In [1]:
import pandas as pd

# Load the CSV file
dfQuinone = pd.read_csv('Quinone.csv')

# Filter the data based on voltage
high_voltage_Quinone = dfQuinone[dfQuinone['VoltageQuinone'] > 2.5]
low_voltage_Quinone = dfQuinone[dfQuinone['VoltageQuinone'] <= 2.5]


In [2]:
high_voltage_Quinone.shape

(438, 2)

In [3]:
low_voltage_Quinone.shape

(203, 2)

In [4]:

# Randomly select 200 molecules from each group
sample_high_voltage_Quinone = high_voltage_Quinone.sample(n=400, random_state=42)
sample_low_voltage_Quinone = low_voltage_Quinone.sample(n=200, random_state=42)

# Combine the samples and shuffle the combined dataset
final_sample_Quinone = pd.concat([sample_high_voltage_Quinone, sample_low_voltage_Quinone]).sample(frac=1, random_state=42)

# Save the final dataset to a new CSV file if needed
final_sample_Quinone.to_csv('Selected_Quinone.csv', index=False)

In [5]:
final_sample_Quinone.head()

Unnamed: 0,SMILESquinone,VoltageQuinone
612,Clc1c(=O)c(=O)c(Cl)c(c12)c(Cl)cc(=O)c2=O,3.663789
145,Cc1ccc(C)c(c12)C(=O)C=CC2=O,2.390244
186,c1cccc(c12)C=C(N)C(=O)C2=O,2.477216
225,c1cccc(cc2)c1c(c2c34)ccc4C=CC(=O)C3=O,2.553843
353,C1C(=O)C(=O)C=c(c=12)c(N)ccc2N,2.898283


In [6]:
# Load the CSV file
dfCSA = pd.read_csv('CSA.csv')

# Filter the data based on voltage
high_voltage_CSA = dfCSA[dfCSA['VoltageCSA'] > 2.5]
low_voltage_CSA = dfCSA[dfCSA['VoltageCSA'] <= 2.5]

In [7]:
high_voltage_CSA.shape

(8746, 2)

In [8]:
low_voltage_CSA.shape

(2686, 2)

In [9]:
# Randomly select 200 molecules from each group
sample_high_voltage_CSA = high_voltage_CSA.sample(n=400, random_state=42)
sample_low_voltage_CSA = low_voltage_CSA.sample(n=400, random_state=42)

# Combine the samples and shuffle the combined dataset
final_sample_CSA = pd.concat([sample_high_voltage_CSA, sample_low_voltage_CSA]).sample(frac=1, random_state=42)

# Save the final dataset to a new CSV file if needed
final_sample_CSA.to_csv('Selected_CSA.csv', index=False)

In [10]:
final_sample_CSA.head()

Unnamed: 0,SMILEScsa,VoltageCSA
4108,Cc1c(C)ccc(c12)/C(=N\S(=O)(=O)C)c3c(C\2=N\S(=O...,2.228517
652,COc1ccc(OC)c(c12)C=C(OC)C(=N/S(=O)(=O)C)\C\2=N...,2.428499
10968,N#Cc(c1)c(C#N)c(C#N)c(c1c23)/C(=N\S(=O)(=O)C)C...,3.194718
7187,CS(=O)(=O)\N=C\1C=C(N)C(=N/S(=O)(=O)C)\c2cc(N)...,2.275049
1402,CS(=O)(=O)\N=C\1/C(=N/S(=O)(=O)C)C=Cc(c2)c1cc(...,2.774487


In [11]:
# Load the CSV file
dfPubl = pd.read_csv('OrganicElectrodesPublications.csv')

In [12]:
dfPubl.head()

Unnamed: 0,Voltage,SMILES
0,2.95,O=C1C2=C(O)C=CC(O)=C2C(=O)C2=C1C(=O)C1=C(C2=O)...
1,2.4,O=C(C1=C(C(S(=O)(O[Na])=O)=CC=C1)C2=O)C3=C2C=C...
2,2.4,OC(C1=C2C=CC=C1S(=O)(O)=O)=C3C(C(S(=O)(O)=O)=C...
3,2.25,O=C1C2=C(C=CC=C2)C(=O)C2=C1C=CC=C2S(=O)(=O)O[Na]
4,2.25,OC(C1=C2C=CC=C1S(=O)(O)=O)=C3C(C=CC=C3)=C2O


In [13]:
# Load the CSV file
dfAnima = pd.read_csv('anima.csv')

In [14]:
dfAnima.head()

Unnamed: 0,VoltageAnima,SMILESanima
0,2.9716,CC1=C2C(=O)C(=NC(=O)C2=CC(=N)C1=O)C#C
1,2.9915,BrC1=NOC2=C(C(=O)C3=NON=C3C2=O)C1=O
2,3.0476,BrC1=NC2=C3C(O1)=NC=CN=C3C(Br)=NC2=O
3,3.2469,CC1=C(O)C2=C(SN=N1)C(=O)N=C(C#N)C2=O
4,3.353,BrC1=CN=C2C(=NC3=NC(=O)C1=C23)C(=O)C#N


In [15]:
# Rename the SMILES and Voltage columns to have a common name across all dataframes
dfPubl = dfPubl.rename(columns={'SMILES': 'SMILES', 'Voltage': 'Voltage'})
dfAnima = dfAnima.rename(columns={'SMILESanima': 'SMILES', 'VoltageAnima': 'Voltage'})
final_sample_CSA = final_sample_CSA.rename(columns={'SMILEScsa': 'SMILES', 'VoltageCSA': 'Voltage'})
final_sample_Quinone = final_sample_Quinone.rename(columns={'SMILESquinone': 'SMILES', 'VoltageQuinone': 'Voltage'})

# Concatenate the dataframes
merged_df = pd.concat([dfPubl, dfAnima, final_sample_CSA, final_sample_Quinone], ignore_index=True)

# Save the merged dataframe to a CSV file
merged_df.to_csv('OrganicElectrodesAll.csv', index=False)

In [16]:
merged_df.shape

(2457, 2)

In [17]:
merged_df.head()

Unnamed: 0,Voltage,SMILES
0,2.95,O=C1C2=C(O)C=CC(O)=C2C(=O)C2=C1C(=O)C1=C(C2=O)...
1,2.4,O=C(C1=C(C(S(=O)(O[Na])=O)=CC=C1)C2=O)C3=C2C=C...
2,2.4,OC(C1=C2C=CC=C1S(=O)(O)=O)=C3C(C(S(=O)(O)=O)=C...
3,2.25,O=C1C2=C(C=CC=C2)C(=O)C2=C1C=CC=C2S(=O)(=O)O[Na]
4,2.25,OC(C1=C2C=CC=C1S(=O)(O)=O)=C3C(C=CC=C3)=C2O
