In [2]:
#importing Libraries

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn.linear_model
import seaborn as sns
from sklearn.metrics import r2_score
import pprint
from rdkit import Chem
from rdkit.Chem import Descriptors

#Reading the data & dropping uneeded columns

data = pd.read_csv("/Users/iainquinn/Python Projects/Data Sets/Newest Data Set .csv")
data = data[[ "smiles", "Source", "Method", "Molecule", "solvent","induction_time" ,"Supersaturation", "solvent_smiles"]].copy()

#Making two new columns

data['ln_t'] = np.log(data['induction_time'])
data['ln_(S-1)'] = np.log(data['Supersaturation']-1)

#Adding a new column called molecule-source

moleculesource_lst=[]

for index, row in data.iterrows():
    moleculesource_lst.append(row['Molecule'] + ' ' + row['Source'])
data['molecule_source'] = moleculesource_lst

#making two new datasets

data_anti = data.loc[data.Method == "Antisolvent"]
data_cool = data.loc[data.Method == "Cooling"]

#Showing the simplified data
#data.head()

#Making lists of each unique molecule in the datasheet

molecule_list = data.Molecule.unique()
molecule_list_anti = data_anti.Molecule.unique()
molecule_list_cool = data_cool.Molecule.unique()

#Making a list of each unique molecule-source object in the datasheet

molsource_list = np.unique(moleculesource_lst)
molsource_list_anti = data_anti.molecule_source.unique()
molsource_list_cool = data_cool.molecule_source.unique()



In [3]:
molecules_source=[]
score=[]
slope=[]
intercept=[]
median=[]
method=[]
smiles=[]
solvent_smiles=[]

for entry in molsource_list_anti:
    data_set = data.loc[data.molecule_source == entry]
    Y = np.c_[data_set["ln_t"]]
    X = np.c_[data_set["ln_(S-1)"]]
    model = sklearn.linear_model.LinearRegression()
    model.fit(X, Y)
    prediction=model.predict(X)
    molecules_source.append(entry)
    score.append(r2_score(Y, prediction))
    slope.append(float(model.coef_))
    intercept.append(float(model.intercept_))
    median.append(data_set["Supersaturation"].median())
    method.append('antisolvent')
    smiles.append(str(data_set.iloc[0]['smiles']))
    solvent_smiles.append(str(data_set.iloc[0]['solvent_smiles']))
    
df_anti = pd.DataFrame(list(zip( molecules_source, slope, intercept, score, median, method, smiles, solvent_smiles)) 
                       , columns=['molecule_source','slope', 'intercept', 'r2','median', 'method', 'smiles','solvent_smiles'])

molecules_source=[]
score=[]
slope=[]
intercept=[]
median=[]
method=[]
smiles=[]
solvent_smiles=[]

for entry in molsource_list_cool:
    data_set = data.loc[data.molecule_source == entry]
    Y = np.c_[data_set["ln_t"]]
    X = np.c_[data_set["ln_(S-1)"]]
    model = sklearn.linear_model.LinearRegression()
    model.fit(X, Y)
    prediction=model.predict(X)
    molecules_source.append(entry)
    score.append(r2_score(Y, prediction))
    slope.append(float(model.coef_))
    intercept.append(float(model.intercept_))
    median.append(data_set["Supersaturation"].median())
    method.append('cooling')
    smiles.append(str(data_set.iloc[0]['smiles']))
    solvent_smiles.append(str(data_set.iloc[0]['solvent_smiles']))
    
df_cool = pd.DataFrame(list(zip( molecules_source, slope, intercept, score, median, method, smiles, solvent_smiles)) ,
                       columns=['molecule_source','slope', 'intercept', 'r2','median', 'method', 'smiles','solvent_smiles'])

df = pd.concat([df_anti, df_cool], ignore_index=True)
df.head()

Unnamed: 0,molecule_source,slope,intercept,r2,median,method,smiles,solvent_smiles
0,Abecarnil Beckmann_1999,-2.679235,3.495162,0.901418,1.773416,antisolvent,CC(C)OC(=O)C1=NC=C2C(=C1COC)C3=C(N2)C=CC(=C3)O...,C(C)(=O)OC(C)C
1,Benzoic acid Zhao_2019,-1.054949,2.597384,0.099874,1.346847,antisolvent,C1=CC=C(C=C1)C(=O)O,O
2,Cefodizime sodium Zhang_2013,-0.747076,5.079866,0.123162,2.247142,antisolvent,CC1=C(SC(=N1)SCC2=C(N3C(C(C3=O)NC(=O)C(=NOC)C4...,C(C)O
3,Cefuroxime Sodium Zhao_2012,-2.99277,3.383713,0.894575,1.457791,antisolvent,CON=C(C1=CC=CO1)C(=O)NC2C3N(C2=O)C(=C(CS3)COC(...,O
4,Dexamethasone Hao_2005,-1.41273,5.591404,0.256615,2.082191,antisolvent,CC1CC2C3CCC4=CC(=O)C=CC4(C3(C(CC2(C1(C(=O)CO)O...,CO


In [6]:
#Removing all poorly correlated data

df = df.loc[df.r2 > 0.5]

good_list = df['molecule_source'].tolist()

df = df.reset_index(drop=True)

smiles_list=df.smiles.tolist()
solvent_smiles_list=df.solvent_smiles.tolist()

df.head(30)
df.to_csv('19_datasets.csv')

In [4]:
mol_descriptors = [desc[0] for desc in Descriptors.descList]
dict_list=[]

for smile in smiles_list:
    my_dict={}
    for desc in mol_descriptors:
        a = "b=Descriptors." + desc + "(Chem.MolFromSmiles(smile))"
        exec(a)
        my_dict[desc]=b
    my_dict['smiles']=smile
    dict_list.append(my_dict)
        
solute_df_desc = pd.DataFrame(dict_list)
solute_df_desc = solute_df_desc.drop(columns=['smiles'])
#df_desc.head(20)

In [5]:
mol_descriptors = [desc[0] for desc in Descriptors.descList]
dict_list=[]

for smile in solvent_smiles_list:
    my_dict={}
    for desc in mol_descriptors:
        exec("a=Descriptors." + desc + "(Chem.MolFromSmiles(smile))")
        my_dict[str('solvent_' + desc)]=a
    my_dict['solvent_smiles']=smile
    dict_list.append(my_dict)

solvent_df_desc = pd.DataFrame(dict_list)
solvent_df_desc = solvent_df_desc.drop(columns=['solvent_smiles'])
#solvent_df_desc.head(20)

In [6]:
new_df=df.join(solute_df_desc)
new_df=new_df.join(solvent_df_desc)
new_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19 entries, 0 to 18
Columns: 424 entries, molecule_source to solvent_fr_urea
dtypes: float64(212), int64(208), object(4)
memory usage: 63.1+ KB


In [7]:
#Making a correlation matrix for the slope

corr_matrix= new_df.corr()
corr=corr_matrix["slope"].sort_values(ascending=False)
corr[0:7]

slope                  1.000000
intercept              0.623847
BCUT2D_MRHI            0.476903
fr_aniline             0.475998
solvent_EState_VSA6    0.472545
SlogP_VSA10            0.426702
BCUT2D_MWHI            0.411075
Name: slope, dtype: float64

In [8]:
#Making a correlation matrix for the median

corr_matrix= new_df.corr()
corr=corr_matrix["median"].sort_values(ascending=False)
corr[0:6]

median                 1.000000
MaxAbsPartialCharge    0.533690
PEOE_VSA8              0.530252
MaxPartialCharge       0.477536
fr_imidazole           0.447682
VSA_EState1            0.415509
Name: median, dtype: float64

In [None]:
#importing Libraries

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn.linear_model
import seaborn as sns
from sklearn.metrics import r2_score
import pprint
from rdkit import Chem
from rdkit.Chem import Descriptors
data = pd.read_csv("/Users/iainquinn/Crystalisation Internship/Data/DataRaw.csv")
data = data[[ "smiles", "Source", "Method", "Molecule", "solvent","induction_time" ,"Supersaturation", "solvent_smiles"]].copy()
data.head()
