In [1]:
#importing Libraries

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn.linear_model
import seaborn as sns
from sklearn.metrics import r2_score
import pprint
from rdkit import Chem
from rdkit.Chem import Descriptors

#Reading the data & dropping uneeded columns

data = pd.read_csv("/Users/iainquinn/Crystalisation Internship/Data/DataRaw.csv")
data = data[[ "smiles", "Source", "Method", "Molecule", "solvent","induction_time" ,"Supersaturation", "solvent_smiles"]].copy()

#Making two new columns

data['ln_t'] = np.log(data['induction_time'])
data['ln_(S-1)'] = np.log(data['Supersaturation']-1)

#Adding a new column called molecule-source

moleculesource_lst=[]

for index, row in data.iterrows():
    moleculesource_lst.append(row['Molecule'] + ' ' + row['Source'])
data['molecule_source'] = moleculesource_lst

#making two new datasets

data_anti = data.loc[data.Method == "Antisolvent"]
data_cool = data.loc[data.Method == "Cooling"]

#Showing the simplified data
#data.head()

#Making lists of each unique molecule in the datasheet

molecule_list = data.Molecule.unique()
molecule_list_anti = data_anti.Molecule.unique()
molecule_list_cool = data_cool.Molecule.unique()

#Making a list of each unique molecule-source object in the datasheet

molsource_list = np.unique(moleculesource_lst)
molsource_list_anti = data_anti.molecule_source.unique()
molsource_list_cool = data_cool.molecule_source.unique()

In [9]:
molecules_source=[]
score=[]
slope=[]
intercept=[]
median=[]
method=[]
smiles=[]
solvent_smiles=[]

for entry in molsource_list_anti:
    data_set = data.loc[data.molecule_source == entry]
    Y = np.c_[data_set["ln_t"]]
    X = np.c_[data_set["ln_(S-1)"]]
    model = sklearn.linear_model.LinearRegression()
    model.fit(X, Y)
    prediction=model.predict(X)
    molecules_source.append(entry)
    score.append(r2_score(Y, prediction))
    slope.append(float(model.coef_))
    intercept.append(float(model.intercept_))
    median.append(data_set["Supersaturation"].median())
    method.append('antisolvent')
    smiles.append(str(data_set.iloc[0]['smiles']))
    solvent_smiles.append(str(data_set.iloc[0]['solvent_smiles']))
    
df_anti = pd.DataFrame(list(zip( molecules_source, slope, intercept, score, median, method, smiles, solvent_smiles)) 
                       , columns=['molecule_source','slope', 'intercept', 'r2','median', 'method', 'smiles','solvent_smiles'])

molecules_source=[]
score=[]
slope=[]
intercept=[]
median=[]
method=[]
smiles=[]
solvent_smiles=[]

for entry in molsource_list_cool:
    data_set = data.loc[data.molecule_source == entry]
    Y = np.c_[data_set["ln_t"]]
    X = np.c_[data_set["ln_(S-1)"]]
    model = sklearn.linear_model.LinearRegression()
    model.fit(X, Y)
    prediction=model.predict(X)
    molecules_source.append(entry)
    score.append(r2_score(Y, prediction))
    slope.append(float(model.coef_))
    intercept.append(float(model.intercept_))
    median.append(data_set["Supersaturation"].median())
    method.append('cooling')
    smiles.append(str(data_set.iloc[0]['smiles']))
    solvent_smiles.append(str(data_set.iloc[0]['solvent_smiles']))
    
df_cool = pd.DataFrame(list(zip( molecules_source, slope, intercept, score, median, method, smiles, solvent_smiles)) ,
                       columns=['molecule_source','slope', 'intercept', 'r2','median', 'method', 'smiles','solvent_smiles'])

df = pd.concat([df_anti, df_cool], ignore_index=True)
df.head()

Unnamed: 0,molecule_source,slope,intercept,r2,median,method,smiles,solvent_smiles
0,Abecarnil Beckmann_1999,-2.679235,3.495162,0.901418,1.773416,antisolvent,CC(C)OC(=O)C1=NC=C2C(=C1COC)C3=C(N2)C=CC(=C3)O...,C(C)(=O)OC(C)C
1,Benzoic acid Zhao_2019,-1.054949,2.597384,0.099874,1.346847,antisolvent,C1=CC=C(C=C1)C(=O)O,O
2,Cefodizime sodium Zhang_2013,-0.747076,5.079866,0.123162,2.247142,antisolvent,CC1=C(SC(=N1)SCC2=C(N3C(C(C3=O)NC(=O)C(=NOC)C4...,C(C)O
3,Cefuroxime Sodium Zhao_2012,-2.99277,3.383713,0.894575,1.457791,antisolvent,CON=C(C1=CC=CO1)C(=O)NC2C3N(C2=O)C(=C(CS3)COC(...,O
4,Dexamethasone Hao_2005,-1.41273,5.591404,0.256615,2.082191,antisolvent,CC1CC2C3CCC4=CC(=O)C=CC4(C3(C(CC2(C1(C(=O)CO)O...,CO


In [10]:
#Removing all poorly correlated data

df = df.loc[df.r2 > 0.5]

good_list = df['molecule_source'].tolist()

df = df.reset_index(drop=True)

smiles_list=df.smiles.tolist()

solvent_list = df.solvent_smiles.tolist()

df=df.drop(columns=['smiles'])
df=df.drop(columns=['solvent_smiles'])

df.head()

Unnamed: 0,molecule_source,slope,intercept,r2,median,method
0,Abecarnil Beckmann_1999,-2.679235,3.495162,0.901418,1.773416,antisolvent
1,Cefuroxime Sodium Zhao_2012,-2.99277,3.383713,0.894575,1.457791,antisolvent
2,Glycine Ramakers 2020,-2.103595,4.009274,0.731073,1.37,antisolvent
3,L-Histidine Liu_2018,-1.12089,5.36831,0.817177,3.112457,antisolvent
4,Paracetamol O'Ciardha_2011,-2.393867,4.126005,0.56771,1.245241,antisolvent


In [8]:
mol_descriptors = [desc[0] for desc in Descriptors.descList]
dict_list=[]

for smile in solvent_list:
    my_dict={}
    for desc in mol_descriptors:
        a = "b=Descriptors." + desc + "(Chem.MolFromSmiles(smile))"
        exec(a)
        my_dict[desc]=b
    my_dict['solvent_smiles']=smile
    dict_list.append(my_dict)
        
df_desc = pd.DataFrame(dict_list)
df_desc.head(20)

Unnamed: 0,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,NumRadicalElectrons,...,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea,solvent_smiles
0,10.031944,-0.212963,10.031944,0.025463,0.460748,102.133,92.053,102.06808,42,0,...,0,0,0,0,0,0,0,0,0,C(C)(=O)OC(C)C
1,0.0,0.0,0.0,0.0,0.327748,18.015,15.999,18.010565,8,0,...,0,0,0,0,0,0,0,0,0,O
2,0.0,0.0,0.0,0.0,0.327748,18.015,15.999,18.010565,8,0,...,0,0,0,0,0,0,0,0,0,O
3,0.0,0.0,0.0,0.0,0.327748,18.015,15.999,18.010565,8,0,...,0,0,0,0,0,0,0,0,0,O
4,7.0,1.0,7.0,1.0,0.385284,32.042,28.01,32.026215,14,0,...,0,0,0,0,0,0,0,0,0,CO
5,8.055556,-0.166667,8.055556,0.166667,0.428405,60.096,52.032,60.057515,26,0,...,0,0,0,0,0,0,0,0,0,C(C)(C)O
6,0.0,0.0,0.0,0.0,0.327748,18.015,15.999,18.010565,8,0,...,0,0,0,0,0,0,0,0,0,O
7,7.0,1.0,7.0,1.0,0.385284,32.042,28.01,32.026215,14,0,...,0,0,0,0,0,0,0,0,0,CO
8,0.0,0.0,0.0,0.0,0.327748,18.015,15.999,18.010565,8,0,...,0,0,0,0,0,0,0,0,0,O
9,7.569444,0.25,7.569444,0.25,0.406808,46.069,40.021,46.041865,20,0,...,0,0,0,0,0,0,0,0,0,C(C)O


In [12]:
#Joining the descriptors to the dataset

new_df=df.join(df_desc)

#new_df.head()

In [16]:
#Making a correlation matrix for the slope

corr_matrix= new_df.corr()
corr=corr_matrix["slope"].sort_values(ascending=False)
corr[0:6]

slope               1.000000
intercept           0.623847
EState_VSA6         0.472545
median              0.410247
MinPartialCharge    0.378954
MolLogP             0.368870
Name: slope, dtype: float64

In [18]:
#Making a correlation matrix for the median

corr_matrix= new_df.corr()
corr=corr_matrix["median"].sort_values(ascending=False)
corr[0:7]

median              1.000000
slope               0.410247
r2                  0.395398
intercept           0.390080
Kappa3              0.373254
MinPartialCharge    0.211977
EState_VSA6         0.206416
Name: median, dtype: float64