In [1]:
#importing Libraries

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn.linear_model
import seaborn as sns
from sklearn.metrics import r2_score
import pprint
from rdkit import Chem

In [None]:
#Reading the data & dropping uneeded columns

data = pd.read_csv("/Users/iainquinn/Crystalisation Internship/Data/DataRaw.csv")
data = data[[ "smiles", "Source", "Method", "Molecule", "solvent","induction_time" ,"Supersaturation"]].copy()

#Making two new columns

data['ln_t'] = np.log(data['induction_time'])
data['ln_(S-1)'] = np.log(data['Supersaturation']-1)

#Adding a new column called molecule-source

moleculesource_lst=[]

for index, row in data.iterrows():
    moleculesource_lst.append(row['Molecule'] + ' ' + row['Source'])
data['molecule_source'] = moleculesource_lst

#making two new datasets

data_anti = data.loc[data.Method == "Antisolvent"]
data_cool = data.loc[data.Method == "Cooling"]

#Showing the simplified data
data.head()

In [None]:
#Making lists of each unique molecule in the datasheet

molecule_list = data.Molecule.unique()
molecule_list_anti = data_anti.Molecule.unique()
molecule_list_cool = data_cool.Molecule.unique()

#Making a list of each unique molecule-source object in the datasheet

molsource_list = np.unique(moleculesource_lst)
molsource_list_anti = data_anti.molecule_source.unique()
molsource_list_cool = data_cool.molecule_source.unique()

In [None]:
molecules_source=[]
score=[]
slope=[]
intercept=[]
median=[]
method=[]
smiles=[]

for entry in molsource_list_anti:
    data_set = data.loc[data.molecule_source == entry]
    Y = np.c_[data_set["ln_t"]]
    X = np.c_[data_set["ln_(S-1)"]]
    model = sklearn.linear_model.LinearRegression()
    model.fit(X, Y)
    prediction=model.predict(X)
    molecules_source.append(entry)
    score.append(r2_score(Y, prediction))
    slope.append(float(model.coef_))
    intercept.append(float(model.intercept_))
    median.append(data_set["Supersaturation"].median())
    method.append('antisolvent')
    smiles.append(str(data_set.smiles.unique()))
    
df_anti = pd.DataFrame(list(zip( molecules_source, slope, intercept, score, median, method, smiles)) , columns=['molecule_source','slope', 'intercept', 'r2','median', 'method', 'smiles'])

molecules_source=[]
score=[]
slope=[]
intercept=[]
median=[]
method=[]
smiles=[]

for entry in molsource_list_cool:
    data_set = data.loc[data.molecule_source == entry]
    Y = np.c_[data_set["ln_t"]]
    X = np.c_[data_set["ln_(S-1)"]]
    model = sklearn.linear_model.LinearRegression()
    model.fit(X, Y)
    prediction=model.predict(X)
    molecules_source.append(entry)
    score.append(r2_score(Y, prediction))
    slope.append(float(model.coef_))
    intercept.append(float(model.intercept_))
    median.append(data_set["Supersaturation"].median())
    method.append('cooling')
    smiles.append(str(data_set.smiles.unique()))
    
df_cool = pd.DataFrame(list(zip( molecules_source, slope, intercept, score, median, method, smiles)) , columns=['molecule_source','slope', 'intercept', 'r2','median', 'method', 'smiles'])

df = pd.concat([df_anti, df_cool], ignore_index=True)
df.head(50)


In [None]:
#Splitting the dataset into two based on how good the R2 value is

df_poor = df.loc[df.r2 < 0.5]
df_good = df.loc[df.r2 > 0.5]

#Making a list of the molecule-source in each dataset

poor_list = df_poor['molecule_source'].tolist()
good_list = df_good['molecule_source'].tolist()

In [None]:
#Making a list of all smiles
smiles_list=df_good.smiles.tolist()
pprint.pprint(smiles_list)

In [None]:
m = Chem.MolFromSmiles('CC(C)OC(=O)C1=NC=C2C(=C1COC)C3=C(N2)C=CC(=C3)OCC4=CC=CC=C4')
m