In [1]:
#importing Libraries

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn.linear_model
import seaborn as sns
from sklearn.metrics import r2_score

In [2]:
#Reading the data & dropping uneeded columns

data = pd.read_csv("/Users/iainquinn/Crystalisation Internship/Data/DataRaw.csv")
data = data[[ "Smiles", "Source", "Method", "Molecule", "solvent","induction_time" ,"Supersaturation"]].copy()

#Making two new columns

data['ln_t'] = np.log(data['induction_time'])
data['ln_(S-1)'] = np.log(data['Supersaturation']-1)

moleculesource_lst=[]

for index, row in data.iterrows():
    moleculesource_lst.append(row['Molecule'] + ' ' + row['Source'])
data['molecule_source'] = moleculesource_lst

#making two new datasets

data_anti = data.loc[data.Method == "Antisolvent"]
data_cool = data.loc[data.Method == "Cooling"]

#Showing the simplified data
#data.head()

In [3]:
#Increasing the resolution of the figures

sns.set(rc={"figure.dpi":400, 'savefig.dpi':800})


In [4]:
#Making lists of each unique molecule, source and solvent in the dataset

molecule_list = data.Molecule.unique()
molecule_list_anti = data_anti.Molecule.unique()
molecule_list_cool = data_cool.Molecule.unique()

source_list = data.Source.unique()
source_list_anti = data_anti.Source.unique()
source_list_cooling = data_cool.Source.unique()

solvent_list= data.solvent.unique()
solvent_list_anti = data_anti.solvent.unique()
solvent_list_cool = data_cool.solvent.unique()

molsource_list = np.unique(moleculesource_lst)
molsource_list_anti = data_anti.molecule_source.unique()
molsource_list_cool = data_cool.molecule_source.unique()

In [None]:
#Creating & saving figures of each linear regression plot

for entry in molsource_list_cool:
    plot = sns.lmplot(y=("ln_t"), x=("ln_(S-1)"), height=(4),
               data=data.loc[data.molecule_source == entry]).set(title=entry)
    plot.savefig("/Users/iainquinn/Crystalisation Internship/Figures/Molecule-Source Figs/Cool/" + entry +'.png')

In [5]:
molecules_source=[]
score=[]
slope=[]
intercept=[]
median=[]
method=[]

for entry in molsource_list_anti:
    data_set = data.loc[data.molecule_source == entry]
    Y = np.c_[data_set["ln_t"]]
    X = np.c_[data_set["ln_(S-1)"]]
    model = sklearn.linear_model.LinearRegression()
    model.fit(X, Y)
    prediction=model.predict(X)
    molecules_source.append(entry)
    score.append(r2_score(Y, prediction))
    slope.append(float(model.coef_))
    intercept.append(float(model.intercept_))
    median.append(data_set["Supersaturation"].median())
    method.append('antisolvent')
    
df_anti = pd.DataFrame(list(zip( molecules_source, slope, intercept, score, median, method)) , columns=['molecule_source','slope', 'intercept', 'r2','median', 'method'])

molecules_source=[]
score=[]
slope=[]
intercept=[]
median=[]
method=[]

for entry in molsource_list_cool:
    data_set = data.loc[data.molecule_source == entry]
    Y = np.c_[data_set["ln_t"]]
    X = np.c_[data_set["ln_(S-1)"]]
    model = sklearn.linear_model.LinearRegression()
    model.fit(X, Y)
    prediction=model.predict(X)
    molecules_source.append(entry)
    score.append(r2_score(Y, prediction))
    slope.append(float(model.coef_))
    intercept.append(float(model.intercept_))
    median.append(data_set["Supersaturation"].median())
    method.append('cooling')
    
df_cool = pd.DataFrame(list(zip( molecules_source, slope, intercept, score, median, method)) , columns=['molecule_source','slope', 'intercept', 'r2','median', 'method'])

df = pd.concat([df_anti, df_cool], ignore_index=True)


In [None]:
df.plot(kind= 'scatter', x="slope", y= "median",c="r2",colorbar=True, alpha = 0.9, 
                               cmap=plt.get_cmap("jet"), figsize=(7,5))
plt.ylabel("Median Supersaturation")
plt.xlabel("Slope")
plt.title('Molecule & Source')
plt.savefig('/Users/iainquinn/Crystalisation Internship/Figures/Molecule-Source Figs/' +'figure.png')

In [6]:
df_poor = df.loc[df.r2 < 0.5]
df_good = df.loc[df.r2 > 0.5]

In [7]:
poor_graphs = df_poor['molecule_source'].tolist()
good_graphs = df_good['molecule_source'].tolist()

In [None]:
poor_graphs = df_poor['molecule_source'].tolist()

for entry in poor_graphs:
    plot = sns.lmplot(y=("ln_t"), x=("ln_(S-1)"), height=(4),
               data=data.loc[data.molecule_source == entry]).set(title=entry)
    plot.savefig("/Users/iainquinn/Crystalisation Internship/Figures/Poor Graphs/" + entry +'.png')