In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [None]:
sf = pd.read_csv('parsed_survey.csv')
print(df.columns)
ols_est = -0.989 + 0.0175*sf['UNIQUETT'] + 4.6*sf['CHIRAL_ALLATOM_RATIO'] + 0.51*sf['MOE_2D_VDISTMA'] - 0.514 * sf['MOE_2D_VDISTEQ']
sf.insert(5, 'PREDICTED_OLS', ols_est) 
sf.head()

In [None]:
sf['ln_TOTALATOM_COUNT'] = np.log(sf['TOTALATOM_COUNT'])
sf['ln_UNIQUETT'] = np.log(sf['UNIQUETT'])

In [None]:
model = smf.ols('meanComplexity ~ CHIRAL_ALLATOM_RATIO + ln_TOTALATOM_COUNT + ln_UNIQUETT', data=sf).fit()
sf[['CHIRAL_ALLATOM_RATIO', 'TOTALATOM_COUNT', 'UNIQUETT']].head(100)

In [None]:
model.summary()

In [None]:
df = pd.read_csv('parsed_gilead2.csv')
df['ln_TOTALATOM_COUNT'] = np.log(df['TOTALATOM_COUNT'])
df['ln_UNIQUETT'] = np.log(df['UNIQUETT'])
print(df.columns)

predicted = model.predict(df)
df.insert(1, 'PREDICTED_OLS', predicted) 
df.to_csv('gilead_results2.csv', index=False)
df.head(10)

In [None]:
sf['FITTED'] = model.fittedvalues.values
sf['DIFF'] = sf['FITTED'] - sf['PREDICTED_RF']

In [None]:
p = pd.DataFrame(columns=['molecule', 'type', 'complexity'])
p['molecule'] = sf['MOLECULE']
p['complexity'] = sf['PREDICTED_OLS']
p['type'] = 'Random Forest (paper)'
f = pd.DataFrame(columns=['molecule', 'type', 'complexity'])
f['molecule'] = sf['MOLECULE']
f['complexity'] = sf['FITTED']
f['type'] = 'OLS Proposed'
pf = pd.concat([p, f])
pf['size'] = 0.5

import plotly.express as px
fig = px.scatter(pf, x='molecule', y='complexity', color='type', size='size') #, symbol='type')

fig.update_layout(legend=dict(
    yanchor="top",
    y=0.99,
    xanchor="left",
    x=0.005
))

fig.show()

In [None]:
print(df[['MOLECULE', 'PREDICTED_OLS', 'FITTED', 'pred']])

In [None]:
def rename_cols(col):
    if 'DESCRIPTORCOMPLEXITY' in col:
        return col[len('DESCRIPTORCOMPLEXITY_'):]
    if 'SP3CARBONS' in col:
        return col[len('SP3CARBONS_'):]
    return col

In [None]:
df = pd.read_csv('ci5001778_si_001.txt')
df.columns = [rename_cols(c) for c in df.columns]
len(df)

In [None]:
cols = ['meanComplexity'] + [c for c in df.columns[5:] if 'MOE' not in c]
print(cols)
corr = df[cols].corr()
corr.style.background_gradient(cmap='coolwarm')


In [None]:
model2 = smf.ols('meanComplexity ~ CHIRAL_ALLATOM_RATIO + TOTALATOM_COUNT + UNIQUETT', data=df).fit()
model2.summary()

In [None]:
model_paper = smf.ols('meanComplexity ~ UNIQUETT + CHIRAL_ALLATOM_RATIO + MOE_2D_VDISTMA + MOE_2D_VDISTEQ', data=df).fit()
model_paper.summary()

In [None]:
df['FITTED'] = model2.fittedvalues.values
df['DIFF'] = df['FITTED'] - df['meanComplexity']

In [None]:
p = pd.DataFrame(columns=['molecule', 'type', 'complexity'])
p['molecule'] = df['MOLECULE']
p['complexity'] = df['meanComplexity']
p['type'] = 'Predicted'
f = pd.DataFrame(columns=['molecule', 'type', 'complexity'])
f['molecule'] = df['MOLECULE']
f['complexity'] = df['FITTED']
f['type'] = 'Fitted'
pf = pd.concat([p, f])
pf['size'] = 0.5

import plotly.express as px
# fig = px.scatter(pf, x='molecule', y='complexity', color='type')  #, size='size') #, symbol='type')
fig = px.scatter(df, y='DIFF')   #, size='size') #, symbol='type')
fig.show()

In [None]:
df['FITTED'] = model2.fittedvalues.values
df['DIFF'] = df['FITTED'] - df['meanComplexity']
p = pd.DataFrame(columns=['molecule', 'type', 'complexity'])
p['molecule'] = df['MOLECULE']
p['complexity'] = df['meanComplexity']
p['type'] = 'Predicted'
f = pd.DataFrame(columns=['molecule', 'type', 'complexity'])
f['molecule'] = df['MOLECULE']
f['complexity'] = df['FITTED']
f['type'] = 'Fitted'
pf = pd.concat([p, f])
pf['size'] = 0.5

import plotly.express as px
# fig = px.scatter(pf, x='molecule', y='complexity', color='type')  #, size='size') #, symbol='type')
fig = px.scatter(df, y='DIFF')   #, size='size') #, symbol='type')
fig.show()

In [None]:
p = pd.DataFrame(columns=['molecule', 'type', 'complexity'])
p['molecule'] = df['MOLECULE']
p['complexity'] = df['meanComplexity']
p['type'] = 'Predicted'
f = pd.DataFrame(columns=['molecule', 'type', 'complexity'])
f['molecule'] = df['MOLECULE']
f['complexity'] = df['FITTED']
f['type'] = 'Fitted'
pf = pd.concat([p, f])
pf['size'] = 0.5

import plotly.express as px
# fig = px.scatter(pf, x='molecule', y='complexity', color='type')  #, size='size') #, symbol='type')
fig = px.scatter(df, y='DIFF')   #, size='size') #, symbol='type')
fig.show()

df['FITTED'] = model2.fittedvalues.values
df['DIFF'] = df['FITTED'] - df['meanComplexity']
p = pd.DataFrame(columns=['molecule', 'type', 'complexity'])
p['molecule'] = df['MOLECULE']
p['complexity'] = df['meanComplexity']
p['type'] = 'Predicted'
f = pd.DataFrame(columns=['molecule', 'type', 'complexity'])
f['molecule'] = df['MOLECULE']
f['complexity'] = df['FITTED']
f['type'] = 'Fitted'
pf = pd.concat([p, f])
pf['size'] = 0.5

import plotly.express as px
# fig = px.scatter(pf, x='molecule', y='complexity', color='type')  #, size='size') #, symbol='type')
fig = px.scatter(df, y='DIFF')   #, size='size') #, symbol='type')
fig.show()

In [None]:
import numpy as np

def cummulative(diffs):
    N = len(diffs)
    n_steps = 1000
    abs_diffs = abs(diffs)
    mn, mx = min(abs_diffs), max(abs_diffs)
    step = (mx - mn)/n_steps
    y = []
    x = []
    for i in range(n_steps):
        th = mn + i*step
        x += [th]
        y += [int(100*len(np.where(abs(abs_diffs) < th)[0])/len(diffs))]
        
    return x, y

# cummulative(np.array(df['DIFF']))

In [None]:
p = pd.DataFrame(columns=['molecule', 'type', 'complexity'])
p['molecule'] = df['MOLECULE']
p['complexity'] = df['meanComplexity']
p['type'] = 'Predicted'
f = pd.DataFrame(columns=['molecule', 'type', 'complexity'])
f['molecule'] = df['MOLECULE']
f['complexity'] = df['FITTED']
f['type'] = 'Fitted'
pf = pd.concat([p, f])
pf['size'] = 0.5

import plotly.express as px
# fig = px.scatter(pf, x='molecule', y='complexity', color='type')  #, size='size') #, symbol='type')
fig = px.histogram(df, x='DIFF')   #, size='size') #, symbol='type')
fig.show()

df['FITTED'] = model2.fittedvalues.values
df['DIFF'] = df['FITTED'] - df['meanComplexity']
p = pd.DataFrame(columns=['molecule', 'type', 'complexity'])
p['molecule'] = df['MOLECULE']
p['complexity'] = df['meanComplexity']
p['type'] = 'Predicted'
f = pd.DataFrame(columns=['molecule', 'type', 'complexity'])
f['molecule'] = df['MOLECULE']
f['complexity'] = df['FITTED']
f['type'] = 'Fitted'
pf = pd.concat([p, f])
pf['size'] = 0.5
df['Error'] = df['DIFF']

import plotly.express as px
# fig = px.scatter(pf, x='molecule', y='complexity', color='type')  #, size='size') #, symbol='type')
fig = px.histogram(df, x='Error')   #, size='size') #, symbol='type')
fig.show()

In [None]:
p = pd.DataFrame(columns=['molecule', 'type', 'complexity'])
p['molecule'] = df['MOLECULE']
p['complexity'] = df['meanComplexity']
p['type'] = 'Predicted'
f = pd.DataFrame(columns=['molecule', 'type', 'complexity'])
f['molecule'] = df['MOLECULE']
f['complexity'] = df['FITTED']
f['type'] = 'Fitted'
pf = pd.concat([p, f])
pf['size'] = 0.5

diffs = np.array(df['DIFF'])
cf_x, cf_y = cummulative(diffs)

import plotly.express as px
# fig = px.scatter(pf, x='molecule', y='complexity', color='type')  #, size='size') #, symbol='type')
fig = px.scatter(x=cf_x, y=cf_y)
fig.show()

df['FITTED'] = model2.fittedvalues.values
df['DIFF'] = df['FITTED'] - df['meanComplexity']
p = pd.DataFrame(columns=['molecule', 'type', 'complexity'])
p['molecule'] = df['MOLECULE']
p['complexity'] = df['meanComplexity']
p['type'] = 'Predicted'
f = pd.DataFrame(columns=['molecule', 'type', 'complexity'])
f['molecule'] = df['MOLECULE']
f['complexity'] = df['FITTED']
f['type'] = 'Fitted'
pf = pd.concat([p, f])
pf['size'] = 0.5

diffs = np.array(df['DIFF'])
cf_x, cf_y = cummulative(diffs)

import plotly.express as px
# fig = px.scatter(pf, x='molecule', y='complexity', color='type')  #, size='size') #, symbol='type')
fig = px.scatter(x=cf_x, y=cf_y)
fig.show()

In [None]:
cdf = pd.DataFrame({'Absolute Error': cf_x, '% Data': cf_y})
fig = px.scatter(cdf, x='Absolute Error', y='% Data')
fig.show()