# Import Libraries

In [1]:
import tdims_ext
import pandas as pd

# Load Data

In [2]:
data_df = pd.read_csv('./data/cmpCl3_200.csv')

smi_col = 'SMILES'
target_prop = 'def_EmAbs'

sm_list = list(data_df[smi_col].values)
y = list(data_df[target_prop].values)

In [3]:
print(f'Data size: {data_df.shape}')
data_df.head(2)

Data size: (200, 6)


Unnamed: 0,SMILES,Solvent,Emission max (nm),Abs,Solvent_id,def_EmAbs
0,CCCCCCn1c2ccccc2c2cc(/C=C/c3cc(=O)oc4cc(C)ccc3...,ClC(Cl)Cl,504.0,393.0,1,111.0
1,CC(=O)c1c2c(cc3cc(N(C)C)ccc13)CCC2,ClC(Cl)Cl,511.0,373.0,1,138.0


# Feature Embeddings

### Full embeddings

In [4]:
radius = 2
fragment = False
func_dis = -1
func_dup = max

emb, key_all = tdims_ext.get_representation(sm_list, radius=radius, func_dis=func_dis, func_merge=func_dup, fragment_set=fragment, atom_set=True, fingerprint_set=True)

Full embedding shape: (200, 17394)
Execution time for full embedding:  3.884267 sec


In [5]:
key_all[:5]

['ccc(c(c)C)c(c)o & cc1ccc(c)n1CC',
 'ccc(c(c)C)c(c)o & ccc1c(c)ccn1C',
 'ccc(c(c)C)c(c)o & ccc1c(c)cnc1c',
 'ccc(c(c)C)c(c)o & C=Cc(cc)c(c)c',
 'ccc(c(c)C)c(c)o & cc(C)cc(=O)o']

### Embeddings w/ feature selection including parameter optimization

In [6]:
radius = 2
fragment = False
func_dis = -1
func_dup = max
reg_model = 'Lasso'

x_slc, key_slc, key_all, optimized_param = tdims_ext.get_representation_with_fs_selection(sm_list, y, reg_model = reg_model, radius=radius, func_dis=func_dis, func_merge=func_dup, fragment_set=fragment, atom_set=True, fingerprint_set=True)

Full embedding shape: (200, 17394)
Execution time for full embedding:  3.884812 sec

Feature were selected from (200, 17394) to (200, 283)
Optimized parameter for Lasso: {'alpha': 1.0}
Execution time for feature selection:  92.097607 sec


In [7]:
len(key_slc)

283

# Regression task

In [8]:
from sklearn.model_selection import RepeatedKFold, cross_val_score
from sklearn.linear_model import Lasso
import numpy as np

In [9]:
estimator = Lasso(**optimized_param)

kf = RepeatedKFold(n_splits=3, n_repeats=10)
cv_score = cross_val_score(estimator, x_slc, y, cv=kf, scoring = 'r2')
print(f'r2 score: {np.mean(cv_score):.4f} ± {np.std(cv_score):.4f}')

r2 score: 0.5287 ± 0.0979


# Feature importance analysis (SHAP)

In [10]:
import shap

In [11]:
estimate = estimator.fit(x_slc, y) 
explainer = shap.LinearExplainer(estimate, shap.maskers.Independent(np.array(x_slc)))

shap_values = explainer(np.array(x_slc))
shap_val_X = shap_values.values
shap_val_absX = np.abs(shap_val_X)

df_shap_val_abs = pd.DataFrame(columns=key_slc, data = shap_val_absX)

In [12]:
df_shap_val_abs.head()

Unnamed: 0,ccc(oc)c(c)c & cccc(c)C,ccc(oc)c(c)c & O,C=Cc(cc)cc & O,cc(c)C=CC & cc(c)C=CC,cc(c)C=CC & ccccc,cccc(c)n & N,cccc(c)n & O,cccc(c)c & cccc(c)c,cccc(c)C & N,ccccc & ccccc,...,CC=C1C(=O)NN=C1C & cc(c)N1N=CCC1=O,cc(c)-c1nccn1c & ccc(cc)-c(n)n,cc(c)-c1nccn1c & ccc(cc)OC,cc(c)-c1nccn1c & cccc(c)C,cc(c)-c1nccn1c & O,ccc(cc)-c(n)n & ccn1c(C)ncc1c,ccn1c(C)ncc1c & ccc(cc)OC,ccn1c(C)ncc1c & O,cc(c)C(=O)N(C)C & ccc(cc)C(N)=O,C=C(C)P(=C(C)C)(c(c)c)c(c)c & C=CC(C=O)=CC
0,9.145156,3.053305,0.0,7.664688,2.485097,12.223294,0.0,0.728583,2.250029,0.0,...,0.0,4.143807e-16,6.049771e-17,2.0431990000000002e-17,4.793067e-18,0.0,3.1877560000000003e-22,0.0,0.0,5.1530810000000004e-17
1,0.260626,0.265505,0.0,0.775368,0.211193,1.666813,0.0,3.143217,5.187792,0.0,...,0.0,4.143807e-16,6.049771e-17,2.0431990000000002e-17,4.793067e-18,0.0,3.1877560000000003e-22,0.0,0.0,5.1530810000000004e-17
2,0.260626,0.265505,0.0,0.775368,0.211193,1.666813,0.0,8.361803,2.250029,0.0,...,0.0,4.143807e-16,6.049771e-17,2.0431990000000002e-17,4.793067e-18,0.0,3.1877560000000003e-22,0.0,0.0,5.1530810000000004e-17
3,4.484634,3.053305,0.0,0.775368,0.211193,1.666813,0.0,8.361803,2.398609,0.0,...,0.0,4.143807e-16,6.049771e-17,2.0431990000000002e-17,4.793067e-18,0.0,3.1877560000000003e-22,0.0,0.0,5.1530810000000004e-17
4,0.260626,0.265505,0.0,0.775368,0.211193,1.666813,0.0,3.143217,2.250029,0.0,...,0.0,4.143807e-16,6.049771e-17,2.0431990000000002e-17,4.793067e-18,0.0,3.1877560000000003e-22,0.0,0.0,5.1530810000000004e-17
