In [None]:
# === Basic libraries ===
import numpy as np
import pandas as pd
import time

# === Molecular descriptor calculation ===
from rdkit import Chem
from mordred import Calculator, descriptors

# === Data preprocessing and visuals ===
from sklearn import preprocessing as pp
import seaborn as sns
from sklearn.decomposition import PCA

# === Model and metrics ===
from sklearn.linear_model import LogisticRegression
from sklearn import metrics as met
from sklearn.model_selection import cross_val_predict

In [None]:
# === Read in files with X,Y ===
RS_XY = pd.read_csv('Documents/rsxy_v1.csv')
clist = list(RS_XY['SMILES'])
y = np.array(RS_XY['Sens'])
y = np.reshape(y,(-1,1))
X = np.zeros(shape=(len(clist),1826))

# === Calculate descriptors ===
calc = Calculator(descriptors)
for i in range(len(clist)):
    mol = Chem.MolFromSmiles(clist[i])
    X[i,:] = calc(mol)
sh1 = np.shape(X)
print(f'Shape | raw: {sh1}')

# === Filter data and scale ===
X = X[:,~np.any(np.isnan(X), axis=0)]
X = X[:, np.var(X, axis=0) != 0]
scaler = pp.MinMaxScaler().fit(X)
Xs = scaler.transform(X)
sh2 = np.shape(Xs)
print(f'Shape | filtered/scaled: {sh2}')

In [None]:
# === Heatmap of feature correlation ===
Xs_pd = pd.DataFrame(Xs)
sns.heatmap(Xs_pd.corr())

In [None]:
# === Conduct PCA and display updated heatmap ===
pca = PCA(n_components=50,random_state=np.random.seed(0))
pca.fit(Xs_pd)
Xr = pca.transform(Xs_pd)
Xr_pd = pd.DataFrame(Xr)
sns.heatmap(Xr_pd.corr())

In [None]:
# === Logistic Regression ===
y = y.ravel()
nseed = 100

acc_tst = np.zeros(nseed)
f1s_tst = np.zeros(nseed)
roc_tst = np.zeros(nseed)

t_sta = time.perf_counter()
for i in range(nseed):
    model = LogisticRegression(solver='liblinear',random_state=np.random.seed(i))
    pred = cross_val_predict(model, Xr, y)
    
    acc_tst[i] = met.accuracy_score(y,pred)
    f1s_tst[i] = met.f1_score(y,pred)
    roc_tst[i] = met.roc_auc_score(y,pred)

t_end = time.perf_counter()
t_ela = t_end-t_sta

m_acc_tst = np.mean(acc_tst)
m_f1s_tst = np.mean(f1s_tst)
m_roc_tst = np.mean(roc_tst)

print(f'Testing Accuracy: {m_acc_tst}')
print(f'Testing F1 Score: {m_f1s_tst}')
print(f'Testing POC AUC: {m_roc_tst}')

print(f'Elapsed Time: {t_ela}')

In [None]:
# === Results Log | LIBLINEAR ===
#
# Data   | Test                  | Time (s)
#        | acc   | f1s   | roc   |
# -------------------------------
# Raw    | 0.703 | 0.254 | 0.573 | 3
# F/S    | 0.842 | 0.742 | 0.798 | 3
# PCA100 | 0.842 | 0.742 | 0.798 | 1
# PCA50  | 0.829 | 0.722 | 0.784 | 1
# PCA25  | 0.835 | 0.740 | 0.798 | 1
