In [None]:
# === Basic libraries ===
import numpy as np
import pandas as pd
import time

# === Molecular descriptor calculation ===
from rdkit import Chem
from mordred import Calculator, descriptors

# === Data preprocessing and visuals ===
from sklearn import preprocessing as pp
import seaborn as sns
from sklearn.decomposition import PCA

# === Model and metrics ===
from xgboost import XGBClassifier, cv, DMatrix
from sklearn.metrics import accuracy_score

In [None]:
# === Read in files with X,Y ===
RS_XY = pd.read_csv('Documents/rsxy_v1.csv')
clist = list(RS_XY['SMILES'])
y = np.array(RS_XY['Sens'])
y = np.reshape(y,(-1,1))
X = np.zeros(shape=(len(clist),1826))

# === Calculate descriptors ===
calc = Calculator(descriptors)
for i in range(len(clist)):
    mol = Chem.MolFromSmiles(clist[i])
    X[i,:] = calc(mol)
sh1 = np.shape(X)
print(f'Shape | raw: {sh1}')

# === Filter data and scale ===
X = X[:,~np.any(np.isnan(X), axis=0)]
X = X[:, np.var(X, axis=0) != 0]
scaler = pp.MinMaxScaler().fit(X)
Xs = scaler.transform(X)
sh2 = np.shape(Xs)
print(f'Shape | filtered/scaled: {sh2}')

In [None]:
# === Heatmap of feature correlation ===
Xs_pd = pd.DataFrame(Xs)
sns.heatmap(Xs_pd.corr())

In [None]:
# === Conduct PCA and display updated heatmap ===
pca = PCA(n_components=50,random_state=np.random.seed(0))
pca.fit(Xs_pd)
Xr = pca.transform(Xs_pd)
Xr_pd = pd.DataFrame(Xr)
sns.heatmap(Xr_pd.corr())

In [None]:
# === XGBoost ===
data_dmatrix = DMatrix(data=Xs,label=y)
params = {'objective':'binary:logistic'}
nseed = 100

acc_tst = np.zeros(nseed)
roc_tst = np.zeros(nseed)
prc_tst = np.zeros(nseed)

t_sta = time.perf_counter()
for i in range(nseed):
    xgb_cv = cv(dtrain=data_dmatrix,params=params,nfold=5,num_boost_round=50,metrics=['error','auc','aucpr'],
                as_pandas=True,early_stopping_rounds=10,seed=i)
    
    acc_tst[i] = xgb_cv.iloc[-1,6]
    roc_tst[i] = xgb_cv.iloc[-1,8]
    prc_tst[i] = xgb_cv.iloc[-1,10]

t_end = time.perf_counter()
t_ela = t_end-t_sta

m_acc_tst = 1-np.mean(acc_tst)
m_roc_tst = np.mean(roc_tst)
m_prc_tst = np.mean(prc_tst)

print(f'Testing Accuracy: {m_acc_tst}')
print(f'Testing ROC AUC: {m_roc_tst}')
print(f'Testing PRC AUC: {m_prc_tst}')

print(f'Elapsed Time: {t_ela}')

In [None]:
# === Results Log ===
#
# Data   | Test                  | Time (s)
#        | acc   | roc   | prc   |
# -------------------------------
# Raw    | 0.827 | 0.899 | 0.856 | 18
# F/S    | 0.827 | 0.900 | 0.857 | 18
# PCA100 | 0.700 | 0.747 | 0.636 | 6
# PCA50  | 0.739 | 0.803 | 0.715 | 6
# PCA25  | 0.765 | 0.823 | 0.753 | 7
