In [None]:
# === Basic libraries ===
import os
import numpy as np
import pandas as pd
import pickle
import time

# === Oversample ===
import imblearn
from imblearn.over_sampling import SMOTE

In [None]:
# === Classifiers ===
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier, DMatrix, train
from tabpfn import TabPFNClassifier

# === Neural networks from Pytorch Lightning ===
import torch
from torch import nn as nn
from torch.utils.data import DataLoader
from torch.nn import functional as F
import lightning as L
from lightning.pytorch import seed_everything

# === Metrics and cross-validation ====
import logging
import warnings
from sklearn import metrics as met
from sklearn.model_selection import cross_val_predict, LeaveOneOut, KFold


# === Check for GPU ===
torch.cuda.is_available()

In [None]:
# === Set project ===
proj = 'LSI'
proj = 'RS'

# === Read Xy ===
with open('temp_out/'+proj+'_vars.pkl','rb') as f:
    X, y, Xs, Xis, Xsr, Xisr, Xs_pd = pickle.load(f)

# === Set up oversampler ===
oversample = False
ovs = SMOTE()

In [None]:
# === Reduce verbosity ===
logging.getLogger("lightning.pytorch.utilities.rank_zero").setLevel(logging.WARNING)
logging.getLogger("lightning.pytorch.accelerators.cuda").setLevel(logging.WARNING)
warnings.filterwarnings("ignore", ".*set a lower value*")

# === Convert to torch tensors ===
def to_tensor(X_trn, y_trn, X_tst, y_tst):
    X_trn = torch.tensor(X_trn, dtype=torch.float32)
    y_trn = torch.tensor(y_trn, dtype=torch.int32).reshape(-1, 1)
    X_tst = torch.tensor(X_tst, dtype=torch.float32)
    y_tst = torch.tensor(y_tst, dtype=torch.int32).reshape(-1, 1)
    return X_trn, y_trn, X_tst, y_tst

# === Lightning module for MLP ===
class rscls_mlp(L.LightningModule):
    
    def __init__(self, xsh: int, ehl: bool, width: int):
        super().__init__()
        self.dr = nn.Dropout(p=0.2)
        self.l1 = nn.Linear(xsh, width)
        self.l2 = nn.Linear(width, width)
        if ehl:
            self.l3 = nn.Sequential(nn.Linear(width, width), nn.ReLU())
        else:
            self.l3 = nn.Sequential()
        self.out = nn.Linear(width, 1)
        self.rla = nn.ReLU()
        self.net = nn.Sequential(self.dr, self.l1, self.rla, self.l2, self.rla, self.l3, self.out)
        self.validation_step_outputs = []
        
    def forward(self, x):
        # x = self.dr(x)
        # x = self.a1(self.l1(x))
        # x = self.a2(self.l2(x))
        # x = self.out(x)
        x = self.net(x)
        return x
    
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
        return optimizer
    
    def training_step(self, train_batch, batch_idx):
        x, y = train_batch
        logits = self.forward(x)
        loss = nn.BCEWithLogitsLoss()
        loss = loss(logits, y.float())
        self.log('train_loss', loss)
        return loss
    
    def validation_step(self, val_batch, batch_idx):
        x, y = val_batch
        logits = self.forward(x)
        loss = nn.BCEWithLogitsLoss()
        loss = loss(logits, y.float())
        self.log('val_loss', loss)
        self.validation_step_outputs.append(loss)
        return loss

    def on_validation_epoch_end(self):
        #avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
        #tensorboard_logs = {'val_loss': avg_loss}
        epoch_average = torch.stack(self.validation_step_outputs).mean()
        self.log("validation_epoch_average", epoch_average)
        self.validation_step_outputs.clear()
        #return {'avg_val_loss': avg_loss, 'log': tensorboard_logs}

In [None]:
# === Cross validation function ===
def qsar_cv(X, y, n, m, p1, p2, p3, nspl=5, oversample=False):
    xsh = np.shape(X)[1]
    # === Initialize metrtics and iterator ===
    acc, pre, rec, f1s = (np.zeros(nspl*n) for i in range(4))
    i = 0
    # === Start stopwatch ===
    t_sta = time.perf_counter()
    
    # === Loop over n random seeds ===
    ytst_cv = []
    prob_cv = []
    for rs in range(n):
        # === Initialize k-fold cross validation ===
        kfold = KFold(n_splits=nspl, shuffle=True, random_state=np.random.seed(rs))
        # === Set model ===
        if m == 'LR':
            model = LogisticRegression(solver=p1, random_state=np.random.seed(rs), max_iter=200)
        if m == 'SVM':
            model = svm.SVC(kernel=p1, random_state=np.random.seed(rs), probability=True, max_iter=200)
        if m == 'RF':
            model = RandomForestClassifier(n_estimators=p1, max_depth=p2, random_state=np.random.seed(rs))
        if m == 'GBT':
            model = XGBClassifier(max_depth=p1, seed=rs)
        if m == 'PFN':
            model = TabPFNClassifier(device='cpu')
        if m == 'MLP':
            # === Use GPU if available, else use CPU ===
            device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
            seed_everything(rs)
            model = rscls_mlp(xsh=xsh, ehl=p1, width=p2).to(device)
        
        for trn_i, tst_i in kfold.split(X):
            # === Split data ===
            X_trn, X_tst = X[trn_i], X[tst_i]
            y_trn, y_tst = y[trn_i], y[tst_i]
            # === Oversample ===
            if oversample == True:
                X_trn, y_trn = ovs.fit_resample(X_trn, y_trn)
            
            # === Fit and predict ===
            if m == 'MLP':
                # === Dataloaders ===
                X_trn, y_trn, X_tst, y_tst = to_tensor(X_trn, y_trn, X_tst, y_tst)
                trainset = torch.utils.data.TensorDataset(X_trn, y_trn)
                testset = torch.utils.data.TensorDataset(X_tst, y_tst)
                trn_load = DataLoader(trainset, persistent_workers=True, num_workers=12, batch_size=32, shuffle=True)
                tst_load = DataLoader(testset, persistent_workers=True, num_workers=12, batch_size=32, shuffle=False)
                trainer = L.Trainer(max_epochs=100, deterministic=True,
                                    enable_model_summary=False, enable_progress_bar=True,
                                    num_sanity_val_steps=0, limit_val_batches=0)
                trainer.fit(model, trn_load, tst_load)
                prob = trainer.predict(model, X_tst)
                prob = torch.cat(prob).numpy()
                pred = (prob >= 0.5).astype(int)
            else:
                model.fit(X_trn, y_trn.ravel())
                prob = model.predict_proba(X_tst)[:,1]
                pred = model.predict(X_tst)
            
            # === Append for performance curves ===
            ytst_cv.append(y_tst)
            prob_cv.append(prob)
            
            # === Calculate metrics ===
            acc[i] = met.accuracy_score(y_tst,pred)
            pre[i] = met.precision_score(y_tst,pred)
            rec[i] = met.recall_score(y_tst,pred)
            f1s[i] = met.f1_score(y_tst,pred)
            
            i += 1
            
    # === Stop stopwatch ===
    t_end = time.perf_counter()
    t_ela = t_end-t_sta

    # === Average metrics ===
    m_acc = np.mean(acc)
    m_pre = np.mean(pre)
    m_rec = np.mean(rec)
    m_f1s = np.mean(f1s)
    ytst_cv = np.concatenate(ytst_cv)
    prob_cv = np.concatenate(prob_cv)
    prc = met.precision_recall_curve(ytst_cv, prob_cv)
    roc = met.roc_curve(ytst_cv, prob_cv)
    
    ret = {'Model':m, 'Param_1':p1, 'Param_2':p2, 'Param_3':p3, 'Accuracy':m_acc, 'Precision':m_pre, 'Recall':m_rec, 'F1_Score':m_f1s, 'Time':t_ela}
    return ret, prc, roc

In [None]:
# === Retrieve processed data ===
# I: Imputed
# S: Scaled
# O: Oversampled
# R: Reduced
def get_X(i):
    if i == 0:
        return Xs, 'S'
    if i == 1:
        return Xis, 'IS'
    if i == 2:
        return Xsr, 'SR'
    if i == 3:
        return Xisr, 'ISR'

# === Initialize metrics dataframe ===
metrics = pd.DataFrame(columns=['Model', 'Param_1', 'Param_2', 'Param_3', 'Accuracy', 'Precision', 'Recall', 'F1_Score', 'Time', 'Data'])
prc_m = pd.DataFrame(columns=['P', 'R', 'M'])
roc_m = pd.DataFrame(columns=['F', 'T', 'M'])

# === PRC and ROC processing function ===
def prc_roc(prc, roc, lab):
    prc2 = pd.DataFrame(prc[0:2]).transpose()
    prc2.columns = ['P', 'R']
    prc2['M'] = [lab]*len(prc[0])
    roc2 = pd.DataFrame(roc[0:2]).transpose()
    roc2.columns = ['F', 'T']
    roc2['M'] = [lab]*len(roc[0])
    return prc2, roc2

In [None]:
# === Define parameter loops ===
def zeroparam(X, y, dt, m, metrics, prc_m, roc_m, oversample):
    if oversample == True:
        dt = dt+'O'
    os, prc, roc = qsar_cv(X, y, 10, m, 0, 0, 0, oversample=oversample)
    os['Data']=dt
    os = pd.DataFrame(os, index=[0])
    # === Append to metrics records ===
    metrics = pd.concat([metrics, os], axis=0, ignore_index=True)
    prc2, roc2 = prc_roc(prc, roc, m+'.'+dt+'._')
    prc_m = pd.concat([prc_m, prc2], axis=0, ignore_index=True)
    roc_m = pd.concat([roc_m, roc2], axis=0, ignore_index=True)
    return metrics, prc_m, roc_m

def oneparam(X, y, dt, m, p1l, metrics, prc_m, roc_m, oversample):
    if oversample == True:
        dt = dt+'O'
    # === Get length of parameter 1 list ===
    ll1 = len(p1l)
    for i in range(ll1):
        # === Get outputs from cross validation ===
        os, prc, roc = qsar_cv(X, y, 10, m, p1l[i], 0, 0, oversample=oversample)
        os['Data']=dt
        os = pd.DataFrame(os, index=[i])
        # === Append to metrics records ===
        metrics = pd.concat([metrics, os], axis=0, ignore_index=True)
        prc2, roc2 = prc_roc(prc, roc, m+'.'+dt+'.'+str(p1l[i]))
        prc_m = pd.concat([prc_m, prc2], axis=0, ignore_index=True)
        roc_m = pd.concat([roc_m, roc2], axis=0, ignore_index=True)
        print(p1l[i])
    return metrics, prc_m, roc_m

def twoparam(X, y, dt, m, p1l, p2l, metrics, prc_m, roc_m, oversample):
    if oversample == True:
        dt = dt+'O'
    ll1 = len(p1l)
    ll2 = len(p2l)
    for i in range(ll1):
        for j in range(ll2):
            os, prc, roc = qsar_cv(X, y, 10, m, p1l[i], p2l[j], 0, oversample=oversample)
            os['Data']=dt
            os = pd.DataFrame(os, index=[i*ll2+j])
            metrics = pd.concat([metrics, os], axis=0, ignore_index=True)
            prc2, roc2 = prc_roc(prc, roc, m+'.'+dt+'.'+str(p1l[i])+'.'+str(p2l[j]))
            prc_m = pd.concat([prc_m, prc2], axis=0, ignore_index=True)
            roc_m = pd.concat([roc_m, roc2], axis=0, ignore_index=True)
            print(str(p1l[i])+', '+str(p2l[j]))
    return metrics, prc_m, roc_m

In [None]:
# === Logistic regression ===
p1l = ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky']
for a in range(4):
    X, dt = get_X(a)
    metrics, prc_m, roc_m = oneparam(X, y, dt, 'LR', p1l, metrics, prc_m, roc_m, oversample=False)
    metrics, prc_m, roc_m = oneparam(X, y, dt, 'LR', p1l, metrics, prc_m, roc_m, oversample=True)

print('LR complete')

In [None]:
# === Support vector machines ===
p1l = ['linear', 'poly', 'rbf', 'sigmoid']
ll1 = len(p1l)
for a in range(4):
    X, dt = get_X(a)
    metrics, prc_m, roc_m = oneparam(X, y, dt, 'SVM', p1l, metrics, prc_m, roc_m, oversample=False)
    metrics, prc_m, roc_m = oneparam(X, y, dt, 'SVM', p1l, metrics, prc_m, roc_m, oversample=True)

print('SVM complete')

In [None]:
# === Random forest ===
p1l = [50, 100, 300]
p2l = [5, 10]
for a in range(4):
    X, dt = get_X(a)
    metrics, prc_m, roc_m = twoparam(X, y, dt, 'RF', p1l, p2l, metrics, prc_m, roc_m, oversample=False)
    metrics, prc_m, roc_m = twoparam(X, y, dt, 'RF', p1l, p2l, metrics, prc_m, roc_m, oversample=True)

print('RF complete')

In [None]:
# === Gradient boosted trees ===
p1l = [3, 5, 10]
for a in range(4):
    X, dt = get_X(a)
    metrics, prc_m, roc_m = oneparam(X, y, dt, 'GBT', p1l, metrics, prc_m, roc_m, oversample=False)
    metrics, prc_m, roc_m = oneparam(X, y, dt, 'GBT', p1l, metrics, prc_m, roc_m, oversample=True)

print('GBT complete')

In [None]:
# === Prior-Data Fitted Networks ===
X, dt = get_X(2)
metrics, prc_m, roc_m = zeroparam(X, y, dt, 'PFN', metrics, prc_m, roc_m, oversample=False)
metrics, prc_m, roc_m = zeroparam(X, y, dt, 'PFN', metrics, prc_m, roc_m, oversample=True)
X, dt = get_X(3)
metrics, prc_m, roc_m = zeroparam(X, y, dt, 'PFN', metrics, prc_m, roc_m, oversample=False)
metrics, prc_m, roc_m = zeroparam(X, y, dt, 'PFN', metrics, prc_m, roc_m, oversample=True)

print('PFN complete')

In [None]:
# === Multilayer perceptron ===
p1l = [False, True]
p2l = [20, 40]
for a in range(4):
    X, dt = get_X(a)
    metrics, prc_m, roc_m = twoparam(X, y, dt, 'MLP', p1l, p2l, metrics, prc_m, roc_m, oversample=False)
    metrics, prc_m, roc_m = twoparam(X, y, dt, 'MLP', p1l, p2l, metrics, prc_m, roc_m, oversample=True)
    
print('MLP complete')

In [None]:
# === Write metrics to file ===
metrics.to_csv(proj+'_MULTI_MET.csv', mode='a', index=False, header=False)
prc_m.to_csv(proj+'_MULTI_PRC.csv', mode='a', index=False, header=False)
roc_m.to_csv(proj+'_MULTI_ROC.csv', mode='a', index=False, header=False)