In [None]:
#Import standard packages for model training

import sklearn as SK
from sklearn.svm import SVC
from sklearn.metrics import *
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import *
from skopt import BayesSearchCV
import numpy as np
import pandas as pd
import os 
import io
from tensorboard.plugins.hparams import api as hp

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "-1"

In [None]:
#Inport train, validation and test sets

training = './data/Classification/T.cruzi/random_split/train_fold_0.csv'
validation = './data/Classification/T.cruzi/random_split/valid_fold_0.csv'
test = './data/Classification/T.cruzi/random_split/test_fold_0.csv'

train_dataset = pd.read_csv(training, delimiter=',', low_memory=False)
valid_dataset = pd.read_csv(validation, delimiter=',', low_memory=False)
test_dataset = pd.read_csv(test, delimiter=',', low_memory=False)

train_dataset.head()

In [None]:
task_start=2
task_index = 3

# load training dataset

train_dataset = pd.concat([valid_dataset, train_dataset], axis=0).reset_index(drop=True)
y_train = np.array(train_dataset.iloc[:,2:task_index].values)
print(f"loaded y_train data: {y_train.shape}")

# load test dataset

y_test = np.array(test_dataset.iloc[:,2:task_index].values)
print(f"loaded y_test data: {y_test.shape}")

In [None]:
# calculate ECFP (defaut) fingerprints using RDKit

from utils.fingerprints import *

train_smiles=train_dataset["SMILES"].values
test_smiles=test_dataset["SMILES"].values
X_train = assing_fp(train_smiles,FP_SIZE,RADIUS)
X_test = assing_fp(test_smiles,FP_SIZE,RADIUS)

y_train = y_train.ravel()
y_train = np.array(y_train).astype(int)
y_test = y_test.ravel()
y_test = np.array(y_test).astype(int)

X_train.shape, X_test.shape

In [None]:
# parameters for train model

cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
scorer = make_scorer(mean_squared_error)

# log-uniform: understand as search over p = exp(x) by varying x

model = BayesSearchCV(SVC(probability=True),
    {
        'C': (1e-3, 1e-2, 'uniform'),
        #'gamma': (1e-4, 1e+1, 'log-uniform'),
        'kernel': ['rbf']
    },
    n_iter=1, # Number of parameter settings that are sampled
    cv=cv,
    scoring = scorer,
    refit = False, # Refit the best estimator with the entire dataset.
    random_state=42,
    n_jobs = -1
)

model.fit(X_train, y_train)

print("Best parameters: %s" % model.best_params_)

In [None]:
#Fit model using best hyperparameters

model = SVC(**model.best_params_)
model.fit(X_train, y_train)

In [None]:
#Statistical characteristics of model

y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

confusion = confusion_matrix(y_train, y_pred_train)
    #[row, column]
TP = confusion[1, 1]
TN = confusion[0, 0]
FP = confusion[0, 1]
FN = confusion[1, 0]

print(("Train set results"))
print("ACC\t%.2f" % ((TN+TP)/(TN+TP+FN+FP)))
print("MCC\t%.2f" % matthews_corrcoef(y_train, y_pred_train))
#print("kappa\t%.2f" % cohen_kappa_score(y_train, y_pred_train))
print("SE\t%.2f" % (TP/(TP+FN)))
print("SP\t%.2f" % (TN/(TN+FP)))
#print("PPV\t%.2f" % (TP/(TP+FP)))
#print("NPV\t%.2f" % (TN/(TN+FN)))
#print("TPR\t%.2f" %(TP/(TP+FN)))
#print("FPR\t%.2f" %(FP/(FP+TN)))
#print("F1\t%.2f" % f1_score(y_train, y_pred_train))


confusion_test = confusion_matrix(y_test, y_pred_test)
    #[row, column]
TP_test = confusion[1, 1]
TN_test = confusion[0, 0]
FP_test = confusion[0, 1]
FN_test = confusion[1, 0]

print(("Test set results"))
print("ACC\t%.2f" % ((TN_test+TP_test)/(TN_test+TP_test+FN_test+FP_test)))
print("MCC\t%.2f" % matthews_corrcoef(y_test, y_pred_test))
#print("kappa\t%.2f" % cohen_kappa_score(y_test, y_pred_test))
print("SE\t%.2f" % (TP_test/(TP_test+FN_test)))
print("SP\t%.2f" % (TN_test/(TN_test+FP_test)))
#print("PPV\t%.2f" % (TP_test/(TP_test+FP_test)))
#print("NPV\t%.2f" % (TN_test/(TN_test+FN_test)))
#print("TPR\t%.2f" %(TP_test/(TP_test+FN_test)))
#print("FPR\t%.2f" %(FP_test/(FP_test+TN_test)))
#print("F1\t%.2f" % f1_score(y_test, y_pred_test))