In [1]:
import random
import numpy as np
from sklearn.decomposition import PCA
import pandas as pd
from sklearn.cluster import KMeans
import pickle
from sklearn.metrics import silhouette_score, roc_auc_score, roc_curve, precision_recall_curve, mean_squared_error
import matplotlib.pyplot as plt
from tqdm import tqdm
import seaborn as sns
sns.set_theme(style="whitegrid")
import math
from sklearn.utils import shuffle
from sklearn.svm import SVC, SVR
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor

In [2]:
# Load train set
with open('data_splits/BBBP/train.pkl', 'rb') as f:
    BBBP_train = pickle.load(f)
# Load test set
with open('data_splits/BBBP/test.pkl', 'rb') as f:
    BBBP_test = pickle.load(f)
# Load test fps
with open('Fingerprints/fine-tuned/take-2/BBBP_preds_and_fps.pkl', 'rb') as f:
    BBBP_ftfps_test = pickle.load(f)
# Load train fps
with open('Fingerprints/fine-tuned/take-2/BBBP_preds_and_fps_train.pkl', 'rb') as f:
    BBBP_ftfps_train = pickle.load(f)
BBBP_ftfps = BBBP_ftfps_test.copy()
BBBP_ftfps['fps'] = {**BBBP_ftfps_test['fps'], **BBBP_ftfps_train['fps']}
print(len(BBBP_ftfps['fps']), len(BBBP_ftfps_test['fps']), len(BBBP_ftfps_train['fps']))

2050 410 1640


In [3]:
# Load train set
with open('data_splits/ClinTox/train.pkl', 'rb') as f:
    clintox_train = pickle.load(f)
# Load test set
with open('data_splits/ClinTox/test.pkl', 'rb') as f:
    clintox_test = pickle.load(f)
# Load test fps
with open('Fingerprints/fine-tuned/take-2/clintox_preds_and_fps.pkl', 'rb') as f:
    clintox_ftfps_test = pickle.load(f)
# Load train fps
with open('Fingerprints/fine-tuned/take-2/clintox_preds_and_fps_train.pkl', 'rb') as f:
    clintox_ftfps_train = pickle.load(f)
clintox_ftfps = clintox_ftfps_test.copy()
clintox_ftfps['fps'] = {**clintox_ftfps_test['fps'],**clintox_ftfps_train['fps']}
print(len(clintox_ftfps['fps']), len(clintox_ftfps_test['fps']), len(clintox_ftfps_train['fps']))

1484 296 1188


In [4]:
# Load train set
with open('data_splits/HIV/train.pkl', 'rb') as f:
    hiv_train = pickle.load(f)
# Load test set
with open('data_splits/HIV/test.pkl', 'rb') as f:
    hiv_test = pickle.load(f)
# Load test ftfps
with open('Fingerprints/fine-tuned/take-2/HIV_preds_and_fps.pkl', 'rb') as f:
    hiv_ftfps_test = pickle.load(f)
# Load train ftfps
with open('Fingerprints/fine-tuned/take-2/HIV_preds_and_fps_train.pkl', 'rb') as f:
    hiv_ftfps_train = pickle.load(f)
hiv_ftfps = hiv_ftfps_test.copy()
hiv_ftfps['fps'] = {**hiv_ftfps_test['fps'],**hiv_ftfps_train['fps']}
print(len(hiv_ftfps['fps']), len(hiv_ftfps_test['fps']), len(hiv_ftfps_train['fps']))

41127 8225 32902


In [5]:
# PCA+kmeans 
def pca_kmeans(ds_test, fps_res_dict):
    X=[]
    y=[]
    for k,v in ds_test.items():
        X.append(fps_res_dict['fps'][k].squeeze())
        y.append(ds_test[k])
    pca = PCA(n_components=2)
    X = pca.fit_transform(X)
    kmeans = KMeans(n_clusters=2, random_state=42).fit(X)
    rocauc = roc_auc_score(y, kmeans.labels_)
    sillhouette = silhouette_score(X, kmeans.labels_)
    return max(rocauc,1-rocauc), sillhouette

In [6]:
# SVM
def svm_pred_test(ds_test, fps_res_dict, ds_train):
    c = random.uniform(1,2)
    clf = SVC(C=c, gamma='auto')
    X_test=[]
    y_test=[]
    X_train=[]
    y_train=[]
    for k,v in ds_test.items():
        X_test.append(fps_res_dict['fps'][k].squeeze())
        y_test.append(ds_test[k])
    
    for k,v in ds_train.items():
        X_train.append(fps_res_dict['fps'][k].squeeze())
        y_train.append(ds_train[k])

    clf.fit(X_train, y_train)
    preds = clf.predict(X_test)
    svm_clf = roc_auc_score(y_test, preds)
    return svm_clf


In [7]:
# RF classifier
def rf_pred_test(ds_test, fps_res_dict, ds_train):
    clf = RandomForestClassifier(max_depth=50)
    X_test=[]
    y_test=[]
    X_train=[]
    y_train=[]
    for k,v in ds_test.items():
        X_test.append(fps_res_dict['fps'][k].squeeze())
        y_test.append(ds_test[k])
    
    for k,v in ds_train.items():
        X_train.append(fps_res_dict['fps'][k].squeeze())
        y_train.append(ds_train[k])
        
    clf.fit(X_train, y_train)
    preds = clf.predict(X_test)
    rf_clf = roc_auc_score(y_test, preds)
    return rf_clf


In [8]:
# FFNN classifier
def ffnn_pred_test(ds_test, fps_res_dict, train_ds):
    X=[]
    y=[]
    for k,v in ds_test.items():
        X.append(fps_res_dict['results'][k].squeeze())
        y.append(ds_test[k])
    ffnn_clf = roc_auc_score(y, X)
    return ffnn_clf

In [9]:
for name, ds, fps, train in [('BBBP', BBBP_test, BBBP_ftfps, BBBP_train), ('clintox',clintox_test, clintox_ftfps, clintox_train), ('HIV',hiv_test, hiv_ftfps, hiv_train)]:
    for func in [svm_pred_test, rf_pred_test, ffnn_pred_test]:
        scores = []
        for i in range(5):
            scores.append(func(ds,fps, train))
        scores = np.array(scores)
        mean_score = [np.mean(scores), np.std(scores)]
        print(name+'-'+func.__name__,mean_score)

BBBP-svm_pred_test [0.8809459503968908, 0.002016443402554102]
BBBP-rf_pred_test [0.8797700997990845, 0.0018831377462825939]
BBBP-ffnn_pred_test [0.9664372056256381, 1.1102230246251565e-16]
clintox-svm_pred_test [0.9246376811594204, 0.0007246376811594234]
clintox-rf_pred_test [0.9450000000000001, 0.009999999999999964]
clintox-ffnn_pred_test [0.9932971014492754, 0.0]
HIV-svm_pred_test [0.683807961074682, 2.518257365902166e-05]
HIV-rf_pred_test [0.6850820836221618, 0.0008779341639047376]
HIV-ffnn_pred_test [0.8112152326985488, 1.1102230246251565e-16]


In [10]:
for name, ds,fps in [('BBBP', BBBP_test, BBBP_ftfps), ('clintox',clintox_test, clintox_ftfps), ('HIV',hiv_test, hiv_ftfps)]:
    print(name,pca_kmeans(ds,fps))

BBBP (0.9033628668357432, 0.8331254937140715)
clintox (0.9481884057971014, 0.856007050859861)
HIV (0.744450935359092, 0.6787382049476393)
