In [1]:
import numpy as np
from sklearn.decomposition import PCA
import pandas as pd
from sklearn.cluster import KMeans
import pickle
from sklearn.metrics import silhouette_score, roc_auc_score, roc_curve, precision_recall_curve, mean_squared_error
import matplotlib.pyplot as plt
from tqdm import tqdm
import seaborn as sns
sns.set_theme(style="whitegrid")
import math
from sklearn.utils import shuffle
from sklearn.svm import SVC, SVR
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor

In [2]:
# Load test set
with open('data_splits/BBBP/test.pkl', 'rb') as f:
    BBBP_test = pickle.load(f)
# Load test fps
with open('Fingerprints/fine-tuned/BBBP_preds_and_fps.pkl', 'rb') as f:
    BBBP_ftfps = pickle.load(f)

In [3]:
# Load test set
with open('data_splits/ClinTox/test.pkl', 'rb') as f:
    clintox_test = pickle.load(f)
# Load ftfps
with open('Fingerprints/fine-tuned/clintox_preds_and_fps.pkl', 'rb') as f:
    clintox_ftfps = pickle.load(f)

In [4]:
# Load test set
with open('data_splits/HIV/test.pkl', 'rb') as f:
    hiv_test = pickle.load(f)
# Load ftfps
with open('Fingerprints/fine-tuned/HIV_preds_and_fps.pkl', 'rb') as f:
    hiv_ftfps = pickle.load(f)

In [19]:
# PCA+kmeans 
def pca_kmeans(ds_test, fps_res_dict):
    X=[]
    y=[]
    for k,v in ds_test.items():
        X.append(fps_res_dict['fps'][k].squeeze())
        y.append(ds_test[k])
    pca = PCA(n_components=2)
    X = pca.fit_transform(X)
    kmeans = KMeans(n_clusters=2, random_state=42).fit(X)
    rocauc = roc_auc_score(y, kmeans.labels_)
    sillhouette = silhouette_score(X, kmeans.labels_)
    return max(rocauc,1-rocauc), sillhouette

In [5]:
# SVM
def svm_pred_test(ds_test, fps_res_dict):
    clf = SVC(gamma='auto')
    X=[]
    y=[]
    for k,v in ds_test.items():
        X.append(fps_res_dict['fps'][k].squeeze())
        y.append(ds_test[k])

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    while len(set(y_test))<=1:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    clf.fit(X_train, y_train)
    preds = clf.predict(X_test)
    svm_clf = roc_auc_score(y_test, preds)
    return svm_clf


In [6]:
# RF classifier
def rf_pred_test(ds_test, fps_res_dict):
    clf = RandomForestClassifier(max_depth=50)
    X=[]
    y=[]
    for k,v in ds_test.items():
        X.append(fps_res_dict['fps'][k].squeeze())
        y.append(ds_test[k])
        
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    while len(set(y_test))<=1:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
        
    clf.fit(X_train, y_train)
    preds = clf.predict(X_test)
    rf_clf = roc_auc_score(y_test, preds)
    return rf_clf


In [7]:
# FFNN classifier
def ffnn_pred_test(ds_test, fps_res_dict):
    X=[]
    y=[]
    for k,v in ds_test.items():
        X.append(fps_res_dict['results'][k].squeeze())
        y.append(ds_test[k])
    ffnn_clf = roc_auc_score(y, X)
    return ffnn_clf

In [11]:
for name, ds,fps in [('BBBP', BBBP_test, BBBP_ftfps), ('clintox',clintox_test, clintox_ftfps), ('HIV',hiv_test, hiv_ftfps)]:
    for func in [svm_pred_test, rf_pred_test, ffnn_pred_test]:
        scores = []
        for i in range(5):
            scores.append(func(ds,fps))
        scores = np.array(scores)
        mean_score = [np.mean(scores), np.std(scores)]
        print(name+'-'+func.__name__,mean_score)

BBBP-svm_pred_test [0.9399246363525322, 0.03793474767548659]
BBBP-rf_pred_test [0.936186568050928, 0.05008886073152611]
BBBP-ffnn_pred_test [0.9664372056256381, 1.1102230246251565e-16]
clintox-svm_pred_test [0.9838961038961038, 0.0235454312648075]
clintox-rf_pred_test [1.0, 0.0]
clintox-ffnn_pred_test [0.9932971014492754, 0.0]
HIV-svm_pred_test [0.6145201224502956, 0.028126398457782382]
HIV-rf_pred_test [0.6489466993601477, 0.02367915817461174]
HIV-ffnn_pred_test [0.8112152326985488, 1.1102230246251565e-16]


In [20]:
for name, ds,fps in [('BBBP', BBBP_test, BBBP_ftfps), ('clintox',clintox_test, clintox_ftfps), ('HIV',hiv_test, hiv_ftfps)]:
    print(name,pca_kmeans(ds,fps))

BBBP (0.9033628668357432, 0.8331255057630665)
clintox (0.9481884057971014, 0.8560070509366445)
HIV (0.7443879789249442, 0.6785649264390821)
