# Validation of t-SNE overlap approach
Make t-SNE of all freesolv quintuplicates. Make a split such that the test quints are not well-represented in the training set. Check if these are poorly predicted on by a model. Then incrementally add back the most un-represented cases and update statistics. 

In [49]:
import glob
import csv
import re
import pandas as pd
import itertools
import numpy as np
from tqdm.notebook import tqdm
import csv
import matplotlib.pyplot as plt 
import seaborn as sns 
plot_kwds = {'alpha' : 0.5, 's' : 80, 'linewidths':0}
from tqdm.notebook import tqdm

import sklearn
from sklearn.manifold import TSNE
from sklearn.neural_network import MLPRegressor
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from sklearn.metrics import make_scorer
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn import metrics
from scipy import stats

import tensorflow as tf
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" 
os.environ["CUDA_VISIBLE_DEVICES"] = ""

from tensorflow import keras
from tensorflow.keras import layers

from rdkit import Chem, DataStructs
from rdkit.Chem import Draw, rdFMCS, AllChem, rdmolfiles, Descriptors, rdchem

In [2]:
quints_fps = pd.read_csv("output/quints_fps.csv", header=None)
quints_infos = pd.read_csv("output/quints_infos.csv", names=["set", "pertname", "pertsmarts", "num_ha", "sem"])

quints_whole_df = pd.concat([quints_infos, quints_fps], axis=1)

# drop NaN columns (happens with molprop generation where (error) strings can't be subtracted)
quints_whole_df = quints_whole_df.dropna(axis=1)

# drop rows where SEM == 0.0. It seems some very large perturbations get this value too, so makes training noisy.
quints_whole_df = quints_whole_df[quints_whole_df["sem"] > 0.0001]

# drop columns where all values are 0.
quints_whole_df = quints_whole_df.loc[:, (quints_whole_df != 0).any(axis=0)]

# TMP DROP DUPLICATES --> should be fewer duplicates when we move up to more features.
quints_whole_df = quints_whole_df.drop_duplicates(subset=quints_whole_df.columns.difference(['sem','set','pertname','pertsmarts']))

quints_whole_df

Unnamed: 0,set,pertname,pertsmarts,num_ha,sem,1,4,5,8,10,...,71,72,73,74,75,77,78,79,80,81
0,TrainingSet,mobley_1662128~mobley_7047032,[C*]C~[C*]C.[C*]Cl,2,0.156846,0,0,1,1,0,...,0.2173,5.0240,0.00,2.5,-0.222222,33.961028,2.425788,6,-1,10.0
1,TrainingSet,mobley_7047032~mobley_1662128,[C*]C.[C*]Cl~[C*]C,2,0.156846,0,0,-1,-1,0,...,-0.2173,-5.0240,0.00,-2.5,0.222222,-33.961028,-2.425788,-6,1,-10.0
2,TrainingSet,mobley_7015518~mobley_303222,[C*]OC~[C*]CC,1,0.108653,0,2,0,-1,1,...,1.1537,3.0320,-9.23,0.0,0.000000,-1.979265,-1.110072,0,0,0.0
3,TrainingSet,mobley_303222~mobley_7015518,[C*]CC~[C*]OC,1,0.108653,0,-2,0,1,-1,...,-1.1537,-3.0320,9.23,0.0,0.000000,1.979265,1.110072,0,0,0.0
4,TrainingSet,mobley_1046331~mobley_3515580,[C*]O~[C*]O,1,0.113167,0,0,0,0,0,...,-0.3492,0.2280,0.00,0.5,0.666667,0.000000,0.000000,-8,2,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5365,TrainingSet,mobley_5326154~mobley_1019269,[C*]N(C)C1CCCCC1~[C*]CCCO,8,0.155166,-1,-11,-4,0,-4,...,-1.1019,-18.5662,16.99,-1.0,-0.666667,-53.062935,0.048361,-68,-7,-26.0
5366,TrainingSet,mobley_1019269~mobley_7774695,[C*]CCCO~[C*]NC1CCCCC1,7,0.119793,1,8,3,0,3,...,0.7597,13.9389,-8.20,0.5,0.222222,39.047285,-0.019931,44,5,20.0
5367,TrainingSet,mobley_7774695~mobley_1019269,[C*]NC1CCCCC1~[C*]CCCO,7,0.119793,-1,-8,-3,0,-3,...,-0.7597,-13.9389,8.20,-0.5,-0.222222,-39.047285,0.019931,-44,-5,-20.0
5368,TrainingSet,mobley_1019269~mobley_1189457,[C*]CCCO~[C*]SC1CCCCC1,7,0.281720,0,7,3,0,3,...,1.9033,18.3942,5.07,0.5,0.222222,56.008457,0.974590,44,5,20.0


### Discretize SEM label into categorical bins

In [3]:
### TMP DISCRETIZE BY STRATIFICATION:
n_bins=10

binned_sem = pd.qcut(quints_whole_df["sem"], n_bins, labels=False)
bin_means = []
quints_whole_df["sem_bin"] = binned_sem
print("Bin, Min, Max, Volume")
for n_bin, df_group in quints_whole_df.groupby(by="sem_bin"):
    print(n_bin, round(min(df_group["sem"].values), 2), round(max(df_group["sem"].values), 2), len(df_group))
    bin_means.append(np.mean(df_group["sem"].values))

Bin, Min, Max, Volume
0 0.01 0.05 357
1 0.05 0.07 358
2 0.07 0.1 356
3 0.1 0.12 356
4 0.12 0.15 358
5 0.15 0.19 356
6 0.19 0.26 356
7 0.26 0.49 356
8 0.49 1.47 358
9 1.47 97.24 356


In [4]:
quints_fps = quints_whole_df.drop(["set", "pertname", "pertsmarts", "num_ha", "sem", "sem_bin"], axis=1)
quints_fps = quints_fps.values
quints_infos = quints_whole_df[["set", "pertname", "pertsmarts", "num_ha", "sem", "sem_bin"]]

### Split into train (orange) and test (blue)

In [5]:
def takeSubset(indices, quints_whole_df):
    """Take a selection of a dataframe using indices"""
    subset = quints_whole_df.iloc[indices]
    
    return subset
    

In [6]:
def takeInfo(perts_df):
    """from the input dataframe, return arrays of fingerprints and SEMs"""
    sems = perts_df[["sem_bin", "sem"]].values
    
    # fps is a bit more involved. Remove everything but the FP columns, return as 2d array.
    fps_df = perts_df.drop(["set", "pertname", "pertsmarts", "num_ha", "sem", "sem_bin"], axis=1)
    fps = fps_df.values

    return fps, sems

In [7]:
def preProcessSets(test1, test2, train):
    """standardises and reduces dimensionality to 95% VE; returns arrays"""
    
    #### fit the scaler on training set.
    scaler = preprocessing.StandardScaler()
    train_scaled = scaler.fit_transform(train)
    
    # transform both the training set and the test sets.
    test1_scaled = scaler.transform(test1)
    test2_scaled = scaler.transform(test2)
    
    #### fit PCA on training set with 95% variance explained.
    pca = PCA(n_components=0.95)
    train_preprocessed = pca.fit_transform(train_scaled)
    
    # transform both the training set and the test sets.
    test1_preprocessed = pca.transform(test1_scaled)
    test2_preprocessed = pca.transform(test2_scaled)
    
    return test1_preprocessed, test2_preprocessed, train_preprocessed

In [8]:
## TMP RANDOM SPLIT INSTEAD OF TSNE SPLIT
from sklearn.model_selection import train_test_split
whole_set, whole_set_sem_bins = takeInfo(takeSubset(np.array(range(len(quints_whole_df))), quints_whole_df))

train_set, test_set1, train_sems, upper_test_sems = train_test_split(whole_set, whole_set_sem_bins, test_size=0.2, random_state=42)

# adjust label arrays such that we train on classes, but eep the actual SEM values for later.
train_sems_values = train_sems[:,1]
train_sems = train_sems[:,0]

upper_test_sems_values = upper_test_sems[:,1]
upper_test_sems = upper_test_sems[:,0]

n_classes = len(set(train_sems))

In [50]:
from sklearn.ensemble import RandomForestClassifier
grid_predictions = np.empty((len(test_set1),n_classes))

for i in tqdm(range(15)):                            
    forest = RandomForestClassifier(random_state = 1)

    n_estimators = [100, 300, 500, 800, 1200]
    max_depth = [5, 8, 15, 25, 30, 50]
    min_samples_split = [2, 5, 10, 15, 100]
    min_samples_leaf = [1, 2, 5, 10] 

    hyperF = dict(n_estimators = n_estimators, max_depth = max_depth,  
                  min_samples_split = min_samples_split, 
                 min_samples_leaf = min_samples_leaf)

    gridF = RandomizedSearchCV(forest, hyperF, cv = 5, n_iter=50, verbose = 0, 
                          n_jobs = -1)
    bestF = gridF.fit(np.array(train_set), np.array(train_sems))

    predicted_probas = gridF.best_estimator_.predict_proba(test_set1) 
    grid_predictions += predicted_probas


HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))




In [51]:
print(grid_predictions)
model_preds = [np.argmax(pred) for pred in grid_predictions]
stats.kendalltau(upper_test_sems, model_preds)[0]

[[1.96801061 1.17062468 1.94932262 ... 0.4874873  1.3690186  1.16131963]
 [0.57384288 1.88299317 0.85634773 ... 1.63634524 4.22314636 0.98776268]
 [0.48887449 1.02600217 1.75130489 ... 1.73596163 5.51161382 1.15311694]
 ...
 [4.28991602 1.85002048 2.04313152 ... 0.7833922  0.30148887 0.39307762]
 [2.31426453 1.11549703 1.76583567 ... 1.03186111 1.74338187 1.23934574]
 [1.08630746 1.98034514 1.43055496 ... 1.8191541  0.74580002 2.30006743]]


0.2974901887383297

In [52]:
metrics.balanced_accuracy_score(upper_test_sems, model_preds)

0.22214773004257618