# Bayes Model on Fingerprints



In [1]:
import collections
import time
import pdb
import torch
from matplotlib import pyplot as plt
import corner
import numpy as np
from tqdm import tqdm
import torch
from sklearn import linear_model
import sys
sys.path.append('../')

from bayes_vs import bayes_models
from bayes_vs import chem_ops



In [2]:
chkpt = torch.load('../scripts/trained_oracles_13May2020.chkpt')
chkpt.keys()

dict_keys(['ground-truth', 'cheap-docking_state_dict', 'expensive-docking_state_dict', 'FEP_state_dict'])

In [3]:
len(chkpt['ground-truth'])

220613

In [4]:
rng = np.random.RandomState(4184189)

In [5]:
shuffled = rng.permutation(list(chkpt['ground-truth'].items()))
smiles, values = zip(*shuffled)
smiles = list(smiles)
values = np.array(values, dtype=np.float32)

In [6]:
test_set_size = 2500

smiles_train, smiles_test = smiles[:-test_set_size], smiles[-test_set_size:]
values_train, values_test = values[:-test_set_size], values[-test_set_size:]

In [7]:

def test_on_data(smiles_train, smiles_test, y_train, y_test):
    
    out_rows = []
    traing_set_size = len(smiles_train)
    fps_train = np.stack([chem_ops.morgan_fp_from_smiles(smi) for smi in tqdm(smiles_train, desc='smiles fp train')]).astype(np.float32)
    fps_test = np.stack([chem_ops.morgan_fp_from_smiles(smi) for smi in tqdm(smiles_test, desc='smiles fp test')]).astype(np.float32)
    y_mean_out = y_test.mean()
    
    
    bayes_embed = lambda x: x
    bayes_embed.fp_dim = fps_train.shape[1]
    
    # Dummy Gaussian
    mn = y_train.mean()
    mse = np.mean((y_test-mn)**2)
    ll = -0.5*np.mean(np.log(2*np.pi) + ((y_test-mn)**2))
    exp_ss = np.mean((mn * np.ones_like(y_test) - y_mean_out)**2)

    out_rows.append(['Dummy Gaussian (var=1)', traing_set_size, f'{mse:.2f}', f'{ll:.2f}', f'{exp_ss:.2f}'])


    # Linear regression with point estimate with weights
    lin = linear_model.LinearRegression(fit_intercept=False)
    lin.fit(fps_train, y_train)
    predicted_mn = lin.predict(fps_test)
    mse = np.mean((y_test-predicted_mn)**2)
    ll = -0.5*np.mean(np.log(2*np.pi) + ((y_test-predicted_mn)**2))
    exp_ss = np.mean((predicted_mn - y_mean_out)**2)
    out_rows.append(['Linear Regression/w Gaussian likelihood (var=1)', 
                     traing_set_size, f'{mse:.2f}', f'{ll:.2f}', f'{exp_ss:.2f}'])

    
    # Bayes regression
    s_time = time.time()
    bayes_r = bayes_models.BayesianRegression(bayes_embed, False)
    bayes_r.fit(torch.tensor(fps_train), torch.tensor(y_train[:, None]))
    mvn = bayes_r.predict(torch.tensor(fps_test))
    mse = np.mean((y_test-mvn.mean.detach().numpy())**2)
    var = torch.diag(mvn.covariance_matrix)
    ll =  -0.5 *torch.mean((torch.log(2*np.pi*var) + (torch.tensor(y_test)-mvn.mean)**2/var) )
    ll = ll.item()
    exp_ss = np.mean((mvn.mean.detach().numpy() - y_mean_out)**2)
    e_time = time.time()
    print(f"Bayesian regression took {e_time - s_time}s")

    #pdb.set_trace()
    #ll = mvn.log_prob(torch.tensor(y_test)).detach().numpy().mean()
    out_rows.append(['Bayesian Regression', traing_set_size, f'{mse:.2f}', f'{ll:.2f}', f'{exp_ss:.2f}'])

    # no longer using below as have told Bayes model to set it with sklearn params so should all be the same
#     # Sklearn regression
#     clf = linear_model.BayesianRidge(compute_score=True, fit_intercept=False)
#     clf.fit(fps_train, y_train)
#     predicted_mn, predicted_std = clf.predict(fps_test,return_std=True)
#     ll = -0.5*np.mean(np.log(2*np.pi*predicted_std**2)  + ((y_test-predicted_mn)**2/predicted_std**2))
#     mse = np.mean((y_test-predicted_mn)**2)
#     exp_ss = np.mean((predicted_mn - y_mean_out)**2)

#     out_rows.append(['Sklearn Bayesian Ridge Regression', traing_set_size, f'{mse:.2f}',
#                      f'{ll:.2f}', f'{exp_ss:.2f}'])

#     # Bayes Regression with sklearn params
#     bayes_r = bayes_models.BayesianRegression(bayes_embed, False)
#     bayes_r.alpha = clf.lambda_
#     bayes_r.beta = clf.alpha_
#     bayes_r.fit(torch.tensor(fps_train), torch.tensor(y_train[:, None]))
#     mvn = bayes_r.predict(torch.tensor(fps_test))
#     mse = np.mean((y_test-mvn.mean.detach().numpy())**2)
#     var = torch.diag(mvn.covariance_matrix)
#     ll =  -0.5 *torch.mean(torch.log(2*np.pi*var) + (torch.tensor(y_test)-mvn.mean)**2/var )
#     ll = ll.item()
#     exp_ss = np.mean((mvn.mean.detach().numpy() - y_mean_out)**2)

#     out_rows.append([f'Bayesian Regression with the sklearn \n learnt precisions (weights: {bayes_r.alpha:.3f},'
#                      f'noise:{bayes_r.beta:.3f})', traing_set_size, f'{mse:.2f}', f'{ll:.2f}', f'{exp_ss:.2f}'])

    
    
    
    
    return out_rows
    



In [8]:
out = []
for train_size in [10, 20, 50, 100, 500, 1000, 2500, 5000, 7500, 10000]:
    out.extend(test_on_data(smiles_train[:train_size], smiles_test, values_train[:train_size], values_test))
    out.append([""] * len(out[-1]))
    out.append([""] * len(out[-1]))
    out.append([""] * len(out[-1]))
    out.append(["---"] * len(out[-1]))

smiles fp train: 100%|██████████| 10/10 [00:00<00:00, 1008.49it/s]
smiles fp test: 100%|██████████| 2500/2500 [00:01<00:00, 2211.85it/s]


Convergence after  115  iterations
Setting hyperparameters to 7.075835757613219 and 0.08797857213461628


smiles fp train: 100%|██████████| 20/20 [00:00<00:00, 2285.66it/s]
smiles fp test:   8%|▊         | 210/2500 [00:00<00:01, 2095.89it/s]

Bayesian regression took 0.3963510990142822s


smiles fp test: 100%|██████████| 2500/2500 [00:01<00:00, 2174.55it/s]


Convergence after  97  iterations
Setting hyperparameters to 1.7980716300908408 and 148.41593544948893


smiles fp train: 100%|██████████| 50/50 [00:00<00:00, 2222.01it/s]
smiles fp test:   0%|          | 0/2500 [00:00<?, ?it/s]

Bayesian regression took 0.3149890899658203s


smiles fp test: 100%|██████████| 2500/2500 [00:01<00:00, 2203.49it/s]


Convergence after  140  iterations
Setting hyperparameters to 29.880334039235812 and 0.04899125934601764


smiles fp train: 100%|██████████| 100/100 [00:00<00:00, 1897.09it/s]

Bayesian regression took 0.35333871841430664s



smiles fp test: 100%|██████████| 2500/2500 [00:01<00:00, 2248.34it/s]


Convergence after  103  iterations
Setting hyperparameters to 3.029945644241448 and 0.149049778608405


smiles fp train:  40%|████      | 201/500 [00:00<00:00, 2007.95it/s]

Bayesian regression took 0.4404418468475342s


smiles fp train: 100%|██████████| 500/500 [00:00<00:00, 2016.50it/s]
smiles fp test: 100%|██████████| 2500/2500 [00:01<00:00, 2153.07it/s]


Convergence after  23  iterations
Setting hyperparameters to 3.198941894036357 and 0.14902637554876008


smiles fp train:  20%|██        | 202/1000 [00:00<00:00, 2017.32it/s]

Bayesian regression took 0.9387872219085693s


smiles fp train: 100%|██████████| 1000/1000 [00:00<00:00, 2113.95it/s]
smiles fp test: 100%|██████████| 2500/2500 [00:01<00:00, 2232.89it/s]


Convergence after  17  iterations
Setting hyperparameters to 3.8295233579066705 and 0.13219480717203158


smiles fp train:   8%|▊         | 201/2500 [00:00<00:01, 2005.84it/s]

Bayesian regression took 3.0288782119750977s


smiles fp train: 100%|██████████| 2500/2500 [00:01<00:00, 2246.28it/s]
smiles fp test: 100%|██████████| 2500/2500 [00:01<00:00, 2273.21it/s]


Convergence after  13  iterations
Setting hyperparameters to 4.681162904815806 and 0.11545261729004037


smiles fp train:   4%|▍         | 191/5000 [00:00<00:02, 1909.06it/s]

Bayesian regression took 2.092288017272949s


smiles fp train: 100%|██████████| 5000/5000 [00:02<00:00, 2174.58it/s]
smiles fp test: 100%|██████████| 2500/2500 [00:01<00:00, 1848.83it/s]


Convergence after  10  iterations
Setting hyperparameters to 5.728517541198217 and 0.11097330726938862


smiles fp train:   3%|▎         | 212/7500 [00:00<00:03, 2113.89it/s]

Bayesian regression took 2.470202922821045s


smiles fp train: 100%|██████████| 7500/7500 [00:03<00:00, 2232.03it/s]
smiles fp test: 100%|██████████| 2500/2500 [00:01<00:00, 2147.40it/s]


Convergence after  9  iterations
Setting hyperparameters to 6.047769846611154 and 0.11075563691268203


smiles fp train:   2%|▏         | 193/10000 [00:00<00:05, 1922.27it/s]

Bayesian regression took 3.0296828746795654s


smiles fp train: 100%|██████████| 10000/10000 [00:04<00:00, 2254.48it/s]
smiles fp test: 100%|██████████| 2500/2500 [00:01<00:00, 2182.53it/s]


Convergence after  8  iterations
Setting hyperparameters to 6.215214574990658 and 0.11169962417862576
Bayesian regression took 3.4373080730438232s


In [9]:
import tabulate

In [10]:
print(tabulate.tabulate(out, headers=['Name', "Train set size", "MSE (↓)", "Avg LL (↑)", "Avg Explained SS"]))

Name                                             Train set size    MSE (↓)    Avg LL (↑)    Avg Explained SS
-----------------------------------------------  ----------------  ---------  ------------  ------------------
Dummy Gaussian (var=1)                           10                23.69      -12.77        3.68
Linear Regression/w Gaussian likelihood (var=1)  10                21.74      -11.79        2.02
Bayesian Regression                              10                20.83      -2.95         0.85



---                                              ---               ---        ---           ---
Dummy Gaussian (var=1)                           20                21.05      -11.44        1.03
Linear Regression/w Gaussian likelihood (var=1)  20                19.98      -10.91        2.19
Bayesian Regression                              20                19.98      -2.92         2.19



---                                              ---               ---        ---           ---


In [12]:
out.extend(test_on_data(smiles_train[:50000], smiles_test, values_train[:50000], values_test))

smiles fp train: 100%|██████████| 50000/50000 [00:23<00:00, 2116.52it/s]
smiles fp test: 100%|██████████| 2500/2500 [00:01<00:00, 2274.59it/s]


Convergence after  5  iterations
Setting hyperparameters to 7.129981199467363 and 0.1143105138674678
Bayesian regression took 18.59956407546997s
