# This notebook includes code for benchmarking ThermoMPNN models on the datasets used in the ThermoMPNN-D paper

To run the benchmarks, just run the cells below. Each model is provided with a few different datasets to show how each dataset is configured. 

## Shared Utility Classes and Functions

In [1]:
from omegaconf import OmegaConf
from thermompnn.trainer.v2_trainer import TransferModelPLv2, TransferModelPLv2Siamese
from thermompnn.train_thermompnn import parse_cfg
from thermompnn.inference.v2_inference import run_prediction_batched
from thermompnn.datasets.v2_datasets import MegaScaleDatasetv2, FireProtDatasetv2, ddgBenchDatasetv2, ProteinGymDataset

from scipy.stats import spearmanr, pearsonr
from sklearn.metrics import root_mean_squared_error as rmse



## Single Mutant Models

In [5]:
# default single mutant config
cfg = OmegaConf.merge(OmegaConf.load('examples/configs/local.yaml'), OmegaConf.load('examples/configs/single.yaml'))
cfg = parse_cfg(cfg)

# load single mutant model
model_path = 'model_weights/ThermoMPNN-ens1.ckpt'
model = TransferModelPLv2.load_from_checkpoint(checkpoint_path=model_path, cfg=cfg, device='gpu')


Loading model %s /home/hdieckhaus/scripts/ThermoMPNN/vanilla_model_weights/v_48_020.pt
setting ProteinMPNN dropout: 0.0
MLP HIDDEN SIZES: [384, 64, 32, 21]




In [6]:
# load single mutant dataset of choice - Megascale-S test set
cfg.training.batch_size = 256
cfg.data.dataset = 'megascale'
cfg.data.splits = ['test']
cfg.data.mut_types = ['single']
keep = True # this will return the raw predictions if True
dataset = MegaScaleDatasetv2(cfg, split='test')
results = run_prediction_batched(name='ThermoMPNN_single', model=model, dataset_name='megascale-S-test', results=[], dataset=dataset, keep=keep, zero_shot=False, cfg=cfg)


100%|██████████| 28/28 [00:03<00:00,  8.43it/s]


Including 28172 direct single/double mutations
Testing Model ThermoMPNN_single on dataset megascale-S-test


100%|██████████| 111/111 [00:21<00:00,  5.21it/s]


28172 mutations evaluated
r2 0.5497757196426392
mse 0.5006169676780701
rmse 0.707542896270752
spearman 0.7186527252197266
pearson 0.7441239356994629


In [7]:
# Alternate single mutant benchmark: SSYM inverse dataset
cfg.training.batch_size = 1
cfg.data.dataset = 'ssym'
cfg.data.splits = ['inv']
cfg.data.mut_types = ['single']
keep = False
dataset = ddgBenchDatasetv2(cfg, csv_fname='data/protddg-bench-master/SSYM/ssym-5fold_clean_inv.csv', pdb_dir='data/protddg-bench-master/SSYM/pdbs')
results = run_prediction_batched(name='ThermoMPNN_single', model=model, dataset_name='ssym-inv', results=[], dataset=dataset, keep=keep, zero_shot=False, cfg=cfg)


Reverse mutations: False




Testing Model ThermoMPNN_single on dataset ssym-inv


100%|██████████| 342/342 [00:04<00:00, 84.24it/s]

342 mutations evaluated
r2 0.0019396543502807617
mse 2.371711492538452
rmse 1.5400362014770508
spearman 0.6066070795059204
pearson 0.5833054780960083





## Additive Models

In [8]:
# default single mutant config
cfg = OmegaConf.merge(OmegaConf.load('examples/configs/local.yaml'), OmegaConf.load('examples/configs/single.yaml'))
cfg = parse_cfg(cfg)

# load single mutant model
model_path = 'model_weights/ThermoMPNN-ens1.ckpt'
model = TransferModelPLv2.load_from_checkpoint(checkpoint_path=model_path, cfg=cfg, device='gpu')


Loading model %s /home/hdieckhaus/scripts/ThermoMPNN/vanilla_model_weights/v_48_020.pt
setting ProteinMPNN dropout: 0.0
MLP HIDDEN SIZES: [384, 64, 32, 21]




In [9]:
# Megascale-D test set
cfg.training.batch_size = 256
cfg.data.dataset = 'megascale'
cfg.data.splits = ['test']
cfg.data.mut_types = ['double']
cfg.data.pick = 0
# load double mutant dataset twice, once for each mutation
from copy import deepcopy
cfg2 = deepcopy(cfg)
cfg2.data.pick = 1
keep = True

dataset_1 = MegaScaleDatasetv2(cfg, split='test') # first mutation
dataset_2 = MegaScaleDatasetv2(cfg2, split='test') # second mutation
results_1 = run_prediction_batched(name='ThermoMPNN_additive', model=model, dataset_name='megascale-D-test-1', results=[], dataset=dataset_1, keep=keep, zero_shot=False, cfg=cfg)
results_2 = run_prediction_batched(name='ThermoMPNN_additive', model=model, dataset_name='megascale-D-test-2', results=[], dataset=dataset_2, keep=keep, zero_shot=False, cfg=cfg)

# add single mutant ddGs to get additive prediction
pred = results_1.ddG_pred + results_2.ddG_pred
true = results_1.ddG_true

print('=' * 50)
print('Additive Model Combined Score:')
print('SCC:', spearmanr(pred, true))
print('PCC:', pearsonr(pred, true))
print('RMSE:', rmse(pred, true))


100%|██████████| 28/28 [00:03<00:00,  8.35it/s]


Including 19671 direct single/double mutations


100%|██████████| 28/28 [00:03<00:00,  8.49it/s]


Including 19671 direct single/double mutations
Testing Model ThermoMPNN_additive on dataset megascale-D-test-1


100%|██████████| 77/77 [00:16<00:00,  4.73it/s]


19671 mutations evaluated
r2 -0.5609180927276611
mse 1.8514941930770874
rmse 1.3606961965560913
spearman 0.36170217394828796
pearson 0.38949763774871826
Testing Model ThermoMPNN_additive on dataset megascale-D-test-2


100%|██████████| 77/77 [00:16<00:00,  4.73it/s]


19671 mutations evaluated
r2 -0.7386915683746338
mse 2.062361478805542
rmse 1.4360923767089844
spearman 0.40318194031715393
pearson 0.40872424840927124
Additive Model Combined Score:
SCC: SignificanceResult(statistic=0.5299539335035481, pvalue=0.0)
PCC: PearsonRResult(statistic=0.5166494156363526, pvalue=0.0)
RMSE: 1.1220313


In [10]:
# Alternate benchmark: PTMUL-D dataset
cfg.training.batch_size = 1
cfg.data.dataset = 'ptmul'
cfg.data.splits = ['alt']
cfg.data.mut_types = ['double']
cfg.data.pick = 0
from copy import deepcopy
cfg2 = deepcopy(cfg)
cfg2.data.pick = 1
keep = True

dataset_1 = ddgBenchDatasetv2(cfg, pdb_dir='data/protddg-bench-master/PTMUL/pdbs', csv_fname='data/protddg-bench-master/PTMUL/ptmul-5fold-mutateeverything_FINAL.csv')
dataset_2 = ddgBenchDatasetv2(cfg2, pdb_dir='data/protddg-bench-master/PTMUL/pdbs', csv_fname='data/protddg-bench-master/PTMUL/ptmul-5fold-mutateeverything_FINAL.csv')
results_1 = run_prediction_batched(name='ThermoMPNN_additive', model=model, dataset_name='ptmul-D-1', results=[], dataset=dataset_1, keep=keep, zero_shot=False, cfg=cfg)
results_2 = run_prediction_batched(name='ThermoMPNN_additive', model=model, dataset_name='ptmul-D-2', results=[], dataset=dataset_2, keep=keep, zero_shot=False, cfg=cfg)

# add single mutant ddGs to get additive prediction
pred = results_1.ddG_pred + results_2.ddG_pred
true = results_1.ddG_true

print('=' * 50)
print('Additive Model Combined Score:')
print('SCC:', spearmanr(pred, true))
print('PCC:', pearsonr(pred, true))
print('RMSE:', rmse(pred, true))

Reverse mutations: False
Reverse mutations: False




Testing Model ThermoMPNN_additive on dataset ptmul-D-1


100%|██████████| 846/846 [00:06<00:00, 139.61it/s]


536 mutations evaluated
r2 -0.004411101341247559
mse 5.010765075683594
rmse 2.238473892211914
spearman 0.401434063911438
pearson 0.3395371735095978
Testing Model ThermoMPNN_additive on dataset ptmul-D-2


100%|██████████| 846/846 [00:06<00:00, 139.10it/s]

536 mutations evaluated
r2 0.008609890937805176
mse 4.94580602645874
rmse 2.22391676902771
spearman 0.42514491081237793
pearson 0.3918190002441406
Additive Model Combined Score:
SCC: SignificanceResult(statistic=0.538188925473584, pvalue=1.408090842557814e-41)
PCC: PearsonRResult(statistic=0.46378033089781784, pvalue=6.0999240436491386e-30)
RMSE: 2.0045671





## Epistatic double mutant model ## 

In [2]:
# default epistatic double mutant config
cfg = OmegaConf.merge(OmegaConf.load('examples/configs/local.yaml'), OmegaConf.load('examples/configs/epistatic.yaml'))
cfg = parse_cfg(cfg)

# load epistatic double mutant model
model_path = 'model_weights/ThermoMPNN-D-ens1.ckpt'
model = TransferModelPLv2Siamese.load_from_checkpoint(checkpoint_path=model_path, cfg=cfg, device='gpu')


Multi-mutant siamese network enabled!
Loading model %s /home/hdieckhaus/scripts/ThermoMPNN/vanilla_model_weights/v_48_020.pt
setting ProteinMPNN dropout: 0.0
MLP HIDDEN SIZES: [256, 128, 128, 1]
Relative loss weights:
ALPHA:	1.0
BETA:	1.0


In [4]:
# Megascale-D test set
cfg.training.batch_size = 256
cfg.data.dataset = 'megascale'
cfg.data.splits = ['test']
cfg.data.mut_types = ['double']
keep = False
dataset = MegaScaleDatasetv2(cfg, split='test') # double mutation
results = run_prediction_batched(name='ThermoMPNN_epistatic', model=model, dataset_name='megascale-D-test-epi', results=[], dataset=dataset, keep=keep, zero_shot=False, cfg=cfg)


100%|██████████| 28/28 [00:03<00:00,  8.45it/s]


Including 19671 direct single/double mutations
Testing Model ThermoMPNN_epistatic on dataset megascale-D-test-epi


100%|██████████| 77/77 [00:22<00:00,  3.44it/s]

19671 mutations evaluated
r2 0.03427821397781372
mse 1.1454977989196777
rmse 1.0702793598175049
spearman 0.5570732355117798
pearson 0.5463431477546692





In [5]:

# PTMUL-D dataset
cfg.training.batch_size = 1
cfg.data.dataset = 'ptmul'
cfg.data.splits = ['alt']
cfg.data.mut_types = ['double']
keep = True
dataset = ddgBenchDatasetv2(cfg, pdb_dir='data/protddg-bench-master/PTMUL/pdbs', csv_fname='data/protddg-bench-master/PTMUL/ptmul-5fold-mutateeverything_FINAL.csv')
results = run_prediction_batched(name='ThermoMPNN_epistatic', model=model, dataset_name='ptmul-D-test-epi', results=[], dataset=dataset, keep=keep, zero_shot=False, cfg=cfg)


Reverse mutations: False




Testing Model ThermoMPNN_epistatic on dataset ptmul-D-test-epi


100%|██████████| 846/846 [00:07<00:00, 118.68it/s]

536 mutations evaluated
r2 0.20691817998886108
mse 3.9564943313598633
rmse 1.9890938997268677
spearman 0.5657690167427063
pearson 0.5374158024787903





In [6]:

# check stabilizing mutation scores for PTMUL
cutoff = -0.5

pred_hits = results.loc[results['ddG_pred'] <= cutoff]
true_hits = pred_hits.loc[pred_hits['ddG_true'] <= cutoff]
ppv = round(true_hits.shape[0] / pred_hits.shape[0], 3)
print('PPV:', ppv)


PPV: 0.512


In [7]:

# DMS sweep example
cfg.training.batch_size = 1
cfg.data.dataset = 'proteingym'
cfg.data.splits = ['GFP_AEQVI']
cfg.data.mut_types = ['double']
keep = False # make True if you want to get your predictions back
dataset = ProteinGymDataset(cfg, pdb_dir='data/protein-gym/ProteinGym_AF2_structures', csv_fname='data/protein-gym/csvs/GFP_AEQVI.csv')
results = run_prediction_batched(name='ThermoMPNN_epistatic', model=model, dataset_name='gfp-aeqvi-D-test-epi', results=[], dataset=dataset, keep=keep, zero_shot=False, cfg=cfg)


100%|██████████| 1/1 [00:00<00:00, 65.28it/s]


Testing Model ThermoMPNN_epistatic on dataset gfp-aeqvi-D-test-epi


100%|██████████| 12777/12777 [02:38<00:00, 80.75it/s]


12777 mutations evaluated
r2 -42.911376953125
mse 21.095693588256836
rmse 4.5930047035217285
spearman 0.39240726828575134
pearson 0.42119482159614563
