In [1]:
import torch
from transformers import AutoModelForCausalLM
from transformers import AutoTokenizer, EsmForMaskedLM
from tokenizers import Tokenizer
import torch.nn.functional as F

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import pickle

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from plm_compare_progen2 import *
from protein_data import *

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

model_name = "hugohrban/progen2-medium"
model, tokenizer = initialize_progen2(model_name)

Using cpu device


### B2L11_HUMAN, CALM1_HUMAN, GDIA_HUMAN

In [39]:
# GDIA_HUMAN
filename = '/Users/johnhutchens/Desktop/Practicum/Data/DMS_ProteinGym_substitutions/GDIA_HUMAN_Silverstein_2021.csv'
df = pd.read_csv(filename)

In [49]:
df.shape

(1154, 3)

In [40]:
df = df.drop('DMS_score_bin', axis='columns')
df.columns

Index(['mutant', 'mutated_sequence', 'DMS_score'], dtype='object')

In [41]:
pg_dict = df.set_index(df.columns[0]).to_dict(orient='index')

In [42]:
keys = list(pg_dict.keys())
print(keys)

['A126S', 'A126T', 'A126V', 'A128P', 'A128S', 'A128T', 'A128V', 'A148P', 'A148S', 'A148T', 'A148V', 'A188G', 'A188S', 'A188T', 'A188V', 'A190S', 'A190T', 'A190V', 'A217S', 'A217T', 'A217V', 'A239S', 'A239T', 'A239V', 'A243S', 'A243T', 'A243V', 'A275S', 'A275T', 'A275V', 'A294S', 'A294T', 'A294V', 'A314S', 'A314T', 'A314V', 'A340S', 'A340T', 'A340V', 'A344T', 'A344V', 'A345S', 'A345T', 'A345V', 'A351D', 'A351S', 'A351T', 'A351V', 'A353S', 'A353T', 'A353V', 'A369S', 'A369T', 'A369V', 'A382S', 'A382T', 'A382V', 'A406D', 'A406S', 'A406T', 'A406V', 'A425D', 'A425S', 'A425T', 'A425V', 'A428S', 'A428T', 'A428V', 'A445G', 'A445S', 'A445V', 'A83S', 'A83T', 'A83V', 'C17F', 'C17Y', 'C202F', 'C202R', 'C202Y', 'C277F', 'C277R', 'C277Y', 'C282F', 'C282R', 'C282Y', 'C302F', 'C302G', 'C302R', 'C302Y', 'C317F', 'C317R', 'C317W', 'C317Y', 'C335F', 'C335R', 'C335S', 'C335Y', 'C394F', 'C394R', 'C394Y', 'C400F', 'C400R', 'C400Y', 'C402F', 'C402R', 'C402Y', 'C414F', 'C414R', 'C414S', 'C414Y', 'D101G', 'D101

In [43]:
pg_dict[keys[0]]

{'mutated_sequence': 'MDEEYDVIVLGTGLTECILSGIMSVNGKKVLHMDRNPYYGGESSSITPLEELYKRFQLLEGPPESMGRGRDWNVDLIPKFLMANGQLVKMLLYTEVTRYLDFKVVEGSFVYKGGKIYKVPSTETESLASNLMGMFEKRRFRKFLVFVANFDENDPKTFEGVDPQTTSMRDVYRKFDLGQDVIDFTGHALALYRTDDYLDQPCLETVNRIKLYSESLARYGKSPYLYPLYGLGELPQGFARLSAIYGGTYMLNKPVDDIIMENGKVVGVKSEGEVARCKQLICDPSYIPDRVRKAGQVIRIICILSHPIKNTNDANSCQIIIPQNQVNRKSDIYVCMISYAHNVAAQGKYIAIASTTVETTDPEKEVEPALELLEPIDQKFVAISDLYEPIDDGCESQVFCSCSYDATTHFETTCNDIKDIYKRMAGTAFDFENMKRKQNDVFGEAEQ',
 'DMS_score': 0.69110813}

## Adding wild type to dictionary

In [44]:
mut = df.iloc[0]['mutant']
seq = df.iloc[0]['mutated_sequence']
print(mut)
print(seq)

A126S
MDEEYDVIVLGTGLTECILSGIMSVNGKKVLHMDRNPYYGGESSSITPLEELYKRFQLLEGPPESMGRGRDWNVDLIPKFLMANGQLVKMLLYTEVTRYLDFKVVEGSFVYKGGKIYKVPSTETESLASNLMGMFEKRRFRKFLVFVANFDENDPKTFEGVDPQTTSMRDVYRKFDLGQDVIDFTGHALALYRTDDYLDQPCLETVNRIKLYSESLARYGKSPYLYPLYGLGELPQGFARLSAIYGGTYMLNKPVDDIIMENGKVVGVKSEGEVARCKQLICDPSYIPDRVRKAGQVIRIICILSHPIKNTNDANSCQIIIPQNQVNRKSDIYVCMISYAHNVAAQGKYIAIASTTVETTDPEKEVEPALELLEPIDQKFVAISDLYEPIDDGCESQVFCSCSYDATTHFETTCNDIKDIYKRMAGTAFDFENMKRKQNDVFGEAEQ


In [45]:
len_mut = len(mut)
orig = mut[0]
pos = int(mut[1:len_mut-1])-1
new = mut[len_mut-1]
wild_seq = seq[:pos] + orig + seq[pos+1:]

pg_dict[None] = {'mutated_sequence': wild_seq}

In [46]:
pg_dict[None]

{'mutated_sequence': 'MDEEYDVIVLGTGLTECILSGIMSVNGKKVLHMDRNPYYGGESSSITPLEELYKRFQLLEGPPESMGRGRDWNVDLIPKFLMANGQLVKMLLYTEVTRYLDFKVVEGSFVYKGGKIYKVPSTETEALASNLMGMFEKRRFRKFLVFVANFDENDPKTFEGVDPQTTSMRDVYRKFDLGQDVIDFTGHALALYRTDDYLDQPCLETVNRIKLYSESLARYGKSPYLYPLYGLGELPQGFARLSAIYGGTYMLNKPVDDIIMENGKVVGVKSEGEVARCKQLICDPSYIPDRVRKAGQVIRIICILSHPIKNTNDANSCQIIIPQNQVNRKSDIYVCMISYAHNVAAQGKYIAIASTTVETTDPEKEVEPALELLEPIDQKFVAISDLYEPIDDGCESQVFCSCSYDATTHFETTCNDIKDIYKRMAGTAFDFENMKRKQNDVFGEAEQ'}

In [47]:
sequence = pg_dict[None]['mutated_sequence']
lp, rlp, llr = collect_log_prob_pg2(sequence, model, tokenizer)

pg_dict[None]['log_probs'] = lp
pg_dict[None]['ref_log_probs'] = rlp
pg_dict[None]['llr_matrix'] = llr

In [48]:
pg_dict[None]

{'mutated_sequence': 'MDEEYDVIVLGTGLTECILSGIMSVNGKKVLHMDRNPYYGGESSSITPLEELYKRFQLLEGPPESMGRGRDWNVDLIPKFLMANGQLVKMLLYTEVTRYLDFKVVEGSFVYKGGKIYKVPSTETEALASNLMGMFEKRRFRKFLVFVANFDENDPKTFEGVDPQTTSMRDVYRKFDLGQDVIDFTGHALALYRTDDYLDQPCLETVNRIKLYSESLARYGKSPYLYPLYGLGELPQGFARLSAIYGGTYMLNKPVDDIIMENGKVVGVKSEGEVARCKQLICDPSYIPDRVRKAGQVIRIICILSHPIKNTNDANSCQIIIPQNQVNRKSDIYVCMISYAHNVAAQGKYIAIASTTVETTDPEKEVEPALELLEPIDQKFVAISDLYEPIDDGCESQVFCSCSYDATTHFETTCNDIKDIYKRMAGTAFDFENMKRKQNDVFGEAEQ',
 'log_probs': tensor([[-2.8561, -5.0028, -2.8489,  ..., -3.4245, -6.1626, -5.1094],
         [-2.4881, -5.5486, -2.9440,  ..., -2.9980, -5.6950, -4.8063],
         [-3.4881, -5.4757, -0.5342,  ..., -4.2637, -7.0952, -4.9590],
         ...,
         [-2.3523, -4.6985, -2.5489,  ..., -2.7169, -5.3956, -4.1655],
         [-2.2087, -4.6168, -2.5093,  ..., -2.8856, -5.1534, -3.8925],
         [-2.2948, -4.7270, -2.7052,  ..., -2.8128, -5.2303, -4.0549]]),
 'ref_log_probs': tensor([[-4.0605],
         [-0.9776],
         [-0.534

## Adding mutants to dictionary

In [50]:
for k in pg_dict.keys():
    sequence = pg_dict[k]['mutated_sequence']
    lp, rlp, llr = collect_log_prob_pg2(sequence, model, tokenizer)

    pg_dict[k]['log_probs'] = lp
    pg_dict[k]['ref_log_probs'] = rlp
    pg_dict[k]['llr_matrix'] = llr


In [52]:
pg_dict[keys[100]]

{'mutated_sequence': 'MDEEYDVIVLGTGLTECILSGIMSVNGKKVLHMDRNPYYGGESSSITPLEELYKRFQLLEGPPESMGRGRDWNVDLIPKFLMANGQLVKMLLYTEVTRYLDFKVVEGSFVYKGGKIYKVPSTETEALASNLMGMFEKRRFRKFLVFVANFDENDPKTFEGVDPQTTSMRDVYRKFDLGQDVIDFTGHALALYRTDDYLDQPCLETVNRIKLYSESLARYGKSPYLYPLYGLGELPQGFARLSAIYGGTYMLNKPVDDIIMENGKVVGVKSEGEVARCKQLICDPSYIPDRVRKAGQVIRIICILSHPIKNTNDANSCQIIIPQNQVNRKSDIYVCMISYAHNVAAQGKYIAIASTTVETTDPEKEVEPALELLEPIDQKFVAISDLYEPIDDGCESQVFFSCSYDATTHFETTCNDIKDIYKRMAGTAFDFENMKRKQNDVFGEAEQ',
 'DMS_score': 0.898775498,
 'log_probs': tensor([[-2.8488, -4.9887, -2.8450,  ..., -3.4157, -6.1596, -5.0981],
         [-2.4666, -5.5561, -2.9176,  ..., -2.9964, -5.7141, -4.7988],
         [-3.4992, -5.4951, -0.5318,  ..., -4.2714, -7.1130, -4.9699],
         ...,
         [-2.3785, -4.6365, -2.5862,  ..., -2.7425, -5.3649, -4.0623],
         [-2.2211, -4.5545, -2.5354,  ..., -2.8818, -5.1384, -3.8444],
         [-2.3350, -4.6626, -2.7342,  ..., -2.8254, -5.2342, -4.0283]]),
 'ref_log_probs': tensor([[-4.0418],
         

In [53]:
len(pg_dict.keys())

1155

In [54]:
filename = '/Users/johnhutchens/Desktop/Practicum/Data/GDIA_HUMAN/pg2_GDIA_HUMAN_matrices.pickle' 

with open(filename, 'wb') as f:
    pickle.dump(pg_dict, f, protocol=pickle.HIGHEST_PROTOCOL)
