In [1]:
import torch
from transformers import AutoModelForCausalLM
from transformers import AutoTokenizer, EsmForMaskedLM
from tokenizers import Tokenizer
import torch.nn.functional as F

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import pickle

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from plm_compare_progen2 import *
from protein_data import *

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

model_name = "hugohrban/progen2-medium"
model, tokenizer = initialize_progen2(model_name)

Using cpu device


### B2L11_HUMAN, CALM1_HUMAN, GDIA_HUMAN, HECD1_HUMAN, ISDH_STAAW

In [4]:
# HECD1_HUMAN
path = '/Users/johnhutchens/Desktop/Practicum/Data/zInput_Data/DMS_ProteinGym_substitutions/'
filename = 'HECD1_HUMAN_Tsuboyama_2023_3DKM.csv'
df = pd.read_csv(path+filename)

In [5]:
df.shape

(5586, 4)

In [6]:
df = df.drop('DMS_score_bin', axis='columns')
df.columns

Index(['mutant', 'mutated_sequence', 'DMS_score'], dtype='object')

In [7]:
pg_dict = df.set_index(df.columns[0]).to_dict(orient='index')

In [8]:
keys = list(pg_dict.keys())
print(keys)

['A14C', 'A14D', 'A14E', 'A14F', 'A14G', 'A14H', 'A14I', 'A14K', 'A14L', 'A14M', 'A14N', 'A14P', 'A14Q', 'A14R', 'A14S', 'A14T', 'A14V', 'A14W', 'A14Y', 'A52C', 'A52D', 'A52E', 'A52F', 'A52G', 'A52H', 'A52I', 'A52K', 'A52L', 'A52M', 'A52N', 'A52P', 'A52Q', 'A52R', 'A52S', 'A52T', 'A52V', 'A52W', 'A52Y', 'A62C', 'A62D', 'A62E', 'A62F', 'A62G', 'A62H', 'A62I', 'A62K', 'A62L', 'A62M', 'A62N', 'A62P', 'A62Q', 'A62R', 'A62S', 'A62T', 'A62V', 'A62W', 'A62Y', 'A71C', 'A71D', 'A71E', 'A71F', 'A71G', 'A71H', 'A71I', 'A71K', 'A71L', 'A71M', 'A71N', 'A71P', 'A71Q', 'A71R', 'A71S', 'A71T', 'A71V', 'A71W', 'A71Y', 'D21A', 'D21A:K65A', 'D21A:K65C', 'D21A:K65E', 'D21A:K65F', 'D21A:K65G', 'D21A:K65H', 'D21A:K65I', 'D21A:K65L', 'D21A:K65M', 'D21A:K65N', 'D21A:K65Q', 'D21A:K65R', 'D21A:K65S', 'D21A:K65T', 'D21A:K65V', 'D21A:K65W', 'D21A:K65Y', 'D21C', 'D21C:K65A', 'D21C:K65C', 'D21C:K65D', 'D21C:K65E', 'D21C:K65F', 'D21C:K65G', 'D21C:K65H', 'D21C:K65I', 'D21C:K65L', 'D21C:K65M', 'D21C:K65N', 'D21C:K65P'

In [9]:
pg_dict[keys[0]]

{'mutated_sequence': 'NLYFQGLKYMVPGCRVTRGLDWKWRDQDGSPQGEGTVTGELHNGWIDVTWDAGGSNSYRMGAEGKFDLKLAP',
 'DMS_score': 0.2863520023709176}

## Adding wild type to dictionary

In [10]:
mut = df.iloc[0]['mutant']
seq = df.iloc[0]['mutated_sequence']
print(mut)
print(seq)

A14C
NLYFQGLKYMVPGCRVTRGLDWKWRDQDGSPQGEGTVTGELHNGWIDVTWDAGGSNSYRMGAEGKFDLKLAP


In [11]:
len_mut = len(mut)
orig = mut[0]
pos = int(mut[1:len_mut-1])-1
new = mut[len_mut-1]
wild_seq = seq[:pos] + orig + seq[pos+1:]

pg_dict[None] = {'mutated_sequence': wild_seq}

In [12]:
pg_dict[None]

{'mutated_sequence': 'NLYFQGLKYMVPGARVTRGLDWKWRDQDGSPQGEGTVTGELHNGWIDVTWDAGGSNSYRMGAEGKFDLKLAP'}

In [13]:
sequence = pg_dict[None]['mutated_sequence']
lp, rlp, llr = collect_log_prob_pg2(sequence, model, tokenizer)

pg_dict[None]['log_probs'] = lp
pg_dict[None]['ref_log_probs'] = rlp
pg_dict[None]['llr_matrix'] = llr

In [14]:
pg_dict[None]

{'mutated_sequence': 'NLYFQGLKYMVPGARVTRGLDWKWRDQDGSPQGEGTVTGELHNGWIDVTWDAGGSNSYRMGAEGKFDLKLAP',
 'log_probs': tensor([[-2.7495, -4.6717, -3.1321,  ..., -3.6410, -4.5148, -4.0542],
         [-3.3371, -4.8444, -4.2508,  ..., -1.5759, -4.4987, -4.4964],
         [-2.8537, -4.7232, -3.6531,  ..., -2.9396, -4.2763, -2.2710],
         ...,
         [-4.6678, -5.0035, -6.8013,  ..., -3.0886, -5.6123, -5.1027],
         [-1.1857, -4.0971, -4.1663,  ..., -2.0979, -5.0846, -2.4035],
         [-2.6705, -5.4710, -4.7268,  ..., -4.1887, -6.6460, -5.2710]]),
 'ref_log_probs': tensor([[-10.4654],
         [ -3.9707],
         [ -2.0299],
         [ -1.8943],
         [ -3.3182],
         [ -3.7726],
         [ -0.6170],
         [ -1.2712],
         [ -1.4426],
         [ -4.1274],
         [ -3.6388],
         [ -2.0061],
         [ -3.2181],
         [ -2.2066],
         [ -2.7905],
         [ -1.1917],
         [ -1.8632],
         [ -2.5740],
         [ -1.5577],
         [ -0.4351],
         [ 

## Adding mutants to dictionary

In [15]:
for k in pg_dict.keys():
    sequence = pg_dict[k]['mutated_sequence']
    lp, rlp, llr = collect_log_prob_pg2(sequence, model, tokenizer)

    pg_dict[k]['log_probs'] = lp
    pg_dict[k]['ref_log_probs'] = rlp
    pg_dict[k]['llr_matrix'] = llr


In [18]:
pg_dict[keys[10]]

{'mutated_sequence': 'NLYFQGLKYMVPGNRVTRGLDWKWRDQDGSPQGEGTVTGELHNGWIDVTWDAGGSNSYRMGAEGKFDLKLAP',
 'DMS_score': -1.5178683307435286,
 'log_probs': tensor([[-2.6696, -4.4545, -3.4151,  ..., -3.3919, -4.3997, -3.7645],
         [-3.3267, -4.7920, -4.0221,  ..., -1.8703, -4.7244, -4.3430],
         [-2.9267, -4.6265, -3.5827,  ..., -2.9494, -4.1244, -2.0768],
         ...,
         [-4.5291, -5.2019, -6.9191,  ..., -2.8714, -5.7220, -5.1513],
         [-1.1205, -4.2246, -4.1788,  ..., -2.1363, -5.1134, -2.5445],
         [-2.7269, -5.4335, -5.0474,  ..., -4.2502, -6.6472, -4.9192]]),
 'ref_log_probs': tensor([[-10.5682],
         [ -3.8757],
         [ -2.2433],
         [ -2.4251],
         [ -3.1921],
         [ -3.5479],
         [ -1.1219],
         [ -1.3647],
         [ -2.3272],
         [ -3.9345],
         [ -3.5550],
         [ -2.4530],
         [ -3.1921],
         [ -2.4770],
         [ -2.8745],
         [ -1.2172],
         [ -1.9110],
         [ -2.6399],
         [ -1.5780

In [19]:
len(pg_dict.keys())

5587

In [20]:
path = '/Users/johnhutchens/Desktop/Practicum/Data/HECD1_HUMAN/'
filename = 'pg2_HECD1_HUMAN_matrices.pickle' 

with open(path + filename, 'wb') as f:
    pickle.dump(pg_dict, f, protocol=pickle.HIGHEST_PROTOCOL)
