In [2]:
import torch
from transformers import AutoModelForCausalLM
from transformers import AutoTokenizer, EsmForMaskedLM
from tokenizers import Tokenizer
import torch.nn.functional as F

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import pickle

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from plm_compare_progen2 import *
from protein_data import *

In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

model_name = "hugohrban/progen2-medium"
model, tokenizer = initialize_progen2(model_name)

Using cpu device


### B2L11_HUMAN, CALM1_HUMAN, GDIA_HUMAN, HECD1_HUMAN

In [7]:
# HECD1_HUMAN
path = '/Users/johnhutchens/Desktop/Practicum/Data/zInput_Data/DMS_ProteinGym_substitutions/'
filename = 'ISDH_STAAW_Tsuboyama_2023_2LHR.csv'
df = pd.read_csv(path+filename)

In [8]:
df.shape

(1944, 4)

In [9]:
df = df.drop('DMS_score_bin', axis='columns')
df.columns

Index(['mutant', 'mutated_sequence', 'DMS_score'], dtype='object')

In [10]:
pg_dict = df.set_index(df.columns[0]).to_dict(orient='index')

In [11]:
keys = list(pg_dict.keys())
print(keys)

['A13C', 'A13E', 'A13G', 'A13I', 'A13L', 'A13N', 'A13Q', 'A13S', 'A13T', 'A13V', 'A13W', 'A36C', 'A36D', 'A36E', 'A36F', 'A36G', 'A36H', 'A36I', 'A36K', 'A36L', 'A36M', 'A36N', 'A36P', 'A36Q', 'A36R', 'A36S', 'A36T', 'A36V', 'A36W', 'A36Y', 'A50C', 'A50D', 'A50E', 'A50F', 'A50G', 'A50H', 'A50I', 'A50K', 'A50L', 'A50M', 'A50N', 'A50P', 'A50Q', 'A50R', 'A50S', 'A50T', 'A50V', 'A50W', 'A50Y', 'A8C', 'A8D', 'A8E', 'A8F', 'A8G', 'A8H', 'A8I', 'A8K', 'A8L', 'A8M', 'A8N', 'A8P', 'A8Q', 'A8R', 'A8S', 'A8T', 'A8V', 'A8W', 'A8Y', 'D43A', 'D43C', 'D43E', 'D43F', 'D43G', 'D43H', 'D43I', 'D43K', 'D43L', 'D43M', 'D43N', 'D43Q', 'D43R', 'D43S', 'D43T', 'D43V', 'D43W', 'D43Y', 'D51A', 'D51C', 'D51E', 'D51F', 'D51G', 'D51H', 'D51I', 'D51K', 'D51L', 'D51M', 'D51N', 'D51P', 'D51Q', 'D51R', 'D51S', 'D51T', 'D51V', 'D51W', 'D51Y', 'E17A', 'E17C', 'E17D', 'E17F', 'E17G', 'E17H', 'E17I', 'E17K', 'E17L', 'E17M', 'E17N', 'E17P', 'E17Q', 'E17R', 'E17S', 'E17T', 'E17V', 'E17W', 'E17Y', 'E22A', 'E22C', 'E22D', 'E

In [12]:
pg_dict[keys[0]]

{'mutated_sequence': 'YNLQKLLAPYHKCKTLERQVYELEKLQEKLPEKYKAEYKKKLDQTRVELADQVKS',
 'DMS_score': -1.4992375794294996}

## Adding wild type to dictionary

In [13]:
mut = df.iloc[0]['mutant']
seq = df.iloc[0]['mutated_sequence']
print(mut)
print(seq)

A13C
YNLQKLLAPYHKCKTLERQVYELEKLQEKLPEKYKAEYKKKLDQTRVELADQVKS


In [14]:
len_mut = len(mut)
orig = mut[0]
pos = int(mut[1:len_mut-1])-1
new = mut[len_mut-1]
wild_seq = seq[:pos] + orig + seq[pos+1:]

pg_dict[None] = {'mutated_sequence': wild_seq}

In [15]:
pg_dict[None]

{'mutated_sequence': 'YNLQKLLAPYHKAKTLERQVYELEKLQEKLPEKYKAEYKKKLDQTRVELADQVKS'}

In [16]:
sequence = pg_dict[None]['mutated_sequence']
lp, rlp, llr = collect_log_prob_pg2(sequence, model, tokenizer)

pg_dict[None]['log_probs'] = lp
pg_dict[None]['ref_log_probs'] = rlp
pg_dict[None]['llr_matrix'] = llr

In [17]:
pg_dict[None]

{'mutated_sequence': 'YNLQKLLAPYHKAKTLERQVYELEKLQEKLPEKYKAEYKKKLDQTRVELADQVKS',
 'log_probs': tensor([[-2.7886, -4.2732, -3.3953,  ..., -3.0360, -4.4955, -3.6931],
         [-2.4751, -4.9017, -2.4215,  ..., -3.0170, -4.9507, -3.8907],
         [-2.6729, -4.7263, -2.6006,  ..., -3.1189, -5.0229, -3.7400],
         ...,
         [-2.8077, -4.6281, -3.2954,  ..., -2.7869, -4.7609, -3.3061],
         [-2.2796, -5.1680, -2.9557,  ..., -3.2113, -4.8108, -3.6628],
         [-2.1897, -5.2352, -2.9951,  ..., -3.0277, -5.0121, -4.0434]]),
 'ref_log_probs': tensor([[-10.1177],
         [ -3.8907],
         [ -2.9156],
         [ -1.9943],
         [ -2.7246],
         [ -2.2603],
         [ -2.1750],
         [ -2.3566],
         [ -2.6737],
         [ -3.4306],
         [ -3.5515],
         [ -3.7275],
         [ -2.1086],
         [ -2.5764],
         [ -2.0935],
         [ -2.9304],
         [ -1.8027],
         [ -1.6748],
         [ -2.8217],
         [ -2.6787],
         [ -3.1889],
       

## Adding mutants to dictionary

In [18]:
for k in pg_dict.keys():
    sequence = pg_dict[k]['mutated_sequence']
    lp, rlp, llr = collect_log_prob_pg2(sequence, model, tokenizer)

    pg_dict[k]['log_probs'] = lp
    pg_dict[k]['ref_log_probs'] = rlp
    pg_dict[k]['llr_matrix'] = llr


In [19]:
pg_dict[keys[100]]

{'mutated_sequence': 'YNLQKLLAPYHKAKTLERQVYELEKLQEKLPEKYKAEYKKKLDQTRVELASQVKS',
 'DMS_score': 0.0290230473988555,
 'log_probs': tensor([[-2.7876, -4.2574, -3.4103,  ..., -3.0391, -4.4981, -3.6897],
         [-2.4766, -4.8622, -2.4550,  ..., -3.0103, -4.9434, -3.8866],
         [-2.6687, -4.6889, -2.6348,  ..., -3.1090, -5.0050, -3.7344],
         ...,
         [-2.8411, -4.6012, -3.2864,  ..., -2.8150, -4.7958, -3.3744],
         [-2.2590, -5.2410, -2.8349,  ..., -3.3165, -4.9859, -3.8501],
         [-2.1991, -5.2000, -2.7406,  ..., -3.1015, -5.0841, -4.1089]]),
 'ref_log_probs': tensor([[-10.1508],
         [ -3.8866],
         [ -2.9227],
         [ -2.0049],
         [ -2.7121],
         [ -2.2717],
         [ -2.1569],
         [ -2.3503],
         [ -2.6705],
         [ -3.4402],
         [ -3.5458],
         [ -3.7111],
         [ -2.1271],
         [ -2.5807],
         [ -2.1052],
         [ -2.9272],
         [ -1.7998],
         [ -1.6824],
         [ -2.7987],
         [ -2.6

In [20]:
len(pg_dict.keys())

1945

In [21]:
filename = '/Users/johnhutchens/Desktop/Practicum/Data/GDIA_HUMAN/pg2_ISDH_STAAW_matrices.pickle' 

with open(filename, 'wb') as f:
    pickle.dump(pg_dict, f, protocol=pickle.HIGHEST_PROTOCOL)
