In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
import json, time, os, sys, glob
import shutil
import warnings
import numpy as np
import pandas as pd
import torch
from torch import optim
from torch.utils.data import DataLoader
from torch.utils.data.dataset import random_split, Subset
import copy
import torch.nn as nn
import torch.nn.functional as F
import random
import os.path
import subprocess
from tqdm import tqdm
from omegaconf import OmegaConf

from protein_mpnn_utils import loss_nll, loss_smoothed, gather_edges, gather_nodes, gather_nodes_t, cat_neighbors_nodes, _scores, _S_to_seq, tied_featurize, parse_PDB
from protein_mpnn_utils import StructureDataset, StructureDatasetPDB, ProteinMPNN
from kaggle_dataset import KaggleTrainDataset

In [5]:
cfg = OmegaConf.load("config.yaml")
dataset = KaggleTrainDataset(cfg, "train")
wt_feat, mut_feat, out, position = dataset[0]

In [16]:
from fireprot_dataset import FireProtDataset
dataset = FireProtDataset(cfg, "train")

In [24]:
dataset.seq_to_data[dataset.wt_sequences[1]]

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,experiment_id,protein_name,uniprot_id,wild_type,mutation,ddG,dTm,is_curated,...,datasets,pdb_id_corrected,position_corrected,chain_corrected,sequence_corrected,sequence_length,oligomer_label,structure_method,resolution,dupe_detector
0,2904,4802,PT005142,Alpha-1-antitrypsin,P01009,F,C,-3.01,,True,...,ProTherm,1QLP,50,A,MDPQGDAAQKTDTSHHDQDHPTFNKITPNLAEFAFSLYRQLAHQSN...,394,monomer,x-ray diffraction,-1,1QLP-C-50-F-P01009
1,3442,5628,PT009330,Alpha-1-antitrypsin,P01009,F,L,-2.1,,True,...,Saraboji2204|PopMuSiC|SAAFEC983|STRUM3421|AUTO...,1QLP,50,A,MDPQGDAAQKTDTSHHDQDHPTFNKITPNLAEFAFSLYRQLAHQSN...,394,monomer,x-ray diffraction,-1,1QLP-L-50-F-P01009
2,3443,5629,PT009331,Alpha-1-antitrypsin,P01009,T,A,-1.0,,True,...,Saraboji2204|PopMuSiC|SAAFEC983|STRUM3421|AUTO...,1QLP,58,A,MDPQGDAAQKTDTSHHDQDHPTFNKITPNLAEFAFSLYRQLAHQSN...,394,monomer,x-ray diffraction,-1,1QLP-A-58-T-P01009
3,3444,5630,PT009332,Alpha-1-antitrypsin,P01009,T,A,-1.0,,True,...,Saraboji2204|PopMuSiC|SAAFEC983|STRUM3421|AUTO...,1QLP,67,A,MDPQGDAAQKTDTSHHDQDHPTFNKITPNLAEFAFSLYRQLAHQSN...,394,monomer,x-ray diffraction,-1,1QLP-A-67-T-P01009
4,3445,5631,PT009333,Alpha-1-antitrypsin,P01009,A,G,-1.6,,True,...,Saraboji2204|PopMuSiC|SAAFEC983|STRUM3421|Broo...,1QLP,69,A,MDPQGDAAQKTDTSHHDQDHPTFNKITPNLAEFAFSLYRQLAHQSN...,394,monomer,x-ray diffraction,-1,1QLP-G-69-A-P01009
5,3446,5632,PT009334,Alpha-1-antitrypsin,P01009,M,I,-2.3,,True,...,Saraboji2204|PopMuSiC|SAAFEC983|STRUM3421|Broo...,1QLP,373,A,MDPQGDAAQKTDTSHHDQDHPTFNKITPNLAEFAFSLYRQLAHQSN...,394,monomer,x-ray diffraction,-1,1QLP-I-373-M-P01009
6,3447,5633,PT009335,Alpha-1-antitrypsin,P01009,S,A,-1.0,,True,...,Saraboji2204|PopMuSiC|SAAFEC983|STRUM3421|Broo...,1QLP,380,A,MDPQGDAAQKTDTSHHDQDHPTFNKITPNLAEFAFSLYRQLAHQSN...,394,monomer,x-ray diffraction,-1,1QLP-A-380-S-P01009
7,3448,5634,PT009336,Alpha-1-antitrypsin,P01009,K,R,-1.0,,True,...,Saraboji2204|PopMuSiC|SAAFEC983|STRUM3421|AUTO...,1QLP,386,A,MDPQGDAAQKTDTSHHDQDHPTFNKITPNLAEFAFSLYRQLAHQSN...,394,monomer,x-ray diffraction,-1,1QLP-R-386-K-P01009
8,3642,5931,PT011578,Alpha-1-antitrypsin,P01009,A,V,-0.6,,True,...,PopMuSiC|SAAFEC983|Broom|ProTherm,1QLP,30,A,MDPQGDAAQKTDTSHHDQDHPTFNKITPNLAEFAFSLYRQLAHQSN...,394,monomer,x-ray diffraction,-1,1QLP-V-30-A-P01009
9,3643,5932,PT011579,Alpha-1-antitrypsin,P01009,A,I,-0.9,,True,...,ProTherm,1QLP,30,A,MDPQGDAAQKTDTSHHDQDHPTFNKITPNLAEFAFSLYRQLAHQSN...,394,monomer,x-ray diffraction,-1,1QLP-I-30-A-P01009


In [70]:
from Bio import pairwise2

def seq1_index_to_seq2_index(align, index):
    cur_seq1_index = 0

    # first find the aligned index
    for aln_idx, char in enumerate(align.seqA):
        if char != '-':
            cur_seq1_index += 1
        if cur_seq1_index > index:
            print(char, aln_idx)
            break
    
    # now the index in seq 2 cooresponding to aligned index
    seq2_to_idx = align.seqB[:aln_idx+1]
    seq2_idx = aln_idx
    for char in seq2_to_idx:
        if char == '-':
            seq2_idx -= 1

    return seq2_idx

seq1 = "FNKITPNLAEFAFSLYRQLAHQSNSTNIFFSPVSIATAFAMLSLGTKADTHDEILEGLNFNLTEIPEAQIHEGFQELLRTLNQPDSQLQLTTGNGLFLSEGLKLVDKFLEDVKKLYHSEAFTVNFGDTEEAKKQINDYVEKGTQGKIVDLVKELDRDTVFALVNYIFFKGKWERPFEVKDTEEEDFHVDQVTTVKVPMMKRLGMFNIQHCKKLSSWVLLMKYLGNATAIFFLPDEGKLQHLENELTHDIITKFLENEDRRSASLHLPKLSITGTYDLKSVLGQLGITKVFSNGADLSGVTEEAPLKLSKAVHKAVLTIDEKGTEAAGAMFLEAIPMSIPPEVKFNKPFVFLMIEQNTKSPLFMGKVVNPTQK"
seq2 = "MDPQGDAAQKTDTSHHDQDHPTFNKITPNLAEFAFSLYRQLAHQSNSTNIFFSPVSIATAFAMLSLGTKADTHDEILEGLNFNLTEIPEAQIHEGFQELLRTLNQPDSQLQLTTGNGLFLSEGLKLVDKFLEDVKKLYHSEAFTVNFGDTEEAKKQINDYVEKGTQGKIVDLVKELDRDTVFALVNYIFFKGKWERPFEVKDTEEEDFHVDQVTTVKVPMMKRLGMFNIQHCKKLSSWVLLMKYLGNATAIFFLPDEGKLQHLENELTHDIITKFLENEDRRSASLHLPKLSITGTYDLKSVLGQLGITKVFSNGADLSGVTEEAPLKLSKAVHKAVLTIDEKGTEAAGAMFLEAIPMSIPPEVKFNKPFVFLMIEQNTKSPLFMGKVVNPTQK"
pos = 10
print(seq1[pos])
align, *rest = pairwise2.align.globalxx(seq1, seq2)
idx2 = seq1_index_to_seq2_index(align, pos)
print(seq2[idx2])


F
F 32
F


In [74]:
"MISLIAALAVDRVIG-----PWNLPADLA".replace("-", "X")

'MISLIAALAVDRVIGXXXXXPWNLPADLA'

In [79]:
seq1 = "KLHKEPATLIKAIDGDTVKLMYKGQPMTFRLLLVDTPETKHPKKGVEKYGPEASAFTKKMVENAKKIEVEFDKGQRTDKYGRGLAYIYADGKMVNEALVRQGLAKVAYVYKPNNTHEQHLRKSEAQAKKEKLNIWS"
len(seq1)

136

In [86]:
for item in dataset:
    if item is None:
        print("!")

TypeError: '>=' not supported between instances of 'slice' and 'int'