In [1]:
import pandas as pd
import torch
import matplotlib.pyplot as plt
import numpy as np
import torch
import random
#set seed for everything
torch.manual_seed(0)
np.random.seed(0)
random.seed(0)
from torch.utils.data import Dataset, DataLoader
from ast import literal_eval
from Network import *
import yaml



class Config:
    def __init__(self, **entries):
        self.__dict__.update(entries)
        self.entries=entries

    def print(self):
        print(self.entries)

def load_config_from_yaml(file_path):
    with open(file_path, 'r') as file:
        config = yaml.safe_load(file)
    return Config(**config)

class finetuned_RibonanzaNet(RibonanzaNet):
    def __init__(self, config, pretrained=False):
        super(finetuned_RibonanzaNet, self).__init__(config)
        if pretrained:
            self.load_state_dict(torch.load("/kaggle/input/ribonanzanet-weights/RibonanzaNet.pt",map_location='cpu'))
        self.decoder=nn.Linear(256,5)

    def forward(self,src):
        
        sequence_features, pairwise_features=self.get_embeddings(src, torch.ones_like(src).long().to(src.device))
        output=self.decoder(sequence_features)

        return output.squeeze(-1)
    

class RNA_test_Dataset(Dataset):
    def __init__(self,data):
        self.data=data
        self.tokens={nt:i for i,nt in enumerate('ACGU')}

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        sequence=[self.tokens[nt] for nt in self.data.loc[idx,'sequence']]
        sequence=np.array(sequence)
        sequence=torch.tensor(sequence)

        return {'sequence':sequence}

class RNA_Dataset(Dataset):
    def __init__(self,data):
        self.data=data
        self.tokens={nt:i for i,nt in enumerate('ACGU')}
        self.label_names=['reactivity', 'deg_Mg_pH10', 'deg_pH10', 'deg_Mg_50C', 'deg_50C']
               
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        sequence=[self.tokens[nt] for nt in (self.data.loc[idx,'sequence'])]
        sequence=np.array(sequence)
        sequence=torch.tensor(sequence)
        
        labels=np.stack([self.data.loc[idx,l] for l in self.label_names],-1)
        labels=torch.tensor(labels)
        
        return {'sequence':sequence,
                'labels':labels}

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
config=load_config_from_yaml("/Users/lihongmin/Research/ideas/RibonanzaNet/ribonanzanet2d-final/configs/pairwise.yaml")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if torch.backends.mps.is_available():
    device= torch.device("mps")
model=finetuned_RibonanzaNet(config,pretrained=False).to(device)
#1. Initial Model Training-only confident labels:
model.load_state_dict(torch.load("RibonanzaNet-Deg_30_68_re.pt",map_location=device))

constructing 9 ConvTransformerEncoderLayers


<All keys matched successfully>

In [25]:
# /Users/lihongmin/Research/ideas/RibonanzaNet/ribonanzanet2d-final/deg-finetune/half-life/pnas.1908052116.sd01.xlsx
# /Users/lihongmin/Research/ideas/RibonanzaNet/ribonanzanet2d-final/deg-finetune/half-life/pnas.1908052116.sd02.xlsx

data1_ffLuc = pd.read_excel("/Users/lihongmin/Research/ideas/RibonanzaNet/ribonanzanet2d-final/deg-finetune/half-life/pnas.1908052116.sd01.xlsx", sheet_name="ffLuc", header=[0,1], index_col=0)

# data1_ffLuc = pd.read_csv("/Users/lihongmin/Research/ideas/RibonanzaNet/ribonanzanet2d-final/deg-finetune/half-life/pnas.1908052116.sd01.csv")

data2 = pd.read_excel("/Users/lihongmin/Research/ideas/RibonanzaNet/ribonanzanet2d-final/deg-finetune/half-life/pnas.1908052116.sd02.xlsx")

data1_ffLuc.head()
data = data2.copy()
data['sequence']=data['ORF Sequence'].str.replace('T','U')
data.head()



Unnamed: 0,Figure Reference,Protein,ID,ORF Sequence,sequence
0,1B,eGFP,G1,ATGGACTATAAAGACGACGACGACAAGGACTACAAGGACGACGACG...,AUGGACUAUAAAGACGACGACGACAAGGACUACAAGGACGACGACG...
1,1B,eGFP,G2,ATGGACTACAAGGACGACGACGACAAGGATTATAAAGACGACGACG...,AUGGACUACAAGGACGACGACGACAAGGAUUAUAAAGACGACGACG...
2,1B,eGFP,G3,ATGGACTATAAGGACGACGACGACAAGGACTACAAGGACGACGACG...,AUGGACUAUAAGGACGACGACGACAAGGACUACAAGGACGACGACG...
3,1B,eGFP,G4,ATGGACTACAAGGACGACGACGACAAGGACTACAAGGACGACGACG...,AUGGACUACAAGGACGACGACGACAAGGACUACAAGGACGACGACG...
4,1C-1D,hEPO,ECO,ATGGGAGTGCACGAGTGTCCCGCGTGGTTGTGGTTGCTGCTGTCGC...,AUGGGAGUGCACGAGUGUCCCGCGUGGUUGUGGUUGCUGCUGUCGC...


In [28]:
test_dataset = RNA_test_Dataset(data)
test_dataset[0]

{'sequence': tensor([0, 3, 2, 2, 0, 1, 3, 0, 3, 0, 0, 0, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1,
         0, 0, 2, 2, 0, 1, 3, 0, 1, 0, 0, 2, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1,
         0, 0, 2, 2, 0, 1, 3, 0, 1, 0, 0, 2, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1,
         0, 0, 2, 0, 3, 2, 2, 3, 1, 0, 2, 1, 0, 0, 2, 2, 2, 1, 2, 0, 2, 2, 0, 2,
         1, 3, 2, 3, 3, 1, 0, 1, 1, 2, 2, 2, 2, 3, 2, 2, 3, 2, 1, 1, 1, 0, 3, 1,
         1, 3, 2, 2, 3, 2, 2, 0, 2, 1, 3, 2, 2, 0, 1, 2, 2, 1, 2, 0, 1, 2, 3, 2,
         0, 0, 1, 2, 2, 1, 1, 0, 1, 0, 0, 2, 3, 3, 1, 0, 2, 1, 2, 3, 2, 0, 2, 1,
         2, 2, 2, 2, 0, 2, 2, 2, 1, 2, 0, 2, 2, 2, 1, 2, 0, 1, 2, 1, 1, 0, 1, 2,
         3, 0, 1, 2, 2, 2, 0, 0, 2, 1, 3, 2, 0, 1, 1, 1, 3, 2, 0, 0, 2, 3, 3, 1,
         0, 3, 1, 3, 2, 3, 0, 1, 1, 0, 1, 1, 2, 2, 1, 0, 0, 2, 1, 3, 2, 1, 1, 1,
         2, 3, 2, 1, 1, 1, 3, 2, 2, 1, 1, 1, 0, 1, 1, 1, 3, 1, 2, 3, 2, 0, 1, 2,
         0, 1, 1, 1, 3, 2, 0, 1, 1, 3, 0, 1, 2, 2, 1, 2, 3, 0, 1, 0, 2, 3, 2, 1,
         3, 3, 1

In [29]:
from tqdm import tqdm

test_preds=[]
model.eval()
for i in tqdm(range(len(test_dataset))):
    example=test_dataset[i]
    sequence=example['sequence'].to(device).unsqueeze(0)

    with torch.no_grad():
        test_preds.append(model(sequence).cpu().numpy())

100%|██████████| 86/86 [15:10<00:00, 10.59s/it]


In [32]:
import pickle
with open("half_life_pred.pkl","wb") as f:
    pickle.dump(test_preds,f)

In [37]:
len(test_preds)

86

In [38]:
data1_ffLuc['U']

RNAID,L1,L2,L3,L4,L5,L6,L7,L8,L9,L10,...,L30,L31,L32,L33,L34,L35,L36,L37,L38,L39
1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,0.649352,-0.149798,,,,0.000000,-0.037376,-0.319066,-0.871059,,...,,,0.362112,,,,,,0.0,
5,-0.187314,-0.330836,,0.0,,,1.650435,,,,...,,,-0.070155,,,,,,,
6,1.338052,0.184714,,,0.0,2.545009,,0.386985,1.549758,,...,,,-0.099106,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1811,,,,,,,,,,,...,,,,,,,,,,
1812,,,,,,,,,,,...,,,,,,,,,,
1813,,,,,,,,,,,...,,,,,,,,,,
1814,,,,,,,,,,,...,,,,,,,,,,
