# Transfer Learning for mRNA Translation Efficiency Prediction

## 1. Importing Required Libraries

First, let's import the necessary libraries:


In [25]:
import os

# Set the cache directory to your desired path
os.environ['HF_HOME'] = '/data1/Jack/projects/TranslonFormer/.cache'

# Optionally, you can print the path to verify
print(os.getenv('HF_HOME'))

/data1/Jack/projects/TranslonFormer/.cache


In [26]:
import torch
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer



## 2. Defining the Model

Now, we'll define our model class that uses a pre-trained BERT model and adds a custom classifier for our specific task.


In [27]:
class mRNATranslationEfficiencyPredictor(nn.Module):
    def __init__(self, pretrained_model_name, num_labels):
        super(mRNATranslationEfficiencyPredictor, self).__init__()
        self.nucleotide_transformer = AutoModel.from_pretrained(pretrained_model_name)
        
        # Freeze the Nucleotide Transformer parameters
        for param in self.nucleotide_transformer.parameters():
            param.requires_grad = False
        
        # Add a custom classifier
        self.classifier = nn.Sequential(
            nn.Linear(self.nucleotide_transformer.config.hidden_size, 256),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(256, num_labels)
        )

    def forward(self, input_ids, attention_mask):
        outputs = self.nucleotide_transformer(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = outputs.last_hidden_state[:, 0, :]  # We use the [CLS] token representation
        return self.classifier(sequence_output)

**Motivation:** 
- We freeze the pre-trained layers to preserve the valuable features learned during pre-training.
- We replace the output layer with a custom classifier tailored to our task of predicting translation efficiency.

## 3. Instantiating the Model

Let's create an instance of our model:

In [28]:
pretrained_model_name = "InstaDeepAI/nucleotide-transformer-2.5b-multi-species"
num_labels = 1  # For regression task (e.g., predicting translation efficiency score)

model = mRNATranslationEfficiencyPredictor(pretrained_model_name, num_labels)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of EsmModel were not initialized from the model checkpoint at InstaDeepAI/nucleotide-transformer-2.5b-multi-species and are newly initialized: ['esm.pooler.dense.bias', 'esm.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



## 4. Preparing Data and Making Predictions

Here's how you can prepare your data and use the model for predictions:

In [29]:
# Prepare your data
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)
mrna_sequence = "AUGGGCUAA"
inputs = tokenizer(mrna_sequence, return_tensors="pt")

# Make predictions
with torch.no_grad():
    prediction = model(**inputs)

print(f"Predicted translation efficiency: {prediction.item()}")

Predicted translation efficiency: 0.03822122514247894


# 6. Creating a Custom Dataset and DataLoader
To efficiently handle your mRNA data, we'll create a custom Dataset class and use it with PyTorch's DataLoader. Here's how you can do this:

In [30]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer

class mRNADataset(Dataset):
    def __init__(self, sequences, efficiency_scores, tokenizer, max_length=512):
        self.sequences = sequences
        self.efficiency_scores = efficiency_scores
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        sequence = self.sequences[idx]
        efficiency_score = self.efficiency_scores[idx]

        # Tokenize the sequence
        encoding = self.tokenizer.encode_plus(
            sequence,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(efficiency_score, dtype=torch.float)
        }

# Example usage:
pretrained_model_name = "InstaDeepAI/nucleotide-transformer-2.5b-multi-species"
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)

# Assume you have these lists of sequences and scores
mRNA_sequences = ["AUGGGCUAA", "CUAGUGAAU", "GGGAAAUUU"]  # Add your mRNA sequences here
efficiency_scores = [0.75, 0.62, 0.88]  # Add corresponding efficiency scores here

# Create the dataset
dataset = mRNADataset(mRNA_sequences, efficiency_scores, tokenizer)

# Create the DataLoader
batch_size = 16
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)


In [32]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in dataloader:
        optimizer.zero_grad()
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = loss_fn(outputs.squeeze(), labels)
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    avg_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch+1}/{num_epochs}, Average Loss: {avg_loss:.4f}")

Epoch 1/10, Average Loss: 0.4589
Epoch 2/10, Average Loss: 0.4908
Epoch 3/10, Average Loss: 0.4886
Epoch 4/10, Average Loss: 0.4135
Epoch 5/10, Average Loss: 0.5923
Epoch 6/10, Average Loss: 0.4903
Epoch 7/10, Average Loss: 0.4997
Epoch 8/10, Average Loss: 0.5017
Epoch 9/10, Average Loss: 0.4868
Epoch 10/10, Average Loss: 0.5062
