In [2]:
!pip install torch transformers

Collecting torch
  Downloading torch-2.4.1-cp312-cp312-manylinux1_x86_64.whl.metadata (26 kB)
Collecting transformers
  Using cached transformers-4.45.2-py3-none-any.whl.metadata (44 kB)
Collecting filelock (from torch)
  Using cached filelock-3.16.1-py3-none-any.whl.metadata (2.9 kB)
Collecting typing-extensions>=4.8.0 (from torch)
  Using cached typing_extensions-4.12.2-py3-none-any.whl.metadata (3.0 kB)
Collecting sympy (from torch)
  Downloading sympy-1.13.3-py3-none-any.whl.metadata (12 kB)
Collecting networkx (from torch)
  Downloading networkx-3.4.1-py3-none-any.whl.metadata (6.3 kB)
Collecting jinja2 (from torch)
  Using cached jinja2-3.1.4-py3-none-any.whl.metadata (2.6 kB)
Collecting fsspec (from torch)
  Using cached fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
 

In [1]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("fill-mask", model="InstaDeepAI/nucleotide-transformer-2.5b-multi-species")

AMD1_leader = "GCTTACACAGTATGGCCGGCGACATTAGCTAGCGCTCGCTCTACTCTCTCTAACGGGAAAGCAGCGGAATACAAGAGACTGAACTGTATCTGCCTCTATTTCCAAAAGACTCACGTTCAACTTTCGCTCACACAAAGCCGGGAAAATTTTATTAGTCCTTTTTTTAAAAAAAGTTAATATAAAATTATAGCAAAAAAAAAAAGGAACCTGAACTTTAGTAACACAGCTGGAACAATCCGCAGCGGCGGCGGCAGCGGCGGGAGAAGAGGTTTAATTTAGTTGATTTTCTGTGGTTGTTGGTTGTTCGCTAGTCTCACGGTGATGGAAGCTGCACATTTTTTCGAAGGGACCGAGAAGCTGCTGGAGGTTTGGTTCTCCCGGCAGCAGCCCGACGCAAACCAAGGATCTGGGGATCTTCGCACTATCCCAAG"



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
pipe(AMD1_leader.replace("A", "<mask>"))

In [2]:
import torch
from torch import nn
from transformers import AutoModelForTokenClassification, AutoTokenizer, Trainer, TrainingArguments


In [3]:
# Load pre-trained model and tokenizer
model_name = "InstaDeepAI/nucleotide-transformer-2.5b-multi-species"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=2)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of EsmForTokenClassification were not initialized from the model checkpoint at InstaDeepAI/nucleotide-transformer-2.5b-multi-species and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
import torch
from transformers import AutoTokenizer

# Make sure to initialize your tokenizer beforehand
tokenizer = AutoTokenizer.from_pretrained("facebook/esm2_t30_150M_UR50D")  # Replace with your model

class RNADataset(torch.utils.data.Dataset):
    def __init__(self, sequences, labels):
        self.sequences = sequences
        self.labels = labels

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        seq = self.sequences[idx]
        label = self.labels[idx]

        # Tokenize the sequence
        encoded = tokenizer(seq, padding="max_length", truncation=True, max_length=512, return_tensors="pt")
        
        # Ensure that labels are a tensor with the correct shape
        encoded["labels"] = torch.tensor(label, dtype=torch.float)  # or dtype=torch.long for integer labels
        
        # Squeeze the tensors to remove the extra batch dimension (size 1)
        for key in encoded.keys():
            encoded[key] = encoded[key].squeeze(0)  # Remove the batch dimension

        return encoded


tokenizer_config.json:   0%|          | 0.00/95.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/93.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [None]:

# Load and preprocess your data
# Each sequence is a string of nucleotides, each label is a list of 0s and 1s (1 for translated)
train_sequences = ["AUGCUAAAG", "GCAUAAAGCU"]
train_labels = [[1,1,1,0,0,0,1,1,1], [0,0,0,1,1,1,0,0,0]]
train_dataset = RNADataset(train_sequences, train_labels)

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
)

# Create Trainer and fine-tune the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)

trainer.train()

# Save the fine-tuned model
model.save_pretrained("./fine_tuned_rna_region_model")



  0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
# Function to predict translated regions
def predict_translated_regions(sequence):
    inputs = tokenizer(sequence, return_tensors="pt")
    outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=-1)
    return predictions[0].tolist()  # Convert to list for easier interpretation

# Example usage
test_sequence = "AUGCUAAAGCUAGCUAGCUAGCUGA"
predicted_regions = predict_translated_regions(test_sequence)
print(f"Sequence: {test_sequence}")
print(f"Predicted translated regions: {predicted_regions}")