# Load datasets

In [1]:
#pip install transformers
#pip install torch
#pip install torch.nn

In [None]:
from Bio import SeqIO
from Bio.Seq import Seq
import pandas as pd
import numpy as np
import transformers

In [3]:
training = 'Data/Train.fasta'
tr_records = SeqIO.parse(training, 'fasta')
tr_data = []
for record in tr_records:
    # Extract the sequence ID (before "_label") and the label (after "_label")
    parts = record.id.split('_')
    seq_id = '_'.join(parts[:-2])  # Join the parts of ID before "_label"
    label = parts[-1]  # The label is the last part after "_label"
    
    tr_data.append({"ID": seq_id, "Label": label, "sequence": str(record.seq)})
df_training = pd.DataFrame(tr_data)
df_training['Label'] = df_training['Label'].astype(int)
df_training.head()

Unnamed: 0,ID,Label,sequence
0,seq_0,1,MANILNLKHLLTLALILLALATKSSTSSSSSITRVKGIYWLENPFF...
1,seq_1,1,MVALLLFPMLLQLLSPTCAQTQKNITLGSTLAPQGPASSWLSPSGD...
2,seq_2,1,MDILQLAPTHLLAILLSSTSALFLITYLLRAGHRPSDLPNGPPTVP...
3,seq_3,1,MRLHQSPPRLLVCILSVLQVSAGLSSNCRCMPGDSCWPSLNDWARF...
4,seq_4,1,MAKIDVHHHFYPPAMRQALDRAGGDPSGWYIPPWTLELDQDITRQM...


In [4]:
testing = 'Data/Train.fasta'
t_records = SeqIO.parse(training, 'fasta')
t_data = []
for record in t_records:
    # Extract the sequence ID (before "_label") and the label (after "_label")
    parts = record.id.split('_')
    seq_id = '_'.join(parts[:-2])  # Join the parts of ID before "_label"
    label = parts[-1]  # The label is the last part after "_label"
    
    t_data.append({"ID": seq_id, "Label": label, "sequence": str(record.seq)})
df_testing = pd.DataFrame(t_data)
df_testing['Label'] = df_training['Label'].astype(int)
df_testing.head()

Unnamed: 0,ID,Label,sequence
0,seq_0,1,MANILNLKHLLTLALILLALATKSSTSSSSSITRVKGIYWLENPFF...
1,seq_1,1,MVALLLFPMLLQLLSPTCAQTQKNITLGSTLAPQGPASSWLSPSGD...
2,seq_2,1,MDILQLAPTHLLAILLSSTSALFLITYLLRAGHRPSDLPNGPPTVP...
3,seq_3,1,MRLHQSPPRLLVCILSVLQVSAGLSSNCRCMPGDSCWPSLNDWARF...
4,seq_4,1,MAKIDVHHHFYPPAMRQALDRAGGDPSGWYIPPWTLELDQDITRQM...


# Using LLM

## Data Preparation

Convert dataframe of sequences to list of sequences. This is to pre-process a dataset of protein sequence in a format that can processed by an LLM later on.

In [5]:
protein_sequences = list(df_training['sequence'])

In [None]:
# Load a pre-trained protein model tokenizer, for example, ProtBERT tokenizer.
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("Rostlab/prot_bert")
max_length = 150
# Tokenize the protein sequences
tokenized_sequences = tokenizer(protein_sequences, 
                                padding=True, 
                                truncation=True, 
                                max_length=max_length, 
                                return_tensors="pt")

## Using Pre-trained LLM (ProtBERT)

In [None]:
from transformers import AutoModel, AutoTokenizer
import torch

# Load the pre-trained model
model = AutoModel.from_pretrained("Rostlab/prot_bert")

# Get embeddings for the tokenized sequences
with torch.no_grad():
    embeddings = model(**tokenized_sequences).last_hidden_state


## Add a Binary Classification Head

In [None]:
import torch
import torch.nn as nn

# Define a simple binary classification head
class ProteinBinaryClassifier(nn.Module):
    def __init__(self, embedding_dim):
        super(ProteinBinaryClassifier, self).__init__()
        self.fc = nn.Linear(embedding_dim, 1)  # Binary classification
        self.sigmoid = nn.Sigmoid()

    def forward(self, embeddings):
        # Use the first token (CLS token) as a representation
        cls_token_embedding = embeddings[:, 0, :]  # Taking the [CLS] token representation
        output = self.fc(cls_token_embedding)
        return self.sigmoid(output)

# Initialize the classifier
embedding_dim = embeddings.size(-1)
classifier = ProteinBinaryClassifier(embedding_dim)
