# Allow Google Colab to access Google Drive

# Import Libraries

## Install necessary packages

## Load libaries

In [None]:
from Bio import SeqIO
from transformers import AutoTokenizer
import torch
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
import numpy as np
import torch
import seaborn as sns
from sklearn.metrics import confusion_matrix, roc_curve, auc, matthews_corrcoef, roc_auc_score
import matplotlib.pyplot as plt

# Load datasets

In [3]:
def parse_fasta(fasta_file):
    sequences = []
    labels = []
    for record in SeqIO.parse(fasta_file, "fasta"):
        sequence = str(record.seq)
        # Extract the label from the sequence ID (assuming it's at the end and separated by '_')
        label = int(record.id.split('_')[-1])  # 0 or 1
        sequences.append(sequence)
        labels.append(label)
    return sequences, labels

train_sequences, train_labels = parse_fasta("Data/Train.fasta")
test_sequences, test_labels = parse_fasta("Data/Test.fasta")

# Tokenize the sequences

In [4]:
# Load a pre-trained tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Tokenize the sequences
def tokenize_sequences(sequences):
    return tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")

train_encodings = tokenize_sequences(train_sequences)
test_encodings = tokenize_sequences(test_sequences)

# Prepare a Dataset for Fine-Tuning

In [None]:
class SequenceDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Create dataset objects
train_dataset = SequenceDataset(train_encodings, train_labels)
test_dataset = SequenceDataset(test_encodings, test_labels)

# Load pre-trained model for Binary Sequence Classification

In [None]:
# Load a pre-trained model for sequence classification (binary classification: 2 labels)
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',          # Output directory
    num_train_epochs=5,              # Number of epochs
    per_device_train_batch_size=100,  # Batch size for training
    per_device_eval_batch_size=100,   # Batch size for evaluation
    eval_strategy="epoch",     # Evaluate every epoch
    logging_dir='./logs',            # Log directory
    logging_steps=10,
    report_to="none"  # Disable W&B logging
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Train the model
trainer.train()

# Make predictions

In [None]:
# Make predictions
predictions = trainer.predict(test_dataset)

# Convert NumPy array to Tensor
pred_tensor = torch.from_numpy(predictions.predictions)

# Use argmax on the tensor
preds = torch.argmax(pred_tensor, axis=-1)

# Evaluation Metrics

In [None]:

y_true = torch.tensor(test_labels)

In [None]:
# Confusion Matrix
conf_matrix = confusion_matrix(y_true, preds)
# Plot the confusion matrix using seaborn
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", cbar=False,
            xticklabels=["Class 0", "Class 1"], yticklabels=["Class 0", "Class 1"])
plt.title("Confusion Matrix")
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.show()