In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from collections import defaultdict
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
import re
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
from Bio import SeqIO
import seaborn as sns
import torch.optim as optim

# Check for GPU
is_cuda = torch.cuda.is_available()
device = torch.device("cuda" if is_cuda else "cpu")
print("GPU is available" if is_cuda else "GPU not available, CPU used")

# Data
records = list(SeqIO.parse("./ird_influenzaA_HA_allspecies.fa", "fasta"))

def extract_information(description, word):
    match = re.search(rf'{word}:([^()|]+)', description)
    if match:
        return match.group(1).strip()
    return None

sequences = []
total_seq = ""

for record in tqdm(records):
    seq = str(record.seq)
    label = extract_information(record.description, "Host")
    sequences.append((label, seq))
    total_seq += seq

vocab_list = list(set(total_seq))
vocab_dict = {letter: idx for idx, letter in enumerate(vocab_list)}

datasets = defaultdict(list)
for label, seq in sequences:
    datasets[label].append(seq)

data = pd.DataFrame(sequences, columns=["Host", "Sequence"])
data['Label'] = data['Host'].apply(lambda x: 1 if x == "Human" else 0)

def encode_sequence(sequence, vocab=vocab_dict):
    return np.array([vocab[aa] for aa in sequence])

data['Encoded_seq'] = data['Sequence'].apply(encode_sequence)

# Pad sequences
max_seq_length = max(data['Encoded_seq'].apply(len))

def pad_sequence(seq, max_length=max_seq_length):
    return np.pad(seq, (0, max_length - len(seq)), 'constant')

data['Padded_seq'] = data['Encoded_seq'].apply(pad_sequence)

# Prepare data for training
X = np.stack(data['Padded_seq'].values)
y = data['Label'].values
x_train, x_test, y_train, y_test = train_test_split(X, y, stratify=y)

# Convert to PyTorch tensors
x_train = torch.tensor(x_train, dtype=torch.long)
y_train = torch.tensor(y_train, dtype=torch.float32)
x_test = torch.tensor(x_test, dtype=torch.long)
y_test = torch.tensor(y_test, dtype=torch.float32)

# Create Tensor datasets
train_data = TensorDataset(x_train, y_train)
valid_data = TensorDataset(x_test, y_test)

# Dataloaders
batch_size = 50
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
valid_loader = DataLoader(valid_data, shuffle=True, batch_size=batch_size)

# Print the shapes of the data
print(f'Shape of train data: {x_train.shape}')
print(f'Shape of test data: {x_test.shape}')


GPU not available, CPU used


100%|██████████| 94560/94560 [00:00<00:00, 245687.23it/s]


KeyboardInterrupt: 

In [None]:

class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, dropout):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.conv_0 = nn.Conv2d(in_channels=1, out_channels=n_filters, kernel_size=(filter_sizes[0], embedding_dim))
        self.conv_1 = nn.Conv2d(in_channels=1, out_channels=n_filters, kernel_size=(filter_sizes[1], embedding_dim))
        self.conv_2 = nn.Conv2d(in_channels=1, out_channels=n_filters, kernel_size=(filter_sizes[2], embedding_dim))
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        embedded = self.embedding(x).unsqueeze(1)
        conved_0 = F.relu(self.conv_0(embedded).squeeze(3))
        conved_1 = F.relu(self.conv_1(embedded).squeeze(3))
        conved_2 = F.relu(self.conv_2(embedded).squeeze(3))
        pooled_0 = F.max_pool1d(conved_0, conved_0.shape[2]).squeeze(2)
        pooled_1 = F.max_pool1d(conved_1, conved_1.shape[2]).squeeze(2)
        pooled_2 = F.max_pool1d(conved_2, conved_2.shape[2]).squeeze(2)
        cat = self.dropout(torch.cat((pooled_0, pooled_1, pooled_2), dim=1))
        return self.fc(cat)


In [3]:
INPUT_DIM = len(vocab_list)
EMBEDDING_DIM = 100
N_FILTERS = 1000
FILTER_SIZES = [3, 4, 5]
OUTPUT_DIM = 1
DROPOUT = 0.2

# Instantiate the model
model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT)
model = model.to(device)

# Loss and optimization functions
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001)


In [5]:
import torch
from tqdm import tqdm

# Function to calculate binary accuracy
def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()
    acc = correct.sum() / len(correct)
    return acc

def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.train()
    for inputs, labels in iterator:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        predictions = model(inputs).squeeze(1)
        loss = criterion(predictions, labels)
        acc = binary_accuracy(predictions, labels)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc.item()
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.eval()
    with torch.no_grad():
        for inputs, labels in iterator:
            inputs, labels = inputs.to(device), labels.to(device)
            predictions = model(inputs).squeeze(1)
            loss = criterion(predictions, labels)
            acc = binary_accuracy(predictions, labels)
            epoch_loss += loss.item()
            epoch_acc += acc.item()
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

N_EPOCHS = 5

# Initialize lists to store the results
train_losses = []
train_accuracies = []
valid_losses = []
valid_accuracies = []

for epoch in tqdm(range(N_EPOCHS)):
    train_loss, train_acc = train(model, train_loader, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_loader, criterion)

    # Append the results to the lists
    train_losses.append(train_loss)
    train_accuracies.append(train_acc)
    valid_losses.append(valid_loss)
    valid_accuracies.append(valid_acc)

    print(f'| Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}% |')



 20%|██        | 1/5 [05:08<20:34, 308.66s/it]

| Epoch: 01 | Train Loss: 0.264 | Train Acc: 91.39% | Val. Loss: 0.177 | Val. Acc: 93.80% |


 40%|████      | 2/5 [10:17<15:26, 308.89s/it]

| Epoch: 02 | Train Loss: 0.177 | Train Acc: 94.60% | Val. Loss: 0.145 | Val. Acc: 96.13% |


 60%|██████    | 3/5 [15:26<10:17, 308.88s/it]

| Epoch: 03 | Train Loss: 0.152 | Train Acc: 95.75% | Val. Loss: 0.138 | Val. Acc: 96.15% |


 80%|████████  | 4/5 [20:35<05:08, 308.84s/it]

| Epoch: 04 | Train Loss: 0.138 | Train Acc: 96.19% | Val. Loss: 0.118 | Val. Acc: 96.74% |


100%|██████████| 5/5 [25:44<00:00, 308.81s/it]

| Epoch: 05 | Train Loss: 0.128 | Train Acc: 96.55% | Val. Loss: 0.127 | Val. Acc: 96.83% |





In [1]:
epochs = range(1, N_EPOCHS + 1)

plt.figure(figsize=(14, 7))

# Plot training and validation loss
plt.subplot(1, 2, 1)
plt.plot(epochs, train_losses, 'b', label='Training loss')
plt.plot(epochs, valid_losses, 'r', label='Validation loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

# Plot training and validation accuracy
plt.subplot(1, 2, 2)
plt.plot(epochs, train_accuracies, 'b', label='Training accuracy')
plt.plot(epochs, valid_accuracies, 'r', label='Validation accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

plt.show()


NameError: name 'N_EPOCHS' is not defined