In [221]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import torch.nn as nn
import torch.optim as optim
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor
from sklearn.model_selection import train_test_split
from torch.nn.utils.rnn import pack_padded_sequence, pad_sequence, pad_packed_sequence


In [222]:
import torch
import pandas as pd

class Preprocessing:
    def __init__(self):
        self.amino_acids = 'ACDEFGHIKLMNPQRSTVWY'
        self.mapping = {'C': 0, 'H': 1, 'E': 2}

    def one_hot_encode(self, sequence):
        # Create a tensor for the sequence where each amino acid is replaced by its index
        indices = torch.tensor([self.amino_acids.index(aa) for aa in sequence], dtype=torch.long)
        # One-hot encode
        return torch.nn.functional.one_hot(indices, num_classes=len(self.amino_acids)).type(torch.float)

    def process_labels(self, label):
        # Convert labels to indices
        return torch.tensor([self.mapping[char] for char in label], dtype=torch.long)

    def preprocess_data(self, filepath, num_sequences=None):
        data = pd.read_csv(filepath)
        filtered_data = data[data['has_nonstd_aa'] == False]

        if num_sequences:
            filtered_data = filtered_data[:num_sequences]

        # Initialize lists for sequences and labels
        sequence_tensors = [self.one_hot_encode(seq) for seq in filtered_data['seq']]
        label_tensors = [self.process_labels(lbl) for lbl in filtered_data['sst3']]

        # Padding sequences and labels
        seq_lengths = torch.tensor([len(seq) for seq in sequence_tensors])
        seq_tensor = torch.nn.utils.rnn.pad_sequence(sequence_tensors, batch_first=True, padding_value=0)
        label_tensor = torch.nn.utils.rnn.pad_sequence(label_tensors, batch_first=True, padding_value=-1)  # Use -1 for label padding

        # Sort by sequence length in descending order for pack_padded_sequence
        lengths, perm_idx = seq_lengths.sort(0, descending=True)
        seq_tensor = seq_tensor[perm_idx]
        label_tensor = label_tensor[perm_idx]
        return seq_tensor, label_tensor, lengths



In [223]:

class Model(nn.Module):
    def __init__(self, input_dim, hidden_dim, layer_dim, output_dim):
        super(Model,self).__init__()
        self.hidden_dim = hidden_dim
        self.layer_dim = layer_dim

        #the problem problem lies in the lstm or in the pad_packed_sequence, after that there is a dimension mismatch
        self.lstm = nn.LSTM(input_dim, hidden_dim, layer_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x,lengths):
        print(f'before padcking {x.size()}')
        packed_x = pack_padded_sequence(x,lengths, batch_first=True, enforce_sorted=True)
        packed_output, (hn,cn) = self.lstm(packed_x)
        assert packed_x.data.size(0) == packed_output.data.size(0)
        #this is still true, no assertion error
        #so problem must lie in pad_packed_sequence
        output, _ = pad_packed_sequence(packed_output, batch_first=True)
        print(f"after padding:{output.size()}")
        out = self.fc(output)
        return out

        

class AminoAcidDataset(Dataset):
    def __init__(self, sequences, labels, lengths):
        self.sequences = sequences 
        self.lengths = lengths # A list of encoded amino acid sequences (as tensors)
        self.labels = labels        # Corresponding labels or targets

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        return self.sequences[idx], self.labels[idx], self.lengths[idx]


In [224]:
#set num_sequences to False if you want all of them
preprocessor = Preprocessing()
sequence_tensors, label_tensors, lengths_tensor = preprocessor.preprocess_data("2018-06-06-ss.cleaned.csv", num_sequences= 2000)

In [225]:
#split into train,validation,test
total = len(sequence_tensors)
train_size = int(0.8 * total)
val_size = int(0.2 * total)


indices = torch.randperm(len(sequence_tensors)).tolist()
train_indices = indices[:train_size]
val_indices = indices[train_size:]

train_sequences = sequence_tensors[:train_size]
train_labels = label_tensors[:train_size]
train_lengths = lengths_tensor[:train_size]

test_sequences = sequence_tensors[train_size:]
test_labels = label_tensors[train_size:]
test_lengths = lengths_tensor[train_size:]

In [226]:


#lengths = torch.tensor([len(seq) for seq in sequence_tensors])
dataset = AminoAcidDataset(sequence_tensors, label_tensors, lengths_tensor)
loader = DataLoader(dataset, batch_size = 1000, shuffle = False)
train_dataset = AminoAcidDataset(train_sequences,train_labels, train_lengths)
val_dataset = AminoAcidDataset(test_sequences, test_labels, test_lengths)
trainloader = DataLoader(train_dataset, batch_size = 100, shuffle = True)
valloader = DataLoader(val_dataset, batch_size = 100, shuffle = True)
#pack_padded_sequence(padded_sequences,...)



In [227]:
#Define the model, loss function and optimizer

model = Model(input_dim = 20, hidden_dim =64 , layer_dim = 1, output_dim = 3)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr = 0.001)




In [228]:
num_epochs = 100
# Initialize lists to monitor loss
train_losses = []

# Assuming you have a validation dataset and loader
# val_loader = ...

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    for sequences, labels, lengthss in loader:
        count =0
        optimizer.zero_grad()
        sequences = sequences.float()
        
        outputs = model(sequences, lengthss)
        print(f"Expected output size: [batch_size, max_seq_length, num_classes] -> {labels.size(0)}, {torch.max(lengthss)}, {outputs.size(2)}")
        print(f"Actual output size: {outputs.size()}")
        print(f"Labels size: {labels.size()}")
        print(outputs.size())
        print(labels.size())
        outputs_flat = outputs.view(-1, 3)
        labels_flat = labels.view(-1)

        #since we have used -1 for padding for the labels and cross_entropy can
        #not handle -1, we ignore those in our loss calculation
        mask = labels_flat != -1
        outputs_flat = outputs_flat[mask]
        labels_flat = labels_flat[mask]
        loss = criterion(outputs_flat, labels_flat)
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item() * sequences.size(0)
        count+=1
        print(count)
        
    # Average loss for this epoch
    epoch_loss /= len(trainloader.dataset)
    train_losses.append(epoch_loss)

    print(f'Epoch [{epoch+1}/{num_epochs}], Training Loss: {epoch_loss:.4f}')

    model.eval()
    with torch.no_grad():
        val_loss = 0
        for sequences, lengths, labels in valloader:
            # Similar steps to compute validation loss
            # Update val_loss
            print(f'Epoch [{epoch+1}/{num_epochs}], Validation Loss: {val_loss:.4f}')

# Plotting the training loss
plt.figure()
plt.plot(train_losses, label='Training Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training Loss Over Epochs')
plt.legend()
plt.show()


before padcking torch.Size([1000, 9, 20])
after padding:torch.Size([1000, 9, 64])
Expected output size: [batch_size, max_seq_length, num_classes] -> 1000, 9, 3
Actual output size: torch.Size([1000, 9, 3])
Labels size: torch.Size([1000, 9])
torch.Size([1000, 9, 3])
torch.Size([1000, 9])
1
before padcking torch.Size([1000, 9, 20])
after padding:torch.Size([1000, 7, 64])
Expected output size: [batch_size, max_seq_length, num_classes] -> 1000, 7, 3
Actual output size: torch.Size([1000, 7, 3])
Labels size: torch.Size([1000, 9])
torch.Size([1000, 7, 3])
torch.Size([1000, 9])


IndexError: The shape of the mask [9000] at index 0 does not match the shape of the indexed tensor [7000, 3] at index 0