In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader,ConcatDataset
from torch.nn.utils.rnn import pad_sequence
import pandas as pd
import numpy as np
from torch.optim.lr_scheduler import StepLR
import os
#from ax.service.ax_client import AxClient
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split

class ProteinDataset(Dataset):
    def __init__(self, sequence_file, pssm_files_path, amino_acid_to_ix, struct_to_ix, label_file=None, indices=None):
        self.sequence_data = pd.read_csv(sequence_file)
        if indices is not None:
            self.sequence_data = self.sequence_data.iloc[indices].reset_index(drop=True)

        self.pssm_files_path = pssm_files_path
        if label_file:
            self.label_data = pd.read_csv(label_file)
            if indices is not None:
                self.label_data = self.label_data.iloc[indices].reset_index(drop=True)
        else:
            self.label_data = pd.DataFrame({'PDB_ID': self.sequence_data['PDB_ID'], 'LABEL': [''] * len(self.sequence_data)})
        
        self.amino_acid_to_ix = amino_acid_to_ix
        self.struct_to_ix = struct_to_ix if label_file else None

    def __len__(self):
        return len(self.sequence_data)

    def __getitem__(self, idx):
        pdb_id = self.sequence_data.iloc[idx, 0]
        sequence = self.sequence_data.iloc[idx, 1]
        sequence_encoded = [self.amino_acid_to_ix[aa] for aa in sequence]
        
        label_sequence = self.label_data[self.label_data['PDB_ID'] == pdb_id].iloc[0, 1]
        label_encoded = [self.struct_to_ix[label] for label in label_sequence] if self.struct_to_ix else []

        pssm_file = os.path.join(self.pssm_files_path, f"{pdb_id}_train.csv")
        pssm_data = pd.read_csv(pssm_file).iloc[:, 2:].to_numpy()

        return {
            'sequence': torch.tensor(sequence_encoded, dtype=torch.long),
            'labels': torch.tensor(label_encoded, dtype=torch.long) if label_encoded else torch.tensor([]),
            'pssm': torch.tensor(pssm_data, dtype=torch.float),
        }

def protein_collate_fn(batch):
    sequences, labels, pssms = zip(*[(sample['sequence'], sample['labels'], sample['pssm']) for sample in batch])
    
    # Pad sequences and labels
    sequences_padded = pad_sequence(sequences, batch_first=True)
    labels_padded = pad_sequence(labels, batch_first=True)
    
    # Pad PSSM data (assuming all PSSM profiles have the same number of columns)
    pssms_padded = pad_sequence(pssms, batch_first=True, padding_value=0)  # padding_value
    
    return {'sequence': sequences_padded,
            'labels': labels_padded,
            'pssm': pssms_padded}

# ConvNet model definition
class ConvNet(nn.Module):
    def __init__(self, num_amino_acids=20, num_classes=3):
        super(ConvNet, self).__init__()
        self.layer1 = nn.Sequential(nn.Conv1d(num_amino_acids, 64, kernel_size=5, padding=2), nn.ReLU(), nn.BatchNorm1d(64))
        self.layer2 = nn.Sequential(nn.Conv1d(64, 128, kernel_size=5, padding=2), nn.ReLU(), nn.BatchNorm1d(128))
        self.layer3 = nn.Sequential(nn.Conv1d(128, 256, kernel_size=5, padding=2), nn.ReLU(), nn.BatchNorm1d(256))
        self.output_layer = nn.Conv1d(256, num_classes, kernel_size=1)

   
    def forward(self, x):
        # sequences[batch_size, sequence_length, 20] Three-dimensional tensor
        #sequences_expanded = sequences.unsqueeze(2).repeat(1, 1, pssms.shape[2])

        # sequences and pssms Merge in sequence dimension
        #x = torch.cat((sequences_expanded, pssms), dim=1)

        # Application network layer
        x = x.permute(0, 2, 1)  # Adjust the dimensions to match the input requirements of the convolution layer
        out = self.layer1(x)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.output_layer(out)

        # Returns the output tensor
        return out.permute(0, 2, 1)



    
# Define the amino acid and structure mappings
amino_acid_to_ix = {'A': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6, 'H': 7, 'I': 8, 'K': 9, 'L': 10, 'M': 11, 'N': 12, 'P': 13, 'Q': 14, 'R': 15, 'S': 16, 'T': 17, 'V': 18, 'W': 19, 'Y': 20}
struct_to_ix = {'H': 0, 'E': 1, 'C': 2}

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

sequence_file = '/kaggle/input/deep-learning-for-msc-202324/seqs_train.csv'
label_file ='/kaggle/input/deep-learning-for-msc-202324/labels_train.csv'
pssm_files_path ='/kaggle/input/deep-learning-for-msc-202324/train/'

sequence_data = pd.read_csv(sequence_file)

# 读取序列数据以获取数据集大小
sequence_data = pd.read_csv(sequence_file)

# 获取数据集的索引
indices = range(len(sequence_data))

# 分割索引
train_indices, val_indices = train_test_split(indices, test_size=0.2, random_state=42)

# 创建训练和验证的数据集实例
train_dataset = ProteinDataset(sequence_file, pssm_files_path, amino_acid_to_ix, struct_to_ix, label_file, indices=train_indices)
val_dataset = ProteinDataset(sequence_file, pssm_files_path, amino_acid_to_ix, struct_to_ix, label_file, indices=val_indices)
# 创建训练 DataLoader
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=protein_collate_fn, pin_memory=True)
# 创建验证 DataLoader
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False, collate_fn=protein_collate_fn, pin_memory=True)





# Instantiate the dataset and data loader 没有分
#protein_dataset = ProteinDataset(sequence_file, pssm_files_path, label_file, amino_acid_to_ix, struct_to_ix)
#dataloader = DataLoader(protein_dataset, batch_size=4, shuffle=True, collate_fn=protein_collate_fn,pin_memory=True)


model =  ConvNet().to(device)  # my model
criterion = nn.CrossEntropyLoss()  # Cross entropy loss function for classification problems
optimizer = optim.Adam(model.parameters())  # Adam optimizer
#optimizer = optim.Adam(model.parameters(), lr=0.0001)  #set lr



#初始化列表
epoch_losses = []
epoch_accuracies = []


def train_model(train_loader, val_loader):
    num_epochs = 10
    for epoch in range(num_epochs):
        model.train()  # Set the model to training mode
        train_loss = 0
        correct_predictions = 0
        total_predictions = 0

        # Training loop
        for batch in train_loader:
            sequences = batch['sequence'].to(device)
            labels = batch['labels'].to(device)
            pssms = batch['pssm'].to(device)

            # Forward pass
            outputs = model(pssms)
            loss = criterion(outputs.reshape(-1, 3), labels.reshape(-1))

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Accumulate loss
            train_loss += loss.item()

            # Calculate accuracy
            _, predicted = torch.max(outputs, 2)
            correct_predictions += (predicted == labels).sum().item()
            total_predictions += labels.numel()

        train_loss /= len(train_loader.dataset)
        train_accuracy = correct_predictions / total_predictions
        print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}')

        # Validation loop
        model.eval()  # Set the model to evaluation mode
        val_loss = 0
        correct_val_predictions = 0
        total_val_predictions = 0

        with torch.no_grad():
            for batch in val_loader:
                sequences = batch['sequence'].to(device)
                labels = batch['labels'].to(device)
                pssms = batch['pssm'].to(device)

                # Forward pass
                outputs = model(pssms)
                loss = criterion(outputs.reshape(-1, 3), labels.reshape(-1))

                # Accumulate loss
                val_loss += loss.item()

                # Calculate accuracy
                _, predicted = torch.max(outputs, 2)
                correct_val_predictions += (predicted == labels).sum().item()
                total_val_predictions += labels.numel()

        val_loss /= len(val_loader.dataset)
        val_accuracy = correct_val_predictions / total_val_predictions
        print(f'Epoch [{epoch+1}/{num_epochs}], Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}')

    return model




if __name__ == '__main__':
    # Ensure you have defined and instantiated your DataLoader here
    train_model(train_loader,val_loader)
    # Save the trained model
    torch.save(model.state_dict(), 'prediction_model_1.pth')


Epoch [1/10], Train Loss: 0.0929, Train Accuracy: 0.8608
Epoch [1/10], Validation Loss: 0.0851, Validation Accuracy: 0.8703
Epoch [2/10], Train Loss: 0.0803, Train Accuracy: 0.8770
Epoch [2/10], Validation Loss: 0.0786, Validation Accuracy: 0.8788
Epoch [3/10], Train Loss: 0.0762, Train Accuracy: 0.8821
Epoch [3/10], Validation Loss: 0.0782, Validation Accuracy: 0.8791
Epoch [4/10], Train Loss: 0.0741, Train Accuracy: 0.8850
Epoch [4/10], Validation Loss: 0.0755, Validation Accuracy: 0.8837
Epoch [5/10], Train Loss: 0.0730, Train Accuracy: 0.8868
Epoch [5/10], Validation Loss: 0.0755, Validation Accuracy: 0.8832
Epoch [6/10], Train Loss: 0.0704, Train Accuracy: 0.8906
Epoch [6/10], Validation Loss: 0.0765, Validation Accuracy: 0.8812
Epoch [7/10], Train Loss: 0.0702, Train Accuracy: 0.8911
Epoch [7/10], Validation Loss: 0.0766, Validation Accuracy: 0.8828
Epoch [8/10], Train Loss: 0.0682, Train Accuracy: 0.8936
Epoch [8/10], Validation Loss: 0.0749, Validation Accuracy: 0.8852
Epoch [9

In [None]:
import matplotlib.pyplot as plt

# 绘制损失曲线
plt.figure(figsize=(10, 5))
plt.plot(epoch_losses, label='Training Loss')
plt.title('Training Loss Per Epoch')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()

# 绘制准确率曲线
plt.figure(figsize=(10, 5))
plt.plot(epoch_accuracies, label='Training Accuracy')
plt.title('Training Accuracy Per Epoch')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.show()


In [None]:
#same ProteinDataset with train ，no label
class ProteinDataset(Dataset):
    def __init__(self, sequence_file, pssm_files_path, amino_acid_to_ix, struct_to_ix, label_file=None,):
        self.sequence_data = pd.read_csv(sequence_file)
        self.pssm_files_path = pssm_files_path
        self.actual_lengths = [len(seq) for seq in self.sequence_data['SEQUENCE']]

        if label_file:
            self.label_data = pd.read_csv(label_file)
        else:
            # Create an empty DataFrame with only the PDB_ID so that it will work without the tag
            self.label_data = pd.DataFrame({'PDB_ID': self.sequence_data['PDB_ID'], 'LABEL': [''] * len(self.sequence_data)})
        
        self.amino_acid_to_ix = amino_acid_to_ix
        self.struct_to_ix = struct_to_ix if label_file else None

    def __len__(self):
        return len(self.sequence_data)

    def __getitem__(self, idx):
        pdb_id = self.sequence_data.iloc[idx, 0]
        sequence = self.sequence_data.iloc[idx, 1]
        sequence_encoded = [self.amino_acid_to_ix[aa] for aa in sequence]
        
        label_sequence = self.label_data[self.label_data['PDB_ID'] == pdb_id].iloc[0, 1]
        label_encoded = [self.struct_to_ix[label] for label in label_sequence] if self.struct_to_ix else []

        pssm_file = f"{self.pssm_files_path}/{pdb_id}_test.csv"
        pssm_data = pd.read_csv(pssm_file).iloc[:, 2:].to_numpy()

        return {
            'sequence': torch.tensor(sequence_encoded, dtype=torch.long),
            'labels': torch.tensor(label_encoded, dtype=torch.long) if label_encoded else torch.tensor([]),
            'pssm': torch.tensor(pssm_data, dtype=torch.float),
            'pdb_id': pdb_id, 
            'length': self.actual_lengths[idx]
        }

def protein_collate_fn(batch):
    sequences = pad_sequence([item['sequence'] for item in batch], batch_first=True)
    labels = pad_sequence([item['labels'] for item in batch], batch_first=True, padding_value=-1)
    
    if any(item['pssm'].nelement() != 0 for item in batch):
        pssms = pad_sequence([item['pssm'] for item in batch], batch_first=True)
    else:
        pssms = None

    pdb_ids = [item['pdb_id'] for item in batch]
    lengths = torch.tensor([item['length'] for item in batch])

    return {'sequence': sequences, 'labels': labels, 'pssm': pssms, 'pdb_id': pdb_ids, 'length': lengths}

#Add code to the training loop to calculate validation losses
val_losses = []

# Define the amino acid and structure mappings
amino_acid_to_ix = {'A': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6, 'H': 7, 'I': 8, 'K': 9, 'L': 10, 'M': 11, 'N': 12, 'P': 13, 'Q': 14, 'R': 15, 'S': 16, 'T': 17, 'V': 18, 'W': 19, 'Y': 20}
struct_to_ix = {'H': 0, 'E': 1, 'C': 2}
# Define the inverse mapping from index to structure label
sec_struct_mapping_inv = {0: 'H', 1: 'E', 2: 'C'}

test_sequence_file = '/kaggle/input/deep-learning-for-msc-202324/seqs_test.csv'
test_pssm_files_path ='/kaggle/input/deep-learning-for-msc-202324/test'
# Load the test data
# Load the trained model
model = ConvNet().to(device)
model.load_state_dict(torch.load('prediction_model_1.pth'))
model.eval()

# Load the test data
test_dataset = ProteinDataset(sequence_file=test_sequence_file, 
                              pssm_files_path=test_pssm_files_path, 
                              amino_acid_to_ix=amino_acid_to_ix, 
                              struct_to_ix=struct_to_ix,  # Pass None if there are no labels
                              label_file=None)  # No label file for testing
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False, collate_fn=protein_collate_fn, pin_memory=True)

# Initialize a list to hold the formatted predictions
submission = [['ID', 'STRUCTURE']]
with torch.no_grad():
    for batch_idx, batch in enumerate(test_loader):
        sequences = batch['sequence'].to(device)
        pssms = batch['pssm'].to(device) if batch['pssm'] is not None else None
        pdb_ids = batch['pdb_id']
        actual_lengths = batch['length']
        # Show batch information
        print(f"Processing batch {batch_idx + 1}/{len(test_loader)}")

        if pssms is not None:
            #  sequences Add a dimension to the last dimension
            sequences_expanded = sequences.unsqueeze(-1)
            input_data = torch.cat((sequences_expanded, pssms), dim=2)
        else:
            input_data = sequences

        outputs = model(pssms)
        _, predicted_labels = torch.max(outputs, dim=2)
        
    
        # length of each test，Ensure final quantity
        for i, pdb_id in enumerate(pdb_ids):
            actual_length = sequences.shape[1]
            actual_length = actual_lengths[i].item()
            print(f"Processing {pdb_id} with {actual_length} residues")

            for residue_index in range(actual_length):
                residue_prediction = predicted_labels[i][residue_index].cpu().item()
                submission.append([f"{pdb_id}_{residue_index + 1}", sec_struct_mapping_inv[residue_prediction]])

# Save the prediction to a file
with open('predictions.csv', 'w') as f:
    for line in submission:
        f.write(','.join(line) + '\n')

print(f"Total predictions made: {len(submission) - 1}")
