In [None]:
# !pip install ax-platform

In [None]:
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from ax.plot.slice import plot_slice
from ax.service.ax_client import AxClient
from ax.service.utils.instantiation import ObjectiveProperties
from ax.utils.notebook.plotting import render
from captum.attr import LayerIntegratedGradients
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

In [None]:
# !conda install pytorch==2.2.0 torchvision==0.17.0 torchaudio==2.2.0 pytorch-cuda=12.1 -c pytorch -c nvidia

In [None]:
print(torch.__version__)
print("CUDA Available:", torch.cuda.is_available())

# # 1. Observe the data

In [None]:
# one_train_pssm = pd.read_csv('deep-learning-for-msc-202324/train/1A0A_3_A_train.csv')
# one_train_pssm

In [None]:
# seqs_test = pd.read_csv('deep-learning-for-msc-202324/seqs_test.csv')
# seqs_test

In [None]:
# seqs_train = pd.read_csv('deep-learning-for-msc-202324/seqs_train.csv')
# seqs_train

In [None]:
# labels_train = pd.read_csv('deep-learning-for-msc-202324/labels_train.csv')
# labels_train

# 2. Define the dataset and data loaders

In [None]:
class ProteinDataset(Dataset):
    """
    It is used to load and preprocess the protein sequence and label data. It also reads the PSSM data from the provided files.
    
    :param seq_file_path: The file path to the sequence data
    :type seq_file_path: str
    :param pssm_files_path: The directory path to the PSSM files
    :type pssm_files_path: str
    :param label_file_path: The file path to the label data
    :type label_file_path: str
    :param indices: The indices of the data to select
    :type indices: list
    
    :return: The protein dataset
    :rtype: Dataset
    """

    def __init__(self, seq_file_path, pssm_files_path, label_file_path=None, indices=None):
        # Define the amino acid and structure mappings
        self.amino_acid_to_ix = {'A': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6, 'H': 7, 'I': 8, 'K': 9, 'L': 10,
                                 'M': 11, 'N': 12, 'P': 13, 'Q': 14, 'R': 15, 'S': 16, 'T': 17, 'V': 18, 'W': 19,
                                 'Y': 20}
        self.struct_to_ix = {'H': 0, 'E': 1, 'C': 2}

        # Read the sequence data, and select a subset of the data if indices are provided(train set and val set)
        self.seq_data = pd.read_csv(seq_file_path)
        self.real_lan = [len(seq) for seq in self.seq_data['SEQUENCE']]
        if indices is not None:
            self.seq_data = self.seq_data.iloc[indices].reset_index(drop=True)

        # Read the PSSM files path
        self.pssm_files_path = pssm_files_path

        # Read the label data, and select a subset of the data if indices are provided
        if label_file_path:
            self.label_data = pd.read_csv(label_file_path)
            if indices is not None:
                self.label_data = self.label_data.iloc[indices].reset_index(drop=True)
        else:
            self.label_data = None

    def __len__(self):
        return len(self.seq_data)

    def __getitem__(self, idx):
        """
        It is used to get the sequence, label, and PSSM data for a given index.
        
        :param idx: The index of the data to retrieve 
        :type idx: int
        :return: The sequence, label, and PSSM data
        :rtype: dict
        """
        pdb_id = self.seq_data.iloc[idx, 0]
        sequence = self.seq_data.iloc[idx, 1]
        sequence_encoded = [self.amino_acid_to_ix[aa] for aa in sequence]

        if self.label_data is not None:
            label_sequence = self.label_data[self.label_data['PDB_ID'] == pdb_id].iloc[0, 1]
            label_encoded = [self.struct_to_ix[label] for label in label_sequence]
        else:
            label_encoded = [-1] * len(sequence)

        pssm_file_path = os.path.join(self.pssm_files_path,
                                      f"{pdb_id}_test.csv" if self.label_data is None else f"{pdb_id}_train.csv")
        pssm_data = pd.read_csv(pssm_file_path).iloc[:, 2:].to_numpy()

        return {
            'pdb_id': pdb_id,
            'sequence': torch.tensor(sequence_encoded, dtype=torch.long),
            'labels': torch.tensor(label_encoded, dtype=torch.long),
            'pssm': torch.tensor(pssm_data, dtype=torch.float),
            'real_len': self.real_lan[idx]
        }

In [None]:
def protein_collate_fn(batch):
    """
    It is used to process and prepare batched data during data loading. The DataLoader object can accept a function through the collate_fn parameter to customize how to combine multiple samples into a batch. This is particularly useful when dealing with sequence data, as the lengths of sequence data are often not consistent and need to be padded or otherwise processed to ensure that all data in a batch have consistent dimensions.
    :param batch: The batch of data to process
    :type batch: int
    :return: A dictionary containing the processed batched data
    :rtype: dict
    """
    sequences, labels, pssms, pdb_ids, real_lens = zip(
        *[(sample['sequence'], sample['labels'], sample['pssm'], sample['pdb_id'], sample['real_len']) for sample in
          batch])

    # Pad sequence and label data
    sequences_padded = pad_sequence(sequences, batch_first=True)
    labels_padded = pad_sequence(labels, batch_first=True)

    # Pad PSSM data
    pssms_padded = pad_sequence(pssms, batch_first=True, padding_value=-1)  # Use -1 to pad the PSSM data

    return {'sequence': sequences_padded,
            'labels': labels_padded,
            'pssm': pssms_padded,
            'pdb_id': pdb_ids,
            'real_len': torch.tensor(real_lens)}

In [None]:
def get_data_loaders(batch_size):
    """
    It is used to create training and validation data loaders for the protein secondary structure prediction model. The function loads the sequence data, label data, and PSSM data, and then creates a training and validation split of the data. The function then creates data loaders for the training and validation data.
    :param batch_size: batch size for the data loaders
    :type batch_size: int
    :return: a tuple containing the training and validation data loaders
    :rtype: tuple
    """
    seqs_file_path = 'kaggle/input/deep-learning-for-msc-202324/seqs_train.csv'
    label_file_path = 'kaggle/input/deep-learning-for-msc-202324/labels_train.csv'
    pssm_files_path = 'kaggle/input/deep-learning-for-msc-202324/train/'

    # load the sequence data and generate a list of PDB IDs
    seqs_data = pd.read_csv(seqs_file_path)
    # pdb_ids = seqs_data['PDB_ID'].tolist()
    indices = range(len(seqs_data))

    # split the PDB_ID list into training and validation sets
    pdb_ids_train, pdb_ids_val = train_test_split(indices, test_size=0.2, random_state=10)

    # create training and validation data loaders
    dataset_train = ProteinDataset(seqs_file_path, pssm_files_path, label_file_path, pdb_ids_train)
    dataset_val = ProteinDataset(seqs_file_path, pssm_files_path, label_file_path, pdb_ids_val)

    # create data loaders for training and validation
    dataloader_train = DataLoader(dataset_train, batch_size=batch_size, shuffle=True, collate_fn=protein_collate_fn,
                                  pin_memory=True)
    dataloader_val = DataLoader(dataset_val, batch_size=batch_size, shuffle=False, collate_fn=protein_collate_fn,
                                pin_memory=True)

    return dataloader_train, dataloader_val

# 3. Design the model(Fully Convolutional Networks, FCN)

In [None]:
class ProteinSecondaryStructureFCNwithEmbedding(nn.Module):
    """
    It is used to define a fully convolutional neural network (FCN) model for protein secondary structure prediction. The model uses an embedding layer to embed the amino acid sequence data, and then concatenates the embedded sequence data with the PSSM data before applying convolutional layers to learn features from the data. The model outputs a sequence of predictions for the secondary structure of the protein.
    """

    def __init__(self, embedding_dim=10, dropout_rate=0.5):
        num_amino_acids = 20
        num_pssm_features = 20
        num_classes = 3
        super(ProteinSecondaryStructureFCNwithEmbedding, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=num_amino_acids + 1,
                                      embedding_dim=embedding_dim)

        # convolutional layers
        self.layer1 = nn.Sequential(nn.Conv1d(embedding_dim + num_pssm_features, 64, kernel_size=5, padding=2),
                                    nn.ReLU(), nn.BatchNorm1d(64), nn.Dropout(dropout_rate))
        self.layer2 = nn.Sequential(nn.Conv1d(64, 128, kernel_size=5, padding=2), nn.ReLU(), nn.BatchNorm1d(128),
                                    nn.Dropout(dropout_rate))
        self.layer3 = nn.Sequential(nn.Conv1d(128, 256, kernel_size=5, padding=2), nn.ReLU(), nn.BatchNorm1d(256),
                                    nn.Dropout(dropout_rate))
        self.layer4 = nn.Sequential(nn.Conv1d(256, 512, kernel_size=5, padding=2), nn.ReLU(), nn.BatchNorm1d(512),
                                    nn.Dropout(dropout_rate))
        self.output_layer = nn.Conv1d(512, num_classes, kernel_size=1)

    def forward(self, sequence, pssm):
        # sequence: [batch_size, seq_len]
        # pssm: [batch_size, seq_len, num_pssm_features]

        # embed the sequence
        embedded_sequence = self.embedding(sequence)  # [batch_size, seq_len, embedding_dim]

        # adjust the dimensions of the embedded sequence to be concatenated with the PSSM
        embedded_sequence = embedded_sequence.permute(0, 2, 1)  # [batch_size, embedding_dim, seq_len]

        # concatenate the embedded sequence and PSSM data
        combined_input = torch.cat((embedded_sequence, pssm.permute(0, 2, 1)),
                                   dim=1)  # [batch_size, embedding_dim+num_pssm_features, seq_len]

        # apply the network layers
        out = self.layer1(combined_input)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = self.output_layer(out)

        return out.permute(0, 2, 1)  # [batch_size, seq_len, num_classes]

In [None]:
def train_model(batch_size=4, num_epochs=10, embedding_dim=10, dropout_rate=0.5, lr=0.0001):
    """
    It is used to train the protein secondary structure prediction model. The function creates the data loaders, initializes the model, loss function, and optimizer, and then trains the model for the specified number of epochs. The function returns the trained model and the training and validation performance metrics.
    :param batch_size: batch size for the data loaders
    :type batch_size: int
    :param num_epochs: the number of epochs to train the model
    :type num_epochs: int
    :param embedding_dim: the dimension of the embedding layer
    :type embedding_dim: int
    :param dropout_rate: the dropout rate for the model
    :type dropout_rate: float
    :param lr: the learning rate for the optimizer
    :type lr: float
    :return: a dictionary containing the trained model and the training and validation performance metrics
    :rtype: dict
    """
    train_loader, val_loader = get_data_loaders(batch_size)
    model = ProteinSecondaryStructureFCNwithEmbedding(embedding_dim=embedding_dim, dropout_rate=dropout_rate).to(device)
    criterion = nn.CrossEntropyLoss(ignore_index=-1)  # Cross entropy loss function for classification problems
    optimizer = optim.Adam(model.parameters(), lr=lr)  # Adam optimizer

    # initialize lists to store performance metrics
    metrics = {
        'train_loss': [],
        'train_accuracy': [],
        'train_precision': [],
        'train_recall': [],
        'train_f1': [],
        'val_loss': [],
        'val_accuracy': [],
        'val_precision': [],
        'val_recall': [],
        'val_f1': []
    }

    for epoch in range(num_epochs):
        model.train()  # Set the model to training mode

        train_losses, train_true, train_pred = [], [], []

        # Training loop
        for batch in tqdm(train_loader, desc=f"Training Epoch {epoch + 1}/{num_epochs}"):
            sequences = batch['sequence'].to(device)
            labels = batch['labels'].to(device)
            pssms = batch['pssm'].to(device)

            # Forward pass
            outputs = model(sequences, pssms)
            loss = criterion(outputs.reshape(-1, 3), labels.reshape(-1))

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Track the training loss
            train_losses.append(loss.item())
            _, predicted = torch.max(outputs, 2)
            train_true.extend(labels.view(-1).cpu().numpy())
            train_pred.extend(predicted.view(-1).cpu().numpy())

        # Calculate the training performance metrics
        metrics['train_loss'].append(np.mean(train_losses))
        metrics['train_accuracy'].append(accuracy_score(train_true, train_pred))
        metrics['train_precision'].append(precision_score(train_true, train_pred, average='macro', zero_division=0))
        metrics['train_recall'].append(recall_score(train_true, train_pred, average='macro', zero_division=0))
        metrics['train_f1'].append(f1_score(train_true, train_pred, average='macro', zero_division=0))

        print(
            f'Epoch [{epoch + 1}/{num_epochs}], Train Loss: {metrics["train_loss"][-1]:.4f}, Train Accuracy: {metrics["train_accuracy"][-1]:.4f}, Train Precision: {metrics["train_precision"][-1]:.4f}, Train Recall: {metrics["train_recall"][-1]:.4f}, Train F1: {metrics["train_f1"][-1]:.4f}')

        torch.cuda.empty_cache()  # clean up CUDA memory

        # Validation loop
        model.eval()  # Set the model to evaluation mode
        val_losses, val_true, val_pred = [], [], []

        with torch.no_grad():
            for batch in val_loader:
                sequences = batch['sequence'].to(device)
                labels = batch['labels'].to(device)
                pssms = batch['pssm'].to(device)

                # Forward pass
                outputs = model(sequences, pssms)
                loss = criterion(outputs.reshape(-1, 3), labels.reshape(-1))

                # Track the validation loss
                val_losses.append(loss.item())
                _, predicted = torch.max(outputs, 2)
                val_true.extend(labels.view(-1).cpu().numpy())
                val_pred.extend(predicted.view(-1).cpu().numpy())

        # Calculate the validation performance metrics
        metrics['val_loss'].append(np.mean(val_losses))
        metrics['val_accuracy'].append(accuracy_score(val_true, val_pred))
        metrics['val_precision'].append(precision_score(val_true, val_pred, average='macro', zero_division=0))
        metrics['val_recall'].append(recall_score(val_true, val_pred, average='macro', zero_division=0))
        metrics['val_f1'].append(f1_score(val_true, val_pred, average='macro', zero_division=0))

        print(
            f'Epoch [{epoch + 1}/{num_epochs}], Validation Loss: {metrics["val_loss"][-1]:.4f}, Validation Accuracy: {metrics["val_accuracy"][-1]:.4f}, Validation Precision: {metrics["val_precision"][-1]:.4f}, Validation Recall: {metrics["val_recall"][-1]:.4f}, Validation F1: {metrics["val_f1"][-1]:.4f}')

        # torch.cuda.empty_cache()  # clean up CUDA memory

    return {'model': model, 'metrics': metrics}

# 4. Train the model and evaluate the performance

In [None]:
torch.cuda.is_available()

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
device

In [None]:
# # Ensure you have defined and instantiated your DataLoader here
# train_rsults = train_model(batch_size=32, num_epochs=10, embedding_dim=15, dropout_rate=0.234673, lr=0.000143)

In [None]:
# display(train_rsults['model'])

In [None]:
# # Save the trained model
# torch.save(train_rsults['model'].state_dict(), 'prediction_model_default_para.pth')

In [None]:
def plot_loss_accuracy_history(metrics):
    """
    It is used to plot the loss and accuracy history of the model during training and validation.
    :param metrics: A dictionary containing the training and validation performance metrics
    :type metrics: dict
    """
    train_loss_history = metrics['train_loss']
    val_loss_history = metrics['val_loss']
    train_accuracy_history = metrics['train_accuracy']
    val_accuracy_history = metrics['val_accuracy']

    epochs = range(1, len(train_loss_history) + 1)

    plt.figure(figsize=(12, 6))
    plt.subplot(1, 2, 1)
    plt.plot(epochs, train_loss_history, label='Train Loss')
    plt.plot(epochs, val_loss_history, label='Validation Loss')
    plt.title('Loss History')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(epochs, train_accuracy_history, label='Train Accuracy')
    plt.plot(epochs, val_accuracy_history, label='Validation Accuracy')
    plt.title('Accuracy History')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()

    plt.tight_layout()
    plt.show()

In [None]:
# display(train_rsults['metrics'])

In [None]:
# plot_loss_accuracy_history(train_rsults['metrics'])

# 5. Hyperparameter optimization with Ax

In [None]:
def train_evaluate(parameterization):
    """
    It is used to train and evaluate the protein secondary structure prediction model with the given hyperparameters.
    :param parameterization: A dictionary containing the hyperparameters to use for training and evaluation
    :type parameterization: dict
    :return: A dictionary containing the validation performance metrics
    :rtype: dict
    """
    # extract the hyperparameters from the parameterization
    learning_rate = parameterization["lr"]
    dropout_rate = parameterization["dropout_rate"]
    batch_size = parameterization["batch_size"]
    embedding_dim = parameterization["embedding_dim"]

    # use the extracted hyperparameters to train and evaluate the model
    metrics = \
        train_model(batch_size=batch_size, embedding_dim=embedding_dim, dropout_rate=dropout_rate, lr=learning_rate)[
            'metrics']

    val_loss = metrics['val_loss'][-1]
    val_accuracy = metrics['val_accuracy'][-1]
    val_precision = metrics['val_precision'][-1]
    val_recall = metrics['val_recall'][-1]
    val_f1 = metrics['val_f1'][-1]

    return {"val_loss": (val_loss, 0.0), "val_accuracy": (val_accuracy, 0.0), "val_precision": (val_precision, 0.0),
            "val_recall": (val_recall, 0.0), "val_f1": (val_f1, 0.0)}

In [None]:
ax_client = AxClient()
ax_client.create_experiment(
    name="protein_structure_prediction_experiment",
    parameters=[
        {"name": "lr", "type": "range", "bounds": [5e-4, 3e-3], "log_scale": True},
        {"name": "dropout_rate", "type": "range", "bounds": [0.15, 0.2]},
        {"name": "batch_size", "type": "choice", "values": [32, 64]},
        {"name": "embedding_dim", "type": "choice", "values": [4, 5, 6, 7]},
    ],
    objectives={"val_loss": ObjectiveProperties(minimize=True)},
    tracking_metric_names=["val_accuracy", "val_precision", "val_recall", "val_f1"],
)


In [None]:
times_ax = 10
for i in range(times_ax):
    print(f"Running optimization iteration {i + 1}/{times_ax}...")
    parameters, trial_index = ax_client.get_next_trial()
    ax_client.complete_trial(trial_index=trial_index, raw_data=train_evaluate(parameters))

In [None]:
best_parameters_with_eval_value = ax_client.get_best_parameters()
display(best_parameters_with_eval_value)

In [None]:
best_parameters = best_parameters_with_eval_value[0]

Show the best parameters after ax

In [None]:
best_parameters

In [None]:
results_df = ax_client.get_trials_data_frame()
results_df

After 70+ times of loop optimization, the best parameters are:
{'lr': 0.0015518165052243,
 'dropout_rate': 0.1858316733276795,
 'batch_size': 64,
 'embedding_dim': 7}

In [None]:
# results_df.to_csv('10_trials_2_results.csv', index=False)

Plotting to aid in parameterisation

In [None]:
render(ax_client.get_contour_plot(param_x="lr", param_y="dropout_rate", metric_name="val_accuracy"))

In [None]:
render(ax_client.get_optimization_trace())

In [None]:
render(plot_slice(
    model=ax_client.generation_strategy.model,
    param_name='lr',
    metric_name='val_accuracy'
))

In [None]:
render(plot_slice(
    model=ax_client.generation_strategy.model,
    param_name='dropout_rate',
    metric_name='val_accuracy'
))

# 6. Train the model with the best hyperparameters and make predictions of the test set

In [None]:
def test_train(best_parameters):
    batch_size = best_parameters['batch_size']
    embedding_dim = best_parameters['embedding_dim']
    dropout_rate = best_parameters['dropout_rate']
    lr = best_parameters['lr']
    num_epochs = 100

    best_results = train_model(batch_size=batch_size, num_epochs=num_epochs, embedding_dim=embedding_dim,
                               dropout_rate=dropout_rate, lr=lr)

    # save the trained model
    filename = f"{batch_size}_{num_epochs}_{embedding_dim}_{dropout_rate}_{lr}"
    torch.save(best_results['model'].state_dict(), f'{filename}.pth')

    return best_results, filename

Assist in the best parameters

In [None]:
file_path = '/all_ax.csv'
if os.path.exists(file_path):
    all_ax = pd.read_csv(file_path)
    all_ax

    max_val_accuracy_idx = all_ax['val_accuracy'].idxmax()
    max_val_accuracy_idx
    
    max_val_accuracy_row = all_ax.loc[max_val_accuracy_idx]
    max_val_accuracy_row

    print(
        f'Best parameters: \n lr:\t{max_val_accuracy_row["lr"]}, dropout_rate:\t{max_val_accuracy_row["dropout_rate"]}, batch_size:\t{max_val_accuracy_row["batch_size"]}, embedding_dim:\t{max_val_accuracy_row["embedding_dim"]}')

    best_parameters = {'lr': max_val_accuracy_row['lr'],
                       'dropout_rate': max_val_accuracy_row['dropout_rate'],
                       'batch_size': int(max_val_accuracy_row['batch_size']),
                       'embedding_dim': int(max_val_accuracy_row['embedding_dim'])}
else:
    best_parameters = {'lr': 0.0015518165052243, 'dropout_rate': 0.1858316733276795, 'batch_size': 64,
                       'embedding_dim': 7}

print(best_parameters)

In [None]:
best_results, filename = test_train(best_parameters)

In [None]:
def predict_and_save_test_set_modified(model, batch_size, filename):
    seqs_test_path = 'kaggle/input/deep-learning-for-msc-202324/seqs_test.csv'
    pssm_test_path = 'kaggle/input/deep-learning-for-msc-202324/test/'
    output_csv_path = f'kaggle/output/{filename}.csv'

    struct_to_ix = {'H': 0, 'E': 1, 'C': 2}
    ix_to_struct = {v: k for k, v in struct_to_ix.items()}

    # initialize the test dataset and data loader
    dataset_test = ProteinDataset(seqs_test_path, pssm_test_path)
    dataloader_test = DataLoader(dataset_test, batch_size=batch_size, shuffle=False, collate_fn=protein_collate_fn,
                                 pin_memory=True)

    submission = [['ID', 'STRUCTURE']]
    model.eval()

    with torch.no_grad():
        for batch_index, batch in enumerate(dataloader_test):
            sequences = batch['sequence'].to(device)
            pssms = batch['pssm'].to(device)
            pdb_ids = batch['pdb_id']
            real_lens = batch['real_len']

            print(f"Batch {batch_index + 1}/{len(dataloader_test)}")

            outputs = model(sequences, pssms)
            _, predicted = torch.max(outputs, 2)

            # split the batch predictions into individual sequences
            for i, pdb_id in enumerate(pdb_ids):
                real_len = real_lens[i].item()
                # print(f"{pdb_id} with {real_len} residues")

                for j in range(real_len):
                    residue_prediction = predicted[i][j].cpu().item()
                    submission.append([f"{pdb_id}_{j + 1}", ix_to_struct[residue_prediction]])
                    # print(f"{pdb_id}_{j+1} with {ix_to_struct[residue_prediction]}")

    # save the predictions to a CSV file
    with open(output_csv_path, 'w') as f:
        for line in submission:
            f.write(','.join(line) + '\n')

    print(f"Total predictions made: {len(submission) - 1}")
    print(f'file saved to {output_csv_path}')

    return model, dataloader_test

In [None]:
filename

In [None]:
test_model, dataloader_test = predict_and_save_test_set_modified(best_results['model'], best_parameters['batch_size'],
                                                                 filename)

In [None]:
plot_loss_accuracy_history(best_results['metrics'])

# 7. Explain the model predictions with Captum

In [None]:
def layer_attributions(model, dataloader):
    model.eval()
    # initialize the LayerIntegratedGradients object, specifying the model and the layer of interest
    lig = LayerIntegratedGradients(model, model.embedding)

    for sample in tqdm(dataloader, desc="Calculating attributions"):

    sequences = sample['sequence'].to(device).long()
    pssms = sample['pssm'].to(device).float()

    # set the baseline for the sequence data
    baseline = torch.zeros_like(sequences)

    # focus on the first class
    target = torch.tensor([0] * sequences.size(0)).to(device)

    # calculate the attributions
    attributions, delta = lig.attribute(inputs=(sequences, pssms),
                                        baselines=(baseline, pssms),
                                        target=target,
                                        return_convergence_delta=True)

    # convert the attributions and delta tensors to numpy arrays
    attributions = attributions.cpu().detach().numpy()
    delta = delta.cpu().detach().numpy()

    return attributions, delta

In [None]:
attributions, delta = layer_attributions(test_model, dataloader_test)

In [None]:
def visualize_attributions(attributions_seq, attributions_pssm, amino_acids=None):
    """
    It is used to visualize the attributions of the model predictions for the amino acid sequence and PSSM features.
    :param attributions_seq: The attributions of the model predictions for the amino acid sequence
    :type attributions_seq: numpy.ndarray
    :param attributions_pssm: The attributions of the model predictions for the PSSM features
    :type attributions_pssm: numpy.ndarray
    :param amino_acids: The amino acids in the sequence
    :type amino_acids: list
    """
    # only visualize the attributions for the first sample in the batch
    sample_attributions_seq = attributions_seq[0]
    sample_attributions_pssm = attributions_pssm[0]

    fig, ax = plt.subplots(2, 1, figsize=(12, 8))

    # plot the attributions for the amino acid sequence
    ax[0].bar(range(len(sample_attributions_seq)), sample_attributions_seq)
    ax[0].set_title('Attribution scores for amino acid sequences')
    ax[0].set_xlabel('amino acid position')
    ax[0].set_ylabel('attributable score')
    if amino_acids:
        ax[0].set_xticks(range(len(amino_acids)))
        ax[0].set_xticklabels(amino_acids, rotation=45, ha="right")

    # plot the attributions for the PSSM features
    cax = ax[1].matshow(sample_attributions_pssm, aspect='auto', cmap='viridis')
    fig.colorbar(cax, ax=ax[1])
    ax[1].set_title('Attribution scores for PSSM characteristics')
    ax[1].set_xlabel('features')
    ax[1].set_ylabel('amino acid position')

    plt.tight_layout()
    plt.show()

In [None]:
visualize_attributions(attributions['seq'], attributions['pssm'])

Since it may take too long to run from scratch, I've put some output images in this place, which can be viewed by clicking on them:

https://imgur.com/a/MtaC1b3


![64_100_7_0.1858316733276795_0.0015518165052243.png](https://i.imgur.com/I3EMIF1.png "64_100_7_0.1858316733276795_0.0015518165052243.png")

![lr_trend](https://i.imgur.com/ZY5wkac.png "lr_trend")

![lr&dropout_rate](https://i.imgur.com/nRN5T15.png "lr&dropout_rate")

![dropout_rate_tredn](https://i.imgur.com/H3uQVPk.png "dropout_rate_tredn")

![embedding_dim](https://i.imgur.com/soHrEwR.png "embedding_dim")

![64_150_7_0.1858316733276795_0.0015518165052243](https://i.imgur.com/3gQye4Z.png)