# Imports

In [29]:
# General Imports
from modify_dataset import sanity_check_dimensions
from math import ceil
from tqdm import tqdm
import os
import pandas as pd
import numpy as np

# PyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import WeightedRandomSampler

# PyTorch Geometric
import torch_geometric
from torch_geometric.data.dataset import Dataset
from torch_geometric.loader import DataLoader

# Sets the seed for generating random numbers in PyTorch, numpy and Python.
torch_geometric.seed_everything(42)
dtype = torch.float
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Plotting Libraries
import plotly.express as px

In [30]:
print(f"Torch Version: {torch.__version__}")
print(f"Cuda Available: {torch.cuda.is_available()}")
print(f"Torch Geometric Version: {torch_geometric.__version__}")

Torch Version: 1.13.0
Cuda Available: True
Torch Geometric Version: 2.1.0


# Contact Maps Loading & Plotting

In [89]:
contact_map_A0A0A0MRZ7 = np.load("Dataset_Files/Protein_Graph_Data/raw/Contact_Map_Files/A0A0A0MRZ7.npy")
fig = px.imshow(contact_map_A0A0A0MRZ7, color_continuous_scale=["white", "black"])

fig.show()

In [90]:
contact_map_A0A0A0MRZ7

array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 1, 1, 1],
       [0, 0, 0, ..., 1, 1, 1],
       [0, 0, 0, ..., 1, 1, 1]])

In [87]:
contact_map_A0A0A0MRZ7_reshape = np.load("Dataset_Files/Protein_Graph_Data/raw/Contact_Map_Files/A0A0A0MRZ7.npy").reshape(-1,2)
contact_map_A0A0A0MRZ7_reshape

array([[1, 1],
       [1, 1],
       [0, 0],
       ...,
       [0, 1],
       [1, 1],
       [1, 1]])

# Dimensions Sanity Check

In [32]:
sanity_check_dimensions("A4D1B5", print_information=True)

Contact Map Shape: (854, 854)
Amino Acid Descriptors Shape: (854, 66)
PSSM Shape: (854, 20)
UniProt Embedding Shape: (854, 1024)


True

# Training & Test Sets (Drug Descriptors and Protein Sequence Descriptors)

In [33]:
feature_selection_columns = np.load("Dataset_Files/Feature_Selection/features_dd_psd_list.npy",
                                    allow_pickle=True)
unique_proteins_list = np.load("Dataset_Files/Unique_Proteins_List.npy",
                               allow_pickle=True)

In [34]:
X_train = np.load("Dataset_Files/Training_Test_Sets/X_train_dd_psd_feature_selection.npy")
X_train_accession_list = np.load("Dataset_Files/Training_Test_Sets/X_train_dd_psd_accession.npy", allow_pickle=True)
y_train = np.load("Dataset_Files/Training_Test_Sets/y_train_dd_psd.npy")

X_test_classification = np.load("Dataset_Files/Training_Test_Sets/X_test_classification_dd_psd_feature_selection.npy")
X_test_classification_accession_list = np.load("Dataset_Files/Training_Test_Sets/X_test_classification_accession.npy",
                                               allow_pickle=True)
y_test_classification = np.load("Dataset_Files/Training_Test_Sets/y_test_classification.npy")

In [35]:
# Useful Information & Sanity Checks
print(f"X_train shape: {X_train.shape}")
print(f"X_train accession shape: {X_train_accession_list.shape}")
print(f"y_train shape: {y_train.shape[0]}")

print(f"X_test_classification shape: {X_test_classification.shape}")
print(f"X_test_classification accession shape: {X_test_classification_accession_list.shape}")
print(f"y_test_classification shape: {y_test_classification.shape[0]} ", end="")
print(f"(Binding Count: {y_test_classification[y_test_classification == 1].shape[0]}, ", end="")
print(f"Non-Binding Count: {y_test_classification[y_test_classification == 0].shape[0]})")

X_train shape: (134734, 1044)
X_train accession shape: (134734,)
y_train shape: 134734
X_test_classification shape: (30141, 1044)
X_test_classification accession shape: (30141,)
y_test_classification shape: 30141 (Binding Count: 22001, Non-Binding Count: 8140)


# WeightedRandomSampler
To balance our batches, given the clear imbalance between the classes

In [36]:
# Reference
# https://www.maskaravivek.com/post/pytorch-weighted-random-sampler/

train_class_counts = pd.Series(y_train).groupby(y_train).count().to_numpy()
weights = 1. / train_class_counts

weights_all = np.array([weights[t] for t in y_train.astype(int)])
weights_all = torch.from_numpy(weights_all)

print(weights)
print(weights_all)
print(len(weights_all))

[2.78001724e-05 1.01252493e-05]
tensor([1.0125e-05, 2.7800e-05, 1.0125e-05,  ..., 1.0125e-05, 2.7800e-05,
        2.7800e-05], dtype=torch.float64)
134734


# Dataset Class

In [82]:
class MyDataset(Dataset):
    def __init__(self, root, drug_and_sequence_descriptors=None, unique_proteins_accessions_list=None,
                 index_to_accession=None, labels=None, transform=None, pre_transform=None, pre_filter=None):
        self.drug_and_sequence_descriptors = drug_and_sequence_descriptors
        self.unique_proteins_accessions_list = unique_proteins_accessions_list
        self.index_to_accession = index_to_accession
        self.labels = labels
        super(MyDataset, self).__init__(root, transform, pre_transform, pre_filter)

    @property
    def raw_file_names(self):
        return os.listdir(f"{self.root}/raw")

    @property
    def processed_file_names(self):
        return os.listdir(f"{self.root}/processed")

    def download(self):
        pass

    def process(self):
        unique_proteins_accessions_list = self.unique_proteins_accessions_list

        print("Creating Protein Graphs")
        for i in tqdm(range(len(unique_proteins_accessions_list))):
            accession = unique_proteins_accessions_list[i]

            if sanity_check_dimensions(accession) == True:
                amino_acid_descriptor = np.load(
                    f"{self.root}/raw/Amino_Acid_Descriptors_And_PSSM/{accession}_Descriptors.npy")
                pssm = np.load(f"{self.root}/raw/Amino_Acid_Descriptors_And_PSSM/{accession}_PSSM.npy")
                uniprot_embedding = np.load(f"{self.root}/raw/Amino_Acid_Embeddings/{accession}.npy")

                amino_acid_features = np.hstack((amino_acid_descriptor, pssm, uniprot_embedding))
                contact_map = np.load(f"{self.root}/raw/Contact_Map_Files/{accession}.npy")

                data = torch_geometric.data.Data(x=torch.Tensor(amino_acid_features),
                                                 edge_index=torch.LongTensor(contact_map),
                                                 size=contact_map.shape[0],
                                                 accession=accession)

                torch.save(data, f"{self.root}/processed/protein_graph_{accession}.pt")

    def len(self):
        return len(self.labels)

    def get(self, idx):
        accession = self.index_to_accession[idx]
        protein_graph = torch.load(f"{self.root}/processed/protein_graph_{accession}.pt")

        drug_descriptors = torch.tensor(self.drug_and_sequence_descriptors[idx])
        label = torch.tensor(self.labels[idx])

        print(protein_graph)
        print(drug_descriptors.shape)
        print(label.shape)

        return drug_descriptors, protein_graph, label

In [83]:
ProteinGraphDataset = MyDataset(root="Dataset_Files/Protein_Graph_Data",
                                drug_and_sequence_descriptors=X_train,
                                unique_proteins_accessions_list=unique_proteins_list,
                                index_to_accession=X_train_accession_list,
                                labels=y_train)
ProteinGraphDataset.get(42)

Data(x=[534, 1110], edge_index=[534, 534], accession='Q13564')
torch.Size([1044])
torch.Size([])


(tensor([ 4.9150e+02,  1.2998e+00,  4.9125e+02,  ...,  4.4849e-01,
          3.3911e-01, -5.4688e-01]),
 Data(x=[534, 1110], edge_index=[534, 534], accession='Q13564'),
 tensor(1, dtype=torch.int8))

In [24]:
# class Dataset(torch.utils.data.Dataset):
#     def __init__(self, drug_and_sequence_descriptors, index_to_accession, labels):
#         self.drug_and_sequence_descriptors = drug_and_sequence_descriptors
#         self.index_to_accession = index_to_accession
#         self.labels = labels
#
#     def __len__(self):
#         return len(self.labels)
#
#     def __getitem__(self, index):
#         accession = self.index_to_accession[index]
#         amino_acid_descriptor = np.load(f"Dataset_Files/Amino_Acid_Descriptors_And_PSSM/{accession}_Descriptors.npy")
#         pssm = np.load(f"Dataset_Files/Amino_Acid_Descriptors_And_PSSM/{accession}_PSSM.npy")
#         uniprot_embedding = np.load(f"Dataset_Files/Amino_Acid_Embeddings/{accession}.npy")
#
#         amino_acid_features = np.hstack((amino_acid_descriptor, pssm, uniprot_embedding))
#         contact_map = np.load(f"Dataset_Files/Contact_Map_Files/{accession}.npy")
#
#         drug_and_sequence_descriptor = self.drug_and_sequence_descriptors[index]
#         protein_data = DATA.Data(x=torch.Tensor(amino_acid_features), edge_index=torch.Tensor(contact_map))
#         protein_data.__setitem__('target_size', torch.LongTensor([contact_map.shape[0]]))
#
#         label = self.labels[index]
#
#         return drug_and_sequence_descriptor, protein_data, label

In [80]:
BATCH_SIZE = 64

sampler = WeightedRandomSampler(weights_all, len(weights_all))

trainloader = torch_geometric.loader.DataLoader(
    MyDataset(root="Dataset_Files/Protein_Graph_Data",
              drug_and_sequence_descriptors=X_train,
              unique_proteins_accessions_list=unique_proteins_list,
              index_to_accession=X_train_accession_list,
              labels=y_train),
    batch_size=BATCH_SIZE,
    sampler=sampler)

testloader = torch.utils.data.DataLoader(
    MyDataset(root="Dataset_Files/Protein_Graph_Data",
              drug_and_sequence_descriptors=X_test_classification,
              unique_proteins_accessions_list=unique_proteins_list,
              index_to_accession=X_test_classification_accession_list,
              labels=y_test_classification),
    batch_size=BATCH_SIZE,
    shuffle=True)

In [81]:
for i, x in enumerate(trainloader):
    print(i)
    print(x)
    break

Data(x=[359, 1110], edge_index=[359, 359], accession='P43088')
torch.Size([1044])
torch.Size([])
Data(x=[379, 1110], edge_index=[379, 379], accession='Q5MAI5')
torch.Size([1044])
torch.Size([])
Data(x=[419, 1110], edge_index=[419, 419], accession='O14733')
torch.Size([1044])
torch.Size([])
Data(x=[658, 1110], edge_index=[658, 658], accession='P23786')
torch.Size([1044])
torch.Size([])
Data(x=[1047, 1110], edge_index=[1047, 1047], accession='P20594')
torch.Size([1044])
torch.Size([])
Data(x=[415, 1110], edge_index=[415, 415], accession='P48730')
torch.Size([1044])
torch.Size([])
Data(x=[355, 1110], edge_index=[355, 355], accession='Q8N6T7')
torch.Size([1044])
torch.Size([])
Data(x=[108, 1110], edge_index=[108, 108], accession='Q9NZ45')
torch.Size([1044])
torch.Size([])
Data(x=[286, 1110], edge_index=[286, 286], accession='O60930')
torch.Size([1044])
torch.Size([])
Data(x=[1235, 1110], edge_index=[1235, 1235], accession='Q9UL54')
torch.Size([1044])
torch.Size([])
Data(x=[1634, 1110], edg

RuntimeError: Sizes of tensors must match except in dimension 1. Expected size 359 but got size 379 for tensor number 1 in the list.

In [None]:
for batch_idx, data in enumerate(trainloader):
    print(batch_idx)
    print(data)

In [None]:
index = 765

print(X_train[index].shape)
accession = X_train_accession_list[index]

amino_acid_descriptor = np.load(f"Dataset_Files/Amino_Acid_Descriptors_And_PSSM/{accession}_Descriptors.npy")
print(amino_acid_descriptor.shape)
pssm = np.load(f"Dataset_Files/Amino_Acid_Descriptors_And_PSSM/{accession}_PSSM.npy")
print(pssm.shape)
contact_map = np.load(f"Dataset_Files/Contact_Map_Files/{accession}.npy")
print(contact_map.shape)