In [None]:
!pip install matchms
!pip install rdkit
!pip install torch_geometric
!pip install pickle5

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting matchms
  Downloading matchms-0.18.0-py3-none-any.whl (109 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m109.6/109.6 KB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
Collecting deprecated
  Downloading Deprecated-1.2.13-py2.py3-none-any.whl (9.6 kB)
Collecting pickydict>=0.4.0
  Downloading pickydict-0.4.0-py3-none-any.whl (6.1 kB)
Collecting sparsestack>=0.4.1
  Downloading sparsestack-0.4.1-py3-none-any.whl (10 kB)
Collecting pyteomics>=4.2
  Downloading pyteomics-4.5.6-py2.py3-none-any.whl (232 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.1/232.1 KB[0m [31m28.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyteomics, pickydict, deprecated, sparsestack, matchms
Successfully installed deprecated-1.2.13 matchms-0.18.0 pickydict-0.4.0 pyteomics-4.5.6 sparsestack-0.4.1
Looking in indexes: https://pypi.org/simple, https:/

In [None]:
from matchms.importing import load_from_msp
import numpy as np
import os
import random
from rdkit import Chem
from rdkit.Chem import Descriptors
import matchms
import pickle
from matchms import Spectrum

import matplotlib.pyplot as plt
import warnings

from rdkit.Chem.rdmolops import GetAdjacencyMatrix

# Pytorch and Pytorch Geometric
import torch
from torch_geometric.data import Data
from torch.utils.data import DataLoader

import torch
from torch.nn import Linear
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, TopKPooling, global_mean_pool
from torch_geometric.nn import global_mean_pool as gap, global_max_pool as gmp
import torch.nn as nn



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
os.chdir("/content/drive/MyDrive/NIST")

In [None]:
BASE_DIRECTORY = "/content/drive/MyDrive/NIST"

In [None]:
TRAIN_PATH = 'train.msp'
nist_dataset_org = load_from_msp(TRAIN_PATH, metadata_harmonization=False)

In [None]:
TRAIN_PATH = 'test.msp'
nist_dataset_org_test = load_from_msp(TRAIN_PATH, metadata_harmonization=False)

In [None]:
TEST_DATA_SIZE = 5000
OUTPUT_SIZE = 1000
INTENSITY_POWER = 0.5

In [None]:
nist_dataset_test = []
i = 0
for obj in nist_dataset_org_test:
    nist_dataset_test.append(obj)

In [None]:
len(nist_dataset_test)

29040

In [None]:
def one_hot_encoding(label, num_labels):
    # make one hot encoding for one instance
    # args
    # label: int, position in one hot vector
    # num_label = int, how many groups exist
    # return: torch tensor
    tmp_zeroes = torch.zeros(num_labels)


    if type(label) is bool:
        tmp_zeroes[0] = label
        return tmp_zeroes
    if label >= num_labels:
        tmp_zeroes[num_labels - 1] = float(1)
        warnings.warn("Number of group is greater than one hot dimension representation")
        return tmp_zeroes
    elif label < 0:
      tmp_zeroes[0] = float(1)
      return tmp_zeroes
    else:
        tmp_zeroes[label] = float(1)
    return tmp_zeroes

In [None]:
def get_atom_features(atom):
#     result = []
    torch_result = torch.tensor([])
    PERMITTED_LIST_OF_ATOMAS =  ['C','N','O','S','F','Si','P','Cl','Br','Mg','Na','Ca','Fe','As','Al','I', 'B','V','K','Tl','Yb','Sb','Sn','Ag','Pd','Co','Se','Ti','Zn', 'Li','Ge','Cu','Au','Ni','Cd','In','Mn','Zr','Cr','Pt','Hg','Pb', 'Unknown']
    atom_dict = {elem: index for index, elem in enumerate(PERMITTED_LIST_OF_ATOMAS)}

    atom_type_hot = one_hot_encoding(atom_dict.get(atom.GetSymbol(), len(atom_dict)),
                                     len(PERMITTED_LIST_OF_ATOMAS))

    torch_result = torch.cat((torch_result, atom_type_hot), 0)

    total_valence = atom.GetTotalValence()
    total_valence_hot = one_hot_encoding(total_valence, 8)
    # print("total_valence", total_valence)
    torch_result = torch.cat((torch_result, total_valence_hot), 0)

    is_aromatic_hot = one_hot_encoding(atom.GetIsAromatic(), 1)
    torch_result = torch.cat((torch_result, is_aromatic_hot), 0)


    HYBRIDIZATIONS = [Chem.HybridizationType.UNSPECIFIED,
                      Chem.HybridizationType.S,
                      Chem.HybridizationType.SP,
                      Chem.HybridizationType.SP2,
                      Chem.HybridizationType.SP3,
                      Chem.HybridizationType.SP3D,
                      Chem.HybridizationType.SP3D2,
                      Chem.HybridizationType.OTHER]
    hybridization_dict = {elem: index for index, elem in enumerate(HYBRIDIZATIONS)}
    hybridization = atom.GetHybridization()
    hybridization_hot = one_hot_encoding(hybridization_dict.get(hybridization, len(hybridization_dict)), 8)
    torch_result = torch.cat((torch_result, hybridization_hot), 0)
    # print("hybridization", hybridization)

    # we adapt scale, the output of method GetFormalCharge is [-2, -1, 0, 1, 2]
    formal_charge = atom.GetFormalCharge()
    # print("foral_charge", formal_charge)
    formal_charge_hot = one_hot_encoding(formal_charge + 2, 5)
    torch_result = torch.cat((torch_result, formal_charge_hot), 0)

    default_valence = Chem.GetPeriodicTable().GetDefaultValence(atom.GetAtomicNum())
    # print("default valence", default_valence)
    default_valence_hot = one_hot_encoding(default_valence, 8)
    torch_result = torch.cat((torch_result, default_valence_hot), 0)

    ring_size = [atom.IsInRingSize(r) for r in range(3, 8)]
    # print("ring_size", ring_size)
    ring_size_hot = torch.tensor(ring_size).type(torch.float)
    torch_result = torch.cat((torch_result, ring_size_hot), 0)

    attached_H = np.sum([neighbour.GetAtomicNum() == 1 for neighbour in atom.GetNeighbors()], dtype=np.uint8)
    explicit = atom.GetNumExplicitHs()
    implicit = atom.GetNumImplicitHs()
    H_num = attached_H + explicit + implicit
    # print(attached_H, explicit, implicit)
    try:
        H_hot = one_hot_encoding(H_num, 6)
    except:
        print(H_num)
        print(attached_H, explicit, implicit)
        raise Exception("Sorry, no numbers below zero")


    torch_result = torch.cat((torch_result, H_hot), 0)

    return torch_result




In [None]:
def get_bond_features(bond, use_stereochemistry = True):
    """
    Takes an RDKit bond object as input and gives a 1d-numpy array of bond features as output.
    """

    torch_result = torch.tensor([])

    BOND_TYPE = [1.0, 1.5, 2.0, 3.0]
    bond_dict = {elem: index for index, elem in enumerate(BOND_TYPE)}
    bond_type_hot = one_hot_encoding(bond_dict.get(bond.GetBondTypeAsDouble(), len(bond_dict)),
                                     len(BOND_TYPE))
    torch_result = torch.cat((torch_result, bond_type_hot), 0)

    bond_is_conj_hot = one_hot_encoding(bond.GetIsConjugated(), 1)
#     bond_is_conj_enc = [int(bond.GetIsConjugated())]
    torch_result = torch.cat((torch_result, bond_is_conj_hot), 0)

    bond_is_in_ring_hot = one_hot_encoding(bond.IsInRing(), 1)
#     bond_is_in_ring_enc = [int(bond.IsInRing())]
    torch_result = torch.cat((torch_result, bond_is_in_ring_hot), 0)


    if use_stereochemistry == True:
        STEREO_TYPE = ["STEREOZ", "STEREOE", "STEREOANY", "STEREONONE"]
        stereo_dict = {elem: index for index, elem in enumerate(STEREO_TYPE)}
        stereo_type_hot = one_hot_encoding(stereo_dict.get(str(bond.GetStereo()), len(stereo_dict)),
                                                           len(STEREO_TYPE))
        torch_result = torch.cat((torch_result, stereo_type_hot), 0)
    return torch_result

In [None]:
def spectrum_preparation(spectrum, intensity_power, output_size, operation):
    # get spectrum object and return array of specific size for prediction
    spectrum_output = torch.zeros(1, output_size)

    for position, intensity in zip(spectrum.peaks.mz, spectrum.peaks.intensities):
        if position >= output_size:
            spectrum_output[0][output_size - 1] = intensity
            continue
        spectrum_output[0][int(position)] = intensity

    if operation == "pow":
      spectrum_output = torch.pow(spectrum_output, intensity_power)
    elif operation == "log":
      spectrum_output = spectrum_output + 1
      spectrum_output = torch.log(spectrum_output)
    else:
      spectrum_output = spectrum_output


    return spectrum_output.type(torch.float64)

In [None]:
def create_pytorch_geometric_graph_data_list(nist_data, intensity_power, output_size, operation):
    """
    Inputs:

    x_smiles = [smiles_1, smiles_2, ....] ... a list of SMILES strings
    y = [y_1, y_2, ...] ... a list of numerial labels for the SMILES strings (such as associated pKi values)

    Outputs:

    data_list = [G_1, G_2, ...] ... a list of torch_geometric.data.Data objects which represent labeled molecular graphs that can readily be used for machine learning

    """

    data_list = []

    for nist_obj in nist_data:

        # convert SMILES to RDKit mol object
        mol = Chem.MolFromSmiles(nist_obj.get('smiles'))

        if mol == None:
            continue


        # get feature dimensions
        n_nodes = mol.GetNumAtoms()
        n_edges = 2*mol.GetNumBonds()

        # the purpose is to find out one hot emb dimension
        unrelated_smiles = "O=O"
        unrelated_mol = Chem.MolFromSmiles(unrelated_smiles)
        n_node_features = len(get_atom_features(unrelated_mol.GetAtomWithIdx(0)))
        n_edge_features = len(get_bond_features(unrelated_mol.GetBondBetweenAtoms(0,1)))

        # construct node feature matrix X of shape (n_nodes, n_node_features)
        X = np.zeros((n_nodes, n_node_features))

        for atom in mol.GetAtoms():
            X[atom.GetIdx(), :] = get_atom_features(atom)

        X = torch.tensor(X, dtype = torch.float64)

        # construct edge index array E of shape (2, n_edges)
        (rows, cols) = np.nonzero(GetAdjacencyMatrix(mol))

        torch_rows = torch.from_numpy(rows.astype(np.int64)).to(torch.long)
        torch_cols = torch.from_numpy(cols.astype(np.int64)).to(torch.long)
        E = torch.stack([torch_rows, torch_cols], dim = 0)

        # construct edge feature array EF of shape (n_edges, n_edge_features)
        EF = np.zeros((n_edges, n_edge_features))

        for (k, (i,j)) in enumerate(zip(rows, cols)):

            EF[k] = get_bond_features(mol.GetBondBetweenAtoms(int(i),int(j)))

        EF = torch.tensor(EF, dtype = torch.float)

        # weight of molecul
        MW = nist_obj.get("mw", None)
        if MW == None:
            MW = Descriptors.ExactMolWt(mol)
        MW = torch.tensor(int(round(float(MW))))

        # construct label tensor
        y_tensor = spectrum_preparation(nist_obj, intensity_power, output_size, operation)

        # construct Pytorch Geometric data object and append to data list
        data_list.append(Data(x = X, edge_index = E, edge_attr = EF, molecular_weight = MW, y = y_tensor))

    return data_list

In [None]:
data_list_test = create_pytorch_geometric_graph_data_list(nist_dataset_test, INTENSITY_POWER, OUTPUT_SIZE, "none")
data_list_test

[09:36:30] Explicit valence for atom # 0 C, 5, is greater than permitted
[09:36:54] Explicit valence for atom # 0 B, 5, is greater than permitted
[09:39:02] Explicit valence for atom # 8 Br, 5, is greater than permitted


[Data(x=[20, 84], edge_index=[2, 46], edge_attr=[46, 10], y=[1, 1000], molecular_weight=281),
 Data(x=[24, 84], edge_index=[2, 50], edge_attr=[50, 10], y=[1, 1000], molecular_weight=340),
 Data(x=[8, 84], edge_index=[2, 16], edge_attr=[16, 10], y=[1, 1000], molecular_weight=125),
 Data(x=[37, 84], edge_index=[2, 80], edge_attr=[80, 10], y=[1, 1000], molecular_weight=566),
 Data(x=[8, 84], edge_index=[2, 14], edge_attr=[14, 10], y=[1, 1000], molecular_weight=138),
 Data(x=[23, 84], edge_index=[2, 46], edge_attr=[46, 10], y=[1, 1000], molecular_weight=333),
 Data(x=[27, 84], edge_index=[2, 54], edge_attr=[54, 10], y=[1, 1000], molecular_weight=376),
 Data(x=[21, 84], edge_index=[2, 42], edge_attr=[42, 10], y=[1, 1000], molecular_weight=292),
 Data(x=[17, 84], edge_index=[2, 36], edge_attr=[36, 10], y=[1, 1000], molecular_weight=254),
 Data(x=[29, 84], edge_index=[2, 58], edge_attr=[58, 10], y=[1, 1000], molecular_weight=432),
 Data(x=[30, 84], edge_index=[2, 64], edge_attr=[64, 10], y=[1

In [None]:
# PREPROCESSED_FILE = "/content/drive/MyDrive/NIST/Preprocessed_test_none_preparation.output"
# with open(PREPROCESSED_FILE, 'wb') as fid:
#   pickle.dump(data_list_test, fid)
#   fid.close()

In [None]:
with open("/content/drive/MyDrive/NIST/Preprocessed_test_log_preparation.output", 'rb') as handle:
   data_list_test  = pickle.load(handle)

with open("/content/drive/MyDrive/NIST/Preprocessed_train_log_preparation.output", 'rb') as handle:
   data_list_train  = pickle.load(handle)

In [None]:
len(data_list_test)

29037

In [None]:
len(data_list_train)

261259

In [None]:
# def mask_prediction_by_mass(total_mass, raw_prediction, index_shift):
#     # Zero out predictions to the right of the maximum possible mass.
#     # input
#     # anchor_indices: shape (,batch_size) = ex [3,4,5]
#     #     total_mass = Weights of whole molecule, not only fragment
#     # data: shape (batch_size, embedding), embedding from GNN in our case
#     # index_shift: int constant how far can heaviest fragment differ from weight of original molecule
#     #

#     total_mass = torch.round(total_mass).type(torch.int32)
#     indices = torch.arange(raw_prediction.shape[-1])[None, ...].to(device)

#     right_of_total_mass = indices > (
#             total_mass[..., None] +
#             index_shift)
#     return torch.where(right_of_total_mass, torch.zeros_like(raw_prediction),
#                         raw_prediction)

In [None]:
# def reverse_prediction(total_mass, raw_prediction, index_shift):
#     # reverse vector by anchor_indices and rest set to zero and make preproessing
#     # input
#     # total_mass: shape (,batch_size) = ex [3,4,5]
#     #     total_mass = Weights of whole molecule, not only fragment
#     # raw_prediction: shape (batch_size, embedding), embedding from GNN in our case
#     # index_shift: int constant how far can heaviest fragment differ from weight of original molecule
#     #     total_mass = feature_dict[fmap_constants.MOLECULE_WEIGHT][..., 0]

#     total_mass = torch.round(total_mass).type(torch.int32)
#     return scatter_by_anchor_indices(
#         total_mass, raw_prediction, index_shift)

In [None]:
# def scatter_by_anchor_indices(anchor_indices, data, index_shift):
#     # reverse vector by anchor_indices and rest set to zero
#     # input
#     # anchor_indices: shape (,batch_size) = ex [3,4,5]
#     #     total_mass = Weights of whole molecule, not only fragment
#     # data: shape (batch_size, embedding), embedding from GNN in our case
#     # index_shift: int constant how far can heaviest fragment differ from weight of original molecule

#     index_shift = index_shift
#     anchor_indices = anchor_indices
#     data = data.type(torch.float64)
#     batch_size = data.shape[0]

#     num_data_columns = data.shape[-1]
#     indices = torch.arange(num_data_columns)[None, ...].to(device)
#     shifted_indices = anchor_indices[..., None] - indices + index_shift
#     valid_indices = shifted_indices >= 0



#     batch_indices = torch.tile(
#           torch.arange(batch_size)[..., None], [1, num_data_columns]).to(device)
#     shifted_indices += batch_indices * num_data_columns

#     shifted_indices = torch.reshape(shifted_indices, [-1])
#     num_elements = data.shape[0] * data.shape[1]
#     row_indices = torch.arange(num_elements)
#     stacked_indices = torch.stack([row_indices, shifted_indices], axis=1)


#     lower_batch_boundaries = torch.reshape(batch_indices * num_data_columns, [-1])
#     upper_batch_boundaries = torch.reshape(((batch_indices + 1) * num_data_columns),
#                                           [-1])

#     valid_indices = torch.logical_and(shifted_indices >= lower_batch_boundaries,
#                                      shifted_indices < upper_batch_boundaries)

#     stacked_indices = stacked_indices[valid_indices]

#     # num_elements[..., np.newaxis] v tf aj ked je shape (), tak vies urbit data[]
#     # teraz to z napr. 6 da na [6]
#     dense_shape = torch.tile(torch.tensor(num_elements)[..., None], [2]).type(torch.int32).to(device)

#     scattering_matrix = torch.sparse.FloatTensor(stacked_indices.type(torch.int64).T,
#                                                  torch.ones_like(stacked_indices[:, 0]).type(torch.float64),
#                                                 dense_shape.tolist())

#     flattened_data = torch.reshape(data, [-1])[..., None]
#     flattened_output = torch.sparse.mm(scattering_matrix, flattened_data)
#     return torch.reshape(torch.transpose(flattened_output, 0, 1), [-1, num_data_columns])

In [None]:
# embedding_size = 64
# embedding_in = 32
# NODE_FEATURES = 84
# MASS_SHIFT = 5


In [None]:
class GCN(torch.nn.Module):
    def __init__(self):
        # Init parent
        super(GCN, self).__init__()
        torch.manual_seed(42)

        # GCN layers
        self.initial_conv = GCNConv(NODE_FEATURES, embedding_size)
        self.conv1 = GCNConv(embedding_size, embedding_size)
        self.conv2 = GCNConv(embedding_size, embedding_size)
        self.conv3 = GCNConv(embedding_size, embedding_size)

        # self.backward_prediction = Linear(embedding_size*2, OUTPUT_SIZE)
        # self.gate = Linear(embedding_size*2, OUTPUT_SIZE)

        # Output layer
        # self.hidden = Linear(embedding_size*2, embedding_size*3)

        self.forward_prediction = Linear(embedding_size*2, OUTPUT_SIZE)

    def forward(self, x, edge_index, total_mass, batch_index):
        # First Conv layer
        hidden = self.initial_conv(x, edge_index)
        hidden = F.relu(hidden)

        # Other Conv layers
        hidden = self.conv1(hidden, edge_index)
        hidden = F.relu(hidden)
        hidden = self.conv2(hidden, edge_index)
        hidden = F.relu(hidden)
        hidden = self.conv3(hidden, edge_index)
        hidden = F.relu(hidden)

        # Global Pooling (stack different aggregations)
        hidden = torch.cat([gmp(hidden, batch_index),
                            gap(hidden, batch_index)], dim=1)

        # Bidiractional layer
        # Forward prediction


        # hidden = self.hidden(hidden)
        forward_prediction_hidden = self.forward_prediction(hidden)
        forward_prediction_hidden = mask_prediction_by_mass(total_mass, forward_prediction_hidden, MASS_SHIFT)

        # # Backward prediction
        # backward_prediction_hidden = self.backward_prediction(hidden)
        # backward_prediction_hidden = reverse_prediction(total_mass, backward_prediction_hidden, MASS_SHIFT)

        # # # Gate
        # gate_hidden = self.gate(hidden)
        # gate_hidden = F.sigmoid(gate_hidden)

        # # # Apply a final (linear) classifier.
        # out = gate_hidden * forward_prediction_hidden # + (1. - gate_hidden) * backward_prediction_hidden

        # out = forward_prediction_hidden * backward_prediction_hidden

        out = F.relu(forward_prediction_hidden)

#         out = self.out(hidden)
        # out = F.relu(out)

        return out, hidden

MODEL_NAME = "GCN_basic_one_linear"
model = GCN()
MODEL_SAVE = os.path.join(BASE_DIRECTORY, MODEL_NAME)
os.makedirs(MODEL_SAVE, mode=0o777, exist_ok=True)
print(model)
print("Number of parameters: ", sum(p.numel() for p in model.parameters()))

GCN(
  (initial_conv): GCNConv(84, 64)
  (conv1): GCNConv(64, 64)
  (conv2): GCNConv(64, 64)
  (conv3): GCNConv(64, 64)
  (forward_prediction): Linear(in_features=128, out_features=1000, bias=True)
)
Number of parameters:  146920


In [None]:
embedding_size = 64
embedding_in = 32
NODE_FEATURES = 84
MASS_SHIFT = 5


In [None]:
def mask_prediction_by_mass(total_mass, raw_prediction, index_shift):
    # Zero out predictions to the right of the maximum possible mass.
    # input
    # anchor_indices: shape (,batch_size) = ex [3,4,5]
    #     total_mass = Weights of whole molecule, not only fragment
    # data: shape (batch_size, embedding), embedding from GNN in our case
    # index_shift: int constant how far can heaviest fragment differ from weight of original molecule
    #

    data = raw_prediction.type(torch.float64)

    total_mass = torch.round(total_mass).type(torch.int64)
    indices = torch.arange(data.shape[-1])[None, ...].to(device)

    right_of_total_mass = indices > (
            total_mass[..., None] +
            index_shift)
    return torch.where(right_of_total_mass, torch.zeros_like(data),
                        data)

In [None]:
#############################
# TO CO HORE ALE PYTORCH
#############################
def reverse_prediction(total_mass, raw_prediction, index_shift):
    # reverse vector by anchor_indices and rest set to zero and make preproessing
    # input
    # total_mass: shape (,batch_size) = ex [3,4,5]
    #     total_mass = Weights of whole molecule, not only fragment
    # raw_prediction: shape (batch_size, embedding), embedding from GNN in our case
    # index_shift: int constant how far can heaviest fragment differ from weight of original molecule
    #     total_mass = feature_dict[fmap_constants.MOLECULE_WEIGHT][..., 0]

    total_mass = torch.round(total_mass).type(torch.int32)
    return scatter_by_anchor_indices(
        total_mass, raw_prediction, index_shift)

In [None]:
#############################
# TO CO HORE ALE PYTORCH
#############################
def scatter_by_anchor_indices(anchor_indices, data, index_shift):
    # reverse vector by anchor_indices and rest set to zero
    # input
    # anchor_indices: shape (,batch_size) = ex [3,4,5]
    #     total_mass = Weights of whole molecule, not only fragment
    # data: shape (batch_size, embedding), embedding from GNN in our case
    # index_shift: int constant how far can heaviest fragment differ from weight of original molecule

    index_shift = index_shift
    anchor_indices = anchor_indices
    data = data.type(torch.float64)
    batch_size = data.shape[0]

    num_data_columns = data.shape[-1]
    indices = torch.arange(num_data_columns)[None, ...].to(device)
    shifted_indices = anchor_indices[..., None] - indices + index_shift
    valid_indices = shifted_indices >= 0



    batch_indices = torch.tile(
          torch.arange(batch_size)[..., None], [1, num_data_columns]).to(device)
    shifted_indices += batch_indices * num_data_columns

    shifted_indices = torch.reshape(shifted_indices, [-1])
    num_elements = data.shape[0] * data.shape[1]
    row_indices = torch.arange(num_elements).to(device)
    stacked_indices = torch.stack([row_indices, shifted_indices], axis=1)


    lower_batch_boundaries = torch.reshape(batch_indices * num_data_columns, [-1])
    upper_batch_boundaries = torch.reshape(((batch_indices + 1) * num_data_columns),
                                          [-1])

    valid_indices = torch.logical_and(shifted_indices >= lower_batch_boundaries,
                                     shifted_indices < upper_batch_boundaries)

    stacked_indices = stacked_indices[valid_indices]

    # num_elements[..., np.newaxis] v tf aj ked je shape (), tak vies urbit data[]
    # teraz to z napr. 6 da na [6]
    dense_shape = torch.tile(torch.tensor(num_elements)[..., None], [2]).type(torch.int32)

    scattering_matrix = torch.sparse.FloatTensor(stacked_indices.type(torch.int64).T,
                                                 torch.ones_like(stacked_indices[:, 0]).type(torch.float64),
                                                dense_shape.tolist())

    flattened_data = torch.reshape(data, [-1])[..., None]
    flattened_output = torch.sparse.mm(scattering_matrix, flattened_data)
    return torch.reshape(torch.transpose(flattened_output, 0, 1), [-1, num_data_columns])

In [None]:
embedding_size = 64
embedding_in = 32

class GCN(torch.nn.Module):
    def __init__(self):
        # Init parent
        super(GCN, self).__init__()
        torch.manual_seed(42)

        # GCN layers
        self.initial_conv = GCNConv(NODE_FEATURES, embedding_size)
        self.conv1 = GCNConv(embedding_size, embedding_size)
        self.conv2 = GCNConv(embedding_size, embedding_size)
        self.conv3 = GCNConv(embedding_size, embedding_size)

        self.forward_prediction = Linear(embedding_size*2, OUTPUT_SIZE)
        self.backward_prediction = Linear(embedding_size*2, OUTPUT_SIZE)
        self.gate = Linear(embedding_size*2, OUTPUT_SIZE)

        # Output layer
        self.out = Linear(embedding_in, OUTPUT_SIZE)

    def forward(self, x, edge_index, total_mass, batch_index):
        # First Conv layer
        hidden = self.initial_conv(x, edge_index)
        hidden = F.relu(hidden)

        # Other Conv layers
        hidden = self.conv1(hidden, edge_index)
        hidden = F.relu(hidden)
        hidden = self.conv2(hidden, edge_index)
        hidden = F.relu(hidden)
        hidden = self.conv3(hidden, edge_index)
        hidden = F.relu(hidden)

        # Global Pooling (stack different aggregations)
        hidden = torch.cat([gmp(hidden, batch_index),
                            gap(hidden, batch_index)], dim=1)

        # Bidiractional layer
        # Forward prediction
        forward_prediction_hidden = self.forward_prediction(hidden)
        forward_prediction_hidden = mask_prediction_by_mass(total_mass, forward_prediction_hidden, MASS_SHIFT)

        # Backward prediction
        backward_prediction_hidden = self.backward_prediction(hidden)
        backward_prediction_hidden = reverse_prediction(total_mass, backward_prediction_hidden, MASS_SHIFT)

        # Gate
        gate_hidden = self.gate(hidden)
        gate_hidden = F.sigmoid(gate_hidden)

        # Apply a final (linear) classifier.
        out = gate_hidden * forward_prediction_hidden + (1. - gate_hidden) * backward_prediction_hidden
        out = F.relu(out)
#         out = self.out(hidden)
#         out = F.relu(out)

        return out, hidden

MODEL_NAME = "GCN_basic_one_linear"
model = GCN()
MODEL_SAVE = os.path.join(BASE_DIRECTORY, MODEL_NAME)
os.makedirs(MODEL_SAVE, mode=0o777, exist_ok=True)
print(model)
print("Number of parameters: ", sum(p.numel() for p in model.parameters()))

GCN(
  (initial_conv): GCNConv(84, 64)
  (conv1): GCNConv(64, 64)
  (conv2): GCNConv(64, 64)
  (conv3): GCNConv(64, 64)
  (forward_prediction): Linear(in_features=128, out_features=1000, bias=True)
  (backward_prediction): Linear(in_features=128, out_features=1000, bias=True)
  (gate): Linear(in_features=128, out_features=1000, bias=True)
  (out): Linear(in_features=32, out_features=1000, bias=True)
)
Number of parameters:  437920


In [None]:
from torch_geometric.data import DataLoader
import warnings
warnings.filterwarnings("ignore")

####################################
# HUBLER LOSS
####################################

# Root mean squared error
loss_fn = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# Use GPU for training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Wrap data in a data loader
data_size = len(data_list)
NUM_GRAPHS_PER_BATCH = 64
loader = DataLoader(data_list[:int(data_size * 1.0)],
                    batch_size=NUM_GRAPHS_PER_BATCH, shuffle=True)
test_loader = DataLoader(data_list[int(data_size * 0.8):],
                         batch_size=NUM_GRAPHS_PER_BATCH, shuffle=True)

def train(data, number_of_epoch, save_every_x_epoch):
    print("Starting training...")
    losses = []
    for epoch in range(181, number_of_epoch):
      for batch in loader:
          # Use GPU
          batch.to(device)
          # Reset gradients
          optimizer.zero_grad()
          # Passing the node features and the connection info
          pred, embedding = model(batch.x.float(), batch.edge_index, batch.molecular_weight, batch.batch)
          # Calculating the loss and gradients
          loss = loss_fn(pred, batch.y)
          loss.backward()
          # Update using the gradients
          optimizer.step()


          # Save model every save_every_x_epoch
          if epoch % save_every_x_epoch == 0:
            SAVE_PATH = f"{epoch}.pt"

            # Save model
            torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': loss,
            'metadata': {"loss" : "MSELoss", "Dataset": "Log_preprocessing"}
            }, os.path.join(MODEL_SAVE, SAVE_PATH))

            LOSS_FILE = f"all_loss_until_{epoch}.output"
            with open(os.path.join(MODEL_SAVE, LOSS_FILE), 'wb') as fid:
              pickle.dump(losses, fid)
              fid.close()



      losses.append(loss)

      if epoch % 1 == 0:
        print(f"Epoch {epoch} | Train Loss {loss}")
    return losses, embedding

# print("Starting training...")
# losses = []
# for epoch in range(3000):
#     loss, h = train(data_list)
#     losses.append(loss)
#     if epoch % 100 == 0:
#         print(f"Epoch {epoch} | Train Loss {loss}")

In [None]:
MODEL_SAVE = os.path.join(BASE_DIRECTORY, MODEL_NAME)
os.makedirs(MODEL_SAVE, mode=0o777, exist_ok=True)
train(data_list, 2000, 30)

Starting training...
Epoch 181 | Train Loss 3.804481890552652
Epoch 182 | Train Loss 3.711095698318558
Epoch 183 | Train Loss 3.851199777471343


KeyboardInterrupt: ignored

In [None]:
i = 0
for batch in loader:
  i+=1
i

NameError: ignored

# Try to train again


In [None]:
model = GCN()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

checkpoint = torch.load("/content/drive/MyDrive/NIST/GCN_basic_one_linear/180.pt")
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
# epoch = checkpoint['epoch']
# loss = checkpoint['loss']


model.train()

GCN(
  (initial_conv): GCNConv(84, 64)
  (conv1): GCNConv(64, 64)
  (conv2): GCNConv(64, 64)
  (conv3): GCNConv(64, 64)
  (forward_prediction): Linear(in_features=128, out_features=1000, bias=True)
)

In [None]:
from torch_geometric.data import DataLoader
import warnings
warnings.filterwarnings("ignore")

####################################
# HUBLER LOSS
####################################

# Root mean squared error
loss_fn = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
# Use GPU for training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Wrap data in a data loader
data_size = len(data_list)
NUM_GRAPHS_PER_BATCH = 64
loader = DataLoader(data_list[:int(data_size * 1.0)],
                    batch_size=NUM_GRAPHS_PER_BATCH, shuffle=True)
test_loader = DataLoader(data_list[int(data_size * 0.8):],
                         batch_size=NUM_GRAPHS_PER_BATCH, shuffle=True)

def train(data, number_of_epoch, save_every_x_epoch):
    print("Starting training...")
    losses = []
    for epoch in range(181, number_of_epoch):
      for batch in loader:
          # Use GPU
          batch.to(device)
          # Reset gradients
          optimizer.zero_grad()
          # Passing the node features and the connection info
          pred, embedding = model(batch.x.float(), batch.edge_index, batch.molecular_weight, batch.batch)
          # Calculating the loss and gradients
          loss = loss_fn(pred, batch.y)
          loss.backward()
          # Update using the gradients
          optimizer.step()


          # Save model every save_every_x_epoch
          if epoch % save_every_x_epoch == 0:
            SAVE_PATH = f"{epoch}.pt"

            # Save model
            torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': loss,
            'metadata': {"loss" : "MSELoss", "Dataset": "Log_preprocessing"}
            }, os.path.join(MODEL_SAVE, SAVE_PATH))

            LOSS_FILE = f"all_loss_until_{epoch}.output"
            with open(os.path.join(MODEL_SAVE, LOSS_FILE), 'wb') as fid:
              pickle.dump(losses, fid)
              fid.close()



      losses.append(loss)

      if epoch % 1 == 0:
        print(f"Epoch {epoch} | Train Loss {loss}")
    return losses, embedding

# print("Starting training...")
# losses = []
# for epoch in range(3000):
#     loss, h = train(data_list)
#     losses.append(loss)
#     if epoch % 100 == 0:
#         print(f"Epoch {epoch} | Train Loss {loss}")

In [None]:
MODEL_SAVE = os.path.join(BASE_DIRECTORY, MODEL_NAME)
os.makedirs(MODEL_SAVE, mode=0o777, exist_ok=True)
train(data_list, 2000, 30)

Starting training...
Epoch 181 | Train Loss 3.7620364609912857
Epoch 182 | Train Loss 3.776804240853982
Epoch 183 | Train Loss 2.662681733435435
Epoch 184 | Train Loss 3.098810806938966
Epoch 185 | Train Loss 3.4932250500795714
Epoch 186 | Train Loss 3.0544433372894457
Epoch 187 | Train Loss 3.2501388201473174
Epoch 188 | Train Loss 3.034903836937095
Epoch 189 | Train Loss 4.495488702419223
Epoch 190 | Train Loss 3.873435588553832
Epoch 191 | Train Loss 2.905303597117914
Epoch 192 | Train Loss 3.0855139032998022
Epoch 193 | Train Loss 3.7683918173453987
Epoch 194 | Train Loss 3.7403965920300553
Epoch 195 | Train Loss 3.6374534615000016
Epoch 196 | Train Loss 3.123615187123527
Epoch 197 | Train Loss 3.100494321056385
Epoch 198 | Train Loss 2.736623381811416
Epoch 199 | Train Loss 4.111589386510741
Epoch 200 | Train Loss 2.2982570274911325
Epoch 201 | Train Loss 2.8252158070077034
Epoch 202 | Train Loss 3.2971253897091484
Epoch 203 | Train Loss 3.3466927633612773
Epoch 204 | Train Loss 3