# Installing dependencies

In [1]:
import subprocess

def install_dependencies():
    commands = [
        "sudo apt install unzip -y",
        "pip install gdown",
        "pip install torch",
        "pip install torch-geometric",
        "pip install numpy",
        "pip install pandas",
        "pip install scikit-learn",
        "pip install xxhash",
        "pip install pyarrow",
        "pip install tensorflow"
    ]

    for command in commands:
        process = subprocess.Popen(command.split(), stdout=subprocess.PIPE)
        output, error = process.communicate()
        if error:
            print(f"Error occurred: {error}")
        else:
            print(f"Output: {output}")

In [2]:
import torch
import torch.nn.functional as F
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import numpy as np
from torch_geometric.data import Data
import xxhash

## Loading edge files with a percentile threshold on the edge weights. Higher percentile extracts stronger relations. This parameter can be adjusted to control the strength of trends that we want to predict for future.

In [3]:
import pandas as pd

def load_data(year, data_dir, percentile=0.9):
    edges = pd.read_parquet(f'{data_dir}/{year}/{year}_edges.parquet', engine='pyarrow')
    nodes = pd.read_parquet(f'{data_dir}/{year}/{year}_nodes.parquet', engine='pyarrow')
    weight_threshold = edges['weight'].quantile(percentile)
    filtered_edges = edges[edges['weight'] >= weight_threshold]
    return filtered_edges, nodes

data_dir = "gs://datasets-dev-ded86f66/benchmarks/scientific_trend_prediction/new_parquet_data"
years = range(1980, 2024)

_, nodes = load_data(1980, data_dir)
all_node_ids = nodes['node_id'].tolist()

all_node_ids = set()
id_to_label = {}
for i in years:
    _, n = load_data(i, data_dir)
    all_node_ids = all_node_ids.union(set(n['node_id'].tolist()))
    keys , vals = n['node_id'].tolist() , n['node_label'].tolist()
    entries = {key: value for key, value in zip(keys, vals)}
    id_to_label.update(entries)

# Constructing temporal graph sequences

In [4]:
import numpy as np
import torch
from torch_geometric.data import Data
import networkx as nx

def featurizer(edges, node_ids, id_to_label):
    label_order = ['phenotype', 'gene', 'compound']
    label_to_index = {label: i for i, label in enumerate(label_order)}

    node_features = np.zeros((len(node_ids), 3), dtype=float)
    out_degree_count = {node: {label: 0 for label in label_order} for node in node_ids}

    for src, dest in zip(edges['source_id'], edges['destination_id']):
        dest_label = id_to_label[dest]
        out_degree_count[src][dest_label] += 1

    for i, node in enumerate(node_ids):
        node_feature_vector = [out_degree_count[node][label] for label in label_order]
        node_features[i] = node_feature_vector

    return torch.tensor(node_features, dtype=torch.float)

In [5]:
node_ids = list(all_node_ids)
node_id_to_index = {node_id: idx for idx, node_id in enumerate(node_ids)}

graphs = []

for year in years:
    edges, _ = load_data(year, data_dir)
    node_feature = featurizer(edges, node_ids, id_to_label)
    edge_index = np.array([edges['source_id'].map(node_id_to_index).values,
                           edges['destination_id'].map(node_id_to_index).values])
    edge_index = torch.tensor(edge_index, dtype=torch.long)
    edge_weights = torch.tensor(edges['weight'].values, dtype=torch.float)
    g = Data(x=node_feature, edge_index=edge_index, edge_attr=edge_weights, y=edge_weights)
    graphs.append(g)

In [6]:
print(f"Number of graphs: {len(graphs)}")

Number of graphs: 44


# GNN-LSTM Layer Implemetation

In [7]:
from torch.nn import Parameter
from torch_geometric.nn import ChebConv
from torch_geometric.nn.inits import glorot, zeros

class GConvLSTM(torch.nn.Module):
    r"""An implementation of the Chebyshev Graph Convolutional Long Short Term Memory
    Cell. For details see this paper: `"Structured Sequence Modeling with Graph
    Convolutional Recurrent Networks." <https://arxiv.org/abs/1612.07659>`_

    Args:
        in_channels (int): Number of input features.
        out_channels (int): Number of output features.
        K (int): Chebyshev filter size :math:`K`.
        normalization (str, optional): The normalization scheme for the graph
            Laplacian (default: :obj:`"sym"`):

            1. :obj:`None`: No normalization
            :math:`\mathbf{L} = \mathbf{D} - \mathbf{A}`

            2. :obj:`"sym"`: Symmetric normalization
            :math:`\mathbf{L} = \mathbf{I} - \mathbf{D}^{-1/2} \mathbf{A}
            \mathbf{D}^{-1/2}`

            3. :obj:`"rw"`: Random-walk normalization
            :math:`\mathbf{L} = \mathbf{I} - \mathbf{D}^{-1} \mathbf{A}`

            You need to pass :obj:`lambda_max` to the :meth:`forward` method of
            this operator in case the normalization is non-symmetric.
            :obj:`\lambda_max` should be a :class:`torch.Tensor` of size
            :obj:`[num_graphs]` in a mini-batch scenario and a
            scalar/zero-dimensional tensor when operating on single graphs.
            You can pre-compute :obj:`lambda_max` via the
            :class:`torch_geometric.transforms.LaplacianLambdaMax` transform.
        bias (bool, optional): If set to :obj:`False`, the layer will not learn
            an additive bias. (default: :obj:`True`)
    """

    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        K: int,
        normalization: str = "sym",
        bias: bool = True,
    ):
        super(GConvLSTM, self).__init__()

        self.in_channels = in_channels
        self.out_channels = out_channels
        self.K = K
        self.normalization = normalization
        self.bias = bias
        self._create_parameters_and_layers()
        self._set_parameters()

    def _create_input_gate_parameters_and_layers(self):

        self.conv_x_i = ChebConv(
            in_channels=self.in_channels,
            out_channels=self.out_channels,
            K=self.K,
            normalization=self.normalization,
            bias=self.bias,
        )

        self.conv_h_i = ChebConv(
            in_channels=self.out_channels,
            out_channels=self.out_channels,
            K=self.K,
            normalization=self.normalization,
            bias=self.bias,
        )

        self.w_c_i = Parameter(torch.Tensor(1, self.out_channels))
        self.b_i = Parameter(torch.Tensor(1, self.out_channels))

    def _create_forget_gate_parameters_and_layers(self):

        self.conv_x_f = ChebConv(
            in_channels=self.in_channels,
            out_channels=self.out_channels,
            K=self.K,
            normalization=self.normalization,
            bias=self.bias,
        )

        self.conv_h_f = ChebConv(
            in_channels=self.out_channels,
            out_channels=self.out_channels,
            K=self.K,
            normalization=self.normalization,
            bias=self.bias,
        )

        self.w_c_f = Parameter(torch.Tensor(1, self.out_channels))
        self.b_f = Parameter(torch.Tensor(1, self.out_channels))

    def _create_cell_state_parameters_and_layers(self):

        self.conv_x_c = ChebConv(
            in_channels=self.in_channels,
            out_channels=self.out_channels,
            K=self.K,
            normalization=self.normalization,
            bias=self.bias,
        )

        self.conv_h_c = ChebConv(
            in_channels=self.out_channels,
            out_channels=self.out_channels,
            K=self.K,
            normalization=self.normalization,
            bias=self.bias,
        )

        self.b_c = Parameter(torch.Tensor(1, self.out_channels))

    def _create_output_gate_parameters_and_layers(self):

        self.conv_x_o = ChebConv(
            in_channels=self.in_channels,
            out_channels=self.out_channels,
            K=self.K,
            normalization=self.normalization,
            bias=self.bias,
        )

        self.conv_h_o = ChebConv(
            in_channels=self.out_channels,
            out_channels=self.out_channels,
            K=self.K,
            normalization=self.normalization,
            bias=self.bias,
        )

        self.w_c_o = Parameter(torch.Tensor(1, self.out_channels))
        self.b_o = Parameter(torch.Tensor(1, self.out_channels))

    def _create_parameters_and_layers(self):
        self._create_input_gate_parameters_and_layers()
        self._create_forget_gate_parameters_and_layers()
        self._create_cell_state_parameters_and_layers()
        self._create_output_gate_parameters_and_layers()

    def _set_parameters(self):
        glorot(self.w_c_i)
        glorot(self.w_c_f)
        glorot(self.w_c_o)
        zeros(self.b_i)
        zeros(self.b_f)
        zeros(self.b_c)
        zeros(self.b_o)

    def _set_hidden_state(self, X, H):
        if H is None:
            H = torch.zeros(X.shape[0], self.out_channels).to(X.device)
        return H

    def _set_cell_state(self, X, C):
        if C is None:
            C = torch.zeros(X.shape[0], self.out_channels).to(X.device)
        return C

    def _calculate_input_gate(self, X, edge_index, edge_weight, H, C, lambda_max):
        I = self.conv_x_i(X, edge_index, edge_weight, lambda_max=lambda_max)
        I = I + self.conv_h_i(H, edge_index, edge_weight, lambda_max=lambda_max)
        I = I + (self.w_c_i * C)
        I = I + self.b_i
        I = torch.sigmoid(I)
        return I

    def _calculate_forget_gate(self, X, edge_index, edge_weight, H, C, lambda_max):
        F = self.conv_x_f(X, edge_index, edge_weight, lambda_max=lambda_max)
        F = F + self.conv_h_f(H, edge_index, edge_weight, lambda_max=lambda_max)
        F = F + (self.w_c_f * C)
        F = F + self.b_f
        F = torch.sigmoid(F)
        return F

    def _calculate_cell_state(self, X, edge_index, edge_weight, H, C, I, F, lambda_max):
        T = self.conv_x_c(X, edge_index, edge_weight, lambda_max=lambda_max)
        T = T + self.conv_h_c(H, edge_index, edge_weight, lambda_max=lambda_max)
        T = T + self.b_c
        T = torch.tanh(T)
        C = F * C + I * T
        return C

    def _calculate_output_gate(self, X, edge_index, edge_weight, H, C, lambda_max):
        O = self.conv_x_o(X, edge_index, edge_weight, lambda_max=lambda_max)
        O = O + self.conv_h_o(H, edge_index, edge_weight, lambda_max=lambda_max)
        O = O + (self.w_c_o * C)
        O = O + self.b_o
        O = torch.sigmoid(O)
        return O

    def _calculate_hidden_state(self, O, C):
        H = O * torch.tanh(C)
        return H

    def forward(
        self,
        X: torch.FloatTensor,
        edge_index: torch.LongTensor,
        edge_weight: torch.FloatTensor = None,
        H: torch.FloatTensor = None,
        C: torch.FloatTensor = None,
        lambda_max: torch.Tensor = None,
    ) -> torch.FloatTensor:
        """
        Making a forward pass. If edge weights are not present the forward pass
        defaults to an unweighted graph. If the hidden state and cell state
        matrices are not present when the forward pass is called these are
        initialized with zeros.

        Arg types:
            * **X** *(PyTorch Float Tensor)* - Node features.
            * **edge_index** *(PyTorch Long Tensor)* - Graph edge indices.
            * **edge_weight** *(PyTorch Long Tensor, optional)* - Edge weight vector.
            * **H** *(PyTorch Float Tensor, optional)* - Hidden state matrix for all nodes.
            * **C** *(PyTorch Float Tensor, optional)* - Cell state matrix for all nodes.
            * **lambda_max** *(PyTorch Tensor, optional but mandatory if normalization is not sym)* - Largest eigenvalue of Laplacian.

        Return types:
            * **H** *(PyTorch Float Tensor)* - Hidden state matrix for all nodes.
            * **C** *(PyTorch Float Tensor)* - Cell state matrix for all nodes.
        """
        H = self._set_hidden_state(X, H)
        C = self._set_cell_state(X, C)
        I = self._calculate_input_gate(X, edge_index, edge_weight, H, C, lambda_max)
        F = self._calculate_forget_gate(X, edge_index, edge_weight, H, C, lambda_max)
        C = self._calculate_cell_state(X, edge_index, edge_weight, H, C, I, F, lambda_max)
        O = self._calculate_output_gate(X, edge_index, edge_weight, H, C, lambda_max)
        H = self._calculate_hidden_state(O, C)
        return H, C

# Temporal Link Predictor Architecture

In [8]:
class TemporalGNN(torch.nn.Module):
    def __init__(self, num_nodes, node_features, hidden_channels, output_channels):
        super(TemporalGNN, self).__init__()
        self.recurrent = GConvLSTM(node_features, hidden_channels, 3)
        self.linear = torch.nn.Linear(hidden_channels, output_channels)
        self.edge_mlp = torch.nn.Sequential(
                torch.nn.Linear(2 * output_channels, hidden_channels),
                torch.nn.ReLU(),
                torch.nn.Linear(hidden_channels, 1)
            )

    def forward(self, seq):
        H, C = None, None
        for i in range(len(seq)):
            x = seq[i].x
            edge_index = seq[i].edge_index
            edge_attr = seq[i].edge_attr
            H, C = self.recurrent(x, edge_index, edge_attr, H, C)

        H = F.relu(H)
        H = self.linear(H)
        return F.log_softmax(H, dim=1)

    def predict_edge_weight(self, node_embeddings, edge_index):
        src, dst = edge_index
        edge_features = torch.cat([node_embeddings[src], node_embeddings[dst]], dim=1)
        probs = self.edge_mlp(edge_features)
        probs = torch.sigmoid(probs)
        return probs.squeeze()

# Initializing model and creating train-test splits

In [9]:
node_dim = graphs[0].x.shape[1]
num_nodes = graphs[0].x.shape[0]
hidden_channels = 64
output_channels = 64
learning_rate = 0.0001
epochs = 20
time_window = 10
weight_decay = 0.0001

model = TemporalGNN(num_nodes, node_dim, hidden_channels, output_channels)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
criterion = torch.nn.BCELoss()

In [10]:
import copy

def create_variable_length_sequences(data, d):
    X, Y = [], []
    for i in range(1, len(data) - d + 1):
        X.append(data[:i])
        Y.append(data[i + d - 1])
    return X, Y

d = 5
x, y = create_variable_length_sequences(graphs, d)

split_index = int(len(x) * 0.8)

x_train, x_test = copy.deepcopy(x[:split_index]), copy.deepcopy(x[split_index:])
y_train, y_test = copy.deepcopy(y[:split_index]), copy.deepcopy(y[split_index:])

print("Size of x_train:", len(x_train))
print("Size of x_test:", len(x_test))
print("Size of y_train:", len(y_train))
print("Size of y_test:", len(y_test))


Size of x_train: 31
Size of x_test: 8
Size of y_train: 31
Size of y_test: 8


# Training Loop to generate edges for N+1th graph using last N graphs

In [11]:
import torch
import random
from torch_geometric.utils import negative_sampling

def add_negative_samples(data):
    num_pos_samples = data.edge_index.size(1)
    num_neg_samples = num_pos_samples
    neg_edge_index = negative_sampling(data.edge_index, num_nodes=data.num_nodes, num_neg_samples=num_neg_samples)
    pos_weights = torch.ones(num_pos_samples, device=data.edge_index.device)
    neg_weights = torch.zeros(num_neg_samples, device=data.edge_index.device)

    data.edge_index = torch.cat([data.edge_index, neg_edge_index], dim=1)
    data.y = torch.cat([pos_weights, neg_weights])
    data.edge_attr = torch.cat([data.edge_attr, neg_weights])

    perm = torch.randperm(data.edge_index.size(1))

    data.edge_index = data.edge_index[:, perm]
    data.edge_attr = data.edge_attr[perm]
    data.y = data.y[perm]

    return data

for i in range(len(y_train)):
    y_train[i] = add_negative_samples(y_train[i])

for i in range(len(y_test)):
    y_test[i] = add_negative_samples(y_test[i])

In [12]:
import matplotlib.pyplot as plt
import torch

train_losses = []
val_accuracies_combined = []
val_accuracies_presence = []
val_accuracies_absence = []

model.train()
for epoch in range(epochs):
    total_loss = 0
    model.train()
    for i in range(len(x_train)):
        optimizer.zero_grad()
        node_embeddings = model(x_train[i])
        probs = model.predict_edge_weight(node_embeddings, y_train[i].edge_index)
        loss = criterion(probs, y_train[i].y)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_train_loss = total_loss / len(x_train)
    train_losses.append(avg_train_loss)

    torch.save(model.state_dict(), 'test_model_variable.pth')

    # Validation
    model.eval()
    total_correct_presence = 0
    total_correct_absence = 0
    total_presence = 0
    total_absence = 0
    total_correct = 0
    total_predictions = 0
    with torch.no_grad():
        for i in range(len(x_test)):
            node_embeddings = model(x_test[i])
            probs = model.predict_edge_weight(node_embeddings, y_test[i].edge_index)
            predictions = (probs > 0.5).int()

            correct_presence = ((predictions == y_test[i].y) & (y_test[i].y == 1)).sum().item()
            correct_absence = ((predictions == y_test[i].y) & (y_test[i].y == 0)).sum().item()

            total_correct_presence += correct_presence
            total_correct_absence += correct_absence

            total_presence += (y_test[i].y == 1).sum().item()
            total_absence += (y_test[i].y == 0).sum().item()

            total_correct += (predictions == y_test[i].y).sum().item()
            total_predictions += y_test[i].y.size(0)

    val_accuracy_presence = total_correct_presence / total_presence if total_presence > 0 else 0
    val_accuracy_absence = total_correct_absence / total_absence if total_absence > 0 else 0
    val_accuracy_combined = total_correct / total_predictions if total_predictions > 0 else 0

    val_accuracies_presence.append(val_accuracy_presence)
    val_accuracies_absence.append(val_accuracy_absence)
    val_accuracies_combined.append(val_accuracy_combined)

    print(f'Epoch {epoch+1}, Loss: {avg_train_loss:.4f}, Val Accuracy (Combined): {val_accuracy_combined:.4f}, Val Accuracy (Presence): {val_accuracy_presence:.4f}, Val Accuracy (Absence): {val_accuracy_absence:.4f}')

Epoch 1, Loss: 0.7226, Val Accuracy (Combined): 0.5532, Val Accuracy (Presence): 0.9230, Val Accuracy (Absence): 0.1834
Epoch 2, Loss: 0.6855, Val Accuracy (Combined): 0.6495, Val Accuracy (Presence): 0.9619, Val Accuracy (Absence): 0.3372
Epoch 3, Loss: 0.6774, Val Accuracy (Combined): 0.7618, Val Accuracy (Presence): 0.8528, Val Accuracy (Absence): 0.6708
Epoch 4, Loss: 0.6670, Val Accuracy (Combined): 0.7820, Val Accuracy (Presence): 0.8499, Val Accuracy (Absence): 0.7141
Epoch 5, Loss: 0.6498, Val Accuracy (Combined): 0.7836, Val Accuracy (Presence): 0.8675, Val Accuracy (Absence): 0.6998
Epoch 6, Loss: 0.6230, Val Accuracy (Combined): 0.7845, Val Accuracy (Presence): 0.8582, Val Accuracy (Absence): 0.7108
Epoch 7, Loss: 0.5842, Val Accuracy (Combined): 0.7878, Val Accuracy (Presence): 0.8404, Val Accuracy (Absence): 0.7352
Epoch 8, Loss: 0.5358, Val Accuracy (Combined): 0.7904, Val Accuracy (Presence): 0.8328, Val Accuracy (Absence): 0.7480
Epoch 9, Loss: 0.4868, Val Accuracy (Com

In [None]:
torch.save(model.state_dict(), 'test_model_variable.pth')

In [None]:
import torch
import gcsfs

path = 'gs://datasets-dev-ded86f66/benchmarks/scientific_trend_prediction/model_weights/tgn_complete_model.pth'
fs = gcsfs.GCSFileSystem()

with fs.open(path, 'rb') as f:
    state_dict = torch.load(f)

model.load_state_dict(state_dict)

# Predicting edges of N+1th graph given last N graphs

In [None]:
import torch
from sklearn.metrics import accuracy_score

model.eval()
presence_accuracy = 0
absence_accuracy = 0
num_tests = len(x_test)

presence_count = 0
absence_count = 0

for i in range(num_tests):
    with torch.no_grad():
        node_embeddings = model(x_test[i])
        probs = model.predict_edge_weight(node_embeddings, y_test[i].edge_index)

    threshold = 0.5
    yhat = (probs > threshold).int()
    y_true = y_test[i].y

    presence_mask = y_true == 1
    absence_mask = y_true == 0

    if presence_mask.sum().item() > 0:
        presence_accuracy += accuracy_score(y_true[presence_mask].numpy().astype(int), yhat[presence_mask].numpy().astype(int))
        presence_count += 1

    if absence_mask.sum().item() > 0:
        absence_accuracy += accuracy_score(y_true[absence_mask].numpy().astype(int), yhat[absence_mask].numpy().astype(int))
        absence_count += 1

average_presence_accuracy = presence_accuracy / presence_count if presence_count > 0 else 0
average_absence_accuracy = absence_accuracy / absence_count if absence_count > 0 else 0

average_presence_accuracy_percentage = round(average_presence_accuracy * 100)
average_absence_accuracy_percentage = round(average_absence_accuracy * 100)

print(f'Average Presence Accuracy: {average_presence_accuracy_percentage}%')
print(f'Average Absence Accuracy: {average_absence_accuracy_percentage}%')

Average Presence Accuracy: 86%
Average Absence Accuracy: 74%


In [None]:
def calculate_confusion_matrix(total_positive, total_negative, accuracy_presence, accuracy_absence):
    TP = accuracy_presence * total_positive
    TN = accuracy_absence * total_negative
    FP = total_negative - TN
    FN = total_positive - TP

    return TP, TN, FP, FN

def calculate_precision_recall(TP, FP, FN):
    if TP + FP == 0:
        Precision = 0
    else:
        Precision = TP / (TP + FP)

    if TP + FN == 0:
        Recall = 0
    else:
        Recall = TP / (TP + FN)

    return Precision, Recall

total_samples = sum(len(x.y) for x in y_test)
total_positive = total_samples // 2
total_negative = total_positive
accuracy_presence = average_presence_accuracy
accuracy_absence = average_absence_accuracy

TP, TN, FP, FN = calculate_confusion_matrix(total_positive, total_negative, accuracy_presence, accuracy_absence)
Precision, Recall = calculate_precision_recall(TP, FP, FN)

print(f"Precision: {Precision}")
print(f"Recall: {Recall}")

Precision: 0.7656880825209607
Recall: 0.8630174210995073
