In [1]:
import torch
import torch.nn.functional as F
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import numpy as np
from torch_geometric.data import Data
import random

## Loading edge files with a percentile threshold on the edge weights. Higher percentile extracts stronger relations. This parameter can be adjusted to control the strength of trends that we want to predict for future.

In [2]:
def load_data(year, data_dir, percentile=0.9):
    edges = pd.read_parquet(f'{data_dir}/{year}/{year}_edges.parquet', engine='pyarrow')
    nodes = pd.read_parquet(f'{data_dir}/{year}/{year}_nodes.parquet', engine='pyarrow')
    weight_threshold = edges['weight'].quantile(percentile)
    filtered_edges = edges[edges['weight'] >= weight_threshold]
    return filtered_edges, nodes

data_dir = "gs://datasets-dev-ded86f66/benchmarks/scientific_trend_prediction/new_parquet_data"
years = range(1980, 2024)

all_node_ids = set()
id_to_label = {}
for i in years:
    _, n = load_data(i, data_dir)
    all_node_ids = all_node_ids.union(set(n['node_id'].tolist()))
    keys, vals = n['node_id'].tolist(), n['node_label'].tolist()
    entries = {key: value for key, value in zip(keys, vals)}
    id_to_label.update(entries)

node_ids = sorted(list(all_node_ids))
node_id_to_index = {node_id: idx for idx, node_id in enumerate(node_ids)}
num_of_nodes = len(node_ids)
trends = []

for year in years:
    edges, _ = load_data(year, data_dir)
    edge_index = np.array([edges['source_id'].map(node_id_to_index).values,
                           edges['destination_id'].map(node_id_to_index).values])
    edge_index = torch.tensor(edge_index, dtype=torch.long)
    trend = Data(edge_index=edge_index)
    trends.append(trend)

## Creating an 80:20 train-test split, where the test data follows the training data in chronological order.

In [3]:
import copy

def create_sequences(data, time_step, forecast_length):
    X, Y = [], []
    for i in range(len(data) - time_step - forecast_length + 1):
        X.append(list(range(i,i + time_step)))
        Y.append(data[i + time_step + forecast_length - 1])
    return X, Y

time_window = 10
forecast_length = 5

x, y = create_sequences(trends, time_window, forecast_length)

split_index = int(len(x) * 0.8)

x_train, x_test = copy.deepcopy(x[:split_index]), copy.deepcopy(x[split_index:])
y_train, y_test = copy.deepcopy(y[:split_index]), copy.deepcopy(y[split_index:])

print("Size of x_train:", len(x_train))
print("Size of x_test:", len(x_test))
print("Size of y_train:", len(y_train))
print("Size of y_test:", len(y_test))

Size of x_train: 24
Size of x_test: 6
Size of y_train: 24
Size of y_test: 6


## Generating negative samples equal to the number of positive samples

In [4]:
import torch
import random
import copy
from torch_geometric.utils import negative_sampling

def add_negative_samples(data):
    num_pos_samples = data.edge_index.size(1)
    num_neg_samples = num_pos_samples
    neg_edge_index = negative_sampling(data.edge_index, num_nodes=num_of_nodes, num_neg_samples=num_neg_samples)
    pos_weights = torch.ones(num_pos_samples, device=data.edge_index.device)
    neg_weights = torch.zeros(num_neg_samples, device=data.edge_index.device)

    data.edge_index = torch.cat([data.edge_index, neg_edge_index], dim=1)
    data.y = torch.cat([pos_weights, neg_weights])

    perm = torch.randperm(data.edge_index.size(1))
    data.edge_index = data.edge_index[:, perm]
    data.y = data.y[perm]

    return data

for i in range(len(y_train)):
    y_train[i] = add_negative_samples(y_train[i])

for i in range(len(y_test)):
    y_test[i] = add_negative_samples(y_test[i])

## Decoupled LSTM

In [5]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from sklearn.metrics import mean_squared_error
from tensorflow.keras.callbacks import Callback
import numpy as np
import json

In [6]:
def get_edge_embeddings(node_embeddings, edge_index):
    src, dst = edge_index
    edge_features = torch.cat([node_embeddings[src], node_embeddings[dst]], dim=1)
    return edge_features

In [7]:
import gcsfs

def load_embeddings_from_bucket(index):
    path = f'gs://datasets-dev-ded86f66/benchmarks/scientific_trend_prediction/Tgn_embeddings_d5/{1980+index}.pt'
    fs = gcsfs.GCSFileSystem()

    with fs.open(path, 'rb') as f:
        embeddings = torch.load(f)

    return embeddings

In [8]:
def extract_edge_embeddings(index, edge_index):
    with torch.no_grad():
        node_embeddings = load_embeddings_from_bucket(index)
        edge_embeddings = get_edge_embeddings(node_embeddings, edge_index).detach().numpy()
        return edge_embeddings

In [9]:
x_train_lstm = []
y_train_lstm = []

for i in range(len(x_train)):
    embs = []
    for j in range(len(x_train[i])):
        edge_embs = extract_edge_embeddings(x_train[i][j], y_train[i].edge_index)
        embs.append(edge_embs)
    stacked_embs = np.stack(embs,axis=1)
    x_train_lstm.append(stacked_embs)
    y_train_lstm.append(y_train[i].y)

In [10]:
x_train_lstm = np.vstack(x_train_lstm)
y_train_lstm = np.hstack(y_train_lstm)

In [11]:
x_test_lstm = []
y_test_lstm = []

for i in range(len(x_test)):
    embs = []
    for j in range(len(x_test[i])):
        edge_embs = extract_edge_embeddings(x_test[i][j], y_test[i].edge_index)
        embs.append(edge_embs)
    stacked_embs = np.stack(embs,axis=1)
    x_test_lstm.append(stacked_embs)
    y_test_lstm.append(y_test[i].y)

In [12]:
x_test_lstm = np.vstack(x_test_lstm)
y_test_lstm = np.hstack(y_test_lstm)

In [13]:
lstm_model = Sequential()
lstm_model.add(LSTM(50, return_sequences=True, input_shape=(x_train_lstm.shape[1], x_train_lstm.shape[2])))
lstm_model.add(LSTM(50))
lstm_model.add(Dense(1, activation='sigmoid'))

lstm_model.compile(optimizer='adam', loss='binary_crossentropy')

history = lstm_model.fit(x_train_lstm, y_train_lstm, batch_size=1024, epochs=3, verbose=1)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [14]:
probs = lstm_model.predict(x_test_lstm,batch_size = 1024, verbose=1)



In [15]:
from sklearn.metrics import accuracy_score

threshold = 0.5
yhat = (probs > threshold).astype(int)

In [16]:
import torch
from sklearn.metrics import accuracy_score, precision_score, recall_score
import numpy as np

y_true = y_test_lstm.astype(int)

presence_mask = y_true == 1
absence_mask = y_true == 0

presence_accuracy = accuracy_score(y_true[presence_mask], yhat[presence_mask])

absence_accuracy = accuracy_score(y_true[absence_mask], yhat[absence_mask])

total_accuracy = accuracy_score(y_true, yhat)
precision = precision_score(y_true, yhat, zero_division=0)
recall = recall_score(y_true, yhat, zero_division=0)

total_accuracy_percentage = round(total_accuracy * 100)
precision_percentage = round(precision * 100)
recall_percentage = round(recall * 100)
presence_accuracy_percentage = round(presence_accuracy * 100)
absence_accuracy_percentage = round(absence_accuracy * 100)

print(f'Overall Accuracy: {total_accuracy_percentage}%')
print(f'Precision: {precision_percentage}%')
print(f'Recall: {recall_percentage}%')
print(f'Presence Accuracy: {presence_accuracy_percentage}%')
print(f'Absence Accuracy: {absence_accuracy_percentage}%')

Overall Accuracy: 84%
Precision: 81%
Recall: 88%
Presence Accuracy: 88%
Absence Accuracy: 79%
