In [2]:
import torch
import torch.nn.functional as F
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import numpy as np
from torch_geometric.data import Data
import xxhash

# Loading data from disk

In [27]:
def load_data(year, data_dir, percentile=0.0):
    edges = pd.read_parquet(f'{data_dir}/{year}/{year}_edges.parquet', engine='pyarrow')
    nodes = pd.read_parquet(f'{data_dir}/{year}/{year}_nodes.parquet', engine='pyarrow')
    weight_threshold = edges['weight'].quantile(percentile)
    filtered_edges = edges[edges['weight'] >= weight_threshold]
    return filtered_edges, nodes

In [23]:
data_dir = "gs://datasets-dev-ded86f66/benchmarks/scientific_trend_prediction/parquet_data"
years = range(1980, 2023)

all_node_ids = set()
id_to_label = {}
total = 0
for i in years:
    e, n = load_data(i, data_dir)
    total = total + len(e.drop_duplicates())
    all_node_ids = all_node_ids.union(set(n['node_id'].tolist()))
    keys , vals = n['node_id'].tolist() , n['node_label'].tolist()
    entries = {key: value for key, value in zip(keys, vals)}
    id_to_label.update(entries)

In [29]:
import pandas as pd

data_dir = "gs://datasets-dev-ded86f66/benchmarks/scientific_trend_prediction/parquet_data"
years = range(1980, 2023)

all_node_ids = set()
id_to_label = {}
total_unique_entries = 0

all_edges = []
all_nodes = []

for year in years:
    e, n = load_data(year, data_dir)
    all_edges.append(e)
    all_nodes.append(n)

# Concatenate all dataframes
all_edges_df = pd.concat(all_edges, ignore_index=True)
all_nodes_df = pd.concat(all_nodes, ignore_index=True)

# Drop duplicates to get unique entries
unique_edges_df = all_edges_df.drop_duplicates()
unique_nodes_df = all_nodes_df.drop_duplicates()

# Update total unique entries
total_unique_entries = len(unique_edges_df)

# Update all_node_ids set
all_node_ids.update(unique_nodes_df['node_id'].tolist())

# Update id_to_label dictionary
keys = unique_nodes_df['node_id'].tolist()
vals = unique_nodes_df['node_label'].tolist()
id_to_label.update(dict(zip(keys, vals)))

print(f"Total unique entries: {total_unique_entries}")
print(f"Total unique node IDs: {len(all_node_ids)}")
print(f"Total entries in id_to_label: {len(id_to_label)}")


Total unique entries: 35268983
Total unique node IDs: 36076
Total entries in id_to_label: 36076


In [4]:
import numpy as np
import torch
from torch_geometric.data import Data
import networkx as nx

def featurizer(edges, node_ids, id_to_label):
    label_order = ['phenotype', 'gene', 'compound']
    label_to_index = {label: i for i, label in enumerate(label_order)}

    node_features = np.zeros((len(node_ids), 3), dtype=float)
    out_degree_count = {node: {label: 0 for label in label_order} for node in node_ids}

    for src, dest in zip(edges['source_id'], edges['destination_id']):
        dest_label = id_to_label[dest]
        out_degree_count[src][dest_label] += 1

    for i, node in enumerate(node_ids):
        node_feature_vector = [out_degree_count[node][label] for label in label_order]
        node_features[i] = node_feature_vector

    return torch.tensor(node_features, dtype=torch.float)

In [5]:
node_ids = list(all_node_ids)
node_id_to_index = {node_id: idx for idx, node_id in enumerate(node_ids)}

graphs = []

for year in years:
    edges, _ = load_data(year, data_dir)
    node_feature = featurizer(edges, node_ids, id_to_label)
    edge_index = np.array([edges['source_id'].map(node_id_to_index).values,
                           edges['destination_id'].map(node_id_to_index).values])
    edge_index = torch.tensor(edge_index, dtype=torch.long)
    edge_weights = torch.tensor(edges['weight'].values, dtype=torch.float)
    g = Data(x=node_feature, edge_index=edge_index, edge_attr=edge_weights, y=edge_weights)
    graphs.append(g)

# Normalizing edge weights

In [6]:
def normalize_edge_weights_min_max(graph_list):
    all_weights = []
    for graph in graph_list:
        all_weights.extend(graph.edge_attr.view(-1).tolist())

    min_weight = min(all_weights)
    max_weight = max(all_weights)

    for graph in graph_list:
        edge_attr_normalized = (graph.edge_attr - min_weight) / (max_weight - min_weight)
        graph.edge_attr = edge_attr_normalized
        graph.y = edge_attr_normalized

    return graph_list

In [7]:
graphs = normalize_edge_weights_min_max(graphs)

In [8]:
print(f"Number of graphs: {len(graphs)}")

Number of graphs: 44


In [9]:
from sklearn.model_selection import train_test_split

def create_sequences(data, time_step):
    X, Y = [], []
    for i in range(len(data) - time_step - 1):
        X.append(data[i:(i + time_step)])
        Y.append(data[i + time_step])
    return X, Y

x, y = create_sequences(graphs, 10)

split_index = int(len(x) * 0.8)

x_train, x_test = x[:split_index], x[split_index:]
y_train, y_test = y[:split_index], y[split_index:]

print("Size of x_train:", len(x_train))
print("Size of x_test:", len(x_test))
print("Size of y_train:", len(y_train))
print("Size of y_test:", len(y_test))

Size of x_train: 26
Size of x_test: 7
Size of y_train: 26
Size of y_test: 7


# LSTM Baseline

In [10]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import Callback
import numpy as np

In [11]:
def concatenate_edge_weights(grphs):
    reference_edge_index = grphs[0].edge_index
    reference_edge_attr = grphs[0].edge_attr

    edge_dict = {}
    for i in range(reference_edge_index.shape[1]):
        edge = tuple(reference_edge_index[:, i].numpy())
        edge_dict[edge] = [reference_edge_attr[i].item()]

    for grph in grphs[1:]:
        edge_attr = grph.edge_attr
        edge_index = grph.edge_index

        for i in range(edge_index.shape[1]):
            edge = tuple(edge_index[:, i].numpy())
            if edge in edge_dict:
                edge_dict[edge].append(edge_attr[i].item())
            else:
                continue

    max_len = len(grphs)

    concatenated_weights = []
    for edge, weights in edge_dict.items():
        while len(weights) < max_len:
            weights.append(0.0)
        concatenated_weights.append(np.array(weights))

    concatenated_weights_array = np.array(concatenated_weights)

    return concatenated_weights_array

In [12]:
import numpy as np

def concatenate_edge_weights(grphs):
    reference_edge_index = grphs[0].edge_index
    reference_edge_attr = grphs[0].edge_attr

    edge_dict = {}
    for i in range(reference_edge_index.shape[1]):
        edge = tuple(reference_edge_index[:, i].numpy())
        edge_dict[edge] = [reference_edge_attr[i].item()]

    max_len = len(grphs)

    for grph in grphs[1:]:
        edge_attr = grph.edge_attr
        edge_index = grph.edge_index

        current_edges = {tuple(edge_index[:, i].numpy()): edge_attr[i].item() for i in range(edge_index.shape[1])}

        for edge in edge_dict.keys():
            if edge in current_edges:
                edge_dict[edge].append(current_edges[edge])
            else:
                edge_dict[edge].append(0.0)

    concatenated_weights = [np.array(weights) for weights in edge_dict.values()]
    concatenated_weights_array = np.array(concatenated_weights)

    return concatenated_weights_array

In [13]:
from tqdm import tqdm

seq_data_batchs = []

for i in tqdm(range(len(x_train)), desc="Processing batches"):
    weights = concatenate_edge_weights([y_train[i]] + x_train[i])
    seq_data_batchs.append(weights)

Processing batches: 100%|██████████| 26/26 [05:59<00:00, 13.83s/it]


In [14]:
k = 10000

In [15]:
input_dim = k
time_step = 10

model = Sequential()
model.add(LSTM(50, return_sequences=True, input_shape=(time_step, input_dim)))
model.add(LSTM(50, return_sequences=False))
model.add(Dense(input_dim,activation='sigmoid'))

model.compile(optimizer='adam', loss='mean_squared_error')

for batch in seq_data_batchs:
    X, Y = batch[:, 1:], batch[:, 0]

    for j in range(0, X.shape[0], k):
        x, y = X[j:j+k, :], Y[j:j+k]

        if len(x) < k:
            continue

        x = x.transpose(1, 0)
        y = y.reshape(1, -1)

        x = np.expand_dims(x, axis=0)

        model.fit(x, y, epochs=3, batch_size=1,verbose=0)

# Testing

In [16]:
from tqdm import tqdm

test_seq_batches = []

for i in tqdm(range(len(x_test)), desc="Processing batches"):
    weights = concatenate_edge_weights([y_test[i]] + x_test[i])
    test_seq_batches.append(weights)

Processing batches: 100%|██████████| 7/7 [03:33<00:00, 30.49s/it]


In [17]:
from sklearn.metrics import mean_squared_error
import numpy as np

all_true_values = []
all_predictions = []

for batch in test_seq_batches:
    X, Y = batch[:, 1:], batch[:, 0]

    for j in range(0, X.shape[0], k):
        x, y = X[j:j+k, :], Y[j:j+k]

        if len(x) < k:
            continue

        x = x.transpose(1, 0)
        x = np.expand_dims(x, axis=0)
        y = y.reshape(-1, 1)

        yhat = model.predict(x,verbose=0)
        yhat = yhat.reshape(-1, 1)

        all_true_values.extend(y)
        all_predictions.extend(yhat)

all_true_values = np.array(all_true_values)
all_predictions = np.array(all_predictions)

In [18]:
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(all_true_values, all_predictions)

In [19]:
print(mse)

0.0002880302120263512
