In [1]:
import pandas as pd
import numpy as np
import torch
#!pip install torch-geometric
from torch_geometric.data import Data
from collections import defaultdict
from sklearn.preprocessing import OneHotEncoder
from torch.nn import Embedding
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.loader import DataLoader
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [2]:
repository_dict = np.load("data/gnn_data.npy",allow_pickle=True).item()

In [3]:
#key = "open alex id", value = [embedding, citation relationship array, subtopic, primary topic, github url, institution, year]
repository_dict.keys()
len(repository_dict['W2963873275'][0])

768

In [4]:
#feature engineering

topics = [value[3] for value in repository_dict.values()]       # Primary topic (One-Hot Encoding)
subtopics = [value[2] for value in repository_dict.values()]    # Subtopic (Embedding)
institutions = [value[5] for value in repository_dict.values()] # Institution (Embedding)

#One-Hot Encoding for 'topic' (Assuming 2 unique values, e.g., "topic_A", "topic_B")
onehot_encoder = OneHotEncoder(sparse=False)
topics_encoded = onehot_encoder.fit_transform(np.array(topics).reshape(-1, 1))

#Embedding Encoding for 'subtopic' (46 unique values) and 'institution' (56 unique values)

subtopic_to_index = {sub: idx for idx, sub in enumerate(set(subtopics))}
institution_to_index = {inst: idx for idx, inst in enumerate(set(institutions))}

subtopic_indices = [subtopic_to_index[sub] for sub in subtopics]
institution_indices = [institution_to_index[inst] for inst in institutions]

subtopic_embedding_dim = 16
institution_embedding_dim = 16

subtopic_embedding_layer = Embedding(num_embeddings=len(subtopic_to_index), embedding_dim=subtopic_embedding_dim)
institution_embedding_layer = Embedding(num_embeddings=len(institution_to_index), embedding_dim=institution_embedding_dim)

subtopic_embeddings = subtopic_embedding_layer(torch.tensor(subtopic_indices))
institution_embeddings = institution_embedding_layer(torch.tensor(institution_indices))

#Combine all the features (dropping git url)
final_features = []

for idx, (key, value) in enumerate(repository_dict.items()):
    existing_embedding = torch.tensor(value[0])  # Existing embedding (e.g., value[0])
    subtopic_emb = subtopic_embeddings[idx]      # Subtopic embedding
    institution_emb = institution_embeddings[idx]# Institution embedding
    topic_onehot = torch.tensor(topics_encoded[idx], dtype=torch.float)  # One-hot encoded topic

    combined_features = torch.cat([existing_embedding, subtopic_emb, institution_emb, topic_onehot])
    final_features.append(combined_features)

final_features_tensor = torch.stack(final_features)
np.save('data/final_features_tensor.npy', final_features_tensor.detach().numpy())

In [6]:
#Create the initial edge_index using citation relationships
edge_index = []
edge_weight = []  # To store edge weights

# Create a mapping of repository IDs to indices
node_to_index = {key: idx for idx, key in enumerate(repository_dict.keys())}

# Build edge_index for official citation relationships
for node_id, node_data in repository_dict.items():
    source_idx = node_to_index[node_id]  # Index of the current repository
    citations = node_data[1]  # Citation relationship array

    for cited_repo in citations:
        if cited_repo in node_to_index:  # Only add edges if the cited repo is in the dataset
            target_idx = node_to_index[cited_repo]
            edge_index.append([source_idx, target_idx])  # Directed edge: source -> target
            edge_weight.append(1.0)  # Citation edge weight = 1 (strong connection)

#Compute pairwise cosine similarity between repository embeddings
embeddings = torch.tensor([value[0] for value in repository_dict.values()])  # Embedding matrix
similarity_matrix = F.cosine_similarity(embeddings.unsqueeze(1), embeddings.unsqueeze(0), dim=2)

#Ensure that every node has at least K connections (citations + similarities)
K = 3  # Minimum number of connections

min_sim, max_sim = 0.57, 0.99
for idx, (key, value) in enumerate(repository_dict.items()):
    existing_connections = set([edge[1] for edge in edge_index if edge[0] == idx])  # Current connections

    # Add more connections based on similarity if the node has fewer than K connections
    if len(existing_connections) < K:
        similarity_scores = similarity_matrix[idx]  # Similarity with all other repositories

        # Get the indices of the top K most similar repositories (excluding already connected nodes)
        sorted_similarities = torch.argsort(similarity_scores, descending=True)
        # Add new connections based on similarity, but ignore self and already connected nodes
        count = len(existing_connections)
        for neighbor_idx in sorted_similarities:
            if neighbor_idx != idx and neighbor_idx.item() not in existing_connections:
                edge_index.append([idx, neighbor_idx.item()])
                cosine_similarity_weight = 0 + ((similarity_scores[neighbor_idx] - min_sim) / (max_sim - min_sim)) * 0.8
                edge_weight.append(cosine_similarity_weight)  # Use cosine similarity as weight
                count += 1
            if count >= K:
                break

if len(edge_index) > 0:
    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
    edge_weight = torch.tensor(edge_weight, dtype=torch.float)
else:
    raise ValueError("The edge_index is empty. Ensure citation relationships are correct.")

# Debugging: Print the constructed edge_index and edge weights
# print("edge_index:", edge_index)
# print("edge_weight:", edge_weight)
np.save('data/edge_index.npy', edge_index.numpy())

In [7]:
def contrastive_loss(embeddings, edge_index):
    print("Embeddings require grad:", embeddings.requires_grad)
    print("Edge index requires grad:", edge_index.requires_grad if edge_index.requires_grad else 'No gradients')

    # Keep the embeddings as part of the computation graph
    node_embeddings = embeddings[edge_index]
    node_i, node_j = node_embeddings[0], node_embeddings[1]

    # Positive pair similarity (for nodes with an edge between them)
    pos_sim = F.cosine_similarity(node_i, node_j, dim=-1)

    # Negative pair similarity (randomly sample unconnected nodes)
    neg_sim = torch.mm(embeddings.detach(), embeddings.detach().t())  # No detaching; negative pairs will also track gradients

    # Mask out similarities of nodes with edges (since we only want unconnected pairs for negatives)
    neg_sim[edge_index[0], edge_index[1]] = float('-inf')  # Mask out connected pairs

    # Use only the highest negative similarities (hard negative mining)
    hardest_neg_sim = torch.max(neg_sim, dim=-1)[0]  # Max similarity for unconnected nodes

    # Loss: maximize positive similarity, minimize negative similarity
    pos_loss = (1 - pos_sim).mean()
    neg_loss = hardest_neg_sim.mean()

    return pos_loss + neg_loss

# Step 2: Create a PyTorch Geometric Data object
# final_features_tensor = the processed node features (from the previous step)
data = Data(x=final_features_tensor, edge_index=edge_index)

# Step 3: Define the GNN model using PyTorch Geometric
class GCN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)  # First GCN layer
        self.conv2 = GCNConv(hidden_channels, out_channels)  # Second GCN layer (output layer)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        # First layer: Apply convolution and ReLU activation
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        # Second layer: Another convolution (no activation needed)
        x = self.conv2(x, edge_index)
        return x

# Step 4: Instantiate the model and define optimizer and loss function
in_channels = final_features_tensor.shape[1]  # Number of input features (from the processed feature vector)
hidden_channels = 128  # Hidden dimension size (you can adjust this)
out_channels = 128 # Output embedding size (you can adjust this)

model = GCN(in_channels=in_channels, hidden_channels=hidden_channels, out_channels=out_channels)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# Step 5: Training the model (unsupervised learning - using reconstruction loss or similar)
# Define a simple training loop (this is an unsupervised example, using embeddings)

def train():
    global edge_index
    optimizer.zero_grad()  # Reset gradients
    model.train()

    out = model(data)  # Forward pass
    #print("Model output (embeddings) require grad:", out.requires_grad)
    edge_index = edge_index.detach() if edge_index.requires_grad else edge_index
    loss = contrastive_loss(out, edge_index)
    loss.backward(retain_graph=True)  # Backpropagate
    optimizer.step()  # Update weights
    optimizer.zero_grad()
    return loss.item()

# Step 6: Training Loop
num_epochs = 100  # Set number of epochs
for epoch in range(num_epochs):
    loss = train()
    if epoch % 10 == 0:
        print(f"Epoch {epoch}, Loss: {loss}")

# Step 7: Use the trained GNN model to get node embeddings
model.eval()  # Set the model to evaluation mode
with torch.no_grad():
    embeddings = model(data)  # Get embeddings for each node (repository)
    print("Embeddings for each repository:", embeddings)


Embeddings require grad: True
Edge index requires grad: No gradients


RuntimeError: Trying to backward through the graph a second time (or directly access saved tensors after they have already been freed). Saved intermediate values of the graph are freed when you call .backward() or autograd.grad(). Specify retain_graph=True if you need to backward through the graph a second time or if you need to access saved tensors after calling backward.