In [2]:
import torch
from torch_geometric.data import Data
import pandas as pd
import pickle
import numpy as np

from torch_geometric.nn import SAGEConv
import torch
import torch_geometric
import torch.nn.functional as F
torch_geometric.set_debug(True)

  from .autonotebook import tqdm as notebook_tqdm


<torch_geometric.debug.set_debug at 0x12ac39690>

# Helper Functions

In [3]:
def read_data(nodes_df_path, edges_df_path, subject_mapping_path):
    nodes_df = pd.read_csv(nodes_df_path)
    edges_df = pd.read_csv(edges_df_path)
    with open(subject_mapping_path, 'rb') as f:
        subject_mapping = pickle.load(f)
    return nodes_df, edges_df, subject_mapping

def get_node_id_mapping(nodes_df):
    node_id_mapping, inverse_node_id_mapping = dict(), dict()
    for i, node_id in enumerate(nodes_df['nodeId']):
        node_id_mapping[i] = node_id
        inverse_node_id_mapping[node_id] = i
    return node_id_mapping, inverse_node_id_mapping

In [104]:
def get_feature_vectors(nodes_df):
    # TODO: Complete this function
    feature_list = []
    for i, feature in enumerate(nodes_df['features']):
        feature = list(map(int, feature[1:-1].split(','))) # turn string into list of numbers
        feature_list.append(torch.tensor(feature, dtype=torch.float32))
    return torch.stack(feature_list)

def get_edges(edges_df, inverse_node_id_mapping):
    # TODO: Complete this function
    edges_list = []
    for i, edge in enumerate(edges_df.to_dict('records')):
        source_node = inverse_node_id_mapping[edge['sourceNodeId']]
        target_node = inverse_node_id_mapping[edge['targetNodeId']]
        # relation = edge['relationshipType'] ### the relation is always CITES
        edges_list.append(torch.tensor([source_node, target_node], dtype=torch.long))
    return torch.stack(edges_list).t().contiguous()

def get_labels(nodes_df, subject_mapping):
    # TODO: Complete this function
    return torch.tensor([subject_mapping[val] for val in nodes_df['subject'].values], dtype=torch.long)

# Graph Generation

In [6]:
nodes_df_path = 'data/nodes.csv'
edges_df_path = 'data/edges.csv'
subject_mapping_path = 'data/subject_mapping.pkl'
nodes_df, edges_df, subject_mapping = read_data(nodes_df_path, edges_df_path, subject_mapping_path)

In [100]:
# Check data sizes:
print("Number of nodes:", nodes_df.shape[0])
print("Dimension of features:", len(nodes_df['features'].iloc[0].split(',')))
print("Number of edges:", edges_df.shape[0])
print("Number of labels:", len(subject_mapping))
print(subject_mapping)

Number of nodes: 2708
Dimension of features: 1433
Number of edges: 10556
Number of labels: 7
{'Neural_Networks': 0, 'Rule_Learning': 1, 'Reinforcement_Learning': 2, 'Probabilistic_Methods': 3, 'Theory': 4, 'Genetic_Algorithms': 5, 'Case_Based': 6}


In [105]:
node_id_mapping, inverse_node_id_mapping = get_node_id_mapping(nodes_df)

# TODO: These functions need to be implemented. You can decide what are the arguments to these functions.
x = get_feature_vectors(nodes_df)
print("x:", x.shape)

edge_index = get_edges(edges_df, inverse_node_id_mapping)
print("edge_index:", edge_index.shape)

y = get_labels(nodes_df, subject_mapping)
print("y:", y.shape)

x: torch.Size([2708, 1433])
edge_index: torch.Size([2, 10556])
y: torch.Size([2708])


In [88]:
with open('data/indices_dict_part2.pkl', 'rb') as f:
    indices_dict = pickle.load(f)

In [106]:
train_mask = torch.tensor([1 if node_id_mapping[i] in indices_dict['train_indices'] else 0 for i in range(x.shape[0])], dtype=torch.bool)
valid_mask = torch.tensor([1 if node_id_mapping[i] in indices_dict['valid_indices'] else 0 for i in range(x.shape[0])], dtype=torch.bool)
test_mask = torch.tensor([1 if node_id_mapping[i] in indices_dict['test_indices'] else 0 for i in range(x.shape[0])], dtype=torch.bool)

In [107]:
data = Data(x=x, y=y, edge_index=edge_index, train_mask=train_mask, valid_mask=valid_mask, test_mask=test_mask)
data

Data(x=[2708, 1433], edge_index=[2, 10556], y=[2708], train_mask=[2708], valid_mask=[2708], test_mask=[2708])

# GNN Model

In [143]:
# doesnt work!!
torch.manual_seed(42)
np.random.seed(42)

In [215]:
class GraphSAGE(torch.nn.Module):
    def __init__(self, hidden_channels, output_dim, seed=1):
        super().__init__()
        torch.cuda.manual_seed(seed)
        self.conv1 = SAGEConv((-1, -1), hidden_channels, normalize=True)
        self.conv2 = SAGEConv((-1, -1), hidden_channels//2, normalize=True)
        self.conv3 = SAGEConv((-1, -1), output_dim, normalize=True)

    def forward(self, x, edge_index):
        # TODO: Complete this function
        x = self.conv1(x, edge_index)
        x = F.dropout(x, p=0.5, training=self.training)
        # x = self.conv2(x, edge_index)
        # x = F.dropout(x, p=0.5, training=self.training)
        # x = self.conv3(x, edge_index)
        return F.log_softmax(x, dim=1)

# Training

In [227]:
output_dim = len(subject_mapping)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GraphSAGE(x.shape[1], output_dim).to(device)
data = data.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
epochs = 500

In [226]:
def early_stop(rel_data_mask):
    model.eval()
    preds = model(data.x, data.edge_index).argmax(dim=1)
    correct = (preds[rel_data_mask] == data.y[rel_data_mask]).sum()
    acc = int(correct) / int(rel_data_mask.sum())
    return acc

In [228]:
def train():
    # TODO: Complete this function
    model.train()
    optimizer.zero_grad()
    avg_loss = 0
    prev_val_acc = 0
    
    for epoch in range(1, epochs + 1):
        loss = F.nll_loss(model(data.x, data.edge_index)[train_mask], data.y[train_mask])
        loss.backward()
        optimizer.step()
        avg_loss += loss.item()
        
        if epoch % 10 == 0:
            val_acc = early_stop(valid_mask)
            print('Epoch: {:2d}, Loss: {:.4f}, Accuracy: {:.4f}'.format(epoch, avg_loss / 10, val_acc))
            avg_loss = 0

            # early stopping:
            if prev_val_acc - val_acc > 0.1:
                break
            prev_val_acc = val_acc

In [229]:
train()

Epoch: 10, Loss: 7.1015, Accuracy: 0.5717
Epoch: 20, Loss: 7.0758, Accuracy: 0.5840
Epoch: 30, Loss: 7.1129, Accuracy: 0.5984
Epoch: 40, Loss: 7.0959, Accuracy: 0.6107
Epoch: 50, Loss: 7.0203, Accuracy: 0.6168
Epoch: 60, Loss: 6.8342, Accuracy: 0.6148
Epoch: 70, Loss: 6.7085, Accuracy: 0.6148
Epoch: 80, Loss: 6.7737, Accuracy: 0.6148
Epoch: 90, Loss: 6.6939, Accuracy: 0.6250
Epoch: 100, Loss: 6.6630, Accuracy: 0.6230
Epoch: 110, Loss: 6.6986, Accuracy: 0.6270
Epoch: 120, Loss: 6.7360, Accuracy: 0.6332
Epoch: 130, Loss: 6.6676, Accuracy: 0.6311
Epoch: 140, Loss: 6.6345, Accuracy: 0.6352
Epoch: 150, Loss: 6.6015, Accuracy: 0.6373
Epoch: 160, Loss: 6.5926, Accuracy: 0.6414
Epoch: 170, Loss: 6.6004, Accuracy: 0.6475
Epoch: 180, Loss: 6.5714, Accuracy: 0.6537
Epoch: 190, Loss: 6.5568, Accuracy: 0.6598
Epoch: 200, Loss: 6.5323, Accuracy: 0.6619
Epoch: 210, Loss: 6.4939, Accuracy: 0.6680
Epoch: 220, Loss: 6.4851, Accuracy: 0.6701
Epoch: 230, Loss: 6.4848, Accuracy: 0.6701
Epoch: 240, Loss: 6.

# Evaluation

In [230]:
def evaluate(rel_data_mask):
    model.eval()
    preds = model(data.x, data.edge_index).argmax(dim=1)
    correct = (preds[rel_data_mask] == data.y[rel_data_mask]).sum()
    acc = int(correct) / int(rel_data_mask.sum())
    print(f'Accuracy: {acc:.4f}')

In [231]:
evaluate(data.test_mask)

Accuracy: 0.7646


# TSNE Visualization

In [None]:
# TODO: Complete this function
def visualize():
    pass