In [1]:
# import logging
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from logs import log
import torch
# # from tqdm import tqdm
import networkx as nx
import os
import sys
import torch.nn as nn
import torch_geometric.transforms as T
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv, GCNConv
from torch_geometric.data import InMemoryDataset, Data
from torch_geometric.loader import DataLoader
from torch_geometric.utils import from_networkx, negative_sampling, to_networkx
from sklearn.metrics import roc_auc_score, average_precision_score

### GraphSAGE model version logs:
    * ver1: GraphSAGE first try.
          |_2: Use the testing node embedding for testing & uploading.
    * ver2:

## Read data

In [2]:
"""Datasets:
    * id: edge id, 
    * from & to: 'from' node point to 'to' node, 
    * label: connect or not.
    * content: containing each node's attribute.

   Evaluate:
    * AUC: area under ROC curve
    * AP: average precision
"""
data_path = './dataset1/'
model_version = 'GraphSAGE_ver2'
upload_dataset_info = '_submission'
store_file = model_version + upload_dataset_info
tra_val_store_file = model_version + '_2' + upload_dataset_info
log_file = 'logs/' + store_file + '.log'
logger = log(path=data_path, file=log_file)

df_train = pd.read_csv(data_path+'raw/train.csv').sort_values('from')
df_test = pd.read_csv(data_path+'raw/test.csv')
df_content = pd.read_csv(data_path+'raw/content.csv', delimiter='\t', header=None)
df_upload = pd.read_csv(data_path+'raw/upload.csv')

In [3]:
print(f'Node feature shape: {df_content.shape}')
tmp_node_feats = df_content.set_index(0)
tmp_node_ids = tmp_node_feats.index.values.astype(int)

Node feature shape: (877, 1704)


In [4]:
df_train

Unnamed: 0,id,to,from,label
266,E2202,470,0,0
191,E937,689,0,0
891,E2414,742,0,0
1066,E3176,677,0,1
268,E960,260,0,0
...,...,...,...,...
313,E403,466,873,0
894,E343,640,874,1
2105,E2565,290,875,1
1511,E2136,612,875,1


## Datasets

In [5]:
class Graph_dataset(InMemoryDataset):
    def __init__(self, root, transform=None, pre_transform=None):
        super(Graph_dataset, self).__init__(root, transform, pre_transform)
        self.data, self.slices = torch.load(self.processed_paths[0])
    
    @property
    def raw_file_names(self):
        return ['train.csv', 'content.csv']
    
    @property
    def processed_file_names(self):
        return ['train.pt']
    
    def download(self):
        pass

    def process(self):
        self.data = pd.read_csv(self.raw_paths[0]).sort_values('from')
        node_feats = pd.read_csv(self.raw_paths[1], delimiter='\t', header=None, index_col=0)
        
        # Get node features. [num_nodes, num_node_features]
        x = torch.tensor(node_feats.sort_index().values, dtype=torch.float)
        
        # Get positive data.(label = 1: link)
        pos_data = self.data[self.data['label'] == 1]
        # neg_data = self.data[self.data['label'] == 0]

        # Get edge index.
        graph = nx.from_pandas_edgelist(pos_data, 'from', 'to', edge_attr=None)

        pair1 = [i[0] for i in graph.edges()]
        pair2 = [i[1] for i in graph.edges()]
        pos_edge_index = torch.LongTensor([pair1+pair2,pair2+pair1])

        # Create Data object.
        proc_graph = Data(x=x,
                          edge_index=pos_edge_index,
                          y=None)
        print(proc_graph)

        data, slices = self.collate([proc_graph])
        torch.save((data, slices), self.processed_paths[0])

In [6]:
"""Total content.csv nodes = 2708
   Total train.csv nodes = 2704
   Total train.csv positive link nodes = 2590
"""
demo = Graph_dataset(data_path)

In [7]:
for times, data in enumerate(demo, 1):
    print(data)
    print(data.x.size(0))
    print(data.x)
    print(data.num_node_features)

    # using this to check whether data.edge_index is fulfilled data.x values.
    data.validate(raise_on_error=True)

Data(x=[877, 1703], edge_index=[2, 2368])
877
tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 1., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 1., 0.],
        [0., 0., 0.,  ..., 0., 1., 0.],
        [0., 0., 0.,  ..., 0., 1., 0.]])
1703


## Model

In [8]:
class GraphSAGE(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels) -> None:
        super(GraphSAGE, self).__init__()
        # self.fc1 = nn.Linear(in_channels, hidden_channels)
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        # self.conv1 = SAGEConv(hidden_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, out_channels)
    
    def encode(self, x, edge_index):
        # x = F.relu(self.fc1(x))
        x = F.relu(self.conv1(x, edge_index))
        x = self.conv2(x, edge_index)
        return x
    
    def decode(self, z, edge_label_index):
        return (z[edge_label_index[0]] * z[edge_label_index[1]]).sum(dim=-1)
    
    # def decode_all(self, z):
    #     prob_adj = torch.matmul(z, z.t()) # z @ z.t()
    #     return(prob_adj > 0).nonzero(as_tuple=False).t()

## Training & Validating

In [9]:
def train_link_predictor(model, train_data, val_data, optimizer, criterion, n_epochs=200):
    logger.info('Training Start')
    for epoch in tqdm(range(1, n_epochs+1)):
        model.train()
        optimizer.zero_grad()
        z = model.encode(train_data.x, train_data.edge_index)

        # sampling training negatives for every training epoch
        neg_edge_index = negative_sampling(edge_index=train_data.edge_index,
                                           num_nodes=train_data.num_nodes,
                                           num_neg_samples=train_data.edge_label_index.size(1),
                                           method='sparse')
        
        edge_label_index = torch.cat([train_data.edge_label_index, 
                                      neg_edge_index], 
                                      dim=-1)


        edge_label = torch.cat([train_data.edge_label,
                                train_data.edge_label.new_zeros(neg_edge_index.size(1))], 
                                dim=0)
        
        out = model.decode(z, edge_label_index).view(-1)

        loss = criterion(out, edge_label)
        loss.backward()
        optimizer.step()

        val_auc, val_ap = eval_link_predictor(model, train_data, val_data, None)
        # print(f'Epoch: {epoch:03d}, Train Loss: {loss:.3f}, Val AUC: {val_auc:.3f}')
        if epoch % 10 == 0:
            # print(f'Epoch: {epoch:03d}, Train Loss: {loss:.3f}, Val AUC: {val_auc:.3f}, Val AP: {val_ap:.3f}')
            logger.info(f'Epoch: {epoch:03d}, Train Loss: {loss:.3f}, Val AUC: {val_auc:.3f}, Val AP: {val_ap:.3f}')

    logger.info('Training End --------------------------------')
    return model

def eval_link_predictor(model, train_data, val_data, test_data=None):
    model.eval()
    with torch.no_grad():
        if test_data == None: 
            z = model.encode(train_data.x, train_data.edge_index)
        else:
            # 'test_data.edge_index' include 'train_data' & 'val_data' pos_edge_index.
            z = model.encode(test_data.x, test_data.edge_index)

        out = model.decode(z, val_data.edge_label_index).view(-1)
        out = torch.sigmoid(out)

        auc = roc_auc_score(val_data.edge_label.cpu().numpy(), out.cpu().numpy())
        ap = average_precision_score(val_data.edge_label.cpu().numpy(), out.cpu().numpy())

    return auc, ap

In [10]:
data = demo.data
# data = T.NormalizeFeatures()(data)
train_data, val_data, test_data = T.RandomLinkSplit(num_val=0.05, 
                                                    num_test=0.1, 
                                                    is_undirected=True, 
                                                    add_negative_train_samples=False
                                                    )(data)
print(data)
print(train_data)
print(val_data)
print(test_data)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GraphSAGE(data.num_features, 128, 32).to(device)
optimizer = torch.optim.Adam(params=model.parameters(), lr=0.001)
criterion = torch.nn.BCEWithLogitsLoss()

model = train_link_predictor(model, train_data, val_data, optimizer, criterion, n_epochs=300)

test_auc, test_ap = eval_link_predictor(model, train_data, test_data, None)
logger.info(f"Test AUC: {test_auc:.3f}, Test AP: {test_ap:.3f}")
test_auc, test_ap = eval_link_predictor(model, train_data, test_data, test_data)
logger.info(f"tra_val Test AUC: {test_auc:.3f}, Test AP: {test_ap:.3f}")

INFO Training Start


Data(x=[877, 1703], edge_index=[2, 2368])
Data(x=[877, 1703], edge_index=[2, 2134], edge_label=[1067], edge_label_index=[2, 1067])
Data(x=[877, 1703], edge_index=[2, 2134], edge_label=[124], edge_label_index=[2, 124])
Data(x=[877, 1703], edge_index=[2, 2258], edge_label=[250], edge_label_index=[2, 250])


  0%|          | 0/300 [00:00<?, ?it/s]

INFO Epoch: 010, Train Loss: 0.492, Val AUC: 0.857, Val AP: 0.862
INFO Epoch: 020, Train Loss: 0.440, Val AUC: 0.835, Val AP: 0.851
INFO Epoch: 030, Train Loss: 0.422, Val AUC: 0.824, Val AP: 0.850
INFO Epoch: 040, Train Loss: 0.410, Val AUC: 0.821, Val AP: 0.844
INFO Epoch: 050, Train Loss: 0.403, Val AUC: 0.824, Val AP: 0.846
INFO Epoch: 060, Train Loss: 0.384, Val AUC: 0.816, Val AP: 0.843
INFO Epoch: 070, Train Loss: 0.391, Val AUC: 0.815, Val AP: 0.838
INFO Epoch: 080, Train Loss: 0.395, Val AUC: 0.809, Val AP: 0.831
INFO Epoch: 090, Train Loss: 0.400, Val AUC: 0.806, Val AP: 0.842
INFO Epoch: 100, Train Loss: 0.387, Val AUC: 0.811, Val AP: 0.843
INFO Epoch: 110, Train Loss: 0.398, Val AUC: 0.803, Val AP: 0.842
INFO Epoch: 120, Train Loss: 0.381, Val AUC: 0.820, Val AP: 0.853
INFO Epoch: 130, Train Loss: 0.392, Val AUC: 0.819, Val AP: 0.851
INFO Epoch: 140, Train Loss: 0.387, Val AUC: 0.809, Val AP: 0.844
INFO Epoch: 150, Train Loss: 0.385, Val AUC: 0.780, Val AP: 0.828
INFO Epoch

## Uploading

In [11]:
test_df = pd.read_csv(data_path+'raw/test.csv')
test_feats = pd.read_csv(data_path+'raw/content.csv', delimiter='\t', header=None, index_col=0)
test_x = torch.tensor(test_feats.sort_index().values, dtype=torch.float)
test_id = test_df['id'].values
test_edge_index = torch.tensor(test_df[['from', 'to']].values.T)

In [12]:
model.eval()
with torch.no_grad():
    z = model.encode(train_data.x, train_data.edge_index)

    out = model.decode(z, test_edge_index).view(-1)
    out = torch.sigmoid(out)
    print(out)

tensor([0.5979, 0.5345, 0.9908, 0.7228, 0.6173, 0.9602, 0.3545, 0.3924, 0.6223,
        0.4935, 0.5534, 0.6499, 0.4678, 0.4396, 0.4999, 0.9680, 0.4979, 0.9331,
        0.4666, 0.7545, 1.0000, 0.8338, 0.9932, 0.2889, 0.4950, 0.4839, 0.9437,
        0.5264, 0.5731, 0.8637, 0.2001, 0.9537, 0.9908, 0.5634, 0.5160, 0.6532,
        0.4309, 0.9773, 0.8396, 0.5055, 0.5364, 0.3715, 0.3952, 0.3796, 0.7347,
        0.4578, 0.5158, 0.4834, 0.5701, 0.4808, 0.3883, 0.4608, 0.4819, 0.4704,
        0.9481, 0.4858, 0.9966, 0.4967, 0.5404, 0.0565, 0.9881, 0.4741, 0.4474,
        0.4135, 0.5277, 0.6216, 0.6123, 0.9350, 0.4892, 0.4812, 0.4474, 0.6050,
        0.6940, 0.3169, 0.9752, 0.5625, 0.8489, 0.9954, 0.6031, 0.9354, 0.8832,
        0.3064, 0.6382, 0.4176, 0.9883, 0.4770, 0.9940, 0.3751, 0.3646, 0.9245,
        0.8280, 0.8297, 0.7189, 0.4827, 0.5056, 0.4827, 0.9947, 0.5624, 0.5211,
        0.5477, 0.6429, 0.9744, 0.7631, 0.4999, 0.6316, 0.7329, 0.7871, 0.6502,
        0.9408, 0.9368, 0.3488, 0.5343, 

In [13]:
header = ['id', 'prob']
out = out.numpy()
output_csv = []
for ind, val in enumerate(test_id):
    output_csv.append([val, str(out[ind])])
output_csv = pd.DataFrame(output_csv, columns=header)
output_csv.to_csv(data_path+'submission/'+store_file+'.csv', index=False)

### tra_val inference

In [14]:
test_df = pd.read_csv(data_path+'raw/test.csv')
test_feats = pd.read_csv(data_path+'raw/content.csv', delimiter='\t', header=None, index_col=0)
test_x = torch.tensor(test_feats.sort_index().values, dtype=torch.float)
test_id = test_df['id'].values
test_edge_index = torch.tensor(test_df[['from', 'to']].values.T)

In [15]:
model.eval()
with torch.no_grad():
    z = model.encode(test_data.x, test_data.edge_index)

    out = model.decode(z, test_edge_index).view(-1)
    out = torch.sigmoid(out)
    print(out)

tensor([0.5975, 0.5052, 0.9839, 0.6761, 0.6173, 0.9601, 0.3580, 0.3941, 0.6235,
        0.4937, 0.5541, 0.6522, 0.4823, 0.4398, 0.5000, 0.9680, 0.4968, 0.9257,
        0.4652, 0.7542, 1.0000, 0.8318, 0.9931, 0.2889, 0.4958, 0.4851, 0.9299,
        0.5344, 0.5728, 0.8609, 0.2181, 0.9536, 0.9846, 0.5634, 0.5161, 0.6526,
        0.4093, 0.9471, 0.8417, 0.5063, 0.8684, 0.3663, 0.3953, 0.3796, 0.7286,
        0.4578, 0.7610, 0.4958, 0.8948, 0.4786, 0.4826, 0.4614, 0.4819, 0.4704,
        0.9470, 0.4858, 0.9965, 0.4965, 0.5416, 0.0590, 0.9881, 0.4738, 0.4474,
        0.4599, 0.5277, 0.6216, 0.6120, 0.9351, 0.5837, 0.4506, 0.4565, 0.6029,
        0.6940, 0.3169, 0.9752, 0.5636, 0.8468, 0.9952, 0.6413, 0.9339, 0.8322,
        0.2608, 0.5586, 0.4189, 0.9890, 0.4770, 0.9937, 0.3789, 0.3520, 0.9245,
        0.8195, 0.8295, 0.7174, 0.4836, 0.5056, 0.4828, 0.9946, 0.5611, 0.5051,
        0.5477, 0.6453, 0.9744, 0.7512, 0.5034, 0.6315, 0.7321, 0.7832, 0.6502,
        0.9408, 0.9315, 0.3488, 0.5360, 

In [16]:
header = ['id', 'prob']
out = out.numpy()
output_csv = []
for ind, val in enumerate(test_id):
    output_csv.append([val, str(out[ind])])
output_csv = pd.DataFrame(output_csv, columns=header)
output_csv.to_csv(data_path+'submission/'+tra_val_store_file+'.csv', index=False)