In [1]:
# import logging
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from logs import log
import torch
# # from tqdm import tqdm
import networkx as nx
import os
import sys
import torch.nn as nn
import torch_geometric.transforms as T
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv, GCNConv, GraphSAGE
from torch_geometric.data import InMemoryDataset, Data
from torch_geometric.loader import DataLoader
from torch_geometric.utils import from_networkx, negative_sampling, to_networkx
from sklearn.metrics import roc_auc_score, average_precision_score

### GCN model version logs:
    * ver1: GCN first try. (submission.csv)
          |_2: Use the testing node embedding for testing & uploading.
          |_3: (out_dim=64 --> 32, lr=0.005 --> 0.001)
            |_2: Use the testing node embedding for testing & uploading.
    * ver2: Add one more fully-connected layer. (fc_submission.csv)
          |_2: Use the testing node embedding for testing & uploading.
    * ver3: tune hyperparameter (out_dim=64 --> 32, lr=0.005 --> 0.001)
          |_2: Use the testing node embedding for testing & uploading.
    * ver4: normalize data.x (use ver1 framework, lr=0.005, out_dim=64)
          |_2: Use the testing node embedding for testing & uploading.
    * ver5: normalize data.x (use ver1 framework, lr=0.005, out_dim=32)
          |_2: Use the testing node embedding for testing & uploading.
    * ver6:
    

## Read data

In [2]:
"""Datasets:
    * id: edge id, 
    * from & to: 'from' node point to 'to' node, 
    * label: connect or not.
    * content: containing each node's attribute.

   Evaluate:
    * AUC: area under ROC curve
    * AP: average precision
"""
data_path = './dataset1/'
model_version = 'GCN_ver6'
upload_dataset_info = '_submission'
store_file = model_version + upload_dataset_info
tra_val_store_file = model_version + '_2' + upload_dataset_info
log_file = 'logs/' + store_file + '.log'
logger = log(path=data_path, file=log_file)

df_train = pd.read_csv(data_path+'raw/train.csv').sort_values('from')
df_test = pd.read_csv(data_path+'raw/test.csv')
df_content = pd.read_csv(data_path+'raw/content.csv', delimiter='\t', header=None)
df_upload = pd.read_csv(data_path+'raw/upload.csv')

In [3]:
print(f'Node feature shape: {df_content.shape}')
tmp_node_feats = df_content.set_index(0)
tmp_node_ids = tmp_node_feats.index.values.astype(int)

Node feature shape: (877, 1704)


In [4]:
df_train

Unnamed: 0,id,to,from,label
266,E2202,470,0,0
191,E937,689,0,0
891,E2414,742,0,0
1066,E3176,677,0,1
268,E960,260,0,0
...,...,...,...,...
313,E403,466,873,0
894,E343,640,874,1
2105,E2565,290,875,1
1511,E2136,612,875,1


## Datasets

In [5]:
class Graph_dataset(InMemoryDataset):
    def __init__(self, root, transform=None, pre_transform=None):
        super(Graph_dataset, self).__init__(root, transform, pre_transform)
        self.data, self.slices = torch.load(self.processed_paths[0])
    
    @property
    def raw_file_names(self):
        return ['train.csv', 'content.csv']
    
    @property
    def processed_file_names(self):
        return ['train.pt']
    
    def download(self):
        pass

    def process(self):
        self.data = pd.read_csv(self.raw_paths[0]).sort_values('from')
        node_feats = pd.read_csv(self.raw_paths[1], delimiter='\t', header=None, index_col=0)
        
        # Get node features. [num_nodes, num_node_features]
        x = torch.tensor(node_feats.sort_index().values, dtype=torch.float)
        
        # Get positive data.(label = 1: link)
        pos_data = self.data[self.data['label'] == 1]
        # neg_data = self.data[self.data['label'] == 0]

        # Get edge index.
        graph = nx.from_pandas_edgelist(pos_data, 'from', 'to', edge_attr=None)

        pair1 = [i[0] for i in graph.edges()]
        pair2 = [i[1] for i in graph.edges()]
        pos_edge_index = torch.LongTensor([pair1+pair2,pair2+pair1])

        # Create Data object.
        proc_graph = Data(x=x,
                          edge_index=pos_edge_index,
                          y=None)
        print(proc_graph)

        data, slices = self.collate([proc_graph])
        torch.save((data, slices), self.processed_paths[0])

In [6]:
"""Total content.csv nodes = 2708
   Total train.csv nodes = 2704
   Total train.csv positive link nodes = 2590
"""
demo = Graph_dataset(data_path)

In [7]:
for times, data in enumerate(demo, 1):
    print(data)
    print(data.x.size(0))
    print(data.x)
    print(data.num_node_features)

    # using this to check whether data.edge_index is fulfilled data.x values.
    data.validate(raise_on_error=True)

Data(x=[877, 1703], edge_index=[2, 2368])
877
tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 1., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 1., 0.],
        [0., 0., 0.,  ..., 0., 1., 0.],
        [0., 0., 0.,  ..., 0., 1., 0.]])
1703


## Model

In [8]:
class GCN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels) -> None:
        super(GCN, self).__init__()
        # self.fc1 = nn.Linear(in_channels, hidden_channels)
        self.conv1 = GCNConv(in_channels, hidden_channels)
        # self.conv1 = GCNConv(hidden_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, out_channels)
    
    def encode(self, x, edge_index):
        # x = F.relu(self.fc1(x))
        x = F.relu(self.conv1(x, edge_index))
        x = self.conv2(x, edge_index)
        return x
    
    def decode(self, z, edge_label_index):
        return (z[edge_label_index[0]] * z[edge_label_index[1]]).sum(dim=-1)
    
    # def decode_all(self, z):
    #     prob_adj = torch.matmul(z, z.t()) # z @ z.t()
    #     return(prob_adj > 0).nonzero(as_tuple=False).t()

## Training & Validating

In [9]:
def train_link_predictor(model, train_data, val_data, optimizer, criterion, n_epochs=200):
    logger.info('Training Start')
    for epoch in tqdm(range(1, n_epochs+1)):
        model.train()
        optimizer.zero_grad()
        z = model.encode(train_data.x, train_data.edge_index)

        # sampling training negatives for every training epoch
        neg_edge_index = negative_sampling(edge_index=train_data.edge_index,
                                           num_nodes=train_data.num_nodes,
                                           num_neg_samples=train_data.edge_label_index.size(1),
                                           method='sparse')
        
        edge_label_index = torch.cat([train_data.edge_label_index, 
                                      neg_edge_index], 
                                      dim=-1)


        edge_label = torch.cat([train_data.edge_label,
                                train_data.edge_label.new_zeros(neg_edge_index.size(1))], 
                                dim=0)
        
        out = model.decode(z, edge_label_index).view(-1)

        loss = criterion(out, edge_label)
        loss.backward()
        optimizer.step()

        val_auc, val_ap = eval_link_predictor(model, train_data, val_data, None)
        # print(f'Epoch: {epoch:03d}, Train Loss: {loss:.3f}, Val AUC: {val_auc:.3f}')
        if epoch % 10 == 0:
            # print(f'Epoch: {epoch:03d}, Train Loss: {loss:.3f}, Val AUC: {val_auc:.3f}, Val AP: {val_ap:.3f}')
            logger.info(f'Epoch: {epoch:03d}, Train Loss: {loss:.3f}, Val AUC: {val_auc:.3f}, Val AP: {val_ap:.3f}')

    logger.info('Training End --------------------------------')
    return model

def eval_link_predictor(model, train_data, val_data, test_data=None):
    model.eval()
    with torch.no_grad():
        if test_data == None: 
            z = model.encode(train_data.x, train_data.edge_index)
        else:
            # 'test_data.edge_index' include 'train_data' & 'val_data' pos_edge_index.
            z = model.encode(test_data.x, test_data.edge_index)

        out = model.decode(z, val_data.edge_label_index).view(-1)
        out = torch.sigmoid(out)

        auc = roc_auc_score(val_data.edge_label.cpu().numpy(), out.cpu().numpy())
        ap = average_precision_score(val_data.edge_label.cpu().numpy(), out.cpu().numpy())

    return auc, ap

In [10]:
data = demo.data
# data = T.NormalizeFeatures()(data)
train_data, val_data, test_data = T.RandomLinkSplit(num_val=0.05, 
                                                    num_test=0.1, 
                                                    is_undirected=True, 
                                                    add_negative_train_samples=False
                                                    )(data)
print(data)
print(train_data)
print(val_data)
print(test_data)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GCN(data.num_features, 128, 32).to(device)
optimizer = torch.optim.Adam(params=model.parameters(), lr=0.001)
criterion = torch.nn.BCEWithLogitsLoss()

model = train_link_predictor(model, train_data, val_data, optimizer, criterion, n_epochs=300)

test_auc, test_ap = eval_link_predictor(model, train_data, test_data, None)
logger.info(f"Test AUC: {test_auc:.3f}, Test AP: {test_ap:.3f}")
test_auc, test_ap = eval_link_predictor(model, train_data, test_data, test_data)
logger.info(f"tra_val Test AUC: {test_auc:.3f}, Test AP: {test_ap:.3f}")

INFO Training Start


Data(x=[877, 1703], edge_index=[2, 2368])
Data(x=[877, 1703], edge_index=[2, 2134], edge_label=[1067], edge_label_index=[2, 1067])
Data(x=[877, 1703], edge_index=[2, 2134], edge_label=[124], edge_label_index=[2, 124])
Data(x=[877, 1703], edge_index=[2, 2258], edge_label=[250], edge_label_index=[2, 250])


  0%|          | 0/300 [00:00<?, ?it/s]

INFO Epoch: 010, Train Loss: 0.443, Val AUC: 0.917, Val AP: 0.916
INFO Epoch: 020, Train Loss: 0.404, Val AUC: 0.925, Val AP: 0.921
INFO Epoch: 030, Train Loss: 0.410, Val AUC: 0.924, Val AP: 0.925
INFO Epoch: 040, Train Loss: 0.403, Val AUC: 0.907, Val AP: 0.913
INFO Epoch: 050, Train Loss: 0.391, Val AUC: 0.889, Val AP: 0.900
INFO Epoch: 060, Train Loss: 0.399, Val AUC: 0.894, Val AP: 0.906
INFO Epoch: 070, Train Loss: 0.389, Val AUC: 0.902, Val AP: 0.911
INFO Epoch: 080, Train Loss: 0.389, Val AUC: 0.909, Val AP: 0.919
INFO Epoch: 090, Train Loss: 0.393, Val AUC: 0.895, Val AP: 0.907
INFO Epoch: 100, Train Loss: 0.379, Val AUC: 0.877, Val AP: 0.901
INFO Epoch: 110, Train Loss: 0.387, Val AUC: 0.877, Val AP: 0.899
INFO Epoch: 120, Train Loss: 0.378, Val AUC: 0.876, Val AP: 0.900
INFO Epoch: 130, Train Loss: 0.384, Val AUC: 0.880, Val AP: 0.903
INFO Epoch: 140, Train Loss: 0.386, Val AUC: 0.882, Val AP: 0.900
INFO Epoch: 150, Train Loss: 0.377, Val AUC: 0.880, Val AP: 0.895
INFO Epoch

## Uploading

In [11]:
test_df = pd.read_csv(data_path+'raw/test.csv')
test_feats = pd.read_csv(data_path+'raw/content.csv', delimiter='\t', header=None, index_col=0)
test_x = torch.tensor(test_feats.sort_index().values, dtype=torch.float)
test_id = test_df['id'].values
test_edge_index = torch.tensor(test_df[['from', 'to']].values.T)

In [12]:
model.eval()
with torch.no_grad():
    z = model.encode(train_data.x, train_data.edge_index)

    out = model.decode(z, test_edge_index).view(-1)
    out = torch.sigmoid(out)
    print(out)

tensor([0.6190, 0.9709, 0.9561, 0.7320, 0.3349, 0.7913, 0.6741, 0.8253, 0.9254,
        0.4418, 0.4138, 0.7324, 0.7034, 0.7959, 0.3676, 0.9055, 0.4899, 0.5604,
        0.5313, 0.5169, 0.9977, 0.3090, 0.9334, 0.4100, 0.4352, 0.4613, 0.9584,
        0.4698, 0.6404, 0.7925, 0.3101, 0.9486, 0.9747, 0.9862, 0.8151, 0.7315,
        0.6564, 0.9800, 0.9876, 0.4678, 0.9417, 0.5968, 0.4183, 0.4993, 0.3933,
        0.6218, 0.8290, 0.4588, 0.9973, 0.4077, 0.3325, 0.4372, 0.4382, 0.6675,
        0.8001, 0.3850, 0.9756, 0.5160, 0.3732, 0.7520, 0.9552, 0.5900, 0.2158,
        0.3715, 0.6578, 0.9296, 0.5598, 0.6836, 0.3514, 0.3723, 0.3925, 0.6533,
        0.8393, 0.4605, 0.9320, 0.4045, 0.8981, 0.9748, 0.2287, 0.8354, 0.8890,
        0.4663, 0.7931, 0.8108, 0.9935, 0.4055, 0.9934, 0.5956, 0.9285, 0.9842,
        0.9278, 0.7946, 0.8219, 0.4433, 0.4454, 0.3936, 0.9558, 0.4014, 0.3865,
        0.4205, 0.9784, 0.2887, 0.8600, 0.6646, 0.6539, 0.7229, 0.7422, 0.9721,
        0.9742, 0.9939, 0.4093, 0.4704, 

In [13]:
header = ['id', 'prob']
out = out.numpy()
output_csv = []
for ind, val in enumerate(test_id):
    output_csv.append([val, str(out[ind])])
output_csv = pd.DataFrame(output_csv, columns=header)
output_csv.to_csv(data_path+'submission/'+store_file+'.csv', index=False)

### tra_val inference

In [14]:
test_df = pd.read_csv(data_path+'raw/test.csv')
test_feats = pd.read_csv(data_path+'raw/content.csv', delimiter='\t', header=None, index_col=0)
test_x = torch.tensor(test_feats.sort_index().values, dtype=torch.float)
test_id = test_df['id'].values
test_edge_index = torch.tensor(test_df[['from', 'to']].values.T)

In [15]:
model.eval()
with torch.no_grad():
    z = model.encode(test_data.x, test_data.edge_index)

    out = model.decode(z, test_edge_index).view(-1)
    out = torch.sigmoid(out)
    print(out)

tensor([0.5677, 0.9660, 0.9558, 0.7285, 0.3548, 0.7811, 0.6699, 0.8333, 0.9242,
        0.4389, 0.4180, 0.7299, 0.6954, 0.8245, 0.3685, 0.9055, 0.4942, 0.6369,
        0.5320, 0.5173, 0.9968, 0.3128, 0.9206, 0.4121, 0.4339, 0.4615, 0.9584,
        0.4931, 0.6428, 0.7852, 0.2705, 0.9461, 0.9746, 0.9855, 0.8027, 0.7196,
        0.6123, 0.9796, 0.9813, 0.4686, 0.9416, 0.5255, 0.4363, 0.4991, 0.3836,
        0.6051, 0.8133, 0.4536, 0.9965, 0.4264, 0.3325, 0.4359, 0.4383, 0.6675,
        0.9469, 0.3850, 0.9745, 0.5076, 0.3994, 0.7441, 0.9413, 0.5884, 0.2158,
        0.3721, 0.6603, 0.9187, 0.5565, 0.7165, 0.5099, 0.3726, 0.4020, 0.7131,
        0.8393, 0.4160, 0.9320, 0.4105, 0.9000, 0.9683, 0.2559, 0.8421, 0.8780,
        0.4596, 0.7649, 0.8231, 0.9919, 0.4130, 0.9901, 0.5931, 0.9250, 0.9842,
        0.9297, 0.7936, 0.8069, 0.4485, 0.4790, 0.3941, 0.9565, 0.4023, 0.3934,
        0.4410, 0.9735, 0.3321, 0.8558, 0.6594, 0.6532, 0.7088, 0.7333, 0.9730,
        0.9730, 0.9935, 0.4120, 0.4699, 

In [16]:
header = ['id', 'prob']
out = out.numpy()
output_csv = []
for ind, val in enumerate(test_id):
    output_csv.append([val, str(out[ind])])
output_csv = pd.DataFrame(output_csv, columns=header)
output_csv.to_csv(data_path+'submission/'+tra_val_store_file+'.csv', index=False)