In [1]:
# import logging
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from logs import log
import torch
# # from tqdm import tqdm
import networkx as nx
import os
import sys
import torch.nn as nn
import torch_geometric.transforms as T
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv, GCNConv, APPNP
from torch_geometric.data import InMemoryDataset, Data
from torch_geometric.loader import DataLoader
from torch_geometric.utils import from_networkx, negative_sampling, to_networkx
from sklearn.metrics import roc_auc_score, average_precision_score

### fcAPPNP model version logs:
    * ver1: fcAPPNP first try.
          |_2: Use the testing node embedding for testing & uploading.
    * ver2:

## Read data

In [2]:
"""Datasets:
    * id: edge id, 
    * from & to: 'from' node point to 'to' node, 
    * label: connect or not.
    * content: containing each node's attribute.

   Evaluate:
    * AUC: area under ROC curve
    * AP: average precision
"""
data_path = './dataset1/'
model_version = 'fcAPPNP_ver2'
upload_dataset_info = '_submission'
store_file = model_version + upload_dataset_info
tra_val_store_file = model_version + '_2' + upload_dataset_info
log_file = 'logs/' + store_file + '.log'
logger = log(path=data_path, file=log_file)

df_train = pd.read_csv(data_path+'raw/train.csv').sort_values('from')
df_test = pd.read_csv(data_path+'raw/test.csv')
df_content = pd.read_csv(data_path+'raw/content.csv', delimiter='\t', header=None)
df_upload = pd.read_csv(data_path+'raw/upload.csv')

In [3]:
print(f'Node feature shape: {df_content.shape}')
tmp_node_feats = df_content.set_index(0)
tmp_node_ids = tmp_node_feats.index.values.astype(int)

Node feature shape: (877, 1704)


In [4]:
df_train

Unnamed: 0,id,to,from,label
266,E2202,470,0,0
191,E937,689,0,0
891,E2414,742,0,0
1066,E3176,677,0,1
268,E960,260,0,0
...,...,...,...,...
313,E403,466,873,0
894,E343,640,874,1
2105,E2565,290,875,1
1511,E2136,612,875,1


## Datasets

In [5]:
class Graph_dataset(InMemoryDataset):
    def __init__(self, root, transform=None, pre_transform=None):
        super(Graph_dataset, self).__init__(root, transform, pre_transform)
        self.data, self.slices = torch.load(self.processed_paths[0])
    
    @property
    def raw_file_names(self):
        return ['train.csv', 'content.csv']
    
    @property
    def processed_file_names(self):
        return ['train.pt']
    
    def download(self):
        pass

    def process(self):
        self.data = pd.read_csv(self.raw_paths[0]).sort_values('from')
        node_feats = pd.read_csv(self.raw_paths[1], delimiter='\t', header=None, index_col=0)
        
        # Get node features. [num_nodes, num_node_features]
        x = torch.tensor(node_feats.sort_index().values, dtype=torch.float)
        
        # Get positive data.(label = 1: link)
        pos_data = self.data[self.data['label'] == 1]
        # neg_data = self.data[self.data['label'] == 0]

        # Get edge index.
        graph = nx.from_pandas_edgelist(pos_data, 'from', 'to', edge_attr=None)

        pair1 = [i[0] for i in graph.edges()]
        pair2 = [i[1] for i in graph.edges()]
        pos_edge_index = torch.LongTensor([pair1+pair2,pair2+pair1])

        # Create Data object.
        proc_graph = Data(x=x,
                          edge_index=pos_edge_index,
                          y=None)
        print(proc_graph)

        data, slices = self.collate([proc_graph])
        torch.save((data, slices), self.processed_paths[0])

In [6]:
"""Total content.csv nodes = 2708
   Total train.csv nodes = 2704
   Total train.csv positive link nodes = 2590
"""
demo = Graph_dataset(data_path)

In [7]:
for times, data in enumerate(demo, 1):
    print(data)
    print(data.x.size(0))
    print(data.x)
    print(data.num_node_features)

    # using this to check whether data.edge_index is fulfilled data.x values.
    data.validate(raise_on_error=True)

Data(x=[877, 1703], edge_index=[2, 2368])
877
tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 1., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 1., 0.],
        [0., 0., 0.,  ..., 0., 1., 0.],
        [0., 0., 0.,  ..., 0., 1., 0.]])
1703


## Model

In [8]:
class fcAPPNP(torch.nn.Module):
    def __init__(self, in_channels, out_channels) -> None:
        super(fcAPPNP, self).__init__()
        self.linear1 = nn.Linear(in_channels, out_channels)
        self.propagate = APPNP(K=1, alpha=0.15)
    
    def encode(self, x, edge_index):
        x = self.linear1(x)
        x = F.normalize(x, p=2, dim=-1) * 1.8
        x = self.propagate(x, edge_index)
        return x
    
    def decode(self, z, edge_label_index):
        return (z[edge_label_index[0]] * z[edge_label_index[1]]).sum(dim=-1)
    
    # def decode_all(self, z):
    #     prob_adj = torch.matmul(z, z.t()) # z @ z.t()
    #     return(prob_adj > 0).nonzero(as_tuple=False).t()

## Training & Validating

In [9]:
def train_link_predictor(model, train_data, val_data, optimizer, criterion, n_epochs=200):
    logger.info('Training Start')
    for epoch in tqdm(range(1, n_epochs+1)):
        model.train()
        optimizer.zero_grad()
        z = model.encode(train_data.x, train_data.edge_index)

        # sampling training negatives for every training epoch
        neg_edge_index = negative_sampling(edge_index=train_data.edge_index,
                                           num_nodes=train_data.num_nodes,
                                           num_neg_samples=train_data.edge_label_index.size(1),
                                           method='sparse')
        
        edge_label_index = torch.cat([train_data.edge_label_index, 
                                      neg_edge_index], 
                                      dim=-1)


        edge_label = torch.cat([train_data.edge_label,
                                train_data.edge_label.new_zeros(neg_edge_index.size(1))], 
                                dim=0)
        
        out = model.decode(z, edge_label_index).view(-1)

        loss = criterion(out, edge_label)
        loss.backward()
        optimizer.step()

        val_auc, val_ap = eval_link_predictor(model, train_data, val_data, None)
        # print(f'Epoch: {epoch:03d}, Train Loss: {loss:.3f}, Val AUC: {val_auc:.3f}')
        if epoch % 10 == 0:
            # print(f'Epoch: {epoch:03d}, Train Loss: {loss:.3f}, Val AUC: {val_auc:.3f}, Val AP: {val_ap:.3f}')
            logger.info(f'Epoch: {epoch:03d}, Train Loss: {loss:.3f}, Val AUC: {val_auc:.3f}, Val AP: {val_ap:.3f}')

    logger.info('Training End --------------------------------')
    return model

def eval_link_predictor(model, train_data, val_data, test_data=None):
    model.eval()
    with torch.no_grad():
        if test_data == None: 
            z = model.encode(train_data.x, train_data.edge_index)
        else:
            # 'test_data.edge_index' include 'train_data' & 'val_data' pos_edge_index.
            z = model.encode(test_data.x, test_data.edge_index)

        out = model.decode(z, val_data.edge_label_index).view(-1)
        out = torch.sigmoid(out)

        auc = roc_auc_score(val_data.edge_label.cpu().numpy(), out.cpu().numpy())
        ap = average_precision_score(val_data.edge_label.cpu().numpy(), out.cpu().numpy())

    return auc, ap

In [10]:
data = demo.data
data = T.NormalizeFeatures()(data)
train_data, val_data, test_data = T.RandomLinkSplit(num_val=0.05, 
                                                    num_test=0.1, 
                                                    is_undirected=True, 
                                                    add_negative_train_samples=False
                                                    )(data)
print(data)
print(train_data)
print(val_data)
print(test_data)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = fcAPPNP(data.num_features, 64).to(device)
optimizer = torch.optim.Adam(params=model.parameters(), lr=0.005)
criterion = torch.nn.BCEWithLogitsLoss()

model = train_link_predictor(model, train_data, val_data, optimizer, criterion, n_epochs=300)

test_auc, test_ap = eval_link_predictor(model, train_data, test_data, None)
logger.info(f"Test AUC: {test_auc:.3f}, Test AP: {test_ap:.3f}")
test_auc, test_ap = eval_link_predictor(model, train_data, test_data, test_data)
logger.info(f"tra_val Test AUC: {test_auc:.3f}, Test AP: {test_ap:.3f}")

INFO Training Start


Data(x=[877, 1703], edge_index=[2, 2368])
Data(x=[877, 1703], edge_index=[2, 2134], edge_label=[1067], edge_label_index=[2, 1067])
Data(x=[877, 1703], edge_index=[2, 2134], edge_label=[124], edge_label_index=[2, 124])
Data(x=[877, 1703], edge_index=[2, 2258], edge_label=[250], edge_label_index=[2, 250])


  0%|          | 0/300 [00:00<?, ?it/s]

INFO Epoch: 010, Train Loss: 0.532, Val AUC: 0.850, Val AP: 0.872
INFO Epoch: 020, Train Loss: 0.465, Val AUC: 0.894, Val AP: 0.909
INFO Epoch: 030, Train Loss: 0.445, Val AUC: 0.937, Val AP: 0.940
INFO Epoch: 040, Train Loss: 0.436, Val AUC: 0.957, Val AP: 0.959
INFO Epoch: 050, Train Loss: 0.430, Val AUC: 0.966, Val AP: 0.967
INFO Epoch: 060, Train Loss: 0.422, Val AUC: 0.972, Val AP: 0.973
INFO Epoch: 070, Train Loss: 0.405, Val AUC: 0.973, Val AP: 0.975
INFO Epoch: 080, Train Loss: 0.415, Val AUC: 0.974, Val AP: 0.975
INFO Epoch: 090, Train Loss: 0.417, Val AUC: 0.975, Val AP: 0.977
INFO Epoch: 100, Train Loss: 0.411, Val AUC: 0.972, Val AP: 0.976
INFO Epoch: 110, Train Loss: 0.412, Val AUC: 0.972, Val AP: 0.976
INFO Epoch: 120, Train Loss: 0.402, Val AUC: 0.973, Val AP: 0.977
INFO Epoch: 130, Train Loss: 0.404, Val AUC: 0.974, Val AP: 0.979
INFO Epoch: 140, Train Loss: 0.386, Val AUC: 0.973, Val AP: 0.978
INFO Epoch: 150, Train Loss: 0.398, Val AUC: 0.973, Val AP: 0.978
INFO Epoch

## Uploading

In [11]:
test_df = pd.read_csv(data_path+'raw/test.csv')
test_feats = pd.read_csv(data_path+'raw/content.csv', delimiter='\t', header=None, index_col=0)
test_x = torch.tensor(test_feats.sort_index().values, dtype=torch.float)
test_id = test_df['id'].values
test_edge_index = torch.tensor(test_df[['from', 'to']].values.T)

In [12]:
model.eval()
with torch.no_grad():
    z = model.encode(train_data.x, train_data.edge_index)

    out = model.decode(z, test_edge_index).view(-1)
    out = torch.sigmoid(out)
    print(out)

tensor([0.4198, 0.9822, 0.9093, 0.7215, 0.7458, 0.8618, 0.9084, 0.4951, 0.8971,
        0.5359, 0.4727, 0.9087, 0.7160, 0.8273, 0.5823, 0.8636, 0.5078, 0.8522,
        0.4287, 0.5022, 0.9995, 0.9559, 0.8286, 0.3855, 0.4303, 0.3476, 0.9180,
        0.4993, 0.7421, 0.7212, 0.3894, 0.9724, 0.9224, 0.9753, 0.9987, 0.5872,
        0.7308, 0.9225, 0.8904, 0.4273, 0.8803, 0.5545, 0.4280, 0.3805, 0.5630,
        0.4926, 0.7022, 0.4164, 0.9833, 0.6401, 0.6991, 0.4797, 0.5197, 0.4002,
        0.8539, 0.4055, 0.9484, 0.4432, 0.6218, 0.9849, 0.8545, 0.5412, 0.4046,
        0.3531, 0.5292, 0.8854, 0.5217, 0.4199, 0.4051, 0.4818, 0.5045, 0.7044,
        0.8740, 0.4987, 0.9508, 0.4643, 0.9937, 0.9025, 0.5694, 0.9503, 0.6443,
        0.3258, 0.6458, 0.7960, 0.9416, 0.7169, 0.9326, 0.5287, 0.7267, 0.9240,
        0.8876, 0.8691, 0.6737, 0.5114, 0.5809, 0.3931, 0.9870, 0.4895, 0.4739,
        0.5042, 0.9155, 0.8724, 0.6815, 0.6339, 0.6443, 0.7530, 0.6472, 0.9660,
        0.9132, 0.9859, 0.3619, 0.4297, 

In [13]:
header = ['id', 'prob']
out = out.numpy()
output_csv = []
for ind, val in enumerate(test_id):
    output_csv.append([val, str(out[ind])])
output_csv = pd.DataFrame(output_csv, columns=header)
output_csv.to_csv(data_path+'submission/'+store_file+'.csv', index=False)

### tra_val inference

In [14]:
test_df = pd.read_csv(data_path+'raw/test.csv')
test_feats = pd.read_csv(data_path+'raw/content.csv', delimiter='\t', header=None, index_col=0)
test_x = torch.tensor(test_feats.sort_index().values, dtype=torch.float)
test_id = test_df['id'].values
test_edge_index = torch.tensor(test_df[['from', 'to']].values.T)

In [15]:
model.eval()
with torch.no_grad():
    z = model.encode(test_data.x, test_data.edge_index)

    out = model.decode(z, test_edge_index).view(-1)
    out = torch.sigmoid(out)
    print(out)

tensor([0.4202, 0.9811, 0.9156, 0.7378, 0.7458, 0.8615, 0.9012, 0.4974, 0.8808,
        0.5361, 0.4725, 0.8912, 0.7153, 0.8273, 0.5822, 0.8551, 0.5078, 0.8497,
        0.4288, 0.4922, 0.9997, 0.9534, 0.8527, 0.3855, 0.4317, 0.3524, 0.9180,
        0.4995, 0.7425, 0.7081, 0.3950, 0.9716, 0.9244, 0.9719, 0.9988, 0.5855,
        0.7255, 0.9220, 0.8850, 0.4308, 0.8803, 0.5543, 0.4335, 0.3825, 0.5550,
        0.4926, 0.7006, 0.4185, 0.9853, 0.6353, 0.6991, 0.4790, 0.5197, 0.4002,
        0.8536, 0.4055, 0.9506, 0.4454, 0.6232, 0.9850, 0.8532, 0.5405, 0.4050,
        0.3562, 0.5288, 0.8854, 0.5216, 0.4272, 0.3854, 0.4818, 0.5046, 0.7240,
        0.8740, 0.4986, 0.9508, 0.4452, 0.9937, 0.9173, 0.5641, 0.9502, 0.6798,
        0.3262, 0.6453, 0.8269, 0.9361, 0.7091, 0.9261, 0.5275, 0.7197, 0.9240,
        0.8795, 0.8549, 0.6548, 0.5110, 0.5751, 0.3932, 0.9879, 0.4895, 0.4739,
        0.5013, 0.9185, 0.8724, 0.6752, 0.6368, 0.6443, 0.7521, 0.6537, 0.9657,
        0.9131, 0.9860, 0.3619, 0.4297, 

In [16]:
header = ['id', 'prob']
out = out.numpy()
output_csv = []
for ind, val in enumerate(test_id):
    output_csv.append([val, str(out[ind])])
output_csv = pd.DataFrame(output_csv, columns=header)
output_csv.to_csv(data_path+'submission/'+tra_val_store_file+'.csv', index=False)