In [1]:
import numpy as np
import pandas as pd
import torch
from logs import log
from tqdm.notebook import tqdm
# from tqdm import tqdm
import networkx as nx
import os
import torch.nn as nn
import torch_geometric.transforms as T
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv, GCNConv, GAE, VGAE, APPNP
from torch_geometric.data import InMemoryDataset, Data
from torch_geometric.loader import DataLoader
from torch_geometric.utils import from_networkx, negative_sampling, to_networkx
from sklearn.metrics import roc_auc_score

### GNAE model version logs:
    * ver1: copy VGNAE.ipynb then change Encoder & model. 
          |_2: Use the testing node embedding for testing & uploading.
    * ver2: tune hyperparameter(APPNP alpha=0.15)
          |_2: Use the testing node embedding for testing & uploading.

## Read data

In [2]:
"""Datasets:
    * id: edge id, 
    * from & to: 'from' node point to 'to' node, 
    * label: connect or not.
    * content: containing each node's attribute.

   Evaluate:
    * AUC: area under ROC curve
    * AP: average precision
"""
data_path = './dataset1/'
store_file = 'unGNAE_ver3_submission'
log_file = 'logs/'+store_file+'.log'
logger = log(path=data_path, file=log_file)

df_train = pd.read_csv(data_path+'raw/train.csv')
df_test = pd.read_csv(data_path+'raw/test.csv')
df_content = pd.read_csv(data_path+'raw/content.csv', delimiter='\t', header=None)
df_upload = pd.read_csv(data_path+'raw/upload.csv')

In [3]:
print(f'Node feature shape: {df_content.shape}')
tmp_node_feats = df_content.set_index(0)
tmp_node_ids = tmp_node_feats.index.values.astype(int)

Node feature shape: (2708, 1434)


In [4]:
df_train

Unnamed: 0,id,to,from,label
0,E10311,2399,2339,0
1,E10255,2397,1144,1
2,E10667,854,1726,0
3,E9395,872,702,0
4,E5926,2450,1312,1
...,...,...,...,...
8681,E1171,1643,1383,0
8682,E4741,1879,1443,1
8683,E9256,171,1711,1
8684,E4322,633,2440,1


## Datasets

In [5]:
class Graph_dataset(InMemoryDataset):
    def __init__(self, root, transform=None, pre_transform=None):
        super(Graph_dataset, self).__init__(root, transform, pre_transform)
        self.data, self.slices = torch.load(self.processed_paths[0])
    
    @property
    def raw_file_names(self):
        return ['train.csv', 'content.csv']
    
    @property
    def processed_file_names(self):
        return ['train.pt']
    
    def download(self):
        pass

    def process(self):
        self.data = pd.read_csv(self.raw_paths[0]).sort_values('from')
        node_feats = pd.read_csv(self.raw_paths[1], delimiter='\t', header=None, index_col=0)
        
        # Get node features. [num_nodes, num_node_features]
        x = torch.tensor(node_feats.sort_index().values, dtype=torch.float)
        
        # Get positive data.(label = 1: link)
        pos_data = self.data[self.data['label'] == 1]
        # neg_data = self.data[self.data['label'] == 0]

        # Get edge index.
        graph = nx.from_pandas_edgelist(pos_data, 'from', 'to', edge_attr=None)

        pair1 = [i[0] for i in graph.edges()]
        pair2 = [i[1] for i in graph.edges()]
        pos_edge_index = torch.LongTensor([pair1+pair2,pair2+pair1])

        # Create Data object.
        proc_graph = Data(x=x,
                          edge_index=pos_edge_index,
                          y=None)
        print(proc_graph)

        data, slices = self.collate([proc_graph])
        torch.save((data, slices), self.processed_paths[0])

In [6]:
demo = Graph_dataset(data_path)

In [7]:
for times, data in enumerate(demo, 1):
    print(data)
    print(data.x.size(0))
    print(data.x)
    print(data.edge_index)
    print(data.y)
    print(data.num_node_features)

    # using this to check whether data.edge_index is fulfilled data.x values.
    data.validate(raise_on_error=True)

Data(x=[2708, 1433], edge_index=[2, 8472])
2708
tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])
tensor([[   1,    4,    4,  ..., 1402, 2362, 1210],
        [ 962, 2062, 1547,  ..., 2690, 2691, 2697]])
None
1433


## Model

In [8]:
class Encoder(torch.nn.Module):
    def __init__(self, in_channels, out_channels, edge_index):
        super(Encoder, self).__init__()
        self.linear1 = nn.Linear(in_channels, out_channels)
        self.propagate = APPNP(K=1, alpha=0.15)

    def forward(self, x, edge_index, not_prop=0):
        x = self.linear1(x)
        x = F.normalize(x, p=2, dim=-1) * 1.8
        x = self.propagate(x, edge_index)

        return x

## Training

In [9]:
def train_link_predictor(model, train_data, val_data, optimizer, n_epochs=200):
    logger.info('Training Start')
    for epoch in tqdm(range(1, n_epochs+1)):
        model.train()
        optimizer.zero_grad()
        z = model.encode(train_data.x, train_data.edge_index)

        loss = model.recon_loss(z, train_data.edge_index)

        loss.backward()
        optimizer.step()

        val_auc, val_ap = eval_link_predictor(model, train_data, val_data, None)
        if epoch % 10 == 0:
            # print('Epoch: {:03d}, TRAIN LOSS: {:.4f}, VAL AUC: {:.4f}, VAL AP: {:.4f}'.format(epoch, loss, val_auc, val_ap))
            logger.info(f'Epoch: {epoch:03d}, Train Loss: {loss:.3f}, Val AUC: {val_auc:.3f}, Val AP: {val_ap:.3f}')

    logger.info('Training End --------------------------------')
    return model

def eval_link_predictor(model, train_data, val_data, test_data=None):
    model.eval()
    with torch.no_grad():
        if test_data == None: 
            z = model.encode(train_data.x, train_data.edge_index)
        else:
            # 'test_data.edge_index' include 'train_data' & 'val_data' pos_edge_index.
            z = model.encode(test_data.x, test_data.edge_index)

    return model.test(z, val_data.pos_edge_label_index, val_data.neg_edge_label_index)

In [10]:
data = demo.data
data = T.NormalizeFeatures()(data)
train_data, val_data, test_data = T.RandomLinkSplit(num_val=0.05, 
                                                    num_test=0.1, 
                                                    split_labels=True, 
                                                    is_undirected=True, 
                                                    add_negative_train_samples=False
                                                    )(data)
print(data)
print(train_data)
print(val_data)
print(test_data)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GAE(Encoder(data.num_features, 64, train_data.edge_index)).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)

model = train_link_predictor(model, train_data, val_data, optimizer, n_epochs=300)
test_auc, test_ap = eval_link_predictor(model, train_data, test_data, test_data)
# print(f"Test AUC: {test_auc:.3f}, Test AP: {test_ap:.3f}")
logger.info(f"Test AUC: {test_auc:.3f}, Test AP: {test_ap:.3f}")

INFO Training Start


Data(x=[2708, 1433], edge_index=[2, 8472])
Data(x=[2708, 1433], edge_index=[2, 7204], pos_edge_label=[3602], pos_edge_label_index=[2, 3602])
Data(x=[2708, 1433], edge_index=[2, 7204], pos_edge_label=[211], pos_edge_label_index=[2, 211], neg_edge_label=[211], neg_edge_label_index=[2, 211])
Data(x=[2708, 1433], edge_index=[2, 7626], pos_edge_label=[423], pos_edge_label_index=[2, 423], neg_edge_label=[423], neg_edge_label_index=[2, 423])


  0%|          | 0/300 [00:00<?, ?it/s]

INFO Epoch: 010, Train Loss: 1.133, Val AUC: 0.751, Val AP: 0.776
INFO Epoch: 020, Train Loss: 1.201, Val AUC: 0.679, Val AP: 0.682
INFO Epoch: 030, Train Loss: 1.203, Val AUC: 0.671, Val AP: 0.679
INFO Epoch: 040, Train Loss: 1.195, Val AUC: 0.655, Val AP: 0.664
INFO Epoch: 050, Train Loss: 1.185, Val AUC: 0.680, Val AP: 0.677
INFO Epoch: 060, Train Loss: 1.203, Val AUC: 0.659, Val AP: 0.656
INFO Epoch: 070, Train Loss: 1.184, Val AUC: 0.677, Val AP: 0.675
INFO Epoch: 080, Train Loss: 1.187, Val AUC: 0.665, Val AP: 0.663
INFO Epoch: 090, Train Loss: 1.183, Val AUC: 0.672, Val AP: 0.673
INFO Epoch: 100, Train Loss: 1.182, Val AUC: 0.658, Val AP: 0.658
INFO Epoch: 110, Train Loss: 1.176, Val AUC: 0.674, Val AP: 0.673
INFO Epoch: 120, Train Loss: 1.182, Val AUC: 0.668, Val AP: 0.668
INFO Epoch: 130, Train Loss: 1.179, Val AUC: 0.670, Val AP: 0.669
INFO Epoch: 140, Train Loss: 1.189, Val AUC: 0.662, Val AP: 0.662
INFO Epoch: 150, Train Loss: 1.180, Val AUC: 0.679, Val AP: 0.678
INFO Epoch

In [10]:
# for epoch in tqdm(range(1, 200+1)):
#     model.train()
#     optimizer.zero_grad()
#     z = model.encode(data.x, data.edge_index)

#     loss = model.recon_loss(z, train_data.pos_edge_label_index)
#     loss = loss + (1/train_data.num_nodes) * model.kl_loss()
#     loss.backward()
#     optimizer.step()

#     val_auc, val_ap = eval_link_predictor(model, val_data)
#     if epoch % 10 == 0:
#         print('Epoch: {:03d}, TRAIN LOSS: {:.4f}, VAL AUC: {:.4f}, VAL AP: {:.4f}'.format(epoch, loss, val_auc, val_ap))

## Uploading

In [11]:
test_df = pd.read_csv(data_path+'raw/test.csv')
test_feats = pd.read_csv(data_path+'raw/content.csv', delimiter='\t', header=None, index_col=0)
test_x = torch.tensor(test_feats.sort_index().values, dtype=torch.float)
test_id = test_df['id'].values
test_edge_index = torch.tensor(test_df[['from', 'to']].values.T)

In [12]:
model.eval()
with torch.no_grad():
    z = model.encode(test_data.x, test_data.edge_index)
    out = model.decode(z, test_edge_index).view(-1)
    print(out)
    # out = torch.sigmoid(out)

tensor([0.4559, 0.9760, 0.9253, 0.7663, 0.7521, 0.8544, 0.8814, 0.2172, 0.8532,
        0.4659, 0.4615, 0.9066, 0.6769, 0.8192, 0.5639, 0.6941, 0.5068, 0.8609,
        0.4046, 0.7240, 0.9997, 0.6451, 0.8915, 0.3801, 0.4174, 0.4500, 0.9158,
        0.4923, 0.7314, 0.7779, 0.4655, 0.9744, 0.9454, 0.9838, 0.9826, 0.6548,
        0.6930, 0.8715, 0.9438, 0.4547, 0.8844, 0.6759, 0.5002, 0.3781, 0.4889,
        0.3719, 0.7303, 0.5119, 0.9730, 0.4754, 0.5594, 0.5116, 0.4955, 0.5769,
        0.8655, 0.5549, 0.9770, 0.4166, 0.3652, 0.9873, 0.8994, 0.4411, 0.5387,
        0.4174, 0.5706, 0.8665, 0.6413, 0.9499, 0.2844, 0.5174, 0.5094, 0.8362,
        0.8755, 0.4922, 0.9585, 0.3869, 0.9751, 0.9305, 0.6518, 0.7793, 0.5941,
        0.3688, 0.6996, 0.9486, 0.9328, 0.5949, 0.9387, 0.4713, 0.5686, 0.9110,
        0.7375, 0.7933, 0.7143, 0.4767, 0.4765, 0.3759, 0.9228, 0.4603, 0.5729,
        0.5473, 0.7596, 0.8389, 0.5505, 0.8645, 0.6019, 0.5733, 0.6477, 0.9578,
        0.9077, 0.9846, 0.4128, 0.4335, 

In [13]:
header = ['id', 'prob']
out = out.numpy()
output_csv = []
for ind, val in enumerate(test_id):
    output_csv.append([val, str(out[ind])])
output_csv = pd.DataFrame(output_csv, columns=header)
output_csv.to_csv(data_path+'submission/'+store_file+'.csv', index=False)