In [1]:
import numpy as np
import pandas as pd
import torch
from logs import log
from tqdm.notebook import tqdm
# from tqdm import tqdm
import networkx as nx
import os
import torch.nn as nn
import torch_geometric.transforms as T
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv, GCNConv, GAE, VGAE, APPNP
from torch_geometric.data import InMemoryDataset, Data
from torch_geometric.loader import DataLoader
from torch_geometric.utils import from_networkx, negative_sampling, to_networkx
from sklearn.metrics import roc_auc_score, average_precision_score

### VGNAE model version logs:
    * ver1: VGNAE first try. 
    * ver2: Fix some problem.
    * ver3: Use the training node embedding for validation & testing & uploading.
    * ver4: Use the testing node embedding for testing & uploading.
            (testing edge_index include 'train_data' & 'val_data' pos_edge_index)
      Difference in ver3 & ver4: Need change eval_link_predictor() in inference phase and uploading phase.
    * ver5: Use T.NormalizeFeatures()(data). (Normalize data.x) and use 1000 epoches.
          |_2: like ver4 using testing node embedding.
    * ver6: set embedding dimension = 64 (128 --> 64), lr=0.005, epoch=300
          |_2: like ver4 using testing node embedding.
    * ver7: (APPNP(K=1, alpha=0)) --> APPNP(K=10, alpha=0.15)
### unVGNAE model version logs:
    * ver1: Change to training on undirected graph. (embedding dimension = 64, lr=0.005)
          |_2: Use the testing node embedding for testing & uploading.
    * ver2: tune hyperparameter(APPNP alpha=0.15)
          |_2: Use the testing node embedding for testing & uploading.
    * ver3: tune hyperparameter(APPNP K=10) (ver2 is better)
    * ver4: run experiment

## Read data

In [2]:
"""Datasets:
    * id: edge id, 
    * from & to: 'from' node point to 'to' node, 
    * label: connect or not.
    * content: containing each node's attribute.

   Evaluate:
    * AUC: area under ROC curve
    * AP: average precision
"""
data_path = './dataset1/'
model_version = 'unVGNAE_ver4'
upload_dataset_info = '_submission'
store_file = model_version + upload_dataset_info
tra_val_store_file = model_version + '_2' + upload_dataset_info
log_file = 'logs/' + store_file + '.log'
logger = log(path=data_path, file=log_file)

df_train = pd.read_csv(data_path+'raw/train.csv').sort_values('from')
df_test = pd.read_csv(data_path+'raw/test.csv')
df_content = pd.read_csv(data_path+'raw/content.csv', delimiter='\t', header=None)
df_upload = pd.read_csv(data_path+'raw/upload.csv')

In [3]:
print(f'Node feature shape: {df_content.shape}')
tmp_node_feats = df_content.set_index(0)
tmp_node_ids = tmp_node_feats.index.values.astype(int)

Node feature shape: (2708, 1434)


In [4]:
df_train.head()

Unnamed: 0,id,to,from,label
3157,E1276,962,1,1
1723,E6057,527,2,0
57,E795,1937,2,0
4492,E9763,839,3,0
4927,E1266,590,3,0


## Datasets

In [5]:
class Graph_dataset(InMemoryDataset):
    def __init__(self, root, transform=None, pre_transform=None):
        super(Graph_dataset, self).__init__(root, transform, pre_transform)
        self.data, self.slices = torch.load(self.processed_paths[0])
    
    @property
    def raw_file_names(self):
        return ['train.csv', 'content.csv']
    
    @property
    def processed_file_names(self):
        return ['train.pt']
    
    def download(self):
        pass

    def process(self):
        self.data = pd.read_csv(self.raw_paths[0]).sort_values('from')
        node_feats = pd.read_csv(self.raw_paths[1], delimiter='\t', header=None, index_col=0)
        
        # Get node features. [num_nodes, num_node_features]
        x = torch.tensor(node_feats.sort_index().values, dtype=torch.float)
        
        # Get positive data.(label = 1: link)
        pos_data = self.data[self.data['label'] == 1]
        # neg_data = self.data[self.data['label'] == 0]

        # Get edge index.
        graph = nx.from_pandas_edgelist(pos_data, 'from', 'to', edge_attr=None)

        pair1 = [i[0] for i in graph.edges()]
        pair2 = [i[1] for i in graph.edges()]
        pos_edge_index = torch.LongTensor([pair1+pair2,pair2+pair1])

        # Create Data object.
        proc_graph = Data(x=x,
                          edge_index=pos_edge_index,
                          y=None)
        print(proc_graph)

        data, slices = self.collate([proc_graph])
        torch.save((data, slices), self.processed_paths[0])

In [6]:
demo = Graph_dataset(data_path)

In [7]:
for times, data in enumerate(demo, 1):
    print(data)
    print(data.x.size(0))
    print(data.x)
    print(data.edge_index)
    print(data.y)
    print(data.num_node_features)

    # using this to check whether data.edge_index is fulfilled data.x values.
    data.validate(raise_on_error=True)

Data(x=[2708, 1433], edge_index=[2, 8472])
2708
tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])
tensor([[   1,    4,    4,  ..., 1402, 2362, 1210],
        [ 962, 2062, 1547,  ..., 2690, 2691, 2697]])
None
1433


## Model

In [8]:
class Encoder(torch.nn.Module):
    def __init__(self, in_channels, out_channels, edge_index):
        super(Encoder, self).__init__()
        self.linear1 = nn.Linear(in_channels, out_channels)
        self.linear2 = nn.Linear(in_channels, out_channels)
        self.propagate = APPNP(K=1, alpha=0.15)

    def forward(self, x, edge_index, not_prop=0):
        x_ = self.linear1(x)
        x_ = self.propagate(x_, edge_index)

        x = self.linear2(x)
        x = F.normalize(x,p=2,dim=1) * 1.8
        x = self.propagate(x, edge_index)
        return x, x_

## Training

In [9]:
def train_link_predictor(model, train_data, val_data, optimizer, n_epochs=200):
    logger.info('Training Start')
    for epoch in tqdm(range(1, n_epochs+1)):
        model.train()
        optimizer.zero_grad()
        z = model.encode(train_data.x, train_data.edge_index)

        loss = model.recon_loss(z, train_data.edge_index)
        loss = loss + (1/train_data.num_nodes) * model.kl_loss()
        loss.backward()
        optimizer.step()

        val_auc, val_ap = eval_link_predictor(model, train_data, val_data, None)
        if epoch % 10 == 0:
            # print('Epoch: {:03d}, TRAIN LOSS: {:.4f}, VAL AUC: {:.4f}, VAL AP: {:.4f}'.format(epoch, loss, val_auc, val_ap))
            logger.info(f'Epoch: {epoch:03d}, Train Loss: {loss:.3f}, Val AUC: {val_auc:.3f}, Val AP: {val_ap:.3f}')

    logger.info('Training End --------------------------------')
    return model

def eval_link_predictor(model, train_data, val_data, test_data=None):
    model.eval()
    with torch.no_grad():
        if test_data == None: 
            z = model.encode(train_data.x, train_data.edge_index)
        else:
            # 'test_data.edge_index' include 'train_data' & 'val_data' pos_edge_index.
            z = model.encode(test_data.x, test_data.edge_index)

    return model.test(z, val_data.pos_edge_label_index, val_data.neg_edge_label_index)

In [10]:
data = demo.data
data = T.NormalizeFeatures()(data)
train_data, val_data, test_data = T.RandomLinkSplit(num_val=0.05, 
                                                    num_test=0.1, 
                                                    split_labels=True, 
                                                    is_undirected=True, 
                                                    add_negative_train_samples=False
                                                    )(data)
print(data)
print(train_data)
print(val_data)
print(test_data)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = VGAE(Encoder(data.num_features, 64, train_data.edge_index)).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)

model = train_link_predictor(model, train_data, val_data, optimizer, n_epochs=300)

test_auc, test_ap = eval_link_predictor(model, train_data, test_data, None)
logger.info(f"Test AUC: {test_auc:.3f}, Test AP: {test_ap:.3f}")
test_auc, test_ap = eval_link_predictor(model, train_data, test_data, test_data)
logger.info(f"Test AUC: {test_auc:.3f}, Test AP: {test_ap:.3f}")

INFO Training Start


Data(x=[2708, 1433], edge_index=[2, 8472])
Data(x=[2708, 1433], edge_index=[2, 7204], pos_edge_label=[3602], pos_edge_label_index=[2, 3602])
Data(x=[2708, 1433], edge_index=[2, 7204], pos_edge_label=[211], pos_edge_label_index=[2, 211], neg_edge_label=[211], neg_edge_label_index=[2, 211])
Data(x=[2708, 1433], edge_index=[2, 7626], pos_edge_label=[423], pos_edge_label_index=[2, 423], neg_edge_label=[423], neg_edge_label_index=[2, 423])


  0%|          | 0/400 [00:00<?, ?it/s]

INFO Epoch: 010, Train Loss: 12.633, Val AUC: 0.704, Val AP: 0.741
INFO Epoch: 020, Train Loss: 11.176, Val AUC: 0.711, Val AP: 0.747
INFO Epoch: 030, Train Loss: 9.726, Val AUC: 0.728, Val AP: 0.758
INFO Epoch: 040, Train Loss: 8.050, Val AUC: 0.807, Val AP: 0.818
INFO Epoch: 050, Train Loss: 6.472, Val AUC: 0.946, Val AP: 0.938
INFO Epoch: 060, Train Loss: 5.132, Val AUC: 0.951, Val AP: 0.946
INFO Epoch: 070, Train Loss: 4.545, Val AUC: 0.941, Val AP: 0.937
INFO Epoch: 080, Train Loss: 3.947, Val AUC: 0.941, Val AP: 0.936
INFO Epoch: 090, Train Loss: 3.357, Val AUC: 0.938, Val AP: 0.933
INFO Epoch: 100, Train Loss: 3.064, Val AUC: 0.939, Val AP: 0.934
INFO Epoch: 110, Train Loss: 2.850, Val AUC: 0.938, Val AP: 0.932
INFO Epoch: 120, Train Loss: 2.528, Val AUC: 0.938, Val AP: 0.933
INFO Epoch: 130, Train Loss: 2.440, Val AUC: 0.939, Val AP: 0.934
INFO Epoch: 140, Train Loss: 2.265, Val AUC: 0.939, Val AP: 0.934
INFO Epoch: 150, Train Loss: 2.157, Val AUC: 0.939, Val AP: 0.935
INFO Epo

## Uploading

In [11]:
test_df = pd.read_csv(data_path+'raw/test.csv')
test_feats = pd.read_csv(data_path+'raw/content.csv', delimiter='\t', header=None, index_col=0)
test_x = torch.tensor(test_feats.sort_index().values, dtype=torch.float)
test_id = test_df['id'].values
test_edge_index = torch.tensor(test_df[['from', 'to']].values.T)

In [12]:
model.eval()
with torch.no_grad():
    z = model.encode(train_data.x, train_data.edge_index)
    out = model.decode(z, test_edge_index).view(-1)
    print(out)
    # out = torch.sigmoid(out)

tensor([0.4536, 0.6662, 0.8894,  ..., 0.9388, 0.9649, 0.3684])


In [13]:
header = ['id', 'prob']
out = out.numpy()
output_csv = []
for ind, val in enumerate(test_id):
    output_csv.append([val, str(out[ind])])
output_csv = pd.DataFrame(output_csv, columns=header)
output_csv.to_csv(data_path+'submission/'+store_file+'.csv', index=False)

### tra_val inference

In [14]:
test_df = pd.read_csv(data_path+'raw/test.csv')
test_feats = pd.read_csv(data_path+'raw/content.csv', delimiter='\t', header=None, index_col=0)
test_x = torch.tensor(test_feats.sort_index().values, dtype=torch.float)
test_id = test_df['id'].values
test_edge_index = torch.tensor(test_df[['from', 'to']].values.T)

In [15]:
model.eval()
with torch.no_grad():
    z = model.encode(test_data.x, test_data.edge_index)
    out = model.decode(z, test_edge_index).view(-1)
    print(out)
    # out = torch.sigmoid(out)

tensor([0.4695, 0.6662, 0.8894,  ..., 0.9388, 0.9611, 0.3684])


In [16]:
header = ['id', 'prob']
out = out.numpy()
output_csv = []
for ind, val in enumerate(test_id):
    output_csv.append([val, str(out[ind])])
output_csv = pd.DataFrame(output_csv, columns=header)
output_csv.to_csv(data_path+'submission/'+tra_val_store_file+'.csv', index=False)