#NS competion 2: link predicition

In [None]:
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
%matplotlib inline
import random
import gdown
import io
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import train_test_split

In [None]:
!pip install -U dgl==0.8.1 dglgo -f https://data.dgl.ai/wheels/repo.html -q

In [None]:
import dgl
from dgl import function as fn
#from dgl.nn import SAGEConv

In [None]:
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)

# Import data

In [None]:
#node_feat.txt
url = 'https://drive.google.com/uc?id=1j5lJ7ySt8gYKgPrEiDqk4p0IvQvh60My'
output = io.BytesIO()
gdown.download(url, output, quiet=True)
output.seek(0)
content = output.read().decode('utf-8')
feats = []
for line in content.split('\n')[:-1]:
    feat = tuple(float(x) for x in line.split(' '))
    feats.append(feat)
print(feats[-1])

(0.249337, -0.303307, -0.057047, -0.077672, -0.012694, 0.553903, -0.173514, -0.310603, -0.193071, -0.624521, -0.239897, -0.182226, 0.328767, -0.232109, 0.08823, -0.368747, 0.072896, -0.17389, 0.080462, 0.221433, -0.003497, 0.798653, 0.389034, 0.309438, -0.612104, 0.412809, 0.040785, -0.375882, -0.30351, 0.103221, -0.038266, 0.259194)


In [None]:
#train_edges.txt
url = 'https://drive.google.com/uc?id=1NKNa9SbO_ishoJWGX2nvx3xf4rEzwhvN'
output = io.BytesIO()
gdown.download(url, output, quiet=True)
output.seek(0)
content = output.read().decode('utf-8')
edges = []
for line in content.split('\n')[:-1]:
    edge = tuple(int(x) for x in line.split(' '))
    edges.append(edge)
print(edges[-1])

(12586, 10728)


In [None]:
#unlabeled_nodes.txt
url = 'https://drive.google.com/uc?id=1QQa_mgW3qFCQ8ZHKDtDsSfjuhKcP5AOR'
output = io.BytesIO()
gdown.download(url, output, quiet=True)
output.seek(0)
content = output.read().decode('utf-8')
unlabeled_nodes = []
for line in content.split('\n')[:-1]:
    edg = tuple(int(x) for x in line.split(' '))
    unlabeled_nodes.append(edg)
print(unlabeled_nodes[-1])

(1262, 2232)


In [None]:
N = len(feats)
print(N)
print(len(edges))
print(len(unlabeled_nodes))

12588
14322
44014


Build the directed graph:

In [None]:
G = nx.DiGraph()
for i, feat in enumerate(feats):
    G.add_node(i, feature=feat)
G.add_edges_from(edges)
print(G)

DiGraph with 12588 nodes and 14322 edges


# GNN model

In [None]:
class GCNMessagePassingLayer(nn.Module):
    def __init__(self, in_dim, out_dim):
        super().__init__()
        self.dense = nn.Linear(in_dim, out_dim)
    def forward(self, h, graph):
        with graph.local_scope():
            graph = graph.add_self_loop()
            norm = graph.in_degrees()[:, None] ** (-0.5)
            graph.ndata['h'] = self.dense(h) * norm

            graph.update_all(fn.copy_src(src='h', out='m'),
                             fn.sum(msg='m', out='h'))
        return graph.ndata['h'] * norm

In [None]:
class GCNEncoder(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GCNEncoder, self).__init__()
        self.conv1 = GCNMessagePassingLayer(input_dim, hidden_dim)
        self.conv2 = GCNMessagePassingLayer(hidden_dim, output_dim)
        #self.Dropout = nn.Dropout(0.3)

    def forward(self, x, adj):
        h = self.conv1(x, adj)
        h = F.relu(h)
        h = self.conv2(h, adj)
        return h

In [None]:
class MLPDecoder(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(MLPDecoder, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, 1)
        #self.Dropout = nn.Dropout(0.3)

    def forward(self, x1, x2):
        #Hadamard product
        x = torch.mul(x1, x2)
        #x = (x1 - x2) ** 2
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return torch.sigmoid(x)

Negative sampling:

In [None]:
def negative_sampling(edges, n_nodes, n_neg_samples):
    neg_samples = []
    while len(neg_samples) < n_neg_samples:
        i = random.randint(0,n_nodes-1)
        j = random.randint(0,n_nodes-1)
        if (i,j) not in set(edges+neg_samples+unlabeled_nodes):
            neg_samples.append((i,j))
    return neg_samples

Construct adjacency matrix:

In [None]:

#graph = dgl.from_networkx(G)
neg_edges = negative_sampling(edges, len(G), int(1*len(edges)))

pos_samples = torch.tensor(edges, dtype=torch.long)
pos_labels = torch.ones(len(edges), dtype=torch.float)

neg_samples = torch.tensor(neg_edges, dtype=torch.long)
neg_labels = torch.zeros(len(neg_samples), dtype=torch.float)

train_samples = torch.cat([pos_samples, neg_samples], dim=0)
train_labels = torch.cat([pos_labels, neg_labels], dim=0)


adj = nx.adjacency_matrix(G).todense()
adj = torch.tensor(adj, dtype=torch.float)
features = torch.tensor(feats, dtype=torch.float)

s_train, s_test, l_train, l_test = train_test_split(train_samples, train_labels, test_size=0.2, random_state=42)
#train_samples = s_train
#train_labels = l_train

In [None]:
#train_samples

Training the model:

In [None]:
input_dim = 32
hidden_dim = 256
output_dim = 128
mlp_hidden_dim = 256
learning_rate = 0.01
num_epochs = 200

encoder = GCNEncoder(input_dim, hidden_dim, output_dim)
decoder = MLPDecoder(output_dim, mlp_hidden_dim)
model = nn.Sequential(encoder, decoder)

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()
graph = dgl.from_networkx(G)

In [None]:
def execute_model(model, train_samples, train_labels, patience=100):
    model.train()
    wait = 0
    best_loss = np.inf
    min_delta = 1
    for epoch in range(num_epochs):
        optimizer.zero_grad()
        #embeddings = encoder(features, pos_samples.t())
        embeddings = encoder(features, graph)
        train_edge_embeddings1 = embeddings[train_samples[:, 0]]
        train_edge_embeddings2 = embeddings[train_samples[:, 1]]
        train_predictions = decoder(train_edge_embeddings1, train_edge_embeddings2).squeeze()

        loss = criterion(train_predictions, train_labels)
        loss.backward()
        optimizer.step()

        #print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {loss.item()}')
        if loss < best_loss - min_delta:
            best_loss = loss
            wait = 0
        else:
            wait += 1
            if wait >= patience:
                break
    return model

    #accuracy = (predicted_labels == train_labels).sum().item() / train_labels.size(0)

In [None]:
def evaluate_model(model, s_test, l_test):
    model.eval()
    with torch.no_grad():
        embeddings = encoder(features, graph)
        #embeddings = encoder(features, pos_samples.t())
        test_edge_embeddings1 = embeddings[s_test[:, 0]]
        test_edge_embeddings2 = embeddings[s_test[:, 1]]
        test_predictions = decoder(test_edge_embeddings1, test_edge_embeddings2).squeeze()
        predicted_labels = torch.round(test_predictions).long()
        correct_predictions = (predicted_labels == l_test).sum().item()
        total_predictions = l_test.size(0)
        accuracy = correct_predictions / total_predictions
        print(f'Training Accuracy: {accuracy}')
        print(balanced_accuracy_score(l_test, predicted_labels))
        return accuracy

In [None]:
model = execute_model(model, train_samples, train_labels)

Model evaluation:

In [None]:
evaluate_model(model, train_samples, train_labels)
#evaluate_model(s_test, l_test)

Training Accuracy: 0.973676860773635
0.973676860773635


0.973676860773635

In [None]:
torch.save(model.state_dict(), 'trained_model.pth')
load = False
if load:
    model.load_state_dict(torch.load('trained_model.pth'))

In [None]:
model.eval()
with torch.no_grad():
    embeddings = encoder(features, graph)
    #embeddings = encoder(features, pos_samples.t())
    unlabeled_samples = torch.tensor(unlabeled_nodes, dtype=torch.long)
    unlabeled_predictions = decoder(embeddings[unlabeled_samples[:, 0]], embeddings[unlabeled_samples[:, 1]]).squeeze()
    unlabeled_predictions = torch.round(unlabeled_predictions).long()

In [None]:
prediction = [(i, label) for i, label in enumerate(unlabeled_predictions)]
with open('submission.csv','w') as f:
    f.write('ID,Edge\n')
    for i, label in enumerate(unlabeled_predictions):
        f.write(f'{i}, {label}\n')