In [1]:
import dgl
import torch
import torch.nn as nn
import torch.nn.functional as F
import itertools
import link_prediction as lp
import numpy as np
from sklearn.model_selection import train_test_split
from dgl.nn.pytorch.conv import SAGEConv, GraphConv, DotGatConv
import networkx as nx

In [2]:
def negative_sampling_etc(H, un_edges, k=1):
    negative_edges = set()
    all_edges = list(H.edges)
    nodes = list(H.nodes)
    for i in range(k):
        for u, _ in all_edges:
            exists = True
            while exists:
                v = np.random.choice(nodes, size=1)[0]
                if (u != v) and ((u, v) not in all_edges):
                    if (u, v) not in negative_edges:
                        if (u, v) not in un_edges:
                            negative_edges.add((u, v))
                            exists = False
    return list(negative_edges)


def negative_sampling(H, un_edges):
    negative_edges = set()
    all_edges = list(H.edges)
    nodes = list(H.nodes)
    for u, _ in all_edges:
        exists = True
        while exists:
            v = np.random.choice(nodes, size=1)[0]
            if (u != v) and ((u, v) not in all_edges):
                if (u, v) not in negative_edges:
                    negative_edges.add((u, v))
                    exists = False
    return list(negative_edges)


def negative_sampling_rand(H, un_edges, k=1):
    negative_edges = set()
    all_edges = list(H.edges)
    nodes = list(H.nodes)
    while len(negative_edges) < k * len(all_edges):
        u, v = np.random.choice(nodes, size=2)
        if (u != v) and ((u, v) not in all_edges):
            if (u, v) not in un_edges:
                negative_edges.add((u, v))
    return list(negative_edges)

In [3]:
class GraphEncoder(nn.Module):
    def __init__(self, in_feats, h_feats, conv1, conv2):
        super(GraphEncoder, self).__init__()
        self.conv2_type = conv2
        self.conv3_type = conv2
        if conv1 == 'sage':
            self.conv1 = SAGEConv(in_feats, h_feats, 'gcn')
        else:
            self.conv1 = GraphConv(in_feats, h_feats, weight=True, norm='both')
        if conv2 == 'sage':
            self.conv2 = SAGEConv(h_feats, h_feats, 'gcn')
        elif conv2 == 'gat':
            self.conv2 = DotGatConv(h_feats, h_feats, num_heads=1)
        elif conv2 == 'gcn':
            self.conv2 = GraphConv(h_feats, h_feats, weight=True, norm='both')
        # if conv3 == 'sage':
        #     self.conv3 = SAGEConv(h_feats, h_feats, 'gcn')
        # elif conv3 == 'gat':
        #     self.conv3 = DotGatConv(h_feats, h_feats, num_heads=1)
        # elif conv3 == 'gcn':
        #     self.conv3 = GraphConv(h_feats, h_feats, weight=True, norm='both')


    def forward(self, g, in_feat):
        h = self.conv1(g, in_feat)
        if self.conv2_type != 'not':
            h = F.relu(h)
            h = self.conv2(g, h)
            if self.conv2_type == 'gat':
                h = torch.flatten(h, start_dim=1)
            # if self.conv3_type != 'not':
            #     h = F.relu(h)
            #     h = self.conv3(g, h)
            #     if self.conv3_type == 'gat':
            #         h = torch.flatten(h, start_dim=1)
        return h


class Predictor(nn.Module):
    def __init__(self, in_feats, h_feats):
        super().__init__()
        # self.mapping = mapping
        self.W1 = nn.Linear(in_feats, h_feats)
        self.P1 = nn.Linear(in_feats, h_feats)
        self.W2 = nn.Linear(h_feats, 1)

    def apply_edges(self, edges):
        h = edges.src['h'] * edges.dst['h']
        w = F.relu(self.W1(h))
        p = torch.sigmoid(self.P1(h))
        h = self.W2(w * p)
        return {'score': h.squeeze(1)}

    def forward(self, g, h):
        with g.local_scope():
            g.ndata['h'] = h
            g.apply_edges(self.apply_edges)
            return g.edata['score']

In [8]:
from sklearn.metrics import roc_auc_score


def compute_loss(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score])
    labels = torch.cat([torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])])
    return F.binary_cross_entropy_with_logits(scores, labels)


def compute_auc(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score]).numpy()
    labels = torch.cat(
        [torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])]).numpy()
    return roc_auc_score(labels, scores)

In [11]:
import optuna


def train_sigle_model(sample_func):
    def objective(trial):
        conv1 = trial.suggest_categorical("conv1", ["sage", "gcn"])
        conv2 = trial.suggest_categorical("conv2", ["sage", "gat", "gcn", "not"])
        # conv3 = trial.suggest_categorical("conv3", ["sage", "gat", "gcn", "not"])
        # mapping = trial.suggest_categorical('mapping', ['dot', 'sum', 'flat'])
        encoder_hidden = trial.suggest_int("encoder_hidden", 32, 1024, log=True)
        predictor_hidden = trial.suggest_int("predictor_hidden", 8, 128, log=True)
        encoder = GraphEncoder(32, encoder_hidden, conv1, conv2)
        predictor = Predictor(encoder_hidden, predictor_hidden)
        optimizer = torch.optim.Adam(itertools.chain(encoder.parameters(), predictor.parameters()), lr=1e-3)
        scores = {}
        for epoch in range(500):
            h = encoder(train_g , torch.tensor(node_emb, dtype=torch.float))
            pos_score = predictor(train_pos_g, h)
            neg_score = predictor(train_neg_g, h)
            loss = compute_loss(pos_score, neg_score)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            if epoch % 10 == 9:
                with torch.no_grad():
                    pos_score = torch.sigmoid(predictor(test_pos_g, h))
                    neg_score = torch.sigmoid(predictor(test_neg_g, h))
                scores[epoch] = compute_auc(pos_score, neg_score)
                trial.report(scores[epoch], epoch)
                if trial.should_prune():
                    raise optuna.exceptions.TrialPruned()
        best_epoch = max(scores.items(), key=lambda x: x[1])
        trial.set_user_attr("epochs", best_epoch[0])
        return best_epoch[1]


    G, node_emb = lp.load_initial_graph()
    positive_edges = list(G.edges())
    un_edges = lp.get_unlabeled_edges()
    negative_edges = sample_func(G, un_edges, k=1)
    train_pos, test_pos = train_test_split(positive_edges, test_size=0.1)
    train_neg, test_neg = train_test_split(negative_edges, test_size=0.1)
    train_pos_g = dgl.graph(train_pos, num_nodes=len(G))
    train_neg_g = dgl.graph(train_neg, num_nodes=len(G))
    test_pos_g = dgl.graph(test_pos, num_nodes=len(G))
    test_neg_g = dgl.graph(test_neg, num_nodes=len(G))
    un_g = dgl.graph(un_edges, num_nodes=len(G))

    trainG = nx.DiGraph()
    trainG.add_nodes_from(list(range(node_emb.shape[0])))
    trainG.add_edges_from(train_pos)
    train_g = dgl.from_networkx(trainG).add_self_loop()

    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=100)
    trial = study.best_trial

    train_g = dgl.from_networkx(G).add_self_loop()
    train_pos_g = dgl.graph(positive_edges, num_nodes=len(G))
    train_neg_g = dgl.graph(negative_edges, num_nodes=len(G))
    encoder = GraphEncoder(32, trial.params['encoder_hidden'], trial.params['conv1'], trial.params['conv2'])
    predictor = Predictor(trial.params['encoder_hidden'], trial.params['predictor_hidden'])
    optimizer = torch.optim.Adam(itertools.chain(encoder.parameters(), predictor.parameters()), lr=1e-3)
    for epoch in range(trial.user_attrs["epochs"]):
        h = encoder(train_g , torch.tensor(node_emb, dtype=torch.float))
        pos_score = predictor(train_pos_g, h)
        neg_score = predictor(train_neg_g, h)
        loss = compute_loss(pos_score, neg_score)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    with torch.no_grad():
        proba = torch.sigmoid(predictor(un_g, h)).detach().numpy()
    return proba

In [12]:
proba_list = [train_sigle_model(sample_func) for sample_func in [
    negative_sampling_rand, negative_sampling_rand, negative_sampling_etc, negative_sampling_etc]]

[32m[I 2022-04-03 18:00:05,586][0m A new study created in memory with name: no-name-5439c5e9-e2a2-414a-94d5-fb3128040d7a[0m
[32m[I 2022-04-03 18:00:22,477][0m Trial 0 finished with value: 0.836979891297202 and parameters: {'conv1': 'gcn', 'conv2': 'sage', 'encoder_hidden': 83, 'predictor_hidden': 120}. Best is trial 0 with value: 0.836979891297202.[0m
[32m[I 2022-04-03 18:00:46,840][0m Trial 1 finished with value: 0.8479005731221351 and parameters: {'conv1': 'sage', 'conv2': 'gcn', 'encoder_hidden': 202, 'predictor_hidden': 9}. Best is trial 1 with value: 0.8479005731221351.[0m
[32m[I 2022-04-03 18:01:52,052][0m Trial 2 finished with value: 0.8418423473415244 and parameters: {'conv1': 'sage', 'conv2': 'not', 'encoder_hidden': 549, 'predictor_hidden': 66}. Best is trial 1 with value: 0.8479005731221351.[0m
[32m[I 2022-04-03 18:02:04,603][0m Trial 3 finished with value: 0.8269623065913672 and parameters: {'conv1': 'gcn', 'conv2': 'gcn', 'encoder_hidden': 56, 'predictor_hidd

In [33]:
import pandas as pd



forecast = pd.DataFrame(proba_list).mean(axis=0)
median = forecast.median()
forecast.map(lambda x: x >= median).astype(int).to_csv('mean_50.txt', index=False, header=False)

In [34]:
forecast = pd.DataFrame(proba_list).max(axis=0)
median = forecast.median()
forecast.map(lambda x: x >= median).astype(int).to_csv('max_50.txt', index=False, header=False)
median

0.31999665

In [36]:
forecast.map(lambda x: x >= 2).astype(int).to_csv('zero.txt', index=False, header=False)

In [35]:
forecast.map(lambda x: x >= 2).astype(int)

0        0
1        0
2        0
3        0
4        0
        ..
44009    0
44010    0
44011    0
44012    0
44013    0
Length: 44014, dtype: int64

In [158]:
import pandas as pd
pd.DataFrame(proba_list).mean(axis=0).round().astype(int).to_csv('final_pred_mean_sigmoid.txt', index=False, header=False)

In [166]:
pd.DataFrame(proba_list).mean(axis=0).map(lambda x: x > 0.4).astype(int).to_csv('final_pred_mean_sigmoid_04.txt', index=False, header=False)

In [244]:
pd.DataFrame(proba_list).mean(axis=0).map(lambda x: x > 0.14185).astype(int).to_csv('final_pred_mean_sigmoid_50.txt', index=False, header=False)

In [173]:
pd.DataFrame(proba_list).max(axis=0).map(lambda x: x > 0.5).astype(int).to_csv('final_pred_mean_sigmoid_max.txt', index=False, header=False)

In [174]:
pd.DataFrame(proba_list).max(axis=0).map(lambda x: x > 0.4).astype(int).to_csv('final_pred_mean_sigmoid_max_04.txt', index=False, header=False)

In [215]:
pd.DataFrame(proba_list).max(axis=0).map(lambda x: x > 0.32079).astype(int).to_csv('final_pred_mean_sigmoid_max_50.txt', index=False, header=False)

In [113]:
pd.DataFrame(proba_list[:2]).mean(axis=0).round().astype(int).to_csv('final_pred_mean_sigmoid_v1.txt', index=False, header=False)

In [114]:
pd.DataFrame(proba_list[2:]).mean(axis=0).round().astype(int).to_csv('final_pred_mean_sigmoid_v2.txt', index=False, header=False)

In [53]:
import pandas as pd


for idx, preds in enumerate(proba_list):
    pd.Series(np.round(preds)).astype(int).to_csv(f'final_pred_{idx}.txt', index=False, header=False)

In [56]:
pd.DataFrame(proba_list).mean(axis=0).round().astype(int).to_csv('final_pred_mean_v2.txt', index=False, header=False)

In [70]:
pd.DataFrame(proba_list).mean(axis=0)[pd.DataFrame(proba_list).mean(axis=0) > 0.9].count()

6431

In [115]:
pd.DataFrame(proba_list)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,44004,44005,44006,44007,44008,44009,44010,44011,44012,44013
0,3.9e-05,0.000314,3.871243e-07,2e-05,0.002703,4.079205e-09,0.780691,5.511277e-08,0.033222,0.003809,...,0.247177,0.797227,0.872029,3.8e-05,2e-06,0.00018,0.899501,5.8e-05,0.000312,0.163988
1,1.7e-05,7.7e-05,1.575317e-06,5e-06,0.00114,9.342212e-11,0.819498,5.021664e-06,0.018164,3.9e-05,...,0.271288,0.176588,0.969767,2e-05,1.2e-05,0.000368,0.667171,0.001109,0.00101,0.760778
2,0.002196,4.1e-05,3.284911e-05,0.019811,9e-06,3.043145e-05,0.944794,2.235273e-11,0.055784,0.000795,...,0.85252,0.559663,0.942824,7.1e-05,0.000112,0.000739,0.659982,0.000639,0.000403,0.242482
3,0.000492,0.018879,2.709725e-06,0.000649,0.000801,3.206604e-05,0.816199,5.323912e-08,0.08725,0.006023,...,0.966735,0.325599,0.919823,0.001311,0.001053,0.003771,0.73455,0.000516,0.000112,0.575783
