In [28]:
from tqdm.notebook import tqdm
import networkx as nx
import karateclub
from sklearn.decomposition import TruncatedSVD
import numpy as np
import link_prediction as lp
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [65]:
G, node_emb = lp.load_initial_graph()

In [66]:
positive_edges = list(G.edges())
negative_edges = lp.negative_sampling(G)

In [None]:
from scipy import sparse
coo_emb = sparse.coo_matrix(node_emb)

In [61]:
class MeanWalklets(karateclub.Walklets):
    def get_embedding(self):
        return np.mean(self._embedding, axis=0)


models = [
    ("DeepWalk", karateclub.DeepWalk(walk_number=10, walk_length=30, dimensions=64, window_size=5, epochs=5, workers=12)),
    ("LaplacianEigenmaps", karateclub.LaplacianEigenmaps(dimensions=64)),
    ("MeanWalklets", MeanWalklets(walk_number=10, walk_length=30, dimensions=64, window_size=5, epochs=5, workers=12)),
    ("Node2Vec", karateclub.Node2Vec(dimensions=64, walk_length=30, p=1, q=1, workers=12, epochs=5)),
    ("Role2Vec", karateclub.Role2Vec(walk_length=30, workers=12, epochs=5, dimensions=64)),
]

In [44]:
invG = nx.DiGraph()
invG.add_nodes_from(subG.nodes)
inv_train_edges = [(v, u) for u, v in train_edges]
invG.add_edges_from(inv_train_edges)
undirG = nx.Graph()
undirG.add_nodes_from(subG.nodes)
undir_train_edges = train_edges.copy()
for n in subG.nodes:
    if (n, n) not in undir_train_edges:
        undir_train_edges.append((n, n))
undirG.add_edges_from(undir_train_edges)

In [63]:
graphs = [
    ('G', G),
    # ('subG', subG),
    # ('invG', invG),
    # ('undirG', undirG)
]

In [64]:
train_node_emb = lp.hadamard_operator(subG, node_emb, train_edges)
test_node_emb = lp.hadamard_operator(subG, node_emb, test_edges)
for model_name, model in models:
    for grap_name, graph in graphs:
        try:
            model.fit(graph)
            emb = model.get_embedding()
            for name, operator in lp.operators.items():
                lr = LogisticRegression(max_iter=1000)
                train_emb = operator(G, emb, train_edges)
                test_emb = operator(G, emb, test_edges)
                train_edge_emb = np.hstack((train_emb, train_node_emb))
                test_edge_emb = np.hstack((test_emb, test_node_emb))
                lr.fit(train_edge_emb, train_labels)
                preds = lr.predict(test_edge_emb)
                print(model_name, grap_name, name, accuracy_score(test_labels, preds))
        except Exception as e:
            print(e, model_name, grap_name)

DeepWalk G average_operator 0.6850558659217877
DeepWalk G hadamard_operator 0.8687150837988827
DeepWalk G weighted_l1_operator 0.710195530726257
DeepWalk G weighted_l2_operator 0.7091480446927374
DeepWalk G neighbor_weighted_l1_operator 0.7908519553072626
DeepWalk G neighbor_weighted_l2_operator 0.7946927374301676
not implemented for directed type LaplacianEigenmaps G


KeyboardInterrupt: 

In [48]:
import optuna
import lightgbm as lgb


def objective(trial):
    lgb_params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'verbosity': -1,
        'boosting_type': trial.suggest_categorical('boosting_type', ['gbdt', 'goss', 'dart']),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 1.0),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 1.0, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 100, 3000, log=True)
    }
    graph_params = {
        'walk_number': trial.suggest_int('walk_number', 5, 20),
        'walk_length': trial.suggest_int('walk_length', 10, 128),
        'dimensions': trial.suggest_int('dimensions', 32, 128),
        'window_size': trial.suggest_int('window_size', 1, 8),
        'epochs': trial.suggest_int('epochs', 1, 10),
        'workers': 12
    }
    graph_model = karateclub.DeepWalk(**graph_params)
    graph_model.fit(graph)
    emb = graph_model.get_embedding()
    train_emb = lp.hadamard_operator(subG, emb, train_edges)
    test_emb = lp.hadamard_operator(subG, emb, test_edges)
    train_edge_emb = np.hstack((train_emb, train_node_emb))
    test_edge_emb = np.hstack((test_emb, test_node_emb))

    lgb_model = lgb.LGBMClassifier(**lgb_params)
    lgb_model.fit(train_edge_emb, train_labels)
    preds = lgb_model.predict(test_edge_emb).astype(int)
    accuracy = accuracy_score(test_labels, preds)
    return accuracy


study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)
trial = study.best_trial

[32m[I 2022-03-29 11:38:27,206][0m A new study created in memory with name: no-name-74117432-3aa7-4351-b80b-715d2ad4cb21[0m
[32m[I 2022-03-29 11:39:10,381][0m Trial 0 finished with value: 0.7113438045375218 and parameters: {'boosting_type': 'dart', 'reg_alpha': 0.000895140292450531, 'reg_lambda': 1.0318016157668097e-05, 'num_leaves': 144, 'colsample_bytree': 0.7662034958821031, 'min_child_samples': 88, 'learning_rate': 0.04066183462477414, 'n_estimators': 182, 'walk_number': 8, 'walk_length': 110, 'dimensions': 80, 'window_size': 6, 'epochs': 5}. Best is trial 0 with value: 0.7113438045375218.[0m
[32m[I 2022-03-29 11:39:42,880][0m Trial 1 finished with value: 0.7253054101221641 and parameters: {'boosting_type': 'gbdt', 'reg_alpha': 1.3373345257310983e-08, 'reg_lambda': 0.25785068412750667, 'num_leaves': 90, 'colsample_bytree': 0.6831192699677715, 'min_child_samples': 6, 'learning_rate': 0.04000806168075248, 'n_estimators': 358, 'walk_number': 10, 'walk_length': 40, 'dimensions'

KeyboardInterrupt: 

In [51]:
lgb_params = {
    'boosting_type': 'gbdt',
    'reg_alpha': 0.00011783704582339319,
    'reg_lambda': 9.789641091869597e-06,
    'num_leaves': 150,
    'colsample_bytree': 0.5837416285607783,
    'min_child_samples': 40,
    'learning_rate': 0.0823558007306339,
    'n_estimators': 1038,
    'objective': 'binary',
    'metric': 'binary_logloss',
    'verbosity': -1,
}
graph_params = {
    'walk_number': 8,
    'walk_length': 121,
    'dimensions': 79,
    'window_size': 6,
    'epochs': 4,
    'workers': 12
}

In [54]:
import pandas as pd
G, node_emb = lp.load_initial_graph()
positive_edges = list(G.edges)
negative_edges = lp.negative_sampling(G)
new_train_edges = positive_edges + negative_edges
positive_label = np.ones(len(positive_edges))
negative_label = np.zeros(len(negative_edges))
new_labels = np.hstack((positive_label, negative_label))
graph_model = karateclub.DeepWalk(**graph_params)
graph_model.fit(G)
emb = graph_model.get_embedding()
train_emb = lp.hadamard_operator(G, np.hstack((emb, node_emb)), new_train_edges)
un_emb = lp.hadamard_operator(G, np.hstack((emb, node_emb)), un_edges)
lgb_model = lgb.LGBMClassifier(**lgb_params)
lgb_model.fit(train_emb, new_labels)
preds = lgb_model.predict(un_emb).astype(int)
pd.Series(preds).to_csv('deepwalk_lgb_pred.txt', index=False, header=False)

In [58]:
test_emb = lp.hadamard_operator(G, np.hstack((emb, node_emb)), test_edges)
preds = lgb_model.predict(test_emb).astype(int)
accuracy_score(test_labels, preds)

0.9643979057591623

In [None]:
test_emb = lp.hadamard_operator(subG, emb, test_edges)
preds = lgb_model.predict(un_emb).astype(int)