In [None]:
import numpy as np
import collections
import os
import pandas as pd
import networkx as nx
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn import metrics
from sklearn.metrics import r2_score
from sklearn import svm
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss
from sklearn.metrics import roc_auc_score

In [None]:
def model_logistic(X_train, y_train, X_test, y_test):
    
    # Logistic regression
    logistic_model = LogisticRegression()
    logistic_model.fit(X_train, y_train)
                     
    y_pred_train = logistic_model.predict_proba(X_train)
    y_pred_test = logistic_model.predict_proba(X_test)
    
    precision_train = precision_score(y_train, y_pred_train)
    precision_test = precision_score(y_test, y_pred_test)
    accuracy_train = accuracy_score(y_train, y_pred_train)
    accuracy_test = accuracy_score(y_test, y_pred_test)
    f1_macro_train = f1_score(y_train, y_pred_train, average='macro')
    f1_macro_test = f1_score(y_test, y_pred_test, average='macro')
    f1_micro_train = f1_score(y_train, y_pred_train, average='micro')
    f1_micro_test = f1_score(y_test, y_pred_test, average='micro')
    logloss_train = log_loss(y_train, y_pred_train)
    logloss_test = log_loss(y_test, y_pred_test)
    roc_auc_train = roc_auc_score(y_train, y_pred_train)
    roc_auc_test = roc_auc_score(y_test, y_pred_test)

    return precision_train, precision_test, accuracy_train, accuracy_test, f1_macro_train, f1_macro_test, f1_micro_train, f1_micro_test, logloss_train, logloss_test, roc_auc_train, roc_auc_test, y_pred_train, y_pred_test  

In [None]:
def model_svm(X_train, y_train, X_test, y_test):
    
    # SVC
    svm_model = svm.SVC()
    svm_model.fit(X_train, y_train)
                     
    y_pred_train = svm_model.predict_proba(X_train)
    y_pred_test = svm_model.predict_proba(X_test)
    
    precision_train = precision_score(y_train, y_pred_train)
    precision_test = precision_score(y_test, y_pred_test)
    accuracy_train = accuracy_score(y_train, y_pred_train)
    accuracy_test = accuracy_score(y_test, y_pred_test)
    f1_macro_train = f1_score(y_train, y_pred_train, average='macro')
    f1_macro_test = f1_score(y_test, y_pred_test, average='macro')
    f1_micro_train = f1_score(y_train, y_pred_train, average='micro')
    f1_micro_test = f1_score(y_test, y_pred_test, average='micro')
    logloss_train = log_loss(y_train, y_pred_train)
    logloss_test = log_loss(y_test, y_pred_test)
    roc_auc_train = roc_auc_score(y_train, y_pred_train)
    roc_auc_test = roc_auc_score(y_test, y_pred_test)

    return precision_train, precision_test, accuracy_train, accuracy_test, f1_macro_train, f1_macro_test, f1_micro_train, f1_micro_test, logloss_train, logloss_test, roc_auc_train, roc_auc_test, y_pred_train, y_pred_test  

In [None]:
def model_rf(X_train, y_train, X_test, y_test):
    
    # RandomForest
    rf = RandomForestClassifier()
    rf.fit(X_train, y_train)
                     
    y_pred_train = rf.predict_proba(X_train)
    y_pred_test = rf.predict_proba(X_test)
    
    precision_train = precision_score(y_train, y_pred_train)
    precision_test = precision_score(y_test, y_pred_test)
    accuracy_train = accuracy_score(y_train, y_pred_train)
    accuracy_test = accuracy_score(y_test, y_pred_test)
    f1_macro_train = f1_score(y_train, y_pred_train, average='macro')
    f1_macro_test = f1_score(y_test, y_pred_test, average='macro')
    f1_micro_train = f1_score(y_train, y_pred_train, average='micro')
    f1_micro_test = f1_score(y_test, y_pred_test, average='micro')
    logloss_train = log_loss(y_train, y_pred_train)
    logloss_test = log_loss(y_test, y_pred_test)
    roc_auc_train = roc_auc_score(y_train, y_pred_train)
    roc_auc_test = roc_auc_score(y_test, y_pred_test)

    return precision_train, precision_test, accuracy_train, accuracy_test, f1_macro_train, f1_macro_test, f1_micro_train, f1_micro_test, logloss_train, logloss_test, roc_auc_train, roc_auc_test, y_pred_train, y_pred_test  

In [None]:
def model_gbc(X_train, y_train, X_test, y_test):
    
    # GradientBoosting
    params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 2,
              'learning_rate': 0.01, 'loss': 'ls'}
    clf = GradientBoostingClassifier(**params)
    clf.fit(X_train, y_train)
                     
    y_pred_train = clf.predict_proba(X_train)
    y_pred_test = clf.predict_proba(X_test)
    
    precision_train = precision_score(y_train, y_pred_train)
    precision_test = precision_score(y_test, y_pred_test)
    accuracy_train = accuracy_score(y_train, y_pred_train)
    accuracy_test = accuracy_score(y_test, y_pred_test)
    f1_macro_train = f1_score(y_train, y_pred_train, average='macro')
    f1_macro_test = f1_score(y_test, y_pred_test, average='macro')
    f1_micro_train = f1_score(y_train, y_pred_train, average='micro')
    f1_micro_test = f1_score(y_test, y_pred_test, average='micro')
    logloss_train = log_loss(y_train, y_pred_train)
    logloss_test = log_loss(y_test, y_pred_test)
    roc_auc_train = roc_auc_score(y_train, y_pred_train)
    roc_auc_test = roc_auc_score(y_test, y_pred_test)

    return precision_train, precision_test, accuracy_train, accuracy_test, f1_macro_train, f1_macro_test, f1_micro_train, f1_micro_test, logloss_train, logloss_test, roc_auc_train, roc_auc_test, y_pred_train, y_pred_test  

In [None]:
%%time
# Calculating embeddings (finding best parameters: 'p' and 'q')
# Source: https://github.com/aditya-grover/node2vec
values = [0.25,0.5,1,2,4] # d=64
for x in values:
    for y in values:
        %run ./node2vec-master/src/main.py --input ./Data/I_scopus_train.txt --output ./Data/emb/scopus/out{x}_{y}.txt --q {x} --p {y} --weighted
        %run ./node2vec-master/src/main.py --input ./Data/I_scopus_2017.txt --output ./Data/emb/scopus/out_x{x}_{y}.txt --q {x} --p {y} --weighted

In [None]:
# Finding the best 'd' parameter for learning (with p=0.5 and q=1)
values_d = [16,32,64,128,256]
for d in values_d:
    %run ./node2vec-master/src/main.py --input ./Data/I_scopus_train.txt --output ./Data/emb/scopus/out_d_{d}.txt --dimensions {d} --q 1 --p 0.5 --weighted
    %run ./node2vec-master/src/main.py --input ./Data/I_scopus_2017.txt --output ./Data/emb/scopus/out_x_d_{d}.txt --dimensions {d} --q 1 --p 0.5 --weighted

In [None]:
%%time
# Calculating embeddings for key-words co-occurrence network (finding best parameters: 'p' and 'q')
values = [0.25,0.5,1,2,4] # d=64
for x in values:
    for y in values:
        %run ./node2vec-master/src/main.py --input ./Data/I_scopus_keywords_train.txt --output ./Data/emb/scopus_keywords/out{x}_{y}.txt --q {x} --p {y} --weighted
        %run ./node2vec-master/src/main.py --input ./Data/I_scopus_keywords_2017.txt --output ./Data/emb/scopus_keywords/out_x{x}_{y}.txt --q {x} --p {y} --weighted

In [None]:
# Finding the best 'd' parameter for learning (with p=0.25 and q=1)
values_d = [16,32,64,128,256]
for d in values_d:
    %run ./node2vec-master/src/main.py --input ./Data/I_scopus_keywords_train.txt --output ./Data/emb/scopus_keywords/out_d_{d}.txt --dimensions {d} --q 1 --p 0.25 --weighted
    %run ./node2vec-master/src/main.py --input ./Data/I_scopus_keywords_2017.txt --output ./Data/emb/scopus_keywords/out_x_d_{d}.txt --dimensions {d} --q 1 --p 0.25 --weighted

In [None]:
# Define edge embbeding functions
def avg_sum(v1, v2):
    return (np.array(v1)+np.array(v2))/2

def mult(v1, v2):
    return np.array(v1)*np.array(v2)

def w_l1(v1, v2):
    return np.abs(np.array(v1)-np.array(v2))

def w_l2(v1, v2):
    return (np.array(v1)-np.array(v2))**2

def nw_l1(v1, v2, graph, n1, n2, embs ):
    neig1 = [n for n in graph.neighbors(n1)]
    neig2 = [n for n in graph.neighbors(n2)]
    sum1 = np.zeros(len(v1))
    sum2 = np.zeros(len(v2))
    for n in neig1:
        sum1 += np.array(embs[int(n)])
    for n in neig2:
        sum2 += np.array(embs[int(n)])
    return np.abs((sum1+np.array(v1))/(len(neig1)+1)-(sum2+np.array(v2))/(len(neig2)+1))

def nw_l2(v1, v2, graph, n1, n2, embs ):
    neig1 = [n for n in graph.neighbors(n1)]
    neig2 = [n for n in graph.neighbors(n2)]
    sum1 = np.zeros(len(v1))
    sum2 = np.zeros(len(v2))
    for n in neig1:
        sum1 += np.array(embs[int(n)])
    for n in neig2:
        sum2 += np.array(embs[int(n)])
    return ((sum1+np.array(v1))/(len(neig1)+1)-(sum2+np.array(v2))/(len(neig2)+1))**2

In [None]:
%%time
## Loading data ##
Graph_train = nx.read_weighted_edgelist("./Data/scopus/I_scopus_train.txt", delimiter=' ',nodetype=int)
Graph_2017 = nx.read_weighted_edgelist("./Data/scopus/I_scopus_2017.txt", delimiter=' ',nodetype=int)
f_train = open('./Data/scopus/I_scopus_train.txt')
f_2017 = open('./Data/scopus/I_scopus_2017.txt')
edges_train = []
edges_2017 = []
for line in f_train:
    edges_train.append(line.split(' '))
for line in f_2017:
    edges_2017.append(line.split(' '))
f_train.close()
f_2017.close()
for i in range(len(edges_train)):
    edges_train[i][2] = edges_train[i][2][:-1]
for i in range(len(edges_2017)):
    edges_2017[i][2] = edges_2017[i][2][:-1]
f_emb_train = open('./Data/emb/scopus/out_d_'+str(32)+'.txt')
f_emb_2017 = open('./Data/emb/scopus/out_x_d_'+str(32)+'.txt')
emb_train = []
emb_2017 = []
for line in f_emb_train:
    emb_train.append(line.split(' '))
for line in f_emb_2017:
    emb_2017.append(line.split(' '))
f_emb_train.close()
f_emb_2017.close()
for i in range(len(emb_2017)):
    emb_2017[i][len(emb_2017[i])-1] = emb_2017[i][len(emb_2017[i])-1][:-1]
emb_2017_dict = {} 
for i in range(1,len(emb_2017)):
    emb_2017_dict[int(emb_2017[i][0])] = [float(j) for j in emb_2017[i][1:]]
for i in range(len(emb_train)):
    emb_train[i][len(emb_train[i])-1] = emb_train[i][len(emb_train[i])-1][:-1]
emb_train_dict = {} 
for i in range(1,len(emb_train)):
    emb_train_dict[int(emb_train[i][0])] = [float(j) for j in emb_train[i][1:]]
f_emb_train_keywords = open('./Data/emb/scopus_keywords/out_d_'+str(16)+'.txt')
f_emb_2017_keywords = open('./Data/emb/scopus_keywords/out_x_d_'+str(16)+'.txt')
emb_train_keywords = []
emb_2017_keywords = []
for line in f_emb_train_keywords:
    emb_train_keywords.append(line.split(' '))
for line in f_emb_2017_keywords:
    emb_2017_keywords.append(line.split(' '))
f_emb_train_keywords.close()
f_emb_2017_keywords.close()
for i in range(len(emb_2017_keywords)):
    emb_2017_keywords[i][len(emb_2017_keywords[i])-1] = emb_2017_keywords[i][len(emb_2017_keywords[i])-1][:-1]
emb_2017_keywords_dict = {} 
for i in range(1,len(emb_2017_keywords)):
    emb_2017_keywords_dict[int(emb_2017_keywords[i][0])] = [float(j) for j in emb_2017_keywords[i][1:]]
for i in range(len(emb_train_keywords)):
    emb_train_keywords[i][len(emb_train_keywords[i])-1] = emb_train_keywords[i][len(emb_train_keywords[i])-1][:-1]
emb_train_keywords_dict = {} 
for i in range(1,len(emb_train_keywords)):
    emb_train_keywords_dict[int(emb_train_keywords[i][0])] = [float(j) for j in emb_train_keywords[i][1:]]

In [None]:
%%time
## 1 ##
X_train = []
y_train = []
X_test = []
y_test = []
for e in edges_2017:
    emb1 = emb_2017_dict[int(e[0])]
    emb2 = emb_2017_dict[int(e[1])]
    w = round(float(e[2]))
    y_test.append(w)
    res = nw_l2(emb1, emb2, Graph_2017,int(e[0]),int(e[1]),emb_2017_dict)
    X_test.append(res)
for e in edges_train:
    emb1 = emb_train_dict[int(e[0])]
    emb2 = emb_train_dict[int(e[1])]
    w = round(float(e[2]))
    y_train.append(w)
    res = nw_l2(emb1, emb2, Graph_train,int(e[0]),int(e[1]),emb_train_dict)
    X_train.append(res)
precision_train, precision_test, accuracy_train, accuracy_test, f1_macro_train, f1_macro_test, f1_micro_train, f1_micro_test, logloss_train, logloss_test, roc_auc_train, roc_auc_test, y_pred_train, y_pred_test  = model_svm(X_train, y_train, X_test, y_test)
print('Precision: ' + str(precision_train))
print('Accuracy: ' + str(accuracy_train))
print('F-1 (macro): ' + str(f1_macro_train))
print('F-1 (micro): ' + str(f1_micro_train))
print('Logloss: ' + str(logloss_train))
print('ROC-AUC: ' + str(roc_auc_train))

In [None]:
%%time
## 3 ##
X_train = []
y_train = []
X_test = []
y_test = []
for e in edges_2017:
    pref_att = len(list(nx.common_neighbors(Graph_2017, int(e[0]),int(e[1]))))
    jac = list(nx.jaccard_coefficient(Graph_2017, [(int(e[0]),int(e[1]))]))[0][2]
    aa = list(nx.adamic_adar_index(Graph_2017, [(int(e[0]),int(e[1]))]))[0][2]
    w = round(float(e[2]))
    y_test.append(w)
    res = np.array([pref_att, jac, aa])
    X_test.append(res)
for e in edges_train:
    pref_att = len(list(nx.common_neighbors(Graph_train, int(e[0]),int(e[1]))))
    jac = list(nx.jaccard_coefficient(Graph_train, [(int(e[0]),int(e[1]))]))[0][2]
    aa = list(nx.adamic_adar_index(Graph_train, [(int(e[0]),int(e[1]))]))[0][2]
    w = round(float(e[2]))
    y_train.append(w)
    res = np.array([pref_att, jac, aa])
    X_train.append(res)
precision_train, precision_test, accuracy_train, accuracy_test, f1_macro_train, f1_macro_test, f1_micro_train, f1_micro_test, logloss_train, logloss_test, roc_auc_train, roc_auc_test, y_pred_train, y_pred_test  = model_svm(X_train, y_train, X_test, y_test)
print('Precision: ' + str(precision_train))
print('Accuracy: ' + str(accuracy_train))
print('F-1 (macro): ' + str(f1_macro_train))
print('F-1 (micro): ' + str(f1_micro_train))
print('Logloss: ' + str(logloss_train))
print('ROC-AUC: ' + str(roc_auc_train))

In [None]:
%%time
## 1 + 2 ##
X_train = []
y_train = []
X_test = []
y_test = []
for e in edges_2017:
    emb1 = emb_2017_dict[int(e[0])]
    emb2 = emb_2017_dict[int(e[1])]
    emb1_keywords = emb_2017_keywords_dict[int(e[0])]
    emb2_keywords = emb_2017_keywords_dict[int(e[1])]
    w = round(float(e[2]))
    y_test.append(w)
    res = nw_l2(emb1, emb2, Graph_2017,int(e[0]),int(e[1]),emb_2017_dict)
    res_keywords = nw_l2(emb1_keywords, emb2_keywords, Graph_2017,int(e[0]),int(e[1]),emb_2017_keywords_dict)
    X_test.append(np.concatenate((res, res_keywords)))
for e in edges_train:
    emb1 = emb_train_dict[int(e[0])]
    emb2 = emb_train_dict[int(e[1])]
    emb1_keywords = emb_train_keywords_dict[int(e[0])]
    emb2_keywords = emb_train_keywords_dict[int(e[1])]
    w = round(float(e[2]))
    y_train.append(w)
    res = nw_l2(emb1, emb2, Graph_train,int(e[0]),int(e[1]),emb_train_dict)
    res_keywords = nw_l2(emb1_keywords, emb2_keywords, Graph_train,int(e[0]),int(e[1]),emb_train_keywords_dict)
    X_train.append(np.concatenate((res, res_keywords)))
precision_train, precision_test, accuracy_train, accuracy_test, f1_macro_train, f1_macro_test, f1_micro_train, f1_micro_test, logloss_train, logloss_test, roc_auc_train, roc_auc_test, y_pred_train, y_pred_test  = model_svm(X_train, y_train, X_test, y_test)
print('Precision: ' + str(precision_train))
print('Accuracy: ' + str(accuracy_train))
print('F-1 (macro): ' + str(f1_macro_train))
print('F-1 (micro): ' + str(f1_micro_train))
print('Logloss: ' + str(logloss_train))
print('ROC-AUC: ' + str(roc_auc_train))

In [None]:
%%time
## 3 + 4 ##
X_train = []
y_train = []
X_test = []
y_test = []
for e in edges_2017:
    pref_att = len(list(nx.common_neighbors(Graph_2017, int(e[0]),int(e[1]))))
    jac = list(nx.jaccard_coefficient(Graph_2017, [(int(e[0]),int(e[1]))]))[0][2]
    aa = list(nx.adamic_adar_index(Graph_2017, [(int(e[0]),int(e[1]))]))[0][2]
    cl_coef = nx.clustering(Graph_2017, int(e[0])) + nx.clustering(Graph_2017, int(e[1]))
    betw = betw_2017.get(int(e[0])) + betw_2017.get(int(e[0]))
    clos = clos_2017.get(int(e[0])) + clos_2017.get(int(e[0]))
    sp = nx.shortest_path_length(Graph_2017, int(e[0]),int(e[1]))
    w = float(e[2])
    y_test.append(w)
    res = np.array([pref_att, jac, aa, cl_coef, betw, clos, sp])
    X_test.append(res)
for e in edges_train:
    pref_att = len(list(nx.common_neighbors(Graph_train, int(e[0]),int(e[1]))))
    jac = list(nx.jaccard_coefficient(Graph_train, [(int(e[0]),int(e[1]))]))[0][2]
    aa = list(nx.adamic_adar_index(Graph_train, [(int(e[0]),int(e[1]))]))[0][2]
    cl_coef = nx.clustering(Graph_train, int(e[0])) + nx.clustering(Graph_train, int(e[1]))
    betw = betw_train.get(int(e[0])) + betw_train.get(int(e[0]))
    clos = clos_train.get(int(e[0])) + clos_train.get(int(e[0]))
    sp = nx.shortest_path_length(Graph_train, int(e[0]),int(e[1]))
    w = float(e[2])
    y_train.append(w)
    res = np.array([pref_att, jac, aa, cl_coef, betw, clos, sp])
    X_train.append(res)
precision_train, precision_test, accuracy_train, accuracy_test, f1_macro_train, f1_macro_test, f1_micro_train, f1_micro_test, logloss_train, logloss_test, roc_auc_train, roc_auc_test, y_pred_train, y_pred_test  = model_svm(X_train, y_train, X_test, y_test)
print('Precision: ' + str(precision_train))
print('Accuracy: ' + str(accuracy_train))
print('F-1 (macro): ' + str(f1_macro_train))
print('F-1 (micro): ' + str(f1_micro_train))
print('Logloss: ' + str(logloss_train))
print('ROC-AUC: ' + str(roc_auc_train))

In [None]:
%%time
## 1 + 4 ##
X_train = []
y_train = []
X_test = []
y_test = []
for e in edges_2017:
    cl_coef = nx.clustering(Graph_2017, int(e[0])) + nx.clustering(Graph_2017, int(e[1]))
    betw = betw_2017.get(int(e[0])) + betw_2017.get(int(e[0]))
    clos = clos_2017.get(int(e[0])) + clos_2017.get(int(e[0]))
    sp = nx.shortest_path_length(Graph_2017, int(e[0]),int(e[1]))
    emb1 = emb_2017_dict[int(e[0])]
    emb2 = emb_2017_dict[int(e[1])]
    nw = nw_l2(emb1, emb2, Graph_2017,int(e[0]),int(e[1]),emb_2017_dict)
    w = float(e[2])
    y_test.append(w)
    res = np.concatenate((np.array([cl_coef, betw, clos, sp]), nw))
    X_test.append(res)
for e in edges_train:
    cl_coef = nx.clustering(Graph_train, int(e[0])) + nx.clustering(Graph_train, int(e[1]))
    betw = betw_train.get(int(e[0])) + betw_train.get(int(e[0]))
    clos = clos_train.get(int(e[0])) + clos_train.get(int(e[0]))
    sp = nx.shortest_path_length(Graph_train, int(e[0]),int(e[1]))
    emb1 = emb_train_dict[int(e[0])]
    emb2 = emb_train_dict[int(e[1])]
    nw = nw_l2(emb1, emb2, Graph_train,int(e[0]),int(e[1]),emb_train_dict)
    w = float(e[2])
    y_train.append(w)
    res = np.concatenate((np.array([cl_coef, betw, clos, sp]), nw))
    X_train.append(res)
precision_train, precision_test, accuracy_train, accuracy_test, f1_macro_train, f1_macro_test, f1_micro_train, f1_micro_test, logloss_train, logloss_test, roc_auc_train, roc_auc_test, y_pred_train, y_pred_test  = model_svm(X_train, y_train, X_test, y_test)
print('Precision: ' + str(precision_train))
print('Accuracy: ' + str(accuracy_train))
print('F-1 (macro): ' + str(f1_macro_train))
print('F-1 (micro): ' + str(f1_micro_train))
print('Logloss: ' + str(logloss_train))
print('ROC-AUC: ' + str(roc_auc_train))

In [None]:
%%time
## 1 + 3 ##
X_train = []
y_train = []
X_test = []
y_test = []
for e in edges_2017:
    pref_att = len(list(nx.common_neighbors(Graph_2017, int(e[0]),int(e[1]))))
    jac = list(nx.jaccard_coefficient(Graph_2017, [(int(e[0]),int(e[1]))]))[0][2]
    aa = list(nx.adamic_adar_index(Graph_2017, [(int(e[0]),int(e[1]))]))[0][2]
    emb1 = emb_2017_dict[int(e[0])]
    emb2 = emb_2017_dict[int(e[1])]
    nw = nw_l2(emb1, emb2, Graph_2017,int(e[0]),int(e[1]),emb_2017_dict)
    w = float(e[2])
    y_test.append(w)
    res = np.concatenate((np.array([pref_att, jac, aa]), nw))
    X_test.append(res)
for e in edges_train:
    pref_att = len(list(nx.common_neighbors(Graph_train, int(e[0]),int(e[1]))))
    jac = list(nx.jaccard_coefficient(Graph_train, [(int(e[0]),int(e[1]))]))[0][2]
    aa = list(nx.adamic_adar_index(Graph_train, [(int(e[0]),int(e[1]))]))[0][2]
    emb1 = emb_train_dict[int(e[0])]
    emb2 = emb_train_dict[int(e[1])]
    nw = nw_l2(emb1, emb2, Graph_train,int(e[0]),int(e[1]),emb_train_dict)
    w = float(e[2])
    y_train.append(w)
    res = np.concatenate((np.array([pref_att, jac, aa]), nw))
    X_train.append(res)
precision_train, precision_test, accuracy_train, accuracy_test, f1_macro_train, f1_macro_test, f1_micro_train, f1_micro_test, logloss_train, logloss_test, roc_auc_train, roc_auc_test, y_pred_train, y_pred_test  = model_svm(X_train, y_train, X_test, y_test)
print('Precision: ' + str(precision_train))
print('Accuracy: ' + str(accuracy_train))
print('F-1 (macro): ' + str(f1_macro_train))
print('F-1 (micro): ' + str(f1_micro_train))
print('Logloss: ' + str(logloss_train))
print('ROC-AUC: ' + str(roc_auc_train))

In [None]:
%%time
## 1 + 2 + 3 ##
X_train = []
y_train = []
X_test = []
y_test = []
for e in edges_2017:
    pref_att = len(list(nx.common_neighbors(Graph_2017, int(e[0]),int(e[1]))))
    jac = list(nx.jaccard_coefficient(Graph_2017, [(int(e[0]),int(e[1]))]))[0][2]
    aa = list(nx.adamic_adar_index(Graph_2017, [(int(e[0]),int(e[1]))]))[0][2]
    emb1 = emb_2017_dict[int(e[0])]
    emb2 = emb_2017_dict[int(e[1])]
    emb1_keywords = emb_2017_keywords_dict[int(e[0])]
    emb2_keywords = emb_2017_keywords_dict[int(e[1])]
    nw = nw_l2(emb1, emb2, Graph_2017,int(e[0]),int(e[1]),emb_2017_dict)
    nw_keywords = nw_l2(emb1_keywords, emb2_keywords, Graph_2017,int(e[0]),int(e[1]),emb_2017_keywords_dict)
    w = float(e[2])
    y_test.append(w)
    res = np.concatenate((np.array([pref_att, jac, aa]), nw))
    X_test.append(np.concatenate((res, nw_keywords)))
for e in edges_train:
    pref_att = len(list(nx.common_neighbors(Graph_train, int(e[0]),int(e[1]))))
    jac = list(nx.jaccard_coefficient(Graph_train, [(int(e[0]),int(e[1]))]))[0][2]
    aa = list(nx.adamic_adar_index(Graph_train, [(int(e[0]),int(e[1]))]))[0][2]
    emb1 = emb_train_dict[int(e[0])]
    emb2 = emb_train_dict[int(e[1])]
    emb1_keywords = emb_train_keywords_dict[int(e[0])]
    emb2_keywords = emb_train_keywords_dict[int(e[1])]
    w = round(float(e[2]))
    y_train.append(w)
    nw = nw_l2(emb1, emb2, Graph_train,int(e[0]),int(e[1]),emb_train_dict)
    nw_keywords = nw_l2(emb1_keywords, emb2_keywords, Graph_train,int(e[0]),int(e[1]),emb_train_keywords_dict)
    res = np.concatenate((np.array([pref_att, jac, aa]), nw))
    X_train.append(np.concatenate((res, nw_keywords)))
precision_train, precision_test, accuracy_train, accuracy_test, f1_macro_train, f1_macro_test, f1_micro_train, f1_micro_test, logloss_train, logloss_test, roc_auc_train, roc_auc_test, y_pred_train, y_pred_test  = model_svm(X_train, y_train, X_test, y_test)
print('Precision: ' + str(precision_train))
print('Accuracy: ' + str(accuracy_train))
print('F-1 (macro): ' + str(f1_macro_train))
print('F-1 (micro): ' + str(f1_micro_train))
print('Logloss: ' + str(logloss_train))
print('ROC-AUC: ' + str(roc_auc_train))

In [None]:
%%time
## 1 + 2 + 4 ##
X_train = []
y_train = []
X_test = []
y_test = []
for e in edges_2017:
    cl_coef = nx.clustering(Graph_2017, int(e[0])) + nx.clustering(Graph_2017, int(e[1]))
    betw = betw_2017.get(int(e[0])) + betw_2017.get(int(e[0]))
    clos = clos_2017.get(int(e[0])) + clos_2017.get(int(e[0]))
    sp = nx.shortest_path_length(Graph_2017, int(e[0]),int(e[1]))
    emb1 = emb_2017_dict[int(e[0])]
    emb2 = emb_2017_dict[int(e[1])]
    emb1_keywords = emb_2017_keywords_dict[int(e[0])]
    emb2_keywords = emb_2017_keywords_dict[int(e[1])]
    nw = nw_l2(emb1, emb2, Graph_2017,int(e[0]),int(e[1]),emb_2017_dict)
    nw_keywords = nw_l2(emb1_keywords, emb2_keywords, Graph_2017,int(e[0]),int(e[1]),emb_2017_keywords_dict)
    w = float(e[2])
    y_test.append(w)
    res = np.concatenate((np.array([cl_coef, betw, clos, sp]), nw))
    X_test.append(np.concatenate((res, nw_keywords)))
for e in edges_train:
    cl_coef = nx.clustering(Graph_train, int(e[0])) + nx.clustering(Graph_train, int(e[1]))
    betw = betw_train.get(int(e[0])) + betw_train.get(int(e[0]))
    clos = clos_train.get(int(e[0])) + clos_train.get(int(e[0]))
    sp = nx.shortest_path_length(Graph_train, int(e[0]),int(e[1]))
    emb1 = emb_train_dict[int(e[0])]
    emb2 = emb_train_dict[int(e[1])]
    emb1_keywords = emb_train_keywords_dict[int(e[0])]
    emb2_keywords = emb_train_keywords_dict[int(e[1])]
    w = round(float(e[2]))
    y_train.append(w)
    nw = nw_l2(emb1, emb2, Graph_train,int(e[0]),int(e[1]),emb_train_dict)
    nw_keywords = nw_l2(emb1_keywords, emb2_keywords, Graph_train,int(e[0]),int(e[1]),emb_train_keywords_dict)
    res = np.concatenate((np.array([cl_coef, betw, clos, sp]), nw))
    X_train.append(np.concatenate((res, nw_keywords)))
precision_train, precision_test, accuracy_train, accuracy_test, f1_macro_train, f1_macro_test, f1_micro_train, f1_micro_test, logloss_train, logloss_test, roc_auc_train, roc_auc_test, y_pred_train, y_pred_test  = model_svm(X_train, y_train, X_test, y_test)
print('Precision: ' + str(precision_train))
print('Accuracy: ' + str(accuracy_train))
print('F-1 (macro): ' + str(f1_macro_train))
print('F-1 (micro): ' + str(f1_micro_train))
print('Logloss: ' + str(logloss_train))
print('ROC-AUC: ' + str(roc_auc_train))

In [None]:
%%time
## 1 + 2 + 3 + 4 ##
X_train = []
y_train = []
X_test = []
y_test = []
for e in edges_2017:
    pref_att = len(list(nx.common_neighbors(Graph_2017, int(e[0]),int(e[1]))))
    jac = list(nx.jaccard_coefficient(Graph_2017, [(int(e[0]),int(e[1]))]))[0][2]
    aa = list(nx.adamic_adar_index(Graph_2017, [(int(e[0]),int(e[1]))]))[0][2]
    cl_coef = nx.clustering(Graph_2017, int(e[0])) + nx.clustering(Graph_2017, int(e[1]))
    betw = betw_2017.get(int(e[0])) + betw_2017.get(int(e[0]))
    clos = clos_2017.get(int(e[0])) + clos_2017.get(int(e[0]))
    sp = nx.shortest_path_length(Graph_2017, int(e[0]),int(e[1]))
    emb1 = emb_2017_dict[int(e[0])]
    emb2 = emb_2017_dict[int(e[1])]
    emb1_keywords = emb_2017_keywords_dict[int(e[0])]
    emb2_keywords = emb_2017_keywords_dict[int(e[1])]
    nw = nw_l2(emb1, emb2, Graph_2017,int(e[0]),int(e[1]),emb_2017_dict)
    nw_keywords = nw_l2(emb1_keywords, emb2_keywords, Graph_2017,int(e[0]),int(e[1]),emb_2017_keywords_dict)
    w = float(e[2])
    y_test.append(w)
    res = np.concatenate((np.array([pref_att, jac, aa, cl_coef, betw, clos, sp]), nw))
    X_test.append(np.concatenate((res, nw_keywords)))
for e in edges_train:
    pref_att = len(list(nx.common_neighbors(Graph_train, int(e[0]),int(e[1]))))
    jac = list(nx.jaccard_coefficient(Graph_train, [(int(e[0]),int(e[1]))]))[0][2]
    aa = list(nx.adamic_adar_index(Graph_train, [(int(e[0]),int(e[1]))]))[0][2]
    cl_coef = nx.clustering(Graph_train, int(e[0])) + nx.clustering(Graph_train, int(e[1]))
    betw = betw_train.get(int(e[0])) + betw_train.get(int(e[0]))
    clos = clos_train.get(int(e[0])) + clos_train.get(int(e[0]))
    sp = nx.shortest_path_length(Graph_train, int(e[0]),int(e[1]))
    emb1 = emb_train_dict[int(e[0])]
    emb2 = emb_train_dict[int(e[1])]
    emb1_keywords = emb_train_keywords_dict[int(e[0])]
    emb2_keywords = emb_train_keywords_dict[int(e[1])]
    w = round(float(e[2]))
    y_train.append(w)
    nw = nw_l2(emb1, emb2, Graph_train,int(e[0]),int(e[1]),emb_train_dict)
    nw_keywords = nw_l2(emb1_keywords, emb2_keywords, Graph_train,int(e[0]),int(e[1]),emb_train_keywords_dict)
    res = np.concatenate((np.array([pref_att, jac, aa, cl_coef, betw, clos, sp]), nw))
    X_train.append(np.concatenate((res, nw_keywords)))
precision_train, precision_test, accuracy_train, accuracy_test, f1_macro_train, f1_macro_test, f1_micro_train, f1_micro_test, logloss_train, logloss_test, roc_auc_train, roc_auc_test, y_pred_train, y_pred_test  = model_svm(X_train, y_train, X_test, y_test)
print('Precision: ' + str(precision_train))
print('Accuracy: ' + str(accuracy_train))
print('F-1 (macro): ' + str(f1_macro_train))
print('F-1 (micro): ' + str(f1_micro_train))
print('Logloss: ' + str(logloss_train))
print('ROC-AUC: ' + str(roc_auc_train))