In [1]:
import sys
import datetime
from pathlib import Path
import numpy as np, scipy as sp, networkx as nx
import math, time, os, sys, random
from collections import deque
from collections import Counter
import matplotlib
import matplotlib.pyplot as plt
from sklearn.utils.multiclass import unique_labels
import scipy
import scipy.sparse as sps
from scipy.sparse import coo_matrix
from scipy.sparse.linalg import svds, eigs
import sparsesvd
from sklearn.metrics import confusion_matrix
import sklearn
from sklearn import metrics
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MultiLabelBinarizer
import sklearn.model_selection
from time import time
from sklearn.decomposition import NMF, DictionaryLearning
from sklearn.metrics.cluster import normalized_mutual_info_score
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from collections import defaultdict
import pandas as pd
import seaborn as sns
import json
import pickle
from scipy.stats import rankdata
from scipy.stats import kendalltau
from matplotlib import animation as anim
%matplotlib notebook

In [2]:
def read_graph(filename):
    with open(filename, 'r') as f:
        lines = f.read().splitlines()
    list_GT = [
        (min((int(i.split(' ')[0]), int(i.split(' ')[1]))),
         max((int(i.split(' ')[0]), int(i.split(' ')[1])))) for i in lines]
    print("Read in", len(list_GT), "ground truth edges.")
    G = nx.Graph()
    G.add_edges_from(list_GT)
    return G

In [3]:
def read_in_label(filename: str):
    dict_labels = dict()
    dict_counter = dict()
    with open(filename, 'r') as f:
        lines = f.read().splitlines()
    for line in lines:
        dict_labels[int(line.split(' ')[0])] = int(line.split(' ')[1])
        if int(line.split(' ')[1]) not in dict_counter:
            dict_counter[int(line.split(' ')[1])] = 1
        else:
            dict_counter[int(line.split(' ')[1])] += 1
    print("Read in", len(dict_labels), 'node labels.')
    for key, val in dict_counter.items():
        print(">>> Label", key, 'appears', val, 'times')
    return dict_labels

In [4]:
def kFoldResults(X, y):
    dict_performance_ = dict()
    dict_prediction_ = dict()
    kf = sklearn.model_selection.KFold(n_splits=5, shuffle=True, random_state=0)
    kf.get_n_splits(X)
    for idx, (train_index, test_index) in enumerate(kf.split(X)):
#         print('Fold:', idx)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
#         clf = LogisticRegression(random_state=0, solver='lbfgs', penalty='l2', C=1.0)
        clf = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial', penalty='l2', C=1.0)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        dict_prediction_[idx] = y_pred

        acc = metrics.accuracy_score(y_test, y_pred)
        f1_micro = metrics.f1_score(y_test, y_pred, average='micro')
        f1_macro = metrics.f1_score(y_test, y_pred, average='macro')
        dict_ = {'acc': acc, 'f1_macro': f1_macro, 'f1_micro': f1_micro}
        dict_performance_[idx] = dict_
#         print("\t>>> Accuracy:", acc)
#         print("\t>>> F1_micro", f1_micro)
#         print("\t>>> F1_macro", f1_macro)
    return {"Performance": dict_performance_, "Prediction": dict_prediction_}

In [5]:
def kmeans_best_result(X, y, n_clusters):
    list_purity = list()
    list_nmi = list()
    
    kmeans = KMeans(n_clusters=n_clusters, n_init=5000, init='k-means++').fit(X)
    
    list_purity += [purity_score(y, kmeans.labels_)]
    list_nmi += [normalized_mutual_info_score(y, kmeans.labels_)]
    return {'purity': list_purity, 'nmi': list_nmi}

In [6]:
def evaluate_classification(X, y):
    dict_kfold_result_ = kFoldResults(X, y)

    print(">>>>>>>>>")

    current_metric = 'acc'
    print('accuracy')
    print("mean:", '%.4f'%np.mean([dict_kfold_result_['Performance'][i][current_metric] for i in range(5)]))
    print("std:", '%.4f'%np.std([dict_kfold_result_['Performance'][i][current_metric] for i in range(5)]))

    current_metric = 'f1_macro'
    print('f1-macro')
    print("mean:", '%.4f'%np.mean([dict_kfold_result_['Performance'][i][current_metric] for i in range(5)]))
    print("std:", '%.4f'%np.std([dict_kfold_result_['Performance'][i][current_metric] for i in range(5)]))

    print(">>>>>>>>>")
    
    return (np.mean([dict_kfold_result_['Performance'][i]['acc'] for i in range(5)]), 
            np.std([dict_kfold_result_['Performance'][i]['acc'] for i in range(5)]),
            np.mean([dict_kfold_result_['Performance'][i]['f1_macro'] for i in range(5)]), 
            np.std([dict_kfold_result_['Performance'][i]['f1_macro'] for i in range(5)]))

In [7]:
dict_labels = read_in_label('./sample-data/labels/airport_Brazil_label.txt')

Read in 131 node labels.
>>> Label 0 appears 32 times
>>> Label 1 appears 32 times
>>> Label 3 appears 35 times
>>> Label 2 appears 32 times


In [8]:
G = read_graph('./sample-data/Airports/airport_Brazil/brazil-airports.edgelist')


Read in 1074 ground truth edges.


In [9]:
dict_degree = dict(G.degree())
list_node_id = sorted([i for i in G.nodes()])
list_emb = np.array([float(dict_degree[i]) for i in list_node_id]).reshape(-1,1)

In [13]:
X = np.array(list_emb).reshape(-1, 1)
rtn = evaluate_classification(X, y)

list_euclidean = list()
list_cosine = list()
list_best_nmi = list()
list_best_purity = list()
list_acc_mean = list()
list_acc_std = list()
list_f1macro_mean = list()
list_f1macro_std = list()

list_acc_mean += [rtn[0]]
list_acc_std += [rtn[1]]
list_f1macro_mean += [rtn[2]]
list_f1macro_std += [rtn[3]]

>>>>>>>>>
accuracy
mean: 0.7413
std: 0.0852
f1-macro
mean: 0.7294
std: 0.0896
>>>>>>>>>


In [11]:
y = np.array([dict_labels[i] for i in list_node_id])

In [14]:
y

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 3, 3, 2, 3, 0, 3, 3, 1, 1, 3, 0,
       1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 2, 1, 3, 3, 0, 2, 1, 1, 1, 0, 1, 2,
       3, 1, 2, 2, 1, 1, 0, 0, 1, 0, 1, 2, 2, 2, 0, 2, 1, 0, 2, 1, 0, 1,
       0, 3, 3, 0, 1, 0, 3, 3, 0, 1, 2, 0, 2, 0, 1, 2, 1, 2, 1, 2, 2, 1,
       3, 2, 2, 2, 2, 1, 1, 3, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 2, 3, 3, 3,
       3, 3, 3, 2, 3, 3, 3, 3, 3, 2, 3, 3, 3, 2, 3, 2, 3, 1, 3, 3, 2])

In [15]:
X

array([[40.],
       [46.],
       [63.],
       [75.],
       [48.],
       [58.],
       [59.],
       [70.],
       [13.],
       [47.],
       [40.],
       [ 1.],
       [ 1.],
       [ 4.],
       [ 1.],
       [36.],
       [ 1.],
       [ 2.],
       [15.],
       [14.],
       [ 1.],
       [30.],
       [16.],
       [16.],
       [41.],
       [81.],
       [ 7.],
       [44.],
       [18.],
       [22.],
       [33.],
       [30.],
       [ 6.],
       [ 9.],
       [ 3.],
       [ 3.],
       [37.],
       [10.],
       [11.],
       [14.],
       [21.],
       [24.],
       [24.],
       [ 9.],
       [ 2.],
       [39.],
       [15.],
       [ 5.],
       [14.],
       [16.],
       [44.],
       [55.],
       [ 8.],
       [28.],
       [17.],
       [10.],
       [ 8.],
       [10.],
       [46.],
       [ 5.],
       [12.],
       [23.],
       [ 4.],
       [23.],
       [17.],
       [19.],
       [31.],
       [28.],
       [12.],
       [25.],
       [14.],
      

In [16]:
def two_hop_histogram(G):
    dict_degree = dict(G.degree())
    dict_neighbors = {i:[ii for ii in G.neighbors(i)] for i in G.nodes()}
    max_degree = max([i for _,i in dict_degree.items()])
    dict_histogram = dict()
    for cur_node in G.nodes():
        list_histogram = [0.0] * max_degree
        for cur_n in dict_neighbors[cur_node]:
            for cur_nn in dict_neighbors[cur_n]:
                list_histogram[dict_degree[cur_nn] - 1] += 1.0
        dict_histogram[cur_node] = list_histogram
    return dict_histogram

In [19]:
dict_two_hop = two_hop_histogram(G)
list_emb = np.array([dict_two_hop[i] for i in list_node_id])
X = np.array(list_emb)
rtn = evaluate_classification(X, y)


>>>>>>>>>
accuracy
mean: 0.5499
std: 0.0907
f1-macro
mean: 0.5086
std: 0.1050
>>>>>>>>>


