In [2]:
%pip install node2vec
%pip install chardet

Collecting node2vec
  Downloading node2vec-0.4.6-py3-none-any.whl (7.0 kB)
Collecting networkx<3.0,>=2.5
  Downloading networkx-2.8.8-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m170.3 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: networkx, node2vec
  Attempting uninstall: networkx
    Found existing installation: networkx 3.0
    Uninstalling networkx-3.0:
      Successfully uninstalled networkx-3.0
Successfully installed networkx-2.8.8 node2vec-0.4.6
Note: you may need to restart the kernel to use updated packages.
Collecting chardet
  Downloading chardet-5.1.0-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.1/199.1 kB[0m [31m122.8 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: chardet
Successfully installed chardet-5.1.0
Note: you may need to restart the kernel to use updated packages.


In [3]:
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import accuracy_score, matthews_corrcoef, confusion_matrix, classification_report
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import preprocessing

import numpy as np
import pandas as pd
import networkx as nx

#from stellargraph.data import BiasedRandomWalk
#from stellargraph import StellarGraph

from gensim.models import Word2Vec
from node2vec import Node2Vec as n2v

import warnings
import collections
#from stellargraph import datasets
from IPython.display import display, HTML
import matplotlib.pyplot as plt


%matplotlib inline

In [None]:
#Node2Vec

In [4]:
nodes = pd.read_csv("nodes.csv")
edges = pd.read_csv("edges.csv")

In [5]:
filter1 = edges[edges["from"].isin(nodes["id"].values)]
labeled = filter1[filter1["to"].isin(nodes["id"].values)]
labeled

Unnamed: 0,from,to,amount,timestamp
235,73,165,0.433767,1521405323
1904,1107,1088,1.980085,1500534574
2035,1151,1138,0.200000,1518799619
2036,1151,1138,1.400000,1518808782
2037,1151,1138,2.000000,1518810770
...,...,...,...,...
400800,31737,1490,227.000000,1516734829
400801,31737,1490,1.692172,1516743515
400997,1490,1491,5.670000,1521572468
400998,1490,1491,100.000000,1521572720


In [6]:
edge_short = edges.head(10000)

In [7]:
weighted_edges = [(int(edges.iloc[i]["from"]),int(edges.iloc[i]["to"]), edges.iloc[i]["amount"]) for i in range(len(edges))]

In [8]:
G = nx.DiGraph()
G.add_weighted_edges_from(weighted_edges)

In [9]:
g_emb = n2v(G)

WINDOW = 1 # Node2Vec fit window
MIN_COUNT = 1 # Node2Vec min. count
BATCH_WORDS = 4 # Node2Vec batch words

mdl = g_emb.fit(
    #window=WINDOW,
    #min_count=MIN_COUNT,
    #batch_words=BATCH_WORDS
)

emb_df = (
    pd.DataFrame(
        [mdl.wv.get_vector(str(n)) for n in G.nodes()],
        index = G.nodes
    )
)

emb_df = emb_df.merge(
    nodes[['id', 'label']].set_index('id'),
    left_index = True,
    right_index = True
)

emb_df.head()

Computing transition probabilities:   0%|          | 0/86622 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 10/10 [00:07<00:00,  1.27it/s]


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,119,120,121,122,123,124,125,126,127,label
2,-0.718576,-0.519408,-0.016065,0.570895,0.419324,0.050625,-0.22782,-0.223709,-0.75538,0.489541,...,0.583235,0.535721,-0.095898,-0.466622,1.217027,-1.601778,-0.463972,-0.111824,-0.370471,0
63,0.614753,-0.71025,-0.242986,-0.595262,0.775654,0.087683,-0.444086,-0.391527,-0.494486,0.349964,...,0.278874,-0.075922,0.569549,-0.75965,2.239523,-0.567779,0.169506,0.212455,0.020004,0
73,-0.411412,-0.603527,0.257598,0.755987,0.525671,0.552528,-0.478562,-0.191992,0.41881,-0.124178,...,0.314988,-1.519728,1.781798,-1.55973,1.359577,-1.15993,-0.222984,0.382843,-0.576944,1
165,2.091327,-0.965111,-0.593696,0.333485,1.185497,1.231085,-1.434017,0.274971,-0.751329,0.640869,...,0.733103,-0.351556,2.179259,-1.836844,1.927362,-0.453871,-0.566027,-0.034057,-0.322893,1
268,0.783622,0.014042,-0.779979,0.569744,2.046768,0.293983,-0.730126,0.349622,-0.84286,1.019686,...,0.714571,-1.25889,0.128615,-0.054563,0.038366,0.343481,0.085668,-0.010404,-0.444765,1


In [10]:
ft_cols = emb_df.drop(columns = ['label']).columns.tolist()
target_col = 'label'

# train test split
x = emb_df[ft_cols].values
y = emb_df[target_col].values

x_train, x_test, y_train, y_test = train_test_split(
    x, 
    y,
    test_size = 0.3
)

# GBC classifier
clf = GradientBoostingClassifier()

# train the model
clf.fit(x_train, y_train)

In [11]:
def clf_eval(clf, x_test, y_test):
    '''
    This function will evaluate a sk-learn multi-class classification model based on its
    x_test and y_test values
    
    params:
        clf (Model) : The model you wish to evaluate the performance of
        x_test (Array) : Result of the train test split
        y_test (Array) : Result of the train test split
    
    returns:
        This function will return the following evaluation metrics:
            - Accuracy Score
            - Matthews Correlation Coefficient
            - Classification Report
            - Confusion Matrix
    
    example:
        clf_eval(
            clf,
            x_test,
            y_test
        )
    '''
    y_pred = clf.predict(x_test)
    y_true = y_test
    
    y_pred = clf.predict(x_test)
    test_acc = accuracy_score(y_test, y_pred)
    print("Testing Accuracy : ", test_acc)
    
    print("MCC Score : ", matthews_corrcoef(y_true, y_pred))
    
    print("Classification Report : ")
    print(classification_report(y_test, clf.predict(x_test)))
    
    print(confusion_matrix(y_pred,y_test))

In [12]:
clf_eval(
    clf,
    x_test,
    y_test
)

Testing Accuracy :  0.8052434456928839
MCC Score :  0.608140303495466
Classification Report : 
              precision    recall  f1-score   support

           0       0.75      0.82      0.78       114
           1       0.85      0.80      0.82       153

    accuracy                           0.81       267
   macro avg       0.80      0.81      0.80       267
weighted avg       0.81      0.81      0.81       267

[[ 93  31]
 [ 21 122]]
