In [1]:
# Data Manipulation and Representation
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import networkx as nx
import pickle

# Mathematics
from random import randint
import math 
import time
from datetime import datetime

# Training Implementation
from sklearn.model_selection import train_test_split
from sklearn.svm import NuSVC
from sklearn.model_selection import KFold

# Validation Implementation
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_curve
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

# Data Preparation

In [2]:
main_data_dir = "D:/GitCloneProject/Bitcoin-Transaction-Graph-Elliptic-Data-Set/Data/Elliptic Data Set/elliptic_bitcoin_dataset"
features_data_path = main_data_dir + "/elliptic_txs_features.csv"
class_data_path = main_data_dir + "/elliptic_txs_classes.csv"
egdelist_data_path = main_data_dir + "/elliptic_txs_edgelist.csv"

In [3]:
features_df = pd.read_csv(features_data_path)
classes_df = pd.read_csv(class_data_path)
edges_df = pd.read_csv(egdelist_data_path)

In [4]:
features_df.head()

Unnamed: 0,txId,Time step,Local_feature_1,Local_feature_2,Local_feature_3,Local_feature_4,Local_feature_5,Local_feature_6,Local_feature_7,Local_feature_8,...,Aggregate_feature_63,Aggregate_feature_64,Aggregate_feature_65,Aggregate_feature_66,Aggregate_feature_67,Aggregate_feature_68,Aggregate_feature_69,Aggregate_feature_70,Aggregate_feature_71,Aggregate_feature_72
0,1076,48,-0.1685,0.270909,-0.091383,-0.046932,-0.043875,-0.02914,-0.061584,-0.163591,...,0.073047,-0.039637,1.46133,1.461369,0.018279,0.470019,1.216796,1.151607,1.5197,1.521399
1,2534,6,-0.170834,-0.131425,1.018602,0.028105,0.055376,0.054722,-0.061584,-0.163572,...,1.228858,0.379357,0.955101,0.459257,-0.098889,-0.08749,-0.09908,-0.122137,-0.37997,-0.379288
2,3181,34,1.305212,-0.210553,-1.756361,-0.12197,97.30065,-0.113002,-0.061584,1.348765,...,1.34845,1.590664,0.059948,0.113967,-0.098889,1.969527,0.037532,-0.13101,0.006994,0.017772
3,3321,1,-0.169615,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.160199,...,-0.577099,-0.50008,0.241128,0.241406,-0.098889,-0.08749,-0.084674,-0.140597,1.5197,1.521399
4,3889,48,-0.086232,-0.101835,-0.646376,-0.12197,17.046997,-0.113002,-0.061584,-0.074885,...,0.501062,0.36251,0.082065,0.114773,-0.098889,8.948005,1.024948,-0.00957,-0.080708,-0.123601


In [5]:
classes_df.head()

Unnamed: 0,txId,class
0,1076,3
1,2534,2
2,3181,2
3,3321,3
4,3889,3


In [6]:
edges_df.head()

Unnamed: 0,txId1,txId2
0,230425980,5530458
1,232022460,232438397
2,230460314,230459870
3,230333930,230595899
4,232013274,232029206


In [7]:
df_class_feature = pd.merge(classes_df, features_df)
df_class_feature.head()

Unnamed: 0,txId,class,Time step,Local_feature_1,Local_feature_2,Local_feature_3,Local_feature_4,Local_feature_5,Local_feature_6,Local_feature_7,...,Aggregate_feature_63,Aggregate_feature_64,Aggregate_feature_65,Aggregate_feature_66,Aggregate_feature_67,Aggregate_feature_68,Aggregate_feature_69,Aggregate_feature_70,Aggregate_feature_71,Aggregate_feature_72
0,1076,3,48,-0.1685,0.270909,-0.091383,-0.046932,-0.043875,-0.02914,-0.061584,...,0.073047,-0.039637,1.46133,1.461369,0.018279,0.470019,1.216796,1.151607,1.5197,1.521399
1,2534,2,6,-0.170834,-0.131425,1.018602,0.028105,0.055376,0.054722,-0.061584,...,1.228858,0.379357,0.955101,0.459257,-0.098889,-0.08749,-0.09908,-0.122137,-0.37997,-0.379288
2,3181,2,34,1.305212,-0.210553,-1.756361,-0.12197,97.30065,-0.113002,-0.061584,...,1.34845,1.590664,0.059948,0.113967,-0.098889,1.969527,0.037532,-0.13101,0.006994,0.017772
3,3321,3,1,-0.169615,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,...,-0.577099,-0.50008,0.241128,0.241406,-0.098889,-0.08749,-0.084674,-0.140597,1.5197,1.521399
4,3889,3,48,-0.086232,-0.101835,-0.646376,-0.12197,17.046997,-0.113002,-0.061584,...,0.501062,0.36251,0.082065,0.114773,-0.098889,8.948005,1.024948,-0.00957,-0.080708,-0.123601


In [8]:
selected_ids = df_class_feature.loc[(df_class_feature['class'] != 3), 'txId']
df_edges_selected = edges_df.loc[edges_df['txId1'].isin(selected_ids)]
df_classes_selected = classes_df.loc[classes_df['txId'].isin(selected_ids)]
df_features_selected = features_df.loc[features_df['txId'].isin(selected_ids)]

# Merge Class and features
df_class_feature_selected = pd.merge(df_classes_selected, df_features_selected)
df_class_feature_selected.head()

Unnamed: 0,txId,class,Time step,Local_feature_1,Local_feature_2,Local_feature_3,Local_feature_4,Local_feature_5,Local_feature_6,Local_feature_7,...,Aggregate_feature_63,Aggregate_feature_64,Aggregate_feature_65,Aggregate_feature_66,Aggregate_feature_67,Aggregate_feature_68,Aggregate_feature_69,Aggregate_feature_70,Aggregate_feature_71,Aggregate_feature_72
0,2534,2,6,-0.170834,-0.131425,1.018602,0.028105,0.055376,0.054722,-0.061584,...,1.228858,0.379357,0.955101,0.459257,-0.098889,-0.08749,-0.09908,-0.122137,-0.37997,-0.379288
1,3181,2,34,1.305212,-0.210553,-1.756361,-0.12197,97.30065,-0.113002,-0.061584,...,1.34845,1.590664,0.059948,0.113967,-0.098889,1.969527,0.037532,-0.13101,0.006994,0.017772
2,6418,2,48,-0.125229,7.538599,2.128587,7.381781,0.095076,7.602324,0.547008,...,0.43396,0.168508,-0.197237,0.303447,-0.098889,0.931406,1.041565,0.371499,0.79006,0.779125
3,7952,2,48,0.967162,-0.210553,-1.756361,-0.12197,92.556494,-0.113002,-0.061584,...,0.823581,0.850443,0.321431,0.26881,-0.098889,1.1621,0.485146,0.303509,-0.55352,-0.578865
4,9351,2,48,-0.17297,-0.114281,1.018602,-0.12197,-0.043875,-0.113002,-0.061584,...,-0.569626,0.641566,1.46133,1.461369,-0.098889,0.662264,1.728088,1.539269,1.5197,1.521399


In [9]:
X = df_class_feature_selected.drop(columns=['txId', 'class', 'Time step']) # drop class, text id and time step
y = df_class_feature_selected[['class']]
print(np.unique(y))

# in this case, class 2 corresponds to licit transactions, we chang this to 0 as our interest is the ilicit transactions
y = y['class'].apply(lambda x: 0 if x == 2 else 1 )
print(np.unique(y))

[1 2]
[0 1]


In [10]:
print("Feature Shape: ", X.shape)
print("Label Shape: ", y.shape)

Feature Shape:  (46564, 165)
Label Shape:  (46564,)


# Utils

In [12]:
class utils:
    def __init__(self, confusion_matrix, y_true, y_pred):
        """
            - confusion_matrix: 2x2 numpy array
            - y_true: array of label
            - y_pred: array of output value calculated by model
            - fold_count: number of folds
        """
        
        # Initilize all indicator
        self.TP = confusion_matrix[0][0] # true positive
        self.FN = confusion_matrix[1][0] # false negative
        self.FP = confusion_matrix[1][0] # false positive
        self.TN = confusion_matrix[1][1] # true negative
        self.precision = self.TP/(self.FN +  self.TP) # Precision Score - Positive Predictive Value
        self.recall = self.TP / (self.TP + self.FN)
        self.f1_score =  (2 * self.TP) / (2 * self.TP + self.FP + self.FN)
        self.sensitivity = self.TP / (self.TP + self.FN) # True Positive Rate
        self.specificity = self.TN / (self.TN + self.FP) # False Positive Rate
        self.negative_predictive_value = self.TN / (self.TN + self.FN) # Negative Predictive Value
        self.false_negative_rate = self.FN / (self.FN + self.TP) # False Negative Rate
        self.false_positive_rate = self.FP / (self.FP + self.TN) # False Positive Rate
        self.false_discovery_rate = self.FP / (self.FP + self.TP) # False Discovery Rate
        self.false_omission_rate = self.FN / (self.FN + self.TN) # False Ommision Rate
        self.positive_likelihood_ratio = self.sensitivity / self.false_positive_rate # Positive Likelihood Ratio
        self.negative_likelihood_ratio = self.false_negative_rate / self.specificity # Negative Likelihood Ratio
        self.prevalence_threshold = math.sqrt(self.false_positive_rate) / (math.sqrt(self.sensitivity) + 
                                                                      math.sqrt(self.false_positive_rate)) # Prevalance Threshold
        self.threat_score = self.TP / (self.TN + self.FN + self.FP) # Threat Score
        self.prevalence = (self.TP + self.FN)/(self.TP + self.FN + self.TN + self.FP) # Prevalance 
        #  Matthews correlation coefficient
        self.matthews_correlation_coefficient = (self.TP*self.TN - self.FN*self.FP) / ((self.TP + self.FP)
                                                                                       *(self.TP + self.FN)
                                                                                       *(self.TN + self.FP)
                                                                                       *(self.TN + self.FN))
        self.fowlkes_mallows_index = math.sqrt(self.sensitivity + self.precision) # Fowlkes–Mallows index
        self.informedness = self.sensitivity + self.specificity - 1 # informedness
        self.markedness = self.precision + self.negative_predictive_value - 1 # markedness
        self.diagnostic_odds_ratio = self.positive_likelihood_ratio / self.negative_likelihood_ratio # Diagnostic odds ratio
        self.accuracy = (self.TP + self.TN) / (self.TP + self.TN + self.FP + self.FN)
        self.balanced_accuracy = (self.sensitivity + self.specificity) / 2
        self.roc_auc_macro = roc_auc_score(y_true, y_pred)
        self.roc_auc_micro = roc_auc_score(y_true, y_pred, average = 'micro')
        self.roc_auc_weighted = roc_auc_score(y_true, y_pred, average = 'weighted')
        self.cls_report = classification_report(y_true, y_pred)
        
        # Initilize the structure of output_dicts
        self.confusion_matrix = {
            "TP" : self.TP,
            "TN" : self.TN,
            "FN" : self.FN,
            "FP" : self.FP,
            "precision" : self.precision,
            "recall" : self.recall,
            "f1_score" : self.f1_score,
            "sensitivity" : self.sensitivity,
            "specificity" : self.specificity,
            "negative_predictive_value" : self.negative_predictive_value,
            "false_negative_rate" : self.false_negative_rate,
            "false_positive_rate" : self.false_positive_rate,
            "false_discovery_rate" : self.false_discovery_rate,
            "false_omission_rate" : self.false_omission_rate,
            "Positive_likelihood_ratio" : self.positive_likelihood_ratio,
            "Negative_likelihood_ratio" : self.negative_likelihood_ratio,
            "prevalence_threshold" : self.prevalence_threshold,
            "threat_score" : self.threat_score,
            "Prevalence" : self.prevalence,
            "Matthews_correlation_coefficient" : self.matthews_correlation_coefficient,
            "Fowlkes_Mallows_index" : self.fowlkes_mallows_index,
            "informedness" : self.informedness,
            "markedness" : self.markedness,
            "Diagnostic_odds_ratio" : self.diagnostic_odds_ratio,
            "accuracy" : self.accuracy,
            "balanced_accuracy" : self.balanced_accuracy
        }
        
        self.roc_auc_score = {
            "Macro": self.roc_auc_macro, 
            "Micro": self.roc_auc_micro,
            "Weight": self.roc_auc_weighted
        }
        
        self.sub_dict = {
            "Confusion Matrix" : self.confusion_matrix,
            "ROC_AUC_SCORE" : self.roc_auc_score,
            "Classification Report" : classification_report(y_true.tolist(), y_pred.tolist(), 
                                                            labels = [0, 1], # 0 : Licit, 1 : Illicit
                                                            output_dict = True)
        }
    def get_value(self):
        return self.sub_dict
        
# Test 
y_true = np.array([randint(0,1) for x in range(200)])
y_pred = np.array([randint(0,1) for x in range(200)])
confusion_matrix_test = confusion_matrix(y_true, y_pred)
base_utils = utils(confusion_matrix_test, y_true, y_pred)
base_utils.get_value()

{'Confusion Matrix': {'TP': 49,
  'TN': 50,
  'FN': 63,
  'FP': 63,
  'precision': 0.4375,
  'recall': 0.4375,
  'f1_score': 0.4375,
  'sensitivity': 0.4375,
  'specificity': 0.4424778761061947,
  'negative_predictive_value': 0.4424778761061947,
  'false_negative_rate': 0.5625,
  'false_positive_rate': 0.5575221238938053,
  'false_discovery_rate': 0.5625,
  'false_omission_rate': 0.5575221238938053,
  'Positive_likelihood_ratio': 0.7847222222222222,
  'Negative_likelihood_ratio': 1.27125,
  'prevalence_threshold': 0.5302661370059422,
  'threat_score': 0.2784090909090909,
  'Prevalence': 0.49777777777777776,
  'Matthews_correlation_coefficient': -9.48341686897956e-06,
  'Fowlkes_Mallows_index': 0.9354143466934853,
  'informedness': -0.12002212389380529,
  'markedness': -0.12002212389380529,
  'Diagnostic_odds_ratio': 0.6172839506172839,
  'accuracy': 0.44,
  'balanced_accuracy': 0.43998893805309736},
 'ROC_AUC_SCORE': {'Macro': 0.5028481334553961,
  'Micro': 0.5028481334553961,
  'Weigh

# Training

In [18]:
class Training:
    def __init__(self, fold_count, X, y):
        self.fold_count = fold_count
        self.param_grid = {
            "kernel" : ["linear", "rbf", "sigmoid", "precomputed"],
            "degree" : [1, 2, 3, 5, 7, 9],
            "gamma" : ["scale", "auto"],
            "class_weight" : ["balanced", None]
        }
        self.kf = KFold(n_splits=fold_count)
        self.history = {}
        self.X = X
        self.y = y
    
    def get_fold_value(self):
        return self.kf        
        
    def training(self, kernel, degree, gamma, class_weight, train_case):
    
        # output_dict initilize
        output_dict = {}
    
        # poiter track the index of fold
        fold_index = 0
    
        for train_index, test_index in self.kf.split(self.X):
            print("\tFold: {}".format(fold_index))
            print("\tTRAIN:", train_index, "\n\tTEST:", test_index)
        
            # folding data
            X_train, X_test = self.X.values[train_index], self.X.values[test_index]
            y_train, y_test = self.y.values[train_index], self.y.values[test_index]
    
            # Training
            print("\t\tTraining : {}".format(fold_index), end = " -- ")
            print("Start: {}".format(datetime.now().strftime("%m/%d/%Y, %H:%M:%S")), end=" --- ")
            model_nu_svc = NuSVC(nu = 0.5, kernel = kernel, 
                            degree = degree, 
                            gamma = gamma, 
                            class_weight = class_weight
                           )
            model_nu_svc.fit(X_train,y_train)
            print("End: {}".format(datetime.now().strftime("%m/%d/%Y, %H:%M:%S")))
        
            # Testing
            print("\t\tValidation: {}".format(fold_index), end = " -- ")
            print("Start: {}".format(datetime.now().strftime("%m/%d/%Y, %H:%M:%S")), end="---")
            y_pred = model_nu_svc.predict(X_test)
            print("End: {}".format(datetime.now().strftime("%m/%d/%Y, %H:%M:%S")))
        
            # Evaluation
            cm = confusion_matrix(y_test, y_pred)
            current_utils = utils(cm, y_test, y_pred)
            output_dict["fold_{}".format(fold_index)] = current_utils.get_value()
            
            fold_index += 1
        print("\n")
        return output_dict 
    
    def train(self, path):
        count = 0
        for x in self.param_grid["kernel"]:
            for i in self.param_grid["degree"]:
                for j in self.param_grid["gamma"]:
                    for k in self.param_grid["class_weight"]:
                        print("Traning Case: {}".format(count))
                        self.history["train_{}".format(count)] = {
                            "param" : {
                                "kernel" : x,
                                "degree" : i,
                                "gamma" : j,
                                "class_weight" : k,
                            },
                            "train_fold" : self.training(x, i, j, k, count)
                        }
                        count += 1
        with open(path, 'wb') as f:
            pickle.dump(train.history, f)
        
        return self.history
            

In [19]:
path = 'svm_nuSVC_AF.pkl'
train = Training(5, X, y)
history = train.train(path)

Traning Case: 0
	Fold: 0
	TRAIN: [ 9313  9314  9315 ... 46561 46562 46563] 
	TEST: [   0    1    2 ... 9310 9311 9312]
		Training : 0 -- Start: 08/08/2022, 10:05:42 --- 

ValueError: specified nu is infeasible

In [None]:
with open(path, 'rb') as f:
    loaded_dict = pickle.load(f)
loaded_dict