In [1]:
import numpy as np
import pandas as pd
import seaborn as sns # libreria utile per matrice di confusione
# import plotly.express as px
from collections import Counter
from matplotlib import pyplot as plt
from sklearn import tree
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, fbeta_score, make_scorer, confusion_matrix, precision_recall_curve
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
#import shap
import lightgbm as lgb
import glob
from sklearn.feature_selection import RFE

In [2]:
plt.rcParams["mathtext.fontset"]

'dejavusans'

In [3]:
plt.rcParams.update({'font.size': 28, 'font.family': 'STIXGeneral', 'mathtext.fontset': 'stix'})

In [4]:
from sklearn.metrics import roc_curve, roc_auc_score

def plot_roc_curve(fpr, tpr, auc_score,label=None):
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0, 1], [0, 1], 'k--') # dashed diagonal
    plt.axis([0, 1, 0, 1])                                    
    plt.xlabel('False Positive Rate (Fall-Out)', fontsize=16) 
    plt.ylabel('True Positive Rate (Recall)', fontsize=16)
    plt.title('AUC Score: {}'.format(auc_score))
    plt.grid(True)                                            
    plt.show()

In [5]:
def best_classifiers_cv(df_cv,data,X_train,y_train,X_test,y_test):
    df_cv['classifier'] = df_cv.param_classifier.apply(lambda x: x.__str__().split('(')[0])
    df_cv['ranking_by_classifier'] = df_cv.sort_values(by='rank_test_score').groupby(['classifier'])['rank_test_score'].cumcount() + 1
    for row in df_cv[df_cv['ranking_by_classifier']==1].iterrows():
        cls = row[1]['param_classifier']
        name_classifier = row[1]['classifier'] 
        print(row[1]['classifier'])
        cls.fit(X_train,y_train.ravel())
        y_pred_train = cls.predict(X_train)
        data['precision_training_{}'.format(name_classifier)] = [precision_score(y_train, y_pred_train)] # TP / (FP + TP)
        data['recall_training_{}'.format(name_classifier)] = [recall_score(y_train, y_pred_train)]	     # TP / (FN + TP)
        data['accuracy_training_{}'.format(name_classifier)] = [accuracy_score(y_train, y_pred_train)]	 # (TP + TN)/ (TP + FN + TN + FP)
        data['f1_training_{}'.format(name_classifier)] = [f1_score(y_train, y_pred_train)]    

        # Confusion matrix 
        y_pred = cls.predict(X_test)
        data['precision_test_{}'.format(name_classifier)] = [precision_score(y_test, y_pred)] # TP / (FP + TP)
        data['recall_test_{}'.format(name_classifier)] = [recall_score(y_test, y_pred)]	     # TP / (FN + TP)
        data['accuracy_test_{}'.format(name_classifier)] = [accuracy_score(y_test, y_pred)]	 # (TP + TN)/ (TP + FN + TN + FP)
        data['f1_test_{}'.format(name_classifier)] = [f1_score(y_test, y_pred)]

        y_pred_proba = cls.predict_proba(X_test)[:, 1]

        fpr, tpr, thresholds = roc_curve(y_test,y_pred_proba)    
        auc_score = roc_auc_score(y_test,y_pred_proba)

        data['auc_score_test_{}'.format(name_classifier)] = auc_score

In [6]:
transaction_frequency_opcode = ['address', 'balance', 'lifetime', 'tx_in', 'tx_out', 'investment_in', 
 'payment_out', 'investment_to_contract/tx_in', 'payment_from_contract/tx_out', 
 '#addresses_paying_contract', '#addresses_paid_by_contract', 'mean_v1', 'sdev_v1', 'mean_v2', 'sdev_v2', 'paid_rate', 
 'paid_one', 'percentage_some_tx_in', 'sdev_tx_in', 'percentage_some_tx_out', 'sdev_tx_out', 'owner_gets_eth_Wo_investing',
 'owner_gets_eth_investing', 'owner_no_eth', 'PUSH', 'INVALID', 'DUP', 'JUMPDEST', 'STOP', 'MSTORE', 'JUMPI', 'REVERT', 'CALLVALUE',
 'ISZERO', 'CODECOPY', 'RETURN', 'LOG', 'SHA3', 'MSTORE8', 'SWAP', 'POP', 'ADD', 'MLOAD', 'AND', 'SUB', 'CALLDATALOAD', 'EXP', 
 'MUL', 'SLOAD', 'EQ', 'JUMP', 'DIV', 'CALLER', 'CALLDATACOPY', 'SSTORE', 'NOT', 'CALL', 'LT', 'GT', 'OR', 'ADDRESS2', 'TIMESTAMP', 
 'GASLIMIT', 'GAS', 'ORIGIN', 'BALANCE3', 'CALLDATASIZE', 'SAR', 'MSIZE', 'CODESIZE', 'COINBASE', 'CREATE2', 'EXTCODESIZE', 'CALLCODE', 'SHL',
 'BLOCKHASH', 'RETURNDATASIZE', 'SHR', 'GETPC', 'DELEGATECALL', 'MOD', 'ADDMOD', 'NUMBER', 'XOR', 'SLT', 'EXTCODECOPY', 'MULMOD', 'CREATE', 'SELFDESTRUCT', 
 'STATICCALL', 'RETURNDATACOPY', 'SGT', 'DIFFICULTY', 'SMOD', 'BYTE', 'SIGNEXTEND', 'CHAINID', 'SELFBALANCE', 'GASPRICE', 'EXTCODEHASH', 'SDIV', 'target']

In [7]:
only_opcode = [ 'address','PUSH', 'INVALID', 'DUP', 'JUMPDEST', 'STOP', 'MSTORE', 'JUMPI', 'REVERT', 'CALLVALUE', 'ISZERO', 'CODECOPY', 'RETURN', 'LOG', 
 'SHA3', 'MSTORE8', 'SWAP', 'POP', 'ADD', 'MLOAD', 'AND', 'SUB', 'CALLDATALOAD', 'EXP', 'MUL', 'SLOAD', 'EQ', 'JUMP', 'DIV', 'CALLER', 'CALLDATACOPY', 
 'SSTORE', 'NOT', 'CALL', 'LT', 'GT', 'OR', 'ADDRESS2', 'TIMESTAMP', 'GASLIMIT', 'GAS', 'ORIGIN', 'BALANCE3', 'CALLDATASIZE', 'SAR', 'MSIZE', 'CODESIZE', 
 'COINBASE', 'CREATE2', 'EXTCODESIZE', 'CALLCODE', 'SHL', 'BLOCKHASH', 'RETURNDATASIZE', 'SHR', 'GETPC', 'DELEGATECALL', 'MOD', 'ADDMOD', 'NUMBER', 'XOR', 
 'SLT', 'EXTCODECOPY', 'MULMOD', 'CREATE', 'SELFDESTRUCT', 'STATICCALL', 'RETURNDATACOPY', 'SGT', 'DIFFICULTY', 'SMOD', 'BYTE', 'SIGNEXTEND', 'CHAINID', 
 'SELFBALANCE', 'GASPRICE', 'EXTCODEHASH', 'SDIV',
 'target']

In [8]:
dataset_path= ("C:/Users/lucap/OneDrive/Desktop/Smart Ponzi/bytecode_opcode_8k.csv")
db = pd.read_csv(dataset_path, header = 0, delimiter = ';')
db = db.fillna(0)
db

Unnamed: 0,address,balance,lifetime,tx_in,tx_out,investment_in,payment_out,investment_to_contract/tx_in,payment_from_contract/tx_out,#addresses_paying_contract,...,DIFFICULTY,SMOD,BYTE,SIGNEXTEND,CHAINID,SELFBALANCE,GASPRICE,EXTCODEHASH,SDIV,target
0,0x0006157838d5a6b33ab66588a6a693a57c869999,0.000691,117,2,0,1,0,0.5000,0.0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0x001a589dda0d6be37632925eaf1256986b2c6ad0,29.408568,318,11346,4,9685,4,0.8536,1.0,39,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0x00674045bb7c17f0aa1cde34780d6c51af548728,151.824186,1366,29,12,2,12,0.0690,1.0,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0x006bea43baa3f7a6f765f14f10a1a1b08334ef45,0.000000,21,9708,0,0,0,0.0000,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0x00a9f7d093c46d95f0318e4a6ffc6ed68f73044c,0.000000,0,78,1,76,1,0.9744,1.0,65,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7957,0xfd784da5c740c617aafb80399fa81b86e1da99a5,0.000000,461,9964,0,0,0,0.0000,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7958,0xfd7e33bc01a7493b189ddfdcc047500463be573a,0.000000,1,3,0,0,0,0.0000,0.0,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7959,0xfd82ffc0d08dcfd902cbf6e48550033b01f919cc,0.000000,48,8,2,2,2,0.2500,1.0,2,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7960,0xfd9683e9f2c62e08b6bf68123e18e527efa8fbbc,0.000000,1483,6,2,2,2,0.3333,1.0,2,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Recall Optimization

Avendo fatto girare già tutto per prova allegerisco il gridsearch lasciando una combinazione di parametri per caso

In [18]:
def get_grid_search_recall(dataset,name):
    data = {}
    data['test'] = name
    X = dataset.iloc[:, 1:-1].values # rimuovo la prima colonna che contiene gli indirizzi
    y = dataset.loc[:, ['target']].values


    print(dataset.target.value_counts())
    print(X.shape,y.shape,dataset.columns)

    # divido in train set (TR) e test set (TS)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state = 42, stratify=y) 

    flat_list_train = [item for sublist in y_train.tolist() for item in sublist]
    print('Number of smart contract in train set::', flat_list_train.count(0))
    print('Number of smart Ponzi in train set::', flat_list_train.count(1))
    flat_list_test = [item for sublist in y_test.tolist() for item in sublist]
    print('Number of smart contract in test set::', flat_list_test.count(0))
    print('Number of smart Ponzi in test set::', flat_list_test.count(1))
    pipeline = Pipeline(steps=[('classifier',RandomForestClassifier(random_state=42))])

    params =[
    #  {
    #      'classifier': [DecisionTreeClassifier(random_state=42)],
    #      'classifier__criterion' : ['entropy', 'gini'],
    #      'classifier__max_depth' : [5, 9],
    #      'classifier__min_samples_split' : [5, 15], # il valore tipico sta fra 1 e 40
    #      'classifier__max_features' : range(6,X_train.shape[1],3),
    #     'classifier__class_weight' : [{0:1, 1:4}, {0:1, 1:4.5}] # pesi, il valore raccomandato e' sum(negative instances)/sum(positive instances)
    # },
#
    # {
    #     'classifier': [RandomForestClassifier(random_state=42)],
    #     'classifier__n_estimators':[150, 200, 250],
    #     'classifier__min_samples_split' : [5,15],
    #     'classifier__criterion':['gini', 'entropy'],
    #     'classifier__class_weight' : [{0:1, 1:5.5}],
    #     'classifier__bootstrap': [True, False]
    #     },
#
    #  {
    #      'classifier': [lgb.LGBMClassifier(boosting_type='gbdt', n_jobs=4, importance_type='split',random_state=42)],
    #      'classifier__learning_rate' : [0.1,0.01],
    #      'classifier__n_estimators' : [80, 100, 120, 140],
    #      'classifier__max_depth' : [10,15,20],
    #      'classifier__colsample_bytree' : [0.5,0.8,1],
    #      'classifier__reg_alpha' : [0, 0.1, 0.2],
    #      'classifier__reg_lambda' : [1,10,15]
    #  }
      {
          'classifier': [DecisionTreeClassifier(random_state=42)],
          'classifier__criterion' : [ 'gini'],
          'classifier__max_depth' : [ 9],
          'classifier__min_samples_split' : [ 15], # il valore tipico sta fra 1 e 40
          'classifier__max_features' : range(6,X_train.shape[1]),
         'classifier__class_weight' : [{0:1, 1:4}] # pesi, il valore raccomandato e' sum(negative instances)/sum(positive instances)
     },

     {
         'classifier': [RandomForestClassifier(random_state=42)],
         'classifier__n_estimators':[200],
         'classifier__min_samples_split' : [15],
         'classifier__criterion':['entropy'],
         'classifier__class_weight' : [{0:1, 1:5.5}],
         'classifier__bootstrap': [True]
         },

      {
          'classifier': [lgb.LGBMClassifier(boosting_type='gbdt', n_jobs=4, importance_type='split',random_state=42)],
          'classifier__learning_rate' : [0.1],
          'classifier__n_estimators' : [120],
          'classifier__max_depth' : [15],
          'classifier__colsample_bytree' : [0.8],
          'classifier__reg_alpha' : [0.1],
          'classifier__reg_lambda' : [1]
      }


         ]
    grid_search = GridSearchCV(pipeline, params, cv=10,
                               scoring='recall',
                               return_train_score=True,n_jobs=4,verbose=2)
    grid_search.fit(X_train, y_train.ravel())
    df_cv = pd.DataFrame(grid_search.cv_results_)
    best_classifiers_cv(df_cv,data,X_train,y_train,X_test,y_test)
    
    data['best hyperparameters'] = [grid_search.best_params_]
    data['best score'] = [grid_search.best_score_]
    
    y_pred_train = grid_search.best_estimator_.predict(X_train)
    data['best_precision_training'] = [precision_score(y_train, y_pred_train)] # TP / (FP + TP)
    data['best_recall_training'] = [recall_score(y_train, y_pred_train)]	     # TP / (FN + TP)
    data['best_accuracy_training'] = [accuracy_score(y_train, y_pred_train)]	 # (TP + TN)/ (TP + FN + TN + FP)
    data['best_f1_training'] = [f1_score(y_train, y_pred_train)]    

    # Confusion matrix 
    y_pred = grid_search.best_estimator_.predict(X_test)
    data['best_precision_test'] = [precision_score(y_test, y_pred)] # TP / (FP + TP)
    data['best_recall_test'] = [recall_score(y_test, y_pred)]	     # TP / (FN + TP)
    data['best_accuracy_test'] = [accuracy_score(y_test, y_pred)]	 # (TP + TN)/ (TP + FN + TN + FP)
    data['best_f1_test'] = [f1_score(y_test, y_pred)]
    return grid_search,data

df_transaction_frequency_opcode

In [15]:
df_transaction_frequency_opcode =  db[transaction_frequency_opcode].copy()
df_transaction_frequency_opcode = df_transaction_frequency_opcode.drop_duplicates(subset='address')

In [19]:
grid_search_transaction_frequency_opcode,data_transaction_frequency_opcode = get_grid_search_recall(df_transaction_frequency_opcode,'frequency_opcode')

target
0.0    7028
1.0     934
Name: count, dtype: int64
(7962, 100) (7962, 1) Index(['address', 'balance', 'lifetime', 'tx_in', 'tx_out', 'investment_in',
       'payment_out', 'investment_to_contract/tx_in',
       'payment_from_contract/tx_out', '#addresses_paying_contract',
       ...
       'DIFFICULTY', 'SMOD', 'BYTE', 'SIGNEXTEND', 'CHAINID', 'SELFBALANCE',
       'GASPRICE', 'EXTCODEHASH', 'SDIV', 'target'],
      dtype='object', length=102)
Number of smart contract in train set:: 5973
Number of smart Ponzi in train set:: 794
Number of smart contract in test set:: 1055
Number of smart Ponzi in test set:: 140
Fitting 2 folds for each of 96 candidates, totalling 192 fits
DecisionTreeClassifier
RandomForestClassifier
LGBMClassifier


In [20]:
df_results = pd.DataFrame(data_transaction_frequency_opcode)
df_results

Unnamed: 0,test,precision_training_DecisionTreeClassifier,recall_training_DecisionTreeClassifier,accuracy_training_DecisionTreeClassifier,f1_training_DecisionTreeClassifier,precision_test_DecisionTreeClassifier,recall_test_DecisionTreeClassifier,accuracy_test_DecisionTreeClassifier,f1_test_DecisionTreeClassifier,auc_score_test_DecisionTreeClassifier,...,best hyperparameters,best score,best_precision_training,best_recall_training,best_accuracy_training,best_f1_training,best_precision_test,best_recall_test,best_accuracy_test,best_f1_test
0,frequency_opcode,0.73161,0.926952,0.951529,0.817778,0.615385,0.8,0.917992,0.695652,0.884516,...,{'classifier': DecisionTreeClassifier(class_we...,0.754408,0.73161,0.926952,0.951529,0.817778,0.615385,0.8,0.917992,0.695652


df_only_opcode

In [24]:
df_only_opcode = db[only_opcode].copy()
grid_search_only_opcode, data_only_opcode = get_grid_search_recall(df_only_opcode,'only_opcode')

target
0.0    7028
1.0     934
Name: count, dtype: int64
(7962, 77) (7962, 1) Index(['address', 'PUSH', 'INVALID', 'DUP', 'JUMPDEST', 'STOP', 'MSTORE',
       'JUMPI', 'REVERT', 'CALLVALUE', 'ISZERO', 'CODECOPY', 'RETURN', 'LOG',
       'SHA3', 'MSTORE8', 'SWAP', 'POP', 'ADD', 'MLOAD', 'AND', 'SUB',
       'CALLDATALOAD', 'EXP', 'MUL', 'SLOAD', 'EQ', 'JUMP', 'DIV', 'CALLER',
       'CALLDATACOPY', 'SSTORE', 'NOT', 'CALL', 'LT', 'GT', 'OR', 'ADDRESS2',
       'TIMESTAMP', 'GASLIMIT', 'GAS', 'ORIGIN', 'BALANCE3', 'CALLDATASIZE',
       'SAR', 'MSIZE', 'CODESIZE', 'COINBASE', 'CREATE2', 'EXTCODESIZE',
       'CALLCODE', 'SHL', 'BLOCKHASH', 'RETURNDATASIZE', 'SHR', 'GETPC',
       'DELEGATECALL', 'MOD', 'ADDMOD', 'NUMBER', 'XOR', 'SLT', 'EXTCODECOPY',
       'MULMOD', 'CREATE', 'SELFDESTRUCT', 'STATICCALL', 'RETURNDATACOPY',
       'SGT', 'DIFFICULTY', 'SMOD', 'BYTE', 'SIGNEXTEND', 'CHAINID',
       'SELFBALANCE', 'GASPRICE', 'EXTCODEHASH', 'SDIV', 'target'],
      dtype='object')
Number o

In [26]:
df_results = pd.concat([df_results,pd.DataFrame(data_only_opcode)])

df_transaction_weighted_opcode

In [12]:
df_transaction_weighted_opcode = db[transaction_frequency_opcode].copy()

In [13]:
y = df_transaction_weighted_opcode.loc[:, ['target']].values
y = pd.DataFrame(y,columns=['target'])

In [14]:
dataset_account = db.iloc[:, 0:24] 
dataset_opcode = db.iloc[:, 24:101] 

In [15]:
dataset_opcode['all_opcode'] = dataset_opcode[list(dataset_opcode.columns)].sum(axis=1) #generate all_opcode
dataset_opcode = dataset_opcode[:].div(dataset_opcode['all_opcode'], axis=0)
dataset_opcode = dataset_opcode.drop(['all_opcode'], axis = 1)

In [16]:
df_transaction_weighted_opcode = pd.concat([dataset_account, dataset_opcode, y], axis=1)
df_transaction_weighted_opcode = df_transaction_weighted_opcode.fillna(0)

In [17]:
df_transaction_weighted_opcode

Unnamed: 0,address,balance,lifetime,tx_in,tx_out,investment_in,payment_out,investment_to_contract/tx_in,payment_from_contract/tx_out,#addresses_paying_contract,...,DIFFICULTY,SMOD,BYTE,SIGNEXTEND,CHAINID,SELFBALANCE,GASPRICE,EXTCODEHASH,SDIV,target
0,0x0006157838d5a6b33ab66588a6a693a57c869999,0.000691,117,2,0,1,0,0.5000,0.0,1,...,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0x001a589dda0d6be37632925eaf1256986b2c6ad0,29.408568,318,11346,4,9685,4,0.8536,1.0,39,...,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0x00674045bb7c17f0aa1cde34780d6c51af548728,151.824186,1366,29,12,2,12,0.0690,1.0,2,...,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0x006bea43baa3f7a6f765f14f10a1a1b08334ef45,0.000000,21,9708,0,0,0,0.0000,0.0,0,...,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0x00a9f7d093c46d95f0318e4a6ffc6ed68f73044c,0.000000,0,78,1,76,1,0.9744,1.0,65,...,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7957,0xfd784da5c740c617aafb80399fa81b86e1da99a5,0.000000,461,9964,0,0,0,0.0000,0.0,0,...,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7958,0xfd7e33bc01a7493b189ddfdcc047500463be573a,0.000000,1,3,0,0,0,0.0000,0.0,0,...,0.000000,0.000000,0.000472,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7959,0xfd82ffc0d08dcfd902cbf6e48550033b01f919cc,0.000000,48,8,2,2,2,0.2500,1.0,2,...,0.000649,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7960,0xfd9683e9f2c62e08b6bf68123e18e527efa8fbbc,0.000000,1483,6,2,2,2,0.3333,1.0,2,...,0.000000,0.000175,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [48]:
grid_transaction_weighted_opcode, data_transaction_weighted_opcode= get_grid_search_recall(df_transaction_weighted_opcode,'weighted_opcode')

target
0.0    7028
1.0     934
Name: count, dtype: int64
(7962, 100) (7962, 1) Index(['address', 'balance', 'lifetime', 'tx_in', 'tx_out', 'investment_in',
       'payment_out', 'investment_to_contract/tx_in',
       'payment_from_contract/tx_out', '#addresses_paying_contract',
       ...
       'DIFFICULTY', 'SMOD', 'BYTE', 'SIGNEXTEND', 'CHAINID', 'SELFBALANCE',
       'GASPRICE', 'EXTCODEHASH', 'SDIV', 'target'],
      dtype='object', length=102)
Number of smart contract in train set:: 5973
Number of smart Ponzi in train set:: 794
Number of smart contract in test set:: 1055
Number of smart Ponzi in test set:: 140
Fitting 2 folds for each of 96 candidates, totalling 192 fits
DecisionTreeClassifier
RandomForestClassifier
LGBMClassifier


In [49]:
df_results = pd.concat([df_results,pd.DataFrame(data_transaction_weighted_opcode)])

In [50]:
df_melted = pd.melt(df_results,id_vars='test',value_vars=['precision_test_DecisionTreeClassifier',
       'recall_test_DecisionTreeClassifier',
       'accuracy_test_DecisionTreeClassifier',
       'f1_test_DecisionTreeClassifier',
       'auc_score_test_DecisionTreeClassifier','precision_test_RandomForestClassifier',
       'recall_test_RandomForestClassifier',
       'accuracy_test_RandomForestClassifier',
       'f1_test_RandomForestClassifier',
       'auc_score_test_RandomForestClassifier','precision_test_LGBMClassifier', 'recall_test_LGBMClassifier',
       'accuracy_test_LGBMClassifier', 'f1_test_LGBMClassifier',
       'auc_score_test_LGBMClassifier'],var_name='metric_classifier',value_name='score')

df_melted['classifier'] = df_melted['metric_classifier'].apply(lambda x: x.split('_')[-1])
df_melted['metric'] = df_melted['metric_classifier'].apply(lambda x: x.split('_')[0])

In [51]:
df_pivot_recall = df_melted.pivot(index=['test','classifier'],columns='metric',values='score').sort_values(['test','classifier','auc'])
df_pivot_recall

In [57]:
s_recall = df_pivot.to_latex(float_format="%.3f")
with open('comparison_table.tex','w') as fout:
    fout.write(s)

# AUC Optimization

In [39]:
def get_grid_search_auc_(dataset,name):
    data = {}
    data['test'] = name
    X = dataset.iloc[:, 1:-1].values # rimuovo la prima colonna che contiene gli indirizzi
    y = dataset.loc[:, ['target']].values


    print(dataset.target.value_counts())
    print(X.shape,y.shape,dataset.columns)

    # divido in train set (TR) e test set (TS)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state = 42, stratify=y) 

    flat_list_train = [item for sublist in y_train.tolist() for item in sublist]
    print('Number of smart contract in train set::', flat_list_train.count(0))
    print('Number of smart Ponzi in train set::', flat_list_train.count(1))
    flat_list_test = [item for sublist in y_test.tolist() for item in sublist]
    print('Number of smart contract in test set::', flat_list_test.count(0))
    print('Number of smart Ponzi in test set::', flat_list_test.count(1))
    pipeline = Pipeline(steps=[('classifier',RandomForestClassifier(random_state=42))])

    params =[
   {
          'classifier': [DecisionTreeClassifier(random_state=42)],
          'classifier__criterion' : ['entropy', 'gini'],
          'classifier__max_depth' : [5, 9],
          'classifier__min_samples_split' : [5, 15], # il valore tipico sta fra 1 e 40
          'classifier__max_features' : range(6,X_train.shape[1],3),
         'classifier__class_weight' : [{0:1, 1:4}, {0:1, 1:4.5}] # pesi, il valore raccomandato e' sum(negative instances)/sum(positive instances)
     },

     {
         'classifier': [RandomForestClassifier(random_state=42)],
         'classifier__n_estimators':[150, 200, 250],
         'classifier__min_samples_split' : [5,15],
         'classifier__criterion':['gini', 'entropy'],
         'classifier__class_weight' : [{0:1, 1:5.5}],
         'classifier__bootstrap': [True, False]
         },

      {
          'classifier': [lgb.LGBMClassifier(boosting_type='gbdt', n_jobs=4, importance_type='split',random_state=42)],
          'classifier__learning_rate' : [0.1,0.01],
          'classifier__n_estimators' : [80, 100, 120, 140],
          'classifier__max_depth' : [10,15,20],
          'classifier__colsample_bytree' : [0.5,0.8,1],
          'classifier__reg_alpha' : [0, 0.1, 0.2],
          'classifier__reg_lambda' : [1,10,15]
      }


         ]
    grid_search = GridSearchCV(pipeline, params, cv=10,
                               scoring='roc_auc',
                               return_train_score=True,n_jobs=4,verbose=2)
    grid_search.fit(X_train, y_train.ravel())
    df_cv = pd.DataFrame(grid_search.cv_results_)
    best_classifiers_cv(df_cv,data,X_train,y_train,X_test,y_test)
    
    data['best hyperparameters'] = [grid_search.best_params_]
    data['best score'] = [grid_search.best_score_]
    
    y_pred_train = grid_search.best_estimator_.predict(X_train)
    data['best_precision_training'] = [precision_score(y_train, y_pred_train)] # TP / (FP + TP)
    data['best_recall_training'] = [recall_score(y_train, y_pred_train)]	     # TP / (FN + TP)
    data['best_accuracy_training'] = [accuracy_score(y_train, y_pred_train)]	 # (TP + TN)/ (TP + FN + TN + FP)
    data['best_f1_training'] = [f1_score(y_train, y_pred_train)]    

    # Confusion matrix 
    y_pred = grid_search.best_estimator_.predict(X_test)
    data['best_precision_test'] = [precision_score(y_test, y_pred)] # TP / (FP + TP)
    data['best_recall_test'] = [recall_score(y_test, y_pred)]	     # TP / (FN + TP)
    data['best_accuracy_test'] = [accuracy_score(y_test, y_pred)]	 # (TP + TN)/ (TP + FN + TN + FP)
    data['best_f1_test'] = [f1_score(y_test, y_pred)]
    return grid_search,data

In [64]:
grid_search_transaction_frequency_opcode,data_transaction_frequency_opcode = get_grid_search_auc(df_transaction_frequency_opcode,'frequency_opcode')

target
0.0    7028
1.0     934
Name: count, dtype: int64
(7962, 100) (7962, 1) Index(['address', 'balance', 'lifetime', 'tx_in', 'tx_out', 'investment_in',
       'payment_out', 'investment_to_contract/tx_in',
       'payment_from_contract/tx_out', '#addresses_paying_contract',
       ...
       'DIFFICULTY', 'SMOD', 'BYTE', 'SIGNEXTEND', 'CHAINID', 'SELFBALANCE',
       'GASPRICE', 'EXTCODEHASH', 'SDIV', 'target'],
      dtype='object', length=102)
Number of smart contract in train set:: 5973
Number of smart Ponzi in train set:: 794
Number of smart contract in test set:: 1055
Number of smart Ponzi in test set:: 140
Fitting 2 folds for each of 96 candidates, totalling 192 fits
DecisionTreeClassifier
RandomForestClassifier
LGBMClassifier


In [21]:
df_results = pd.DataFrame(data_transaction_frequency_opcode)

NameError: name 'data_transaction_frequency_opcode' is not defined

In [66]:
grid_search_only_opcode, data_only_opcode = get_grid_search_auc(df_only_opcode,'only_opcode')
df_results = pd.concat([df_results,pd.DataFrame(data_only_opcode)])

target
0.0    7028
1.0     934
Name: count, dtype: int64
(7962, 77) (7962, 1) Index(['address', 'PUSH', 'INVALID', 'DUP', 'JUMPDEST', 'STOP', 'MSTORE',
       'JUMPI', 'REVERT', 'CALLVALUE', 'ISZERO', 'CODECOPY', 'RETURN', 'LOG',
       'SHA3', 'MSTORE8', 'SWAP', 'POP', 'ADD', 'MLOAD', 'AND', 'SUB',
       'CALLDATALOAD', 'EXP', 'MUL', 'SLOAD', 'EQ', 'JUMP', 'DIV', 'CALLER',
       'CALLDATACOPY', 'SSTORE', 'NOT', 'CALL', 'LT', 'GT', 'OR', 'ADDRESS2',
       'TIMESTAMP', 'GASLIMIT', 'GAS', 'ORIGIN', 'BALANCE3', 'CALLDATASIZE',
       'SAR', 'MSIZE', 'CODESIZE', 'COINBASE', 'CREATE2', 'EXTCODESIZE',
       'CALLCODE', 'SHL', 'BLOCKHASH', 'RETURNDATASIZE', 'SHR', 'GETPC',
       'DELEGATECALL', 'MOD', 'ADDMOD', 'NUMBER', 'XOR', 'SLT', 'EXTCODECOPY',
       'MULMOD', 'CREATE', 'SELFDESTRUCT', 'STATICCALL', 'RETURNDATACOPY',
       'SGT', 'DIFFICULTY', 'SMOD', 'BYTE', 'SIGNEXTEND', 'CHAINID',
       'SELFBALANCE', 'GASPRICE', 'EXTCODEHASH', 'SDIV', 'target'],
      dtype='object')
Number o

In [None]:
grid_transaction_weighted_opcode, data_transaction_weighted_opcode= get_grid_search_auc(df_transaction_weighted_opcode,'weighted_opcode')
df_results = pd.concat([df_results,pd.DataFrame(data_transaction_weighted_opcode)])

In [45]:
df_melted = pd.melt(df_results_,id_vars='test',value_vars=['precision_test_DecisionTreeClassifier',
       'recall_test_DecisionTreeClassifier',
       'accuracy_test_DecisionTreeClassifier',
       'f1_test_DecisionTreeClassifier',
       'auc_score_test_DecisionTreeClassifier','precision_test_RandomForestClassifier',
       'recall_test_RandomForestClassifier',
       'accuracy_test_RandomForestClassifier',
       'f1_test_RandomForestClassifier',
       'auc_score_test_RandomForestClassifier','precision_test_LGBMClassifier', 'recall_test_LGBMClassifier',
       'accuracy_test_LGBMClassifier', 'f1_test_LGBMClassifier',
       'auc_score_test_LGBMClassifier'],var_name='metric_classifier',value_name='score')

df_melted['classifier'] = df_melted['metric_classifier'].apply(lambda x: x.split('_')[-1])
df_melted['metric'] = df_melted['metric_classifier'].apply(lambda x: x.split('_')[0])

df_pivot_auc= df_melted.pivot(index=['test','classifier'],columns='metric',values='score').sort_values(['test','classifier','auc'])
df_pivot_auc

KeyError: "The following 'value_vars' are not present in the DataFrame: ['accuracy_test_DecisionTreeClassifier', 'accuracy_test_RandomForestClassifier', 'auc_score_test_DecisionTreeClassifier', 'auc_score_test_RandomForestClassifier', 'f1_test_DecisionTreeClassifier', 'f1_test_RandomForestClassifier', 'precision_test_DecisionTreeClassifier', 'precision_test_RandomForestClassifier', 'recall_test_DecisionTreeClassifier', 'recall_test_RandomForestClassifier']"

In [68]:
s_auc = df_pivot.to_latex(float_format="%.3f")
with open('comparison_table_AUC.tex','w') as fout:
    fout.write(s)