In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from sklearn.metrics import precision_score, recall_score, accuracy_score, classification_report, roc_curve, precision_recall_curve, confusion_matrix, f1_score, average_precision_score, cohen_kappa_score
from inspect import signature
from sklearn.externals import joblib
import random
import xgboost as xgb
from xgboost import XGBClassifier



# 1. Prepare Data

In [2]:
OOB_4= pd.read_csv("../9. XBoost/XGB (AllFeatures - OOB - 4).csv", index_col=0)
OOB_4.shape

(4, 1)

In [3]:
X_train_resampled = pd.read_csv("../3. Data Preparation/X_train_resampled.csv", index_col=0)
X_train_resampled.shape

(1848, 30)

In [4]:
y_train_resampled = pd.read_csv("../3. Data Preparation/y_train_resampled.csv", index_col=0)
y_train_resampled.shape

(1848, 1)

In [5]:
X_val = pd.read_csv("../3. Data Preparation/X_val.csv", index_col=0)
X_val.shape

(56962, 30)

In [6]:
y_val = pd.read_csv("../3. Data Preparation/y_val.csv", index_col=0, header=0)
y_val.shape

(56962, 1)

In [7]:
X_train_resampled.drop(OOB_4.index.values, inplace=True, axis=1)
X_val.drop(OOB_4.index.values, inplace=True, axis=1)

In [8]:
print(X_val.shape)
print(X_train_resampled.shape)

(56962, 26)
(1848, 26)


In [9]:
dim = 1001

# 2. Permutation Test

In [10]:
accuracy_from_features = []
feature_list = random.sample(list(X_train_resampled.columns.values), len(OOB_4.index.values))

for i in range(0, dim):
    
    accuracy_from_features.append(feature_list)
    
    X_temp = X_train_resampled[feature_list].copy()
    
    # Create a random forest classifier
    clf = xgb.XGBClassifier(n_jobs=-1, n_estimators=300)

    # Train the classifier
    clf.fit(X_temp, y_train_resampled['Class'].values)
    
    # Prediction
    X_val_temp = X_val[feature_list].copy()
    pred_y_val = clf.predict(X_val_temp)
    
    accuracy_from_features.append(accuracy_score(pred_y_val, y_val['Class'].values))
    print(i, accuracy_from_features[-1])
    
    while feature_list in accuracy_from_features:
        feature_list = random.sample(list(X_train_resampled.columns.values), len(OOB_4.index.values))

0 0.9796882131947614




1 0.9887117727607879
2 0.9604824268810787
3 0.989993328885924
4 0.9748604332713037
5 0.9895544398019732
6 0.964502650890067
7 0.9569010919560409
8 0.9813559917137741
9 0.9745795442575752
10 0.9748955443980197
11 0.9890277729012324
12 0.9870088831150592
13 0.9907833292370353
14 0.974456655314069
15 0.9758786559460693
16 0.9564797584354482
17 0.9867982163547628
18 0.9521435342860153
19 0.9686808749692778
20 0.9730170991187107
21 0.9770197675643412
22 0.9703486534882905
23 0.9789859906604403
24 0.9930304413468628
25 0.9862188827639479
26 0.9765457673536744
27 0.9878691057196025
28 0.9536884238615218
29 0.9792142129840946
30 0.9826551034022681
31 0.9902215512095783
32 0.9736666549629578
33 0.9879393279730346
34 0.9771602120712054
35 0.9941715529651346
36 0.9536884238615218
37 0.9885011060004916
38 0.9630455391313507
39 0.9867982163547628
40 0.9754222112987606
41 0.9865875495944665
42 0.9815842140374285
43 0.9835504371335276
44 0.9652048734243882
45 0.9506337558372249
46 0.9644324286366349


364 0.959516870896387
365 0.9825146588954039
366 0.9815666584740704
367 0.9828482145992065
368 0.9646079842702152
369 0.9879217724096766
370 0.964485095326709
371 0.9892735507882447
372 0.9486148660510516
373 0.9605526491345108
374 0.9872897721287877
375 0.9913977739545662
376 0.9772655454513536
377 0.9629577613145606
378 0.9899055510691338
379 0.9559530915347073
380 0.9759488781995014
381 0.9726659878515501
382 0.9909588848706155
383 0.9893788841683929
384 0.9831291036129349
385 0.9680137635616727
386 0.9683122081387592
387 0.9749482110880938
388 0.966363540606018
389 0.9885362171272076
390 0.9755275446789088
391 0.9639584284259682
392 0.9759488781995014
393 0.9774059899582178
394 0.9912397738843439
395 0.9653277623678944
396 0.9882553281134792
397 0.9867982163547628
398 0.9910993293774797
399 0.9778624346055265
400 0.967574874477722
401 0.9618166496962888
402 0.9872897721287877
403 0.9803904357290826
404 0.9928548857132825
405 0.9645553175801411
406 0.9629928724412766
407 0.956497313

723 0.9896422176187634
724 0.9896246620554053
725 0.9554966468873987
726 0.9784242126329834
727 0.9916084407148625
728 0.9847617710052315
729 0.9759664337628594
730 0.9618868719497209
731 0.9723148765843896
732 0.9577437589972262
733 0.982584881148836
734 0.9745795442575752
735 0.9919244408553071
736 0.9883431059302693
737 0.9706997647554511
738 0.9665215406762403
739 0.9774937677750078
740 0.9668550963800429
741 0.9768266563674028
742 0.9855868824830589
743 0.9729819879919946
744 0.9708402092623152
745 0.9848671043853797
746 0.9909062181805414
747 0.9772831010147116
748 0.9758259892559952
749 0.9711913205294758
750 0.9885011060004916
751 0.9869562164249851
752 0.9726484322881921
753 0.9902566623362944
754 0.9846915487517994
755 0.9932235525438011
756 0.9744215441873529
757 0.9861311049471577
758 0.981900214177873
759 0.9628699834977704
760 0.973175099188933
761 0.9646430953969313
762 0.9875355500158
763 0.9860959938204417
764 0.9797935465749096
765 0.9681015413784628
766 0.97119132052

In [11]:
permutation_test_df = pd.DataFrame()
for i in range(0, len(OOB_4.index.values)):
    permutation_test_df['feature_'+str(i)] = ""
    
permutation_test_df['accuracy'] = 0.0






In [12]:
permutation_test_df

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,accuracy


In [13]:
accuracy = list()
for i in range(0, dim):
    accuracy.insert(i,0)
    
for i in range(0, len(accuracy_from_features),2):
    accuracy.insert(i,accuracy_from_features[i+1])  


In [14]:
accuracy

[0.9796882131947614,
 0,
 0.9887117727607879,
 0,
 0.9604824268810787,
 0,
 0.989993328885924,
 0,
 0.9748604332713037,
 0,
 0.9895544398019732,
 0,
 0.964502650890067,
 0,
 0.9569010919560409,
 0,
 0.9813559917137741,
 0,
 0.9745795442575752,
 0,
 0.9748955443980197,
 0,
 0.9890277729012324,
 0,
 0.9870088831150592,
 0,
 0.9907833292370353,
 0,
 0.974456655314069,
 0,
 0.9758786559460693,
 0,
 0.9564797584354482,
 0,
 0.9867982163547628,
 0,
 0.9521435342860153,
 0,
 0.9686808749692778,
 0,
 0.9730170991187107,
 0,
 0.9770197675643412,
 0,
 0.9703486534882905,
 0,
 0.9789859906604403,
 0,
 0.9930304413468628,
 0,
 0.9862188827639479,
 0,
 0.9765457673536744,
 0,
 0.9878691057196025,
 0,
 0.9536884238615218,
 0,
 0.9792142129840946,
 0,
 0.9826551034022681,
 0,
 0.9902215512095783,
 0,
 0.9736666549629578,
 0,
 0.9879393279730346,
 0,
 0.9771602120712054,
 0,
 0.9941715529651346,
 0,
 0.9536884238615218,
 0,
 0.9885011060004916,
 0,
 0.9630455391313507,
 0,
 0.9867982163547628,
 0,
 0.

In [15]:
z=0
for i in range(0, len(accuracy_from_features),2):
    for j in range(0, len(accuracy_from_features[i])):
        permutation_test_df.loc[i, 'feature_'+str(j)] = accuracy_from_features[i][j]
        
    
    permutation_test_df.loc[i, 'accuracy'] = accuracy[i]

    #z=z+1


In [16]:
permutation_test_df.reset_index(drop=True)


Unnamed: 0,feature_0,feature_1,feature_2,feature_3,accuracy
0,V2,V22,Scaled_Time,V3,0.979688
1,V27,V11,V24,V1,0.988712
2,V1,V2,Scaled_Time,V27,0.960482
3,V27,V10,V15,V28,0.989993
4,V25,V9,V19,V28,0.974860
5,V20,V11,V16,V25,0.989554
6,V1,V22,V5,V28,0.964503
7,Scaled_Amount,V20,V23,Scaled_Time,0.956901
8,V22,V3,V23,Scaled_Amount,0.981356
9,V21,V19,V23,V9,0.974580


In [17]:
permutation_test_df.sort_values("accuracy", ascending=False, inplace=True)

In [18]:
permutation_test_df.to_csv("PermutationTest (XGBoost without OOB_4 Features).csv")

# 3. Results Compare

In [19]:
OOB_4= pd.read_csv("../9. XBoost/XGB (AllFeatures - OOB - 4).csv", index_col=0)
df_permutation_Test = pd.read_csv("PermutationTest (XGBoost without OOB_4 Features).csv", index_col=0)

In [20]:
df_permutation_Test[df_permutation_Test['accuracy'] == df_permutation_Test['accuracy'].max()]

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,accuracy
968,V10,V26,V11,V9,0.994347


In [21]:
OOB_4.head()

Unnamed: 0_level_0,OOB
Name,Unnamed: 1_level_1
V14,0.514744
V17,0.060467
V4,0.036971
V12,0.036124
