In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from sklearn.metrics import precision_score, recall_score, accuracy_score, classification_report, roc_curve, precision_recall_curve, confusion_matrix, f1_score, average_precision_score, cohen_kappa_score
from inspect import signature
from sklearn.externals import joblib
import random
import xgboost as xgb
from xgboost import XGBClassifier



# 1. Prepare Data

In [2]:
OOB_4= pd.read_csv("../9. XBoost/XGB (AllFeatures - OOB - 4).csv", index_col=0)
OOB_4.shape

(4, 1)

In [3]:
X_train_resampled = pd.read_csv("../3. Data Preparation/X_train_resampled.csv", index_col=0)
X_train_resampled.shape

(1848, 30)

In [4]:
y_train_resampled = pd.read_csv("../3. Data Preparation/y_train_resampled.csv", index_col=0)
y_train_resampled.shape

(1848, 1)

In [5]:
X_val = pd.read_csv("../3. Data Preparation/X_val.csv", index_col=0)
X_val.shape

(56962, 30)

In [6]:
y_val = pd.read_csv("../3. Data Preparation/y_val.csv", index_col=0, header=0)
y_val.shape

(56962, 1)

In [7]:
dim = 1001

# 2. Permutation Test

In [8]:
accuracy_from_features = []
feature_list = random.sample(list(X_train_resampled.columns.values), len(OOB_4.index.values))

for i in range(0, dim):
    
    accuracy_from_features.append(feature_list)
    
    X_temp = X_train_resampled[feature_list].copy()
    
    # Create a random forest classifier
    clf  = xgb.XGBClassifier(n_jobs=-1, n_estimators=300)

    # Train the classifier
    clf.fit(X_temp, y_train_resampled['Class'].values)
    #Prediction
    X_val_temp = X_val[feature_list].copy()
    pred_y_val = clf.predict(X_val_temp)
    
    accuracy_from_features.append(accuracy_score(pred_y_val, y_val['Class'].values))
    print(i, accuracy_from_features[-1])
    
    while feature_list in accuracy_from_features:
        feature_list = random.sample(list(X_train_resampled.columns.values), len(OOB_4.index.values))
    
   

0 0.9708226536989571




1 0.9768793230574769
2 0.9572873143499175
3 0.9871317720585654
4 0.9840419929075525
5 0.9889399950844423
6 0.987553105579158
7 0.9840244373441944
8 0.993135774727011
9 0.9830939924862189
10 0.9591482040658684
11 0.9841297707243425
12 0.9762648783399459
13 0.9886766616340719
14 0.981900214177873
15 0.9750184333415259
16 0.9915733295881465
17 0.9870088831150592
18 0.9872019943119975
19 0.9683648748288333
20 0.9724377655278957
21 0.9606228713879429
22 0.9824619922053298
23 0.9903093290263685
24 0.9909062181805414
25 0.9898002176889856
26 0.9922053298690355
27 0.966960429760191
28 0.9793195463642428
29 0.9810751027000456
30 0.9914679962079983
31 0.9858502159334293
32 0.9723850988378218
33 0.9889224395210843
34 0.9884484393104175
35 0.9803728801657245
36 0.9890102173378744
37 0.980021768898564
38 0.9733682103858713
39 0.9880095502264667
40 0.9876408833959481
41 0.988132439169973
42 0.9913977739545662
43 0.9885011060004916
44 0.990607773603455
45 0.992451107756048
46 0.9751062111583161
47 0.

365 0.9876935500860222
366 0.9845511042449352
367 0.9646430953969313
368 0.9536533127348057
369 0.9762473227765879
370 0.9745970998209332
371 0.9679786524349566
372 0.9840595484709105
373 0.9838137705838981
374 0.9767037674238966
375 0.993153330290369
376 0.9859731048769355
377 0.9910115515606895
378 0.9895895509286893
379 0.9724377655278957
380 0.9913626628278501
381 0.9646957620870054
382 0.9757382114392051
383 0.9777746567887363
384 0.9907657736736772
385 0.9904848846599488
386 0.985025104455602
387 0.9892559952248867
388 0.9872722165654296
389 0.9667848741266107
390 0.9732277658790071
391 0.9926968856430602
392 0.9903971068431586
393 0.9907657736736772
394 0.9884133281837014
395 0.9873951055089358
396 0.9853762157227626
397 0.9715073206699203
398 0.9814262139672062
399 0.9876935500860222
400 0.9905375513500229
401 0.9830764369228608
402 0.9926266633896281
403 0.9520557564692251
404 0.9846388820617253
405 0.9879568835363927
406 0.9925037744461219
407 0.9757382114392051
408 0.9917664

723 0.9925037744461219
724 0.9914504406446403
725 0.993715108317826
726 0.9599908711070538
727 0.9601313156139181
728 0.9891506618447385
729 0.9603595379375724
730 0.9662933183525859
731 0.9823917699518978
732 0.9828657701625645
733 0.9859906604402935
734 0.9734384326393034
735 0.9859204381868614
736 0.991257329447702
737 0.9927144412064183
738 0.9649942066640919
739 0.991906885291949
740 0.9909413293072574
741 0.9780028791123907
742 0.9902742178996524
743 0.9915031073347144
744 0.98878199501422
745 0.9870264386784172
746 0.9880095502264667
747 0.9826199922755521
748 0.98504266001896
749 0.9748428777079456
750 0.9796179909413293
751 0.979424879744391
752 0.9893613286050349
753 0.9927144412064183
754 0.9908359959271092
755 0.9910115515606895
756 0.9929602190934307
757 0.9783891015062673
758 0.9926968856430602
759 0.9898002176889856
760 0.9870088831150592
761 0.9795653242512552
762 0.9764931006636003
763 0.9915382184614304
764 0.9880271057898248
765 0.9622555387802395
766 0.9615884273726

In [9]:
permutation_test_df = pd.DataFrame()
for i in range(0, len(OOB_4.index.values)):
    permutation_test_df['feature_'+str(i)] = ""
    
permutation_test_df['accuracy'] = 0.0


In [10]:
accuracy = list()
for i in range(0, dim):
    accuracy.insert(i,0)
    
for i in range(0, len(accuracy_from_features),2):
    accuracy.insert(i,accuracy_from_features[i+1])

In [11]:
accuracy

[0.9708226536989571,
 0,
 0.9768793230574769,
 0,
 0.9572873143499175,
 0,
 0.9871317720585654,
 0,
 0.9840419929075525,
 0,
 0.9889399950844423,
 0,
 0.987553105579158,
 0,
 0.9840244373441944,
 0,
 0.993135774727011,
 0,
 0.9830939924862189,
 0,
 0.9591482040658684,
 0,
 0.9841297707243425,
 0,
 0.9762648783399459,
 0,
 0.9886766616340719,
 0,
 0.981900214177873,
 0,
 0.9750184333415259,
 0,
 0.9915733295881465,
 0,
 0.9870088831150592,
 0,
 0.9872019943119975,
 0,
 0.9683648748288333,
 0,
 0.9724377655278957,
 0,
 0.9606228713879429,
 0,
 0.9824619922053298,
 0,
 0.9903093290263685,
 0,
 0.9909062181805414,
 0,
 0.9898002176889856,
 0,
 0.9922053298690355,
 0,
 0.966960429760191,
 0,
 0.9793195463642428,
 0,
 0.9810751027000456,
 0,
 0.9914679962079983,
 0,
 0.9858502159334293,
 0,
 0.9723850988378218,
 0,
 0.9889224395210843,
 0,
 0.9884484393104175,
 0,
 0.9803728801657245,
 0,
 0.9890102173378744,
 0,
 0.980021768898564,
 0,
 0.9733682103858713,
 0,
 0.9880095502264667,
 0,
 0.98

In [12]:
z=0
for i in range(0, len(accuracy_from_features),2):
    for j in range(0, len(accuracy_from_features[i])):
        permutation_test_df.loc[i, 'feature_'+str(j)] = accuracy_from_features[i][j]
        
    
    permutation_test_df.loc[i, 'accuracy'] = accuracy[i]

    #z=z+1

In [13]:
permutation_test_df.reset_index(drop=True)


Unnamed: 0,feature_0,feature_1,feature_2,feature_3,accuracy
0,V3,V1,V27,V25,0.970823
1,V4,V24,V1,V7,0.976879
2,V8,V27,V25,V13,0.957287
3,V4,V12,V13,V15,0.987132
4,V6,V13,V11,Scaled_Time,0.984042
5,Scaled_Amount,V22,V14,V19,0.988940
6,V1,V2,V16,V18,0.987553
7,V21,V16,V24,V15,0.984024
8,V25,V16,V7,V11,0.993136
9,V3,V26,V8,V21,0.983094


In [14]:
permutation_test_df.sort_values("accuracy", ascending=False, inplace=True)

In [15]:
permutation_test_df.to_csv("PermutationTest ( XGBoost with OOB_4 Features).csv")

# 3. Compare Results

In [16]:
OOB_4= pd.read_csv("../9. XBoost/XGB (AllFeatures - OOB - 4).csv", index_col=0)
df_permutation_Test = pd.read_csv("PermutationTest ( XGBoost with OOB_4 Features).csv", index_col=0)

In [17]:
 df_permutation_Test[df_permutation_Test['accuracy'] == df_permutation_Test['accuracy'].max()]

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,accuracy
700,V12,V11,V23,V10,0.994856


In [18]:
OOB_4

Unnamed: 0_level_0,OOB
Name,Unnamed: 1_level_1
V14,0.514744
V17,0.060467
V4,0.036971
V12,0.036124


In [19]:
feature_permutationTest = df_permutation_Test[df_permutation_Test['accuracy'] == df_permutation_Test['accuracy'].max()].values
for e in feature_permutationTest:
    result_feature_perm = list(e[:12])
result_feature_perm

['V12', 'V11', 'V23', 'V10', 0.9948562199360976]

In [20]:
feature_OOB4 = list(OOB_4.index)
feature_OOB4

['V14', 'V17', 'V4', 'V12']

In [21]:

def intersection(lst1, lst2): 
    lst3 = [value for value in lst1 if value in lst2] 
    return lst3

In [22]:
intersectionFeatures= intersection(result_feature_perm,feature_OOB4)

In [23]:
intersectionFeaturesDF = pd.DataFrame(intersectionFeatures, columns=['Features Intersection'])

In [24]:
intersectionFeaturesDF

Unnamed: 0,Features Intersection
0,V12


In [25]:
intersectionFeaturesDF.to_csv("IntersectionFeaturesXGBoost.csv")