In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from sklearn.metrics import precision_score, recall_score, accuracy_score, classification_report, roc_curve, precision_recall_curve, confusion_matrix, f1_score, average_precision_score, cohen_kappa_score
from inspect import signature
from sklearn.externals import joblib
import random
from sklearn import metrics



# 1. Prepare Data

In [2]:

OOB_10=pd.read_csv("../7. Modeling - DecisionTree AdaBoost/AdaBoost (AllFeatures - OOB - 10).csv", index_col=0)

OOB_10.shape

(10, 1)

In [3]:
X_train_resampled = pd.read_csv("../3. Data Preparation/X_train_resampled.csv", index_col=0)
X_train_resampled.shape

(1848, 30)

In [4]:
y_train_resampled = pd.read_csv("../3. Data Preparation/y_train_resampled.csv", index_col=0)
y_train_resampled.shape

(1848, 1)

In [5]:
X_val = pd.read_csv("../3. Data Preparation/X_val.csv", index_col=0)
X_val.shape

(56962, 30)

In [6]:
y_val = pd.read_csv("../3. Data Preparation/y_val.csv", index_col=0, header=0)
y_val.shape

(56962, 1)

In [7]:
dTree = joblib.load('../7. Modeling - DecisionTree AdaBoost/DecisionTree-gini.pkl') 

In [9]:
dim=100

# 2. Permutation Test


In [10]:
accuracy_from_features = []
feature_list = random.sample(list(X_train_resampled.columns.values), len(OOB_10.index.values))

for i in range(0, dim):
    
    accuracy_from_features.append(feature_list)
    
    X_temp = X_train_resampled[feature_list].copy()
    
    # Create a random forest classifier
    clf = AdaBoostClassifier(base_estimator=dTree, n_estimators=101, learning_rate=0.05,
                              algorithm='SAMME', random_state=42)

    # Train the classifier
    clf.fit(X_temp, y_train_resampled['Class'].values)
    
    # Prediction
    X_test_temp = X_val[feature_list].copy()
    pred_y = clf.predict(X_test_temp)
    
    accuracy_from_features.append(metrics.accuracy_score(pred_y, y_val['Class'].values))
    print(i, accuracy_from_features[-1])
    
    while feature_list in accuracy_from_features:
        feature_list = random.sample(list(X_train_resampled.columns.values), len(OOB_10.index.values))

0 0.9942242196552087




1 0.9895544398019732




2 0.9942593307819247




3 0.996892665285629




4 0.9970682209192093




5 0.9951195533864682




6 0.9972964432428637




7 0.9942242196552087




8 0.995628664723851




9 0.9970682209192093




10 0.9978055545802464




11 0.9958393314841473




12 0.99501422000632




13 0.9946631087391594




14 0.995593553597135




15 0.9895017731118991




16 0.9893613286050349




17 0.9955233313437029




18 0.9955408869070609




19 0.9951019978231102




20 0.9967346652154068




21 0.9965766651451845




22 0.9979284435237527




23 0.996243109441382




24 0.9942768863452828




25 0.9960675538078017




26 0.9968048874688389




27 0.997454443313086




28 0.9945402197956532




29 0.9955408869070609




30 0.9962255538780239




31 0.997471998876444




32 0.9946806643025174




33 0.9966819985253327




34 0.9947157754292335




35 0.989993328885924




36 0.9946455531758014




37 0.9943646641620729




38 0.9952775534566904




39 0.9970857764825674




40 0.9979284435237527




41 0.9953828868368386




42 0.9945051086689372




43 0.9978231101436045




44 0.9976299989466662




45 0.9933113303605913




46 0.9939608862048384




47 0.9950668866963941




48 0.9956988869772831




49 0.9955057757803448




50 0.9948211088093817




51 0.995031775569678




52 0.9953126645834065




53 0.9950493311330361




54 0.9978055545802464




55 0.9963659983848882




56 0.993118219163653




57 0.9947157754292335




58 0.9959973315543695




59 0.9967171096520487




60 0.9955408869070609




61 0.9852708823426144




62 0.9964186650749622




63 0.995628664723851




64 0.9958919981742214




65 0.9945226642322952




66 0.9938731083880482




67 0.9959973315543695




68 0.9964713317650363




69 0.9960499982444436




70 0.9891682174080966




71 0.9977879990168884




72 0.9942066640918507




73 0.997454443313086




74 0.9956637758505671




75 0.9966293318352586




76 0.9974017766230119




77 0.9946455531758014




78 0.9952775534566904




79 0.995628664723851




80 0.993715108317826




81 0.9963659983848882




82 0.9942768863452828




83 0.9938731083880482




84 0.9970155542291352




85 0.995628664723851




86 0.9969628875390611




87 0.9966644429619747




88 0.9953126645834065




89 0.9978582212703205




90 0.9936975527544679




91 0.9969277764123451




92 0.9954706646536288




93 0.9929953302201467




94 0.9808293248130332




95 0.9948913310628138




96 0.9952424423299744




97 0.9954179979635547




98 0.9963308872581721




99 0.9973139988062217




In [11]:
permutation_test_df = pd.DataFrame()
for i in range(0, len(OOB_10.index.values)):
    permutation_test_df['feature_'+str(i)] = ""
    
permutation_test_df['accuracy'] = 0.0


In [12]:
accuracy = list()
for i in range(0, dim):
    accuracy.insert(i,0)
    
for i in range(0, len(accuracy_from_features),2):
    accuracy.insert(i,accuracy_from_features[i+1])

In [13]:
accuracy

[0.9942242196552087,
 0,
 0.9895544398019732,
 0,
 0.9942593307819247,
 0,
 0.996892665285629,
 0,
 0.9970682209192093,
 0,
 0.9951195533864682,
 0,
 0.9972964432428637,
 0,
 0.9942242196552087,
 0,
 0.995628664723851,
 0,
 0.9970682209192093,
 0,
 0.9978055545802464,
 0,
 0.9958393314841473,
 0,
 0.99501422000632,
 0,
 0.9946631087391594,
 0,
 0.995593553597135,
 0,
 0.9895017731118991,
 0,
 0.9893613286050349,
 0,
 0.9955233313437029,
 0,
 0.9955408869070609,
 0,
 0.9951019978231102,
 0,
 0.9967346652154068,
 0,
 0.9965766651451845,
 0,
 0.9979284435237527,
 0,
 0.996243109441382,
 0,
 0.9942768863452828,
 0,
 0.9960675538078017,
 0,
 0.9968048874688389,
 0,
 0.997454443313086,
 0,
 0.9945402197956532,
 0,
 0.9955408869070609,
 0,
 0.9962255538780239,
 0,
 0.997471998876444,
 0,
 0.9946806643025174,
 0,
 0.9966819985253327,
 0,
 0.9947157754292335,
 0,
 0.989993328885924,
 0,
 0.9946455531758014,
 0,
 0.9943646641620729,
 0,
 0.9952775534566904,
 0,
 0.9970857764825674,
 0,
 0.997928

In [14]:
z=0
for i in range(0, len(accuracy_from_features),2):
    for j in range(0, len(accuracy_from_features[i])):
        permutation_test_df.loc[i, 'feature_'+str(j)] = accuracy_from_features[i][j]
        
    
    permutation_test_df.loc[i, 'accuracy'] = accuracy[i]

    #z=z+1

In [15]:
permutation_test_df.reset_index(drop=True)


Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,accuracy
0,V28,V19,V5,V1,V20,V12,V2,Scaled_Amount,V14,V25,0.994224
1,V7,V28,V5,V2,V6,V9,V22,V19,V23,V13,0.989554
2,V14,V6,V4,V15,V9,V19,V27,V21,V25,V16,0.994259
3,V22,V16,V2,V18,V27,V1,V23,V13,V12,V17,0.996893
4,V26,Scaled_Amount,V12,V5,V1,V8,V28,V6,Scaled_Time,V3,0.997068
5,V3,V13,V22,V11,V10,V24,V28,V23,V25,V20,0.995120
6,V28,V7,Scaled_Amount,V19,V18,V11,V27,Scaled_Time,V12,V5,0.997296
7,V17,V15,V2,V9,V6,V14,V4,V10,Scaled_Time,V12,0.994224
8,V22,V8,V25,V14,V24,V9,V20,V3,V19,Scaled_Amount,0.995629
9,V11,V5,V21,V12,V22,Scaled_Time,V20,V7,V6,V28,0.997068


In [16]:
permutation_test_df.sort_values("accuracy", ascending=False, inplace=True)

In [17]:
permutation_test_df.to_csv("PermutationTest (with OOB_10 Features).csv")

# 3. Compare Results

In [3]:
OOB_10=pd.read_csv("../7. Modeling - DecisionTree AdaBoost/AdaBoost (AllFeatures - OOB - 10).csv", index_col=0)
df_permutation_Test = pd.read_csv("PermutationTest (with OOB_10 Features).csv", index_col=0)

In [4]:
df_permutation_Test[df_permutation_Test['accuracy'] == df_permutation_Test['accuracy'].max()]

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,accuracy
44,V7,V13,V11,V9,V12,Scaled_Time,V27,V20,V22,V10,0.997928
80,V7,V23,V6,V9,V28,V21,V20,V10,V12,V19,0.997928


In [20]:
OOB_10

Unnamed: 0_level_0,OOB
Name,Unnamed: 1_level_1
V14,0.224588
V4,0.12859
V8,0.070321
V22,0.066843
Scaled_Time,0.063753
V17,0.063116
V19,0.05225
V20,0.051026
Scaled_Amount,0.047106
V7,0.044566


In [21]:
feature_permutationTest = df_permutation_Test[df_permutation_Test['accuracy'] == df_permutation_Test['accuracy'].max()].values
for e in feature_permutationTest:
    result_feature_perm = list(e[:10])
result_feature_perm

['V7', 'V23', 'V6', 'V9', 'V28', 'V21', 'V20', 'V10', 'V12', 'V19']

In [22]:
feature_OOB10 = list(OOB_10.index)
feature_OOB10

['V14',
 'V4',
 'V8',
 'V22',
 'Scaled_Time',
 'V17',
 'V19',
 'V20',
 'Scaled_Amount',
 'V7']

In [23]:

def intersection(lst1, lst2): 
    lst3 = [value for value in lst1 if value in lst2] 
    return lst3

In [24]:
intersectionFeatures= intersection(result_feature_perm,feature_OOB10)

In [25]:
intersectionFeaturesDF = pd.DataFrame(intersectionFeatures, columns=['Features Intersection'])

In [26]:
intersectionFeaturesDF

Unnamed: 0,Features Intersection
0,V7
1,V20
2,V19


In [27]:
intersectionFeaturesDF.to_csv("IntersectionFeaturesAdaBoostPermutation.csv")