In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from sklearn.metrics import precision_score, recall_score, accuracy_score, classification_report, roc_curve, precision_recall_curve, confusion_matrix, f1_score, average_precision_score, cohen_kappa_score
from inspect import signature
from sklearn.externals import joblib
import random
from sklearn import metrics



# 1. Prepare Data

In [2]:

OOB_10=pd.read_csv("../7. Modeling - DecisionTree AdaBoost/AdaBoost (AllFeatures - OOB - 10).csv", index_col=0)

OOB_10.shape

(10, 1)

In [3]:
X_train_resampled = pd.read_csv("../3. Data Preparation/X_train_resampled.csv", index_col=0)
X_train_resampled.shape

(1848, 30)

In [4]:
y_train_resampled = pd.read_csv("../3. Data Preparation/y_train_resampled.csv", index_col=0)
y_train_resampled.shape

(1848, 1)

In [5]:
X_val = pd.read_csv("../3. Data Preparation/X_val.csv", index_col=0)
X_val.shape

(56962, 30)

In [6]:
y_val = pd.read_csv("../3. Data Preparation/y_val.csv", index_col=0, header=0)
y_val.shape

(56962, 1)

In [7]:
X_train_resampled.drop(OOB_10.index.values, inplace=True, axis=1)
X_val.drop(OOB_10.index.values, inplace=True, axis=1)

In [8]:
print(X_val.shape)
print(X_train_resampled.shape)

(56962, 20)
(1848, 20)


In [9]:
dTree = joblib.load('../7. Modeling - DecisionTree AdaBoost/DecisionTree-gini.pkl') 

In [10]:
dim=100

# 2. Permutation Test


In [11]:
accuracy_from_features = []
feature_list = random.sample(list(X_train_resampled.columns.values), len(OOB_10.index.values))

for i in range(0, dim):
    
    accuracy_from_features.append(feature_list)
    
    X_temp = X_train_resampled[feature_list].copy()
    
    # Create a random forest classifier
    clf = AdaBoostClassifier(base_estimator=dTree, n_estimators=101, learning_rate=0.05,
                              algorithm='SAMME', random_state=42)

    # Train the classifier
    clf.fit(X_temp, y_train_resampled['Class'].values)
    
    # Prediction
    X_test_temp = X_val[feature_list].copy()
    pred_y = clf.predict(X_test_temp)
    
    accuracy_from_features.append(metrics.accuracy_score(pred_y, y_val['Class'].values))
    print(i, accuracy_from_features[-1])
    
    while feature_list in accuracy_from_features:
        feature_list = random.sample(list(X_train_resampled.columns.values), len(OOB_10.index.values))

0 0.9980513324672589




1 0.9972262209894316




2 0.9953477757101226




3 0.9953126645834065




4 0.9978055545802464




5 0.9980688880306169




6 0.9977704434535304




7 0.9951371089498262




8 0.9945928864857273




9 0.9984375548611355




10 0.99501422000632




11 0.9956462202872091




12 0.9963484428215301




13 0.9965942207085425




14 0.9955759980337769




15 0.9970331097924933




16 0.995031775569678




17 0.9974368877497279




18 0.997559776693234




19 0.9979635546504687




20 0.9980864435939749




21 0.9973666654962958




22 0.9981917769741231




23 0.9972964432428637




24 0.9976299989466662




25 0.9973491099329378




26 0.9951371089498262




27 0.9974193321863699




28 0.9977879990168884




29 0.9961553316245918




30 0.9977704434535304




31 0.9951371089498262




32 0.9955233313437029




33 0.9960851093711597




34 0.9967346652154068




35 0.9963133316948141




36 0.9963484428215301




37 0.9888522172676522




38 0.9957691092307153




39 0.9979986657771848




40 0.9976124433833081




41 0.9957691092307153




42 0.9936448860643938




43 0.9973491099329378




44 0.9927671078964924




45 0.9979986657771848




46 0.9949088866261718




47 0.9954531090902707




48 0.9938555528246902




49 0.9977704434535304




50 0.9947859976826656




51 0.9957866647940733




52 0.9971033320459254




53 0.995628664723851




54 0.997559776693234




55 0.9959973315543695




56 0.9977002212000983




57 0.9956462202872091




58 0.9982795547909132




59 0.9939608862048384




60 0.9976475545100242




61 0.9932937747972332




62 0.9954531090902707




63 0.9961377760612338




64 0.9954004424001967




65 0.9909237737438994




66 0.9981039991573329




67 0.9959446648642954




68 0.9882377725501211




69 0.9946104420490853




70 0.9954706646536288




71 0.9972437765527896




72 0.9957339981039992




73 0.9959797759910115




74 0.9952775534566904




75 0.9958568870475053




76 0.9976826656367402




77 0.9959446648642954




78 0.9946982198658755




79 0.9945402197956532




80 0.9947157754292335




81 0.9978582212703205




82 0.9951371089498262




83 0.9971559987359995




84 0.9964362206383203




85 0.9953302201467645




86 0.9984902215512096




87 0.995031775569678




88 0.996892665285629




89 0.9979635546504687




90 0.9977879990168884




91 0.9949088866261718




92 0.9955408869070609




93 0.9961377760612338




94 0.9979986657771848




95 0.9964186650749622




96 0.9966468873986166




97 0.994399775288789




98 0.9951722200765423




99 0.9959271093009374




In [12]:
permutation_test_df = pd.DataFrame()
for i in range(0, len(OOB_10.index.values)):
    permutation_test_df['feature_'+str(i)] = ""
    
permutation_test_df['accuracy'] = 0.0


In [13]:
accuracy = list()
for i in range(0, dim):
    accuracy.insert(i,0)
    
for i in range(0, len(accuracy_from_features),2):
    accuracy.insert(i,accuracy_from_features[i+1])

In [14]:
accuracy

[0.9980513324672589,
 0,
 0.9972262209894316,
 0,
 0.9953477757101226,
 0,
 0.9953126645834065,
 0,
 0.9978055545802464,
 0,
 0.9980688880306169,
 0,
 0.9977704434535304,
 0,
 0.9951371089498262,
 0,
 0.9945928864857273,
 0,
 0.9984375548611355,
 0,
 0.99501422000632,
 0,
 0.9956462202872091,
 0,
 0.9963484428215301,
 0,
 0.9965942207085425,
 0,
 0.9955759980337769,
 0,
 0.9970331097924933,
 0,
 0.995031775569678,
 0,
 0.9974368877497279,
 0,
 0.997559776693234,
 0,
 0.9979635546504687,
 0,
 0.9980864435939749,
 0,
 0.9973666654962958,
 0,
 0.9981917769741231,
 0,
 0.9972964432428637,
 0,
 0.9976299989466662,
 0,
 0.9973491099329378,
 0,
 0.9951371089498262,
 0,
 0.9974193321863699,
 0,
 0.9977879990168884,
 0,
 0.9961553316245918,
 0,
 0.9977704434535304,
 0,
 0.9951371089498262,
 0,
 0.9955233313437029,
 0,
 0.9960851093711597,
 0,
 0.9967346652154068,
 0,
 0.9963133316948141,
 0,
 0.9963484428215301,
 0,
 0.9888522172676522,
 0,
 0.9957691092307153,
 0,
 0.9979986657771848,
 0,
 0.9

In [15]:
z=0
for i in range(0, len(accuracy_from_features),2):
    for j in range(0, len(accuracy_from_features[i])):
        permutation_test_df.loc[i, 'feature_'+str(j)] = accuracy_from_features[i][j]
        
    
    permutation_test_df.loc[i, 'accuracy'] = accuracy[i]

    #z=z+1

In [16]:
permutation_test_df.reset_index(drop=True)


Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,accuracy
0,V6,V2,V15,V1,V5,V12,V16,V9,V21,V24,0.998051
1,V2,V27,V16,V11,V21,V23,V13,V26,V5,V24,0.997226
2,V18,V24,V25,V23,V5,V15,V2,V27,V10,V28,0.995348
3,V16,V21,V3,V24,V5,V15,V13,V23,V11,V6,0.995313
4,V16,V10,V12,V26,V2,V23,V18,V28,V27,V6,0.997806
5,V28,V5,V13,V6,V25,V16,V24,V9,V12,V11,0.998069
6,V10,V24,V15,V9,V12,V26,V27,V6,V25,V28,0.997770
7,V13,V25,V3,V15,V1,V6,V26,V10,V2,V9,0.995137
8,V15,V25,V10,V23,V21,V18,V1,V11,V3,V5,0.994593
9,V13,V15,V16,V25,V21,V12,V9,V11,V1,V24,0.998438


In [17]:
permutation_test_df.sort_values("accuracy", ascending=False, inplace=True)

In [18]:
permutation_test_df.to_csv("PermutationTest (without OOB_10 Features).csv")

# 3. Compare Results

In [19]:
OOB_10=pd.read_csv("../7. Modeling - DecisionTree AdaBoost/AdaBoost (AllFeatures - OOB - 10).csv", index_col=0)
df_permutation_Test = pd.read_csv("PermutationTest (without OOB_10 Features).csv", index_col=0)

In [20]:
df_permutation_Test[df_permutation_Test['accuracy'] == df_permutation_Test['accuracy'].max()]

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,accuracy
172,V23,V16,V2,V21,V25,V9,V13,V12,V5,V11,0.99849


In [21]:
OOB_10

Unnamed: 0_level_0,OOB
Name,Unnamed: 1_level_1
V14,0.224588
V4,0.12859
V8,0.070321
V22,0.066843
Scaled_Time,0.063753
V17,0.063116
V19,0.05225
V20,0.051026
Scaled_Amount,0.047106
V7,0.044566


In [22]:
feature_permutationTest = df_permutation_Test[df_permutation_Test['accuracy'] == df_permutation_Test['accuracy'].max()].values
for e in feature_permutationTest:
    result_feature_perm = list(e[:10])
result_feature_perm

['V23', 'V16', 'V2', 'V21', 'V25', 'V9', 'V13', 'V12', 'V5', 'V11']

In [23]:
result_feature_perm

['V23', 'V16', 'V2', 'V21', 'V25', 'V9', 'V13', 'V12', 'V5', 'V11']

In [24]:
intersectionFeaturesDF = pd.DataFrame(result_feature_perm, columns=['Features Intersection'])

In [26]:
intersectionFeaturesDF.to_csv("IntersectionFeaturesAdaBoostPermutationwithoutOOB.csv")