# **Fraud Detection**

## PreProcessing

### Get Data

#### Download Dataset

In [None]:
f1id = '1W0EL3w6qxHRa5ZFPYhYl87Piv7Y94ELG'
f2id = '1mJv80_a17wXnXydZEPGXKFtnQRd5REsc'

!pip3 install gdown

import gdown

url = 'https://drive.google.com/uc?id=%s'%(f1id)

output = 'train.csv'
gdown.download(url, output, quiet=False)
url = 'https://drive.google.com/uc?id=%s'%(f2id)
output = 'test.csv'
gdown.download(url, output, quiet=False)





Downloading...
From: https://drive.google.com/uc?id=1W0EL3w6qxHRa5ZFPYhYl87Piv7Y94ELG
To: /content/train.csv
99.2MB [00:01, 94.4MB/s]
Downloading...
From: https://drive.google.com/uc?id=1mJv80_a17wXnXydZEPGXKFtnQRd5REsc
To: /content/test.csv
33.1MB [00:00, 86.6MB/s]


'test.csv'

#### Import package

In [None]:
# Basic lib

import numpy as np 
import pandas as pd 

# Preprocess
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
!pip install autoviz datasist pandas_profiling 
from autoviz.AutoViz_Class import AutoViz_Class

from pandas_profiling import ProfileReport
import datasist as ds 
from sklearn.feature_selection import RFECV

# Classifier Libraries
from sklearn.linear_model import LogisticRegression

from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
import lightgbm as lgb

# Performance Evaluation tools
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, cross_val_score
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, classification_report
from sklearn.model_selection import TimeSeriesSplit


# imblearn
from imblearn.pipeline import Pipeline, make_pipeline
from imblearn.under_sampling import NearMiss
from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE
from imblearn.combine import SMOTEENN, SMOTETomek
from imblearn.metrics import classification_report_imbalanced

# Utils
import time
import gc
from collections import Counter
import warnings
warnings.filterwarnings("ignore")


Collecting autoviz
  Downloading https://files.pythonhosted.org/packages/89/20/8c8c64d5221cfcbc54679f4f048a08292a16dbad178af7c78541aa3af730/autoviz-0.0.81-py3-none-any.whl
Collecting datasist
  Downloading https://files.pythonhosted.org/packages/af/da/ed4f7d38947c32aaf1f70630413a5ee83a4d8c520add349f8d53c878bce1/datasist-1.5.3-py3-none-any.whl
Installing collected packages: autoviz, datasist
Successfully installed autoviz-0.0.81 datasist-1.5.3
Imported AutoViz_Class version: 0.0.81. Call using:
    from autoviz.AutoViz_Class import AutoViz_Class
    AV = AutoViz_Class()
    AV.AutoViz(filename, sep=',', depVar='', dfte=None, header=0, verbose=0,
                            lowess=False,chart_format='svg',max_rows_analyzed=150000,max_cols_analyzed=30)
Note: verbose=0 or 1 generates charts and displays them in your local Jupyter notebook.
      verbose=2 saves plots in your local machine under AutoViz_Plots directory and does not display charts.


#### Read Data

In [None]:
df_train = pd.read_csv('./train.csv')
df_test = pd.read_csv('./test.csv')
df_all = pd.concat((df_train, df_test), 0)

df_train.name = 'Training Set'
df_test.name = 'Test Set'
df_all.name = 'All Set' 

dfs = [df_train, df_test]



In [None]:
# Due to RAM limit on colab, need to reduce memory usage
def reduce_mem(df):
    start_mem_usg = df.memory_usage().sum() / (1024*1024)
    print("Memory usage of properties dataframe is :",start_mem_usg," MB")
    
    for col in df.keys():
        if df[col].dtype == int:
            Max = df[col].max()
            Min = df[col].min()
            if -128 < Min and Max < 127:
                df[col] = df[col].astype(np.int8)
            elif -32768 < Min and Max < 32767:
                df[col] = df[col].astype(np.int16)
            elif -2147483648 < Min and Max < 2147483647:
                df[col] = df[col].astype(np.int32)
            else:
                df[col] = df[col].astype(np.int64)      
        elif df[col].dtype == float:
            df[col] = df[col].astype(np.float32)
        else:
            continue
    print("___MEMORY USAGE AFTER COMPLETION:___")
    mem_usg = df.memory_usage().sum() / 1024**2 
    print("Memory usage is: ",mem_usg," MB")
    print("This is ",100*mem_usg/start_mem_usg,"% of the initial size")

In [None]:
reduce_mem(df_train)
print("---"*10)
reduce_mem(df_test)

Memory usage of properties dataframe is : 200.27798461914062  MB
___MEMORY USAGE AFTER COMPLETION:___
Memory usage is:  85.98898696899414  MB
This is  42.93481739019664 % of the initial size
------------------------------
Memory usage of properties dataframe is : 66.75946807861328  MB
___MEMORY USAGE AFTER COMPLETION:___
Memory usage is:  28.66310214996338  MB
This is  42.934886952979994 % of the initial size


#### Observer data 


In [None]:
ds.structdata.check_train_test_set(df_train, df_test, index=None, col=None)

There are 1141340 training rows and 380447 test rows.
There are 23 training columns and 23 test columns.


In [None]:
ds.structdata.describe(df_train)


First five data points


Unnamed: 0,acqic,bacno,cano,conam,contp,csmcu,ecfg,etymd,flbmk,flg_3dsmk,fraud_ind,hcefg,insfg,iterm,locdt,loctm,mcc,mchno,ovrlt,scity,stocn,stscd,txkey
0,6881,113261,38038,513.799988,5,0,N,0,N,N,0,5,N,0,33,172652.0,457,59333,N,0,102,0,516056
1,6881,15408,188328,513.799988,5,0,N,0,N,N,0,5,N,0,6,152458.0,457,59333,N,0,102,0,483434
2,6716,157159,29967,1016.109985,5,62,N,5,N,N,0,5,N,0,5,172946.0,247,50436,N,3281,102,0,1407164
3,5975,105985,81305,713.659973,5,62,N,4,N,N,0,5,N,0,6,182129.0,263,93775,N,5817,102,0,1051004
4,6411,94435,49219,1806.48999,3,62,N,4,N,N,0,5,N,0,6,172624.0,339,0,N,5865,102,0,1622153




Random five data points


Unnamed: 0,acqic,bacno,cano,conam,contp,csmcu,ecfg,etymd,flbmk,flg_3dsmk,fraud_ind,hcefg,insfg,iterm,locdt,loctm,mcc,mchno,ovrlt,scity,stocn,stscd,txkey
70388,6769,33082,63023,818.799988,5,62,N,5,N,N,0,5,N,0,17,210726.0,251,77977,N,5817,102,0,365657
481263,3348,128836,187694,555.429993,5,62,Y,8,N,N,0,5,N,0,11,131327.0,289,54828,N,3460,46,0,657560
100303,6823,29240,148985,721.179993,5,62,N,5,N,N,0,5,N,0,19,112257.0,250,31154,N,5920,102,0,1559029
500910,6767,146451,27171,592.25,5,62,Y,8,N,Y,0,5,N,0,17,204217.0,191,20435,N,5817,102,0,1129895
207640,6831,61803,28909,1395.160034,5,62,N,4,N,N,0,5,N,0,38,113214.0,270,26655,N,6055,102,0,855953




Last five data points


Unnamed: 0,acqic,bacno,cano,conam,contp,csmcu,ecfg,etymd,flbmk,flg_3dsmk,fraud_ind,hcefg,insfg,iterm,locdt,loctm,mcc,mchno,ovrlt,scity,stocn,stscd,txkey
1141335,6032,2510,54094,1.38,5,75,Y,2,,,0,6,N,0,30,231435.0,248,78297,N,5817,102,0,196080
1141336,1801,12832,137381,1138.040039,5,75,Y,8,,,0,6,N,0,1,100009.0,201,198,N,1852,98,0,642025
1141337,6322,91008,15189,578.380005,5,75,Y,8,,,0,6,N,0,4,191642.0,209,38222,N,5817,102,0,1478280
1141338,3226,145107,116252,435.320007,5,75,Y,8,,,0,6,N,0,13,102338.0,192,90135,N,1458,102,0,661087
1141339,6769,162168,93598,1.38,5,75,Y,8,,,0,6,N,0,29,234618.0,373,79246,N,5817,102,0,167073




Shape of  data set: (1141340, 23)


Size of  data set: 26250820


Data Types
Note: All Non-numerical features are identified as objects in pandas


Unnamed: 0,Data Type
acqic,int16
bacno,int32
cano,int32
conam,float32
contp,int8
csmcu,int8
ecfg,object
etymd,int8
flbmk,object
flg_3dsmk,object




Numerical Features in Data set
['acqic', 'bacno', 'cano', 'conam', 'contp', 'csmcu', 'etymd', 'fraud_ind', 'hcefg', 'iterm', 'locdt', 'loctm', 'mcc', 'mchno', 'scity', 'stocn', 'stscd', 'txkey']


Categorical Features in Data set


['ecfg', 'flbmk', 'flg_3dsmk', 'insfg', 'ovrlt']



Statistical Description of Columns


Unnamed: 0,acqic,bacno,cano,conam,contp,csmcu,etymd,fraud_ind,hcefg,iterm,locdt,loctm,mcc,mchno,scity,stocn,stscd,txkey
count,1141340.0,1141340.0,1141340.0,1141340.0,1141340.0,1141340.0,1141340.0,1141340.0,1141340.0,1141340.0,1141340.0,1141340.0,1141340.0,1141340.0,1141340.0,1141340.0,1141340.0,1141340.0
mean,6006.358,82077.19,108937.9,654.608,4.829436,53.82622,4.149955,0.0133904,4.747995,0.04959784,45.29871,146248.0,297.8248,55891.1,4755.236,95.65174,0.02491457,971323.1
std,1504.925,47387.1,60924.17,402.4778,0.6515574,20.72867,2.394154,0.1149396,1.114225,0.3652489,26.02185,52119.28,77.96604,30837.3,1979.464,18.89329,0.2219759,564163.8
min,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,5982.0,41352.0,57653.0,436.48,5.0,62.0,2.0,0.0,5.0,0.0,23.0,110929.0,250.0,33740.0,3802.0,102.0,0.0,487223.5
50%,6708.0,81844.0,109791.0,591.9,5.0,62.0,4.0,0.0,5.0,0.0,45.0,151509.0,264.0,59333.0,5817.0,102.0,0.0,979731.5
75%,6769.0,123094.0,161922.0,807.53,5.0,62.0,5.0,0.0,5.0,0.0,68.0,185818.0,343.0,79200.0,5817.0,102.0,0.0,1455292.0
max,6884.0,163884.0,213334.0,7208.77,6.0,75.0,10.0,1.0,9.0,8.0,90.0,235959.0,459.0,103307.0,6671.0,107.0,4.0,1958239.0




Description of Categorical Features


Unnamed: 0,count,unique,top,freq
ecfg,1141340,2,N,851201
flbmk,1131935,2,N,1128098
flg_3dsmk,1131935,2,N,1083749
insfg,1141340,2,N,1109626
ovrlt,1141340,2,N,1126179




Unique class Count of Categorical features


Unnamed: 0,Feature,Unique Count
0,ecfg,2
1,flbmk,3
2,flg_3dsmk,3
3,insfg,2
4,ovrlt,2




Missing Values in Data


Unnamed: 0,features,missing_counts,missing_percent
0,acqic,0,0.0
1,bacno,0,0.0
2,cano,0,0.0
3,conam,0,0.0
4,contp,0,0.0
5,csmcu,0,0.0
6,ecfg,0,0.0
7,etymd,0,0.0
8,flbmk,9405,0.8
9,flg_3dsmk,9405,0.8


In [None]:
ds.structdata.describe(df_test)

First five data points


Unnamed: 0,acqic,bacno,cano,conam,contp,csmcu,ecfg,etymd,flbmk,flg_3dsmk,fraud_ind,hcefg,insfg,iterm,locdt,loctm,mcc,mchno,ovrlt,scity,stocn,stscd,txkey
0,0,134508,45725,465.619995,5,0,N,2,N,N,0,0,N,0,9,105114.0,451,0,N,5817,102,0,4376
1,0,78377,2295,465.619995,5,0,N,2,N,N,0,0,N,0,6,104918.0,451,0,N,5817,102,0,2943
2,0,151054,197751,465.619995,5,0,N,2,N,N,0,0,N,0,7,104917.0,451,0,N,5817,102,0,3387
3,6716,150887,4541,930.309998,5,62,N,4,N,N,0,5,N,0,8,142028.0,329,37560,N,5820,102,0,1765957
4,6882,110356,133430,0.0,5,0,N,0,N,N,0,0,N,0,9,143657.0,459,2461,N,0,102,0,611269




Random five data points


Unnamed: 0,acqic,bacno,cano,conam,contp,csmcu,ecfg,etymd,flbmk,flg_3dsmk,fraud_ind,hcefg,insfg,iterm,locdt,loctm,mcc,mchno,ovrlt,scity,stocn,stscd,txkey
212008,3191,162821,183560,751.130005,4,26,Y,8,N,N,0,5,N,0,15,83133.0,318,68603,N,5932,52,2,664294
309004,6189,70488,118249,683.25,5,62,N,5,N,N,0,5,N,0,49,120729.0,263,94682,N,5817,102,0,1362601
357579,6769,24809,183423,618.669983,6,62,N,2,N,N,0,5,N,0,4,73129.0,374,79663,N,5817,102,0,57611
131659,5631,74708,56847,439.519989,5,71,Y,8,N,N,0,5,N,0,34,235740.0,277,44286,N,5570,36,2,696103
60868,6769,50438,193529,103.480003,5,62,Y,8,N,N,0,5,N,0,74,132212.0,373,79245,N,5817,102,0,350781




Last five data points


Unnamed: 0,acqic,bacno,cano,conam,contp,csmcu,ecfg,etymd,flbmk,flg_3dsmk,fraud_ind,hcefg,insfg,iterm,locdt,loctm,mcc,mchno,ovrlt,scity,stocn,stscd,txkey
380442,5588,95249,112041,1.38,5,75,Y,8,,,0,6,N,0,17,194234.0,296,73998,N,3426,42,0,668463
380443,6215,85771,169553,609.210022,5,75,Y,2,,,0,6,N,0,14,133907.0,289,54828,N,3460,46,0,1846986
380444,6231,139728,168027,574.359985,5,75,N,5,,,0,6,N,0,28,155437.0,245,18405,N,5817,102,0,992751
380445,6032,45406,197460,1.38,5,75,Y,2,,,0,6,N,0,24,215218.0,373,79246,N,5817,102,0,338215
380446,6716,48723,176440,406.589996,5,75,N,5,,,0,6,N,0,13,163603.0,251,69607,N,2310,102,0,1055258




Shape of  data set: (380447, 23)


Size of  data set: 8750281


Data Types
Note: All Non-numerical features are identified as objects in pandas


Unnamed: 0,Data Type
acqic,int16
bacno,int32
cano,int32
conam,float32
contp,int8
csmcu,int8
ecfg,object
etymd,int8
flbmk,object
flg_3dsmk,object




Numerical Features in Data set
['acqic', 'bacno', 'cano', 'conam', 'contp', 'csmcu', 'etymd', 'fraud_ind', 'hcefg', 'iterm', 'locdt', 'loctm', 'mcc', 'mchno', 'scity', 'stocn', 'stscd', 'txkey']


Categorical Features in Data set


['ecfg', 'flbmk', 'flg_3dsmk', 'insfg', 'ovrlt']



Statistical Description of Columns


Unnamed: 0,acqic,bacno,cano,conam,contp,csmcu,etymd,fraud_ind,hcefg,iterm,locdt,loctm,mcc,mchno,scity,stocn,stscd,txkey
count,380447.0,380447.0,380447.0,380447.0,380447.0,380447.0,380447.0,380447.0,380447.0,380447.0,380447.0,380447.0,380447.0,380447.0,380447.0,380447.0,380447.0,380447.0
mean,6012.936343,82129.508192,108854.39814,654.72052,4.829164,53.854274,4.146591,0.013332,4.753653,0.049718,45.413119,146373.421875,297.761486,55887.603451,4754.804241,95.649418,0.024676,970537.0
std,1494.871307,47288.644683,60842.000804,402.221497,0.650692,20.699398,2.394575,0.114691,1.104997,0.365251,26.009846,52079.625,77.973075,30779.969444,1980.868202,18.921218,0.220792,564037.6
min,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,5982.0,41478.0,57612.0,438.209991,5.0,62.0,2.0,0.0,5.0,0.0,23.0,110936.0,250.0,33774.0,3789.0,102.0,0.0,486157.5
50%,6716.0,82041.0,109769.0,591.900024,5.0,62.0,4.0,0.0,5.0,0.0,45.0,151559.0,264.0,59360.0,5817.0,102.0,0.0,979096.0
75%,6769.0,122953.0,161753.0,806.960022,5.0,62.0,5.0,0.0,5.0,0.0,68.0,185858.0,343.0,79200.0,5817.0,102.0,0.0,1454906.0
max,6884.0,163884.0,213334.0,7195.839844,6.0,75.0,10.0,1.0,9.0,8.0,90.0,235959.0,459.0,103307.0,6667.0,107.0,4.0,1958238.0




Description of Categorical Features


Unnamed: 0,count,unique,top,freq
ecfg,380447,2,N,283311
flbmk,377271,2,N,376035
flg_3dsmk,377271,2,N,361118
insfg,380447,2,N,369849
ovrlt,380447,2,N,375357




Unique class Count of Categorical features


Unnamed: 0,Feature,Unique Count
0,ecfg,2
1,flbmk,3
2,flg_3dsmk,3
3,insfg,2
4,ovrlt,2




Missing Values in Data


Unnamed: 0,features,missing_counts,missing_percent
0,acqic,0,0.0
1,bacno,0,0.0
2,cano,0,0.0
3,conam,0,0.0
4,contp,0,0.0
5,csmcu,0,0.0
6,ecfg,0,0.0
7,etymd,0,0.0
8,flbmk,3176,0.8
9,flg_3dsmk,3176,0.8


### Data Cleaning

#### Filling missing value


In [None]:
# fill with most frequent
df_all['flbmk'] = df_all['flbmk'].fillna('N')
df_all['flg_3dsmk'] = df_all['flg_3dsmk'].fillna('N')

#### Handle different data type encoding


In [None]:
cat_feats = ds.structdata.get_cat_feats(df_all)
num_feats = ds.structdata.get_num_feats(df_all)
get_unique_counts = ds.structdata.get_unique_counts(df_all)
all_feats = df_all.keys()

In [None]:
get_unique_counts

Unnamed: 0,Feature,Unique Count
0,ecfg,2
1,flbmk,2
2,flg_3dsmk,2
3,insfg,2
4,ovrlt,2


In [None]:
num_feats

['acqic',
 'bacno',
 'cano',
 'conam',
 'contp',
 'csmcu',
 'etymd',
 'fraud_ind',
 'hcefg',
 'iterm',
 'locdt',
 'loctm',
 'mcc',
 'mchno',
 'scity',
 'stocn',
 'stscd',
 'txkey']

In [None]:
cat_feats

['ecfg', 'flbmk', 'flg_3dsmk', 'insfg', 'ovrlt']

In [None]:
all_feats

Index(['acqic', 'bacno', 'cano', 'conam', 'contp', 'csmcu', 'ecfg', 'etymd',
       'flbmk', 'flg_3dsmk', 'fraud_ind', 'hcefg', 'insfg', 'iterm', 'locdt',
       'loctm', 'mcc', 'mchno', 'ovrlt', 'scity', 'stocn', 'stscd', 'txkey'],
      dtype='object')

In [None]:
# label encoding catogorical feature
le = LabelEncoder()
for feat in cat_feats:
    df_all[feat] = le.fit_transform(df_all[feat].astype(str))

# one hot feature

# 'contp': 交易類別  
# 'flbmk': fallback交易註記 
# 'ecfg': 網路交易註記  
# 'flg_3dsmk': 3DS交易驗證註記
# 'hcefg': 支付型態
# 'insfg': 分期交易註記
# 'ovrlt': 超額交易註記
# 'stscd': 狀態碼
# 'iterm': 分期期數
onehot_feature = [
    'contp', 'flbmk', 'ecfg', 'flg_3dsmk', 'hcefg', 'insfg', 'ovrlt', 
    'stscd', 'iterm'
]

# frequency feature

# 'csmcu': 消費地幣別
# 'etymd': 交易型態（網路/實體）
# 'mcc': 特店類別
# 'mchno': 特店名稱
# 'acqic': 收單行代碼
# 'bacno': 歸戶帳號
# 'cano': 卡號
# 'scity': 消費地城市
# 'stocn': 消費地國別
freq_feature = [
    'csmcu', 'etymd', 'mcc', 'mchno', 'acqic', 'bacno', 'cano', 'scity',
    'stocn'
]

for k in freq_feature:
    df_all[k + '_f'] = df_all[k].map(df_all[k].value_counts(normalize=True))
    
for k in onehot_feature:
    add_dumy = pd.get_dummies(df_all[k])
    add_dumy.columns = [k + "_{}".format(x) for x in add_dumy.columns]
    if add_dumy.shape[0] < 2:
        add_dumy = add_dumy.iloc[:, 0]
    df_all = pd.concat([df_all, add_dumy], axis=1)

In [None]:
# filter given feature correlated to target feature, use threshold to set it to 0 and 1 (1 means high chance of fraud)
def identify_high_risk_fraud(df, feat, threshold):
    Top_array = df[f'{feat}'].value_counts().values[:threshold]
    df[f'{feat}_high_risk_fraud'] = df[f'{feat}'].apply(lambda x : 0 if x not in Top_array else x)

In [None]:
for feat in freq_feature:
    identify_high_risk_fraud(df_all, feat , 15)

#### Individual feature prepocess



##### acqic 收單行代碼


In [None]:
Df = df_all.groupby(['acqic'])['fraud_ind'].agg(['mean', 'count']).reset_index().sort_values('count', ascending = False)
Df

Unnamed: 0,acqic,mean,count
5945,6769,0.001040,250889
5895,6716,0.000695,201407
5226,5975,0.000376,124951
6047,6881,0.001741,99348
5411,6189,0.000689,91434
...,...,...,...
4604,5285,0.000000,1
4603,5284,0.000000,1
4602,5283,0.000000,1
1234,1371,0.000000,1


##### bacno 歸戶帳號


In [None]:
Df = df_all.groupby(['bacno'])['fraud_ind'].agg(['mean', 'count']).reset_index().sort_values('count', ascending = False)
Df


Unnamed: 0,bacno,mean,count
86417,148859,0.0,1117
90097,155157,0.0,938
60000,103279,0.0,747
53019,91313,0.0,681
86727,149363,0.0,593
...,...,...,...
87662,151021,0.0,1
74079,127607,0.0,1
39616,68356,0.0,1
54426,93694,0.0,1


##### locdf & loctm 授權日期與時間


In [None]:
time2val = lambda x: np.sin((x/12-1)*np.pi)
def fn(x): 
    x_str = str(int(x)).zfill(6)
    h, m, s = float(x_str[:2]), float(x_str[2:4]), float(x_str[4:])
    v = h + m/60 + s/3600
    return time2val(v)

# Sort by time, because transactions has time dependency
df_all['loctm']=  df_all['loctm'].apply(fn)

df_all=df_all.sort_values(by=['locdt','loctm'])

In [None]:
df_all['loctm'].describe()

count    1.521787e+06
mean     3.383419e-01
std      6.073060e-01
min     -1.000000e+00
25%     -1.629672e-01
50%      5.391383e-01
75%      8.836976e-01
max      1.000000e+00
Name: loctm, dtype: float64

In [None]:
def Str_turn_time(str1):
    str1 = str(int(str1))
    if len(str1) < 6:
        str1 = (6 - len(str1)) * '0' + str1
    return str1

df_all['Hour'] = df_all['loctm'].apply(lambda x :Str_turn_time(x)[:2]).astype(int)
df_all['Morning'] = 0
df_all.loc[(df_all['Hour'].astype('int') > 7) & (df_all['Hour'].astype('int') < 22), 'Morning'] = 1

In [None]:
df_all.loc[df_all['locdt'] < 121, 'Month'] = 4
df_all.loc[(df_all['locdt']) < 91, 'Month']  = 3
df_all.loc[(df_all['locdt']) < 61, 'Month']  = 2
df_all.loc[(df_all['locdt']) < 31, 'Month'] = 1

df_all['Week'] = df_all['locdt'].apply(lambda x : x%7)

In [None]:
identify_high_risk_fraud(df_all, 'Month' , 15)
identify_high_risk_fraud(df_all, 'Week' , 15)

##### conam 交易金額



In [None]:
Df = df_all.groupby(['conam'])['fraud_ind'].agg(['mean']).reset_index().sort_values('mean', ascending = True)
Df

Unnamed: 0,conam,mean
36480,1101.839966,0.0
47891,1476.729980,0.0
47892,1476.770020,0.0
47893,1476.819946,0.0
47894,1476.859985,0.0
...,...,...
69378,2388.560059,1.0
25880,819.710022,1.0
11252,463.230011,1.0
25862,819.309998,1.0


##### txkey 交易唯一序號



In [None]:
df_all['Count_txkey_gb_bacno'] =  df_all.groupby(['bacno'])['txkey'].transform('count')

#Calculate count of transaction in the same account and in the same card
df_all['Count_txkey_gb_bacno'] =  df_all.groupby(['bacno', 'cano'])['txkey'].transform('count')

#Calculate count of transaction in the same account in the same one hour
df_all['Count_txkey_gb_bacno_locdt_Hour'] =  df_all.groupby(['bacno', 'locdt', 'Hour'])['txkey'].transform('count')

##### scity & stocn 消費城市&國別




In [None]:
new_col = 'scity'+'_'+'stocn'
col = 'scity'
col2 = 'stocn'

df_all[new_col] = df_all[col].astype(str)+'_'+df_all[col2].astype(str)
df_all[new_col] = df_all[col].astype(str)+'_'+df_all[col2].astype(str) 

le = LabelEncoder()
df_all[new_col] = le.fit_transform(df_all[new_col].astype(str))

In [None]:
df_all.keys()

Index(['acqic', 'bacno', 'cano', 'conam', 'contp', 'csmcu', 'ecfg', 'etymd',
       'flbmk', 'flg_3dsmk', 'fraud_ind', 'hcefg', 'insfg', 'iterm', 'locdt',
       'loctm', 'mcc', 'mchno', 'ovrlt', 'scity', 'stocn', 'stscd', 'txkey',
       'csmcu_f', 'etymd_f', 'mcc_f', 'mchno_f', 'acqic_f', 'bacno_f',
       'cano_f', 'scity_f', 'stocn_f', 'contp_0', 'contp_1', 'contp_2',
       'contp_3', 'contp_4', 'contp_5', 'contp_6', 'flbmk_0', 'flbmk_1',
       'ecfg_0', 'ecfg_1', 'flg_3dsmk_0', 'flg_3dsmk_1', 'hcefg_0', 'hcefg_1',
       'hcefg_2', 'hcefg_3', 'hcefg_5', 'hcefg_6', 'hcefg_7', 'hcefg_8',
       'hcefg_9', 'insfg_0', 'insfg_1', 'ovrlt_0', 'ovrlt_1', 'stscd_0',
       'stscd_1', 'stscd_2', 'stscd_3', 'stscd_4', 'iterm_0', 'iterm_1',
       'iterm_2', 'iterm_3', 'iterm_4', 'iterm_5', 'iterm_6', 'iterm_7',
       'iterm_8', 'csmcu_high_risk_fraud', 'etymd_high_risk_fraud',
       'mcc_high_risk_fraud', 'mchno_high_risk_fraud', 'acqic_high_risk_fraud',
       'bacno_high_risk_fraud', '

#### numerical feature -> StandardScaler
After processing all the feature, scale the numerical feature, so that the model can fit the training data better.

In [None]:
num_feats = ['acqic','bacno','cano','conam', 'csmcu','hcefg','iterm', 'locdt','mcc',
 'mchno','scity','stocn', 'stscd','txkey','Month','Week'] 

In [None]:
scaler = StandardScaler()
df_all[num_feats] = scaler.fit_transform(df_all[num_feats])

In [None]:
# train and test data after data prepocessing
df_train = df_all.iloc[:len(df_train)]
df_test = df_all.iloc[len(df_train):]

In [None]:
# Time seires split

# X = df_train.drop('fraud_ind', axis=1)
# y = df_train['fraud_ind']

# transaction data has time dependency, so instead of normal train test split, here we use TimeSeriesSplit
# tscv = TimeSeriesSplit(n_splits=5)

# We have sorted the data by time(locdt, locdm) during data preprocess
# Time dependent data, train set is the first 70% of the original data
# X_train, X_test = np.split(X, [int(.7 *len(X))])
# y_train, y_test = np.split(y, [int(.7 *len(y))])

# # For training
# # X_train, X_test, y_train, y_test = TimeSeriesSplit(X, y, test_size=0.25, random_state=0, stratify=y)

# X_train = X_train.values
# X_test = X_test.values
# y_train = y_train.values
# y_test = y_test.values

# # For prediction
# X_predict = df_test.drop('fraud_ind', axis=1).values
# y_answer = df_test['fraud_ind'].values


### Feature selection




In [None]:
X = df_train.drop('fraud_ind', axis=1)
y = df_train['fraud_ind']

# Since this data set is imbalance, use under sampling, for better training result of models 
X_nearmiss, y_nearmiss = NearMiss(n_jobs=-1).fit_resample(X, y)

In [None]:
print(X.shape)
print(y.shape)

print(X_nearmiss.shape)
print(y_nearmiss.shape)

(1141340, 89)
(1141340,)
(30566, 89)
(30566,)


In [None]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel

clf = ExtraTreesClassifier(n_estimators=500)
clf = clf.fit(X_nearmiss, y_nearmiss)

In [None]:
# Tree base prune
sel = SelectFromModel(clf, prefit=True, threshold=0.01)
tree_pruned_features_1 = [f for f, s in zip(X, sel.get_support()) if s]

print('\n The selected features are {}:'.format(tree_pruned_features_1))
print('\n threshold: 0.01')
print('\n The number of selected feature {}:'.format(len(tree_pruned_features_1)))


 The selected features are ['bacno', 'cano', 'conam', 'csmcu', 'ecfg', 'etymd', 'locdt', 'loctm', 'mcc', 'mchno', 'scity', 'stocn', 'stscd', 'txkey', 'csmcu_f', 'etymd_f', 'mcc_f', 'mchno_f', 'acqic_f', 'bacno_f', 'cano_f', 'scity_f', 'stocn_f', 'ecfg_0', 'ecfg_1', 'stscd_0', 'stscd_2', 'Week', 'Count_txkey_gb_bacno', 'Count_txkey_gb_bacno_locdt_Hour', 'scity_stocn']:

 threshold: 0.01

 The number of selected feature 31:


In [None]:
# Tree base prune
sel = SelectFromModel(clf, prefit=True, threshold=0.02)
tree_pruned_features_2 = [f for f, s in zip(X, sel.get_support()) if s]

print('\n The selected features are {}:'.format(tree_pruned_features_2))
print('\n threshold: 0.02')
print('\n The number of selected feature {}:'.format(len(tree_pruned_features_2)))


 The selected features are ['bacno', 'cano', 'conam', 'locdt', 'loctm', 'stocn', 'stscd', 'txkey', 'csmcu_f', 'mcc_f', 'mchno_f', 'acqic_f', 'bacno_f', 'cano_f', 'scity_f', 'stocn_f', 'ecfg_0', 'ecfg_1', 'stscd_0', 'stscd_2', 'Week', 'Count_txkey_gb_bacno', 'Count_txkey_gb_bacno_locdt_Hour']:

 threshold: 0.02

 The number of selected feature 23:


In [None]:
# Tree base prune
sel = SelectFromModel(clf, prefit=True, threshold=0.03)
tree_pruned_features_3 = [f for f, s in zip(X, sel.get_support()) if s]

print('\n The selected features are {}:'.format(tree_pruned_features_3))
print('\n threshold: 0.03')
print('\n The number of selected feature {}:'.format(len(tree_pruned_features_3)))


 The selected features are ['bacno', 'cano', 'conam', 'loctm', 'txkey', 'csmcu_f', 'mcc_f', 'mchno_f', 'acqic_f', 'bacno_f', 'cano_f', 'stocn_f', 'Count_txkey_gb_bacno', 'Count_txkey_gb_bacno_locdt_Hour']:

 threshold: 0.03

 The number of selected feature 14:


In [None]:
temp_df_train = pd.concat([df_train[tree_pruned_features_1], df_train['fraud_ind']], axis=1)
temp_df_test = pd.concat([df_test[tree_pruned_features_1], df_test['fraud_ind']], axis=1)

temp_df_train.to_csv('./dataAfterPruned/tree1_train.csv', index=False)
temp_df_test.to_csv('./dataAfterPruned/tree1_test.csv', index=False)

temp_df_train = pd.concat([df_train[tree_pruned_features_2], df_train['fraud_ind']], axis=1)
temp_df_test = pd.concat([df_test[tree_pruned_features_2], df_test['fraud_ind']], axis=1)

temp_df_train.to_csv('./dataAfterPruned/tree2_train.csv', index=False)
temp_df_test.to_csv('./dataAfterPruned/tree2_test.csv', index=False)

temp_df_train = pd.concat([df_train[tree_pruned_features_3], df_train['fraud_ind']], axis=1)
temp_df_test = pd.concat([df_test[tree_pruned_features_3], df_test['fraud_ind']], axis=1)

temp_df_train.to_csv('./dataAfterPruned/tree3_train.csv', index=False)
temp_df_test.to_csv('./dataAfterPruned/tree3_test.csv', index=False)

In [None]:
# This block takes too long to executed.
# To get the dataFrame after feature selection, please use gdown to download the csv file in the next block.

# Feature ranking with recursive feature elimination and cross-validated selection of the best number of features.
skf = StratifiedKFold(n_splits=5, random_state=None, shuffle=True)
xgb = XGBClassifier(n_estimators=600, n_jobs=-1, objective='binary:hinge')
rfecv = RFECV(estimator=xgb, step=1, cv=skf, verbose=3, n_jobs = -1, scoring = 'f1')

rfecv.fit(X_nearmiss, y_nearmiss)

Fitting estimator with 73 features.
Fitting estimator with 72 features.
Fitting estimator with 71 features.
Fitting estimator with 70 features.
Fitting estimator with 69 features.
Fitting estimator with 68 features.
Fitting estimator with 67 features.
Fitting estimator with 66 features.
Fitting estimator with 65 features.
Fitting estimator with 64 features.
Fitting estimator with 63 features.
Fitting estimator with 62 features.
Fitting estimator with 61 features.
Fitting estimator with 60 features.
Fitting estimator with 59 features.
Fitting estimator with 58 features.
Fitting estimator with 57 features.
Fitting estimator with 56 features.
Fitting estimator with 55 features.
Fitting estimator with 54 features.
Fitting estimator with 53 features.
Fitting estimator with 52 features.
Fitting estimator with 51 features.
Fitting estimator with 50 features.
Fitting estimator with 49 features.
Fitting estimator with 48 features.
Fitting estimator with 47 features.
Fitting estimator with 46 fe

RFECV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=True),
      estimator=XGBClassifier(base_score=None, booster=None,
                              colsample_bylevel=None, colsample_bynode=None,
                              colsample_bytree=None, gamma=None, gpu_id=None,
                              importance_type='gain',
                              interaction_constraints=None, learning_rate=None,
                              max_delta_step=None, max_depth=None,
                              min_child_weight=None, missing=nan,
                              monotone_constraints=None, n_estimators=600,
                              n_jobs=-1, num_parallel_tree=None,
                              objective='binary:hinge', random_state=None,
                              reg_alpha=None, reg_lambda=None,
                              scale_pos_weight=None, subsample=None,
                              tree_method=None, validate_parameters=None,
                         

In [None]:
RFECV_pruned_features = [f for f, s in zip(X, rfecv.support_) if s]

print('\n The selected features are {}:'.format(RFECV_pruned_features))
print('\n The number of selected feature {}:'.format(len(RFECV_pruned_features)))


 The selected features are ['acqic', 'bacno', 'cano', 'conam', 'contp', 'csmcu', 'ecfg', 'etymd', 'flg_3dsmk', 'hcefg', 'locdt', 'loctm', 'mcc', 'mchno', 'ovrlt', 'scity', 'stocn', 'stscd', 'txkey', 'csmcu_f', 'etymd_f', 'mcc_f', 'mchno_f', 'acqic_f', 'bacno_f', 'cano_f', 'scity_f', 'stocn_f', 'hcefg_5', 'scity_stocn']:

 The number of selected feature 30:


In [None]:
temp_df_train = pd.concat([df_train[RFECV_pruned_features], df_train['fraud_ind']], axis=1)
temp_df_test = pd.concat([df_test[RFECV_pruned_features], df_test['fraud_ind']], axis=1)

temp_df_train.to_csv('./dataAfterPruned/RFECV_train.csv', index=False)
temp_df_test.to_csv('./dataAfterPruned/RFECV_test.csv', index=False)

In [None]:
df_train_RFECV = pd.read_csv('./dataAfterPruned/RFECV_train.csv')
df_test_RFECV = pd.read_csv('./dataAfterPruned/RFECV_test.csv')

df_train_tree1 = pd.read_csv('./dataAfterPruned/tree1_train.csv')
df_test_tree1 = pd.read_csv('./dataAfterPruned/tree1_test.csv')

df_train_tree2 = pd.read_csv('./dataAfterPruned/tree2_train.csv')
df_test_tree2 = pd.read_csv('./dataAfterPruned/tree2_test.csv')

df_train_tree3 = pd.read_csv('./dataAfterPruned/tree3_train.csv')
df_test_tree3 = pd.read_csv('./dataAfterPruned/tree3_test.csv')

In [None]:
reduce_mem(df_train)
reduce_mem(df_test)

Memory usage of properties dataframe is : 269.9398498535156  MB
___MEMORY USAGE AFTER COMPLETION:___
Memory usage is:  109.93525314331055  MB
This is  40.72583325617449 % of the initial size
Memory usage of properties dataframe is : 89.98011016845703  MB
___MEMORY USAGE AFTER COMPLETION:___
Memory usage is:  36.64519786834717  MB
This is  40.72588686515447 % of the initial size


#### Compare datasets performance on basic classification model


In [None]:
def evaluateDataset(dataset, model, name):
    
    X = dataset.drop('fraud_ind', axis=1).values
    y = dataset['fraud_ind'].values
    
    accuracy_lst = []
    precision_lst = []
    recall_lst = []
    f1_lst = []
    f1_micro_lst = []
    f1_macro_lst = []
    f1_weighted_lst = []
    auc_lst = []
    
    
    skf = StratifiedKFold(n_splits=5, random_state=None, shuffle=True)

    for fold, (train_index, test_index) in enumerate(skf.split(X, y), 1):
        X_train = X[train_index]
        y_train = y[train_index] 
        X_test = X[test_index]
        y_test = y[test_index] 

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        accuracy_lst.append(model.score(X_test, y_test))
        precision_lst.append(precision_score(y[test_index], y_pred))
        recall_lst.append(recall_score(y[test_index], y_pred))
        f1_lst.append(f1_score(y_test, y_pred))
        f1_micro_lst.append(f1_score(y_test, y_pred, average='micro'))
        f1_macro_lst.append(f1_score(y_test, y_pred, average='macro'))
        f1_weighted_lst.append(f1_score(y_test, y_pred, average='weighted'))
        auc_lst.append(roc_auc_score(y[test_index], y_pred))
        
    print(f'Dataset: {name}')
    print('---' * 45)
    print("accuracy: {}".format(np.mean(accuracy_lst)))
    print("precision: {}".format(np.mean(precision_lst)))
    print("recall: {}".format(np.mean(recall_lst)))
    print("f1: {}".format(np.mean(f1_lst)))
    print("f1 micro: {}".format(np.mean(f1_micro_lst)))
    print("f1 macro : {}".format(np.mean(f1_macro_lst)))
    print("f1 weighted: {}".format(np.mean(f1_weighted_lst)))
    print('---' * 45)

In [None]:
xgb = XGBClassifier(objective='binary:hinge', n_jobs=-1, verbosity=0)

dataSets = [df_train_RFECV,df_train_tree1,df_train_tree2, df_train_tree3]
name = ['RFECV', 'tree1', 'tree2', 'tree3']

In [None]:
for dataset, name in zip(dataSets,name):
    evaluateDataset(dataset, xgb, name)

Dataset: RFECV
---------------------------------------------------------------------------------------------------------------------------------------
accuracy: 0.9923475914276201
precision: 0.8410558998861773
recall: 0.5282982195185026
f1: 0.6489091904446447
f1 micro: 0.9923475914276201
f1 macro : 0.8225204131786793
f1 weighted: 0.9914822013498407
---------------------------------------------------------------------------------------------------------------------------------------
Dataset: tree1
---------------------------------------------------------------------------------------------------------------------------------------
accuracy: 0.9931965934778418
precision: 0.8636445665793673
recall: 0.5842429699582283
f1: 0.6969342986034315
f1 micro: 0.9931965934778418
f1 macro : 0.8467469890497303
f1 weighted: 0.9925475816021416
---------------------------------------------------------------------------------------------------------------------------------------
Dataset: tree2
-----------

In [None]:
# tree 2 has the highest f1-score: 0.6979911044445455
# use tree 2 dataset to para tuning

## Parameter tuning


This data set is highly imbalance, in order to fit the training data better, under sampling the data first, use it to train the model, and get the best hyperparameter, then use the hyperparameter obtained to fit the test set and make prediction.

In [None]:
# df_train = pd.read_csv('./dataAfterPruned/tree2_train.csv')
# df_test = pd.read_csv('./dataAfterPruned/tree2_test.csv')

# Inorder to start right here, down below is the gdown link to the dataset - tree2, the one we just clean up and went through feature selection.
!pip3 install gdown
import gdown

f1id = '1PGcWwSgWJKc6CJXU3R9y3_AsYObrUT0C'
f2id = '1GiuDe0BTaq8yISEJgWdnTnHhTP_v_1bM'

url = 'https://drive.google.com/uc?id=%s'%(f1id)
output = 'train.csv'
gdown.download(url, output, quiet=False)
url = 'https://drive.google.com/uc?id=%s'%(f2id)
output = 'test.csv'
gdown.download(url, output, quiet=False)



Downloading...
From: https://drive.google.com/uc?id=1PGcWwSgWJKc6CJXU3R9y3_AsYObrUT0C
To: /content/train.csv
408MB [00:02, 166MB/s]
Downloading...
From: https://drive.google.com/uc?id=1GiuDe0BTaq8yISEJgWdnTnHhTP_v_1bM
To: /content/test.csv
136MB [00:01, 97.6MB/s]


'test.csv'

In [None]:
df_train = pd.read_csv('./train.csv')
df_test = pd.read_csv('./test.csv')

In [None]:
X = df_train.drop('fraud_ind', axis=1)
y = df_train['fraud_ind']
# Since this data set is imbalance, use under sampling, for better training result of models 
X_nearmiss, y_nearmiss = NearMiss(n_jobs=-1).fit_resample(X, y)

In [None]:
print(X_nearmiss.shape)
print(y_nearmiss.shape)
print(X.shape)
print(y.shape)

(30566, 23)
(30566,)
(1141340, 23)
(1141340,)


### hyperOpt



In [None]:
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials

space = {'learning_rate': hp.uniform('learning_rate', 0.001, 0.5),
        'max_depth': hp.choice('max_depth', [1, 3, 5, 9]),
        'colsample_bytree': hp.choice('colsample_bytree', [0.1, 0.7, 1.0]),
        'subsample': hp.choice ('subsample', [0.1, 0.5, 1.0]),
        'min_child_weight' : hp.choice ('min_child_weight', [0.1, 0.5, 1.0]),
        'n_estimators' : hp.choice('n_estimators', [600, 1600, 2100])
    }

def objective(space):
    model = XGBClassifier(learning_rate = space['learning_rate'], 
                                   max_depth = space['max_depth'],
                                 colsample_bytree = space['colsample_bytree'],
                                 subsample = space['subsample'],
                                 n_estimators = space['n_estimators'],
                                min_child_weight=space['min_child_weight'],
                              objective='binary:hinge'
                                 )
    
    accuracy = cross_val_score(model, X_nearmiss, y_nearmiss, cv = 3, scoring='f1', n_jobs=-1).mean()

    # We aim to maximize accuracy, therefore we return it as a negative value
    return {'loss': -accuracy, 'status': STATUS_OK }
    
best = fmin(fn= objective,
            space= space,
            algo= tpe.suggest,
            max_evals = 50,
            trials= Trials())

100%|██████████| 50/50 [39:06<00:00, 46.93s/trial, best loss: -0.9221965433460144]


In [None]:
best

# 20 iters

# {'colsample_bytree': 3,
#  'learning_rate': 0.05315556869068875,
#  'max_depth': 2,
#  'min_child_weight': 0,
#  'n_estimators': 0,
#  'subsample': 3}

# 30 iters
# {'colsample_bytree': 0,
#  'learning_rate': 0.05437163768633186,
#  'max_depth': 3,
#  'min_child_weight': 2,
#  'n_estimators': 0,
#  'subsample': 1}

# 50iters
# {'colsample_bytree': 0,
#  'learning_rate': 0.1282598147882257,
#  'max_depth': 2,
#  'min_child_weight': 1,
#  'n_estimators': 1,
#  'subsample': 2}

{'colsample_bytree': 1,
 'learning_rate': 0.029600019920710423,
 'max_depth': 3,
 'min_child_weight': 2,
 'n_estimators': 0,
 'subsample': 1}

In [None]:
hyperOPT_xgb_model = XGBClassifier(
    objective='binary:hinge', 
    n_jobs=-1, 
    colsample_bytree= 1,
  learning_rate= 0.05315556869068875,
  max_depth= 9,
  min_child_weight= 0.2,
  n_estimators= 600,
  subsample= 1)

basicTrainAndTest(df_train, df_test, hyperOPT_xgb_model, "hyperOPT_xgb_model")

# for 20 iters
# Before

# Dataset: tree2
# ---------------------------------------------------------------------------------------------------------------------------------------
# accuracy: 0.9932088597613331
# precision: 0.8627229751222109
# recall: 0.5861425883775457
# f1: 0.6979911044445455
# f1 micro: 0.9932088597613331
# f1 macro : 0.8472784606458198
# f1 weighted: 0.9925677730156321
    
# After

# Dataset: hyperOPT_xgb_model
# ---------------------------------------------------------------------------------------------------------------------------------------
# Accuracy: 0.9943724092974843
# Precision: 0.8843430369787569
# Reacall: 0.6648264984227129
# f1-score: 0.7590320765334835
# micro f1-score: 0.9943724092974843
# macro f1-score: 0.87809251806951
# weighted f1-score: 0.9939784067271029
# AUC: 0.8318258366239439

# f1 score improves from 0.6979911044445455 -> 0.7590320765334835 with 20 iter of tuning

In [None]:
hyperOPT_xgb_model_30iters = XGBClassifier(
    objective='binary:hinge', 
    n_jobs=-1, 
    colsample_bytree= 0.1,
 learning_rate= 0.05437163768633186,
 max_depth= 9,
 min_child_weight= 1.0,
 n_estimators= 600,
 subsample= 0.5)

basicTrainAndTest(df_train, df_test, hyperOPT_xgb_model_30iters, "hyperOPT_xgb_model_30iters")
# Before

# Dataset: tree2
# ---------------------------------------------------------------------------------------------------------------------------------------
# Accuracy: 0.9934051260753798
# Precision: 0.8685648547598505
# Reacall: 0.5954258675078864
# f1-score: 0.6979911044445455
# micro f1-score: 0.9934051260753798
# macro f1-score: 0.8515902378613556
# weighted f1-score: 0.992796908911799
# AUC: 0.7971042091452186

# After 

# Dataset: hyperOPT_xgb_model_30iters
# ---------------------------------------------------------------------------------------------------------------------------------------
# Accuracy: 0.9931817046789697
# Precision: 0.8963531669865643
# Reacall: 0.5524447949526814
# f1-score: 0.683581361307636
# micro f1-score: 0.9931817046789697
# macro f1-score: 0.8400675414316088
# weighted f1-score: 0.9923812722757145
# AUC: 0.7757908290447723
# ---------------------------------------------------------------------------------------------------------------------------------------

# f1 score improves from 0.6979911044445455 -> 0.683581361307636 with 30 iter of tuning
# --> overFitting ?

Dataset: hyperOPT_xgb_model_30iters
---------------------------------------------------------------------------------------------------------------------------------------
Accuracy: 0.9931817046789697
Precision: 0.8963531669865643
Reacall: 0.5524447949526814
f1-score: 0.683581361307636
micro f1-score: 0.9931817046789697
macro f1-score: 0.8400675414316088
weighted f1-score: 0.9923812722757145
AUC: 0.7757908290447723
---------------------------------------------------------------------------------------------------------------------------------------


In [None]:
hyperOPT_xgb_model_50iters = XGBClassifier(
    objective='binary:hinge', 
    n_jobs=-1, 
    colsample_bytree= 0.1,
 learning_rate= 0.1282598147882257,
 max_depth= 5,
 min_child_weight= 0.5,
 n_estimators= 600,
 subsample= 1)

basicTrainAndTest(df_train, df_test, hyperOPT_xgb_model_50iters, "hyperOPT_xgb_model_50iters")

# for 50 iters
# Before

# Dataset: tree2
# ---------------------------------------------------------------------------------------------------------------------------------------
# accuracy: 0.9932088597613331
# precision: 0.8627229751222109
# recall: 0.5861425883775457
# f1: 0.6979911044445455
# f1 micro: 0.9932088597613331
# f1 macro : 0.8472784606458198
# f1 weighted: 0.9925677730156321
    
# After

# Accuracy: 0.9925981805612871
# Precision: 0.8857729138166894
# Reacall: 0.5106466876971609
# f1-score: 0.647823911955978
# micro f1-score: 0.9925981805612871
# macro f1-score: 0.8220418487390271
# weighted f1-score: 0.9916145478129411
# AUC: 0.754878455403692

# f1 score improves from 0.6979911044445455 -> 0.647823911955978 with 50 iter of tuning
# ---> Overfitting ???

### optuna

In [None]:
# ! pip install optuna
import optuna
from optuna import Trial, visualization
from optuna.samplers import TPESampler

In [None]:
def objective(trial, X, y):

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0, stratify=y)
    
    X_train = X_train.values
    X_test = X_test.values
    y_train = y_train.values
    y_test = y_test.values
    
    param = {
        "verbosity": 0,
        "objective": "binary:hinge",
        # defines booster, gblinear for linear functions.
        "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
        # L2 regularization weight.
        "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
        # L1 regularization weight.
        "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
        # sampling ratio for training data.
        "subsample": trial.suggest_float("subsample", 0.2, 0.7,log=True),
        # sampling according to each tree.
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "n_estimators":trial.suggest_int("n_estimators", 600, 2000),
        'learning_rate':trial.suggest_loguniform('learning_rate',0.005,0.5),
        "n_jobs": -1
    }
    
    model = XGBClassifier(**param)

    if param["booster"] in ["gbtree", "dart"]:
        # maximum depth of the tree, signifies complexity of the tree.
        param["max_depth"] = trial.suggest_int("max_depth", 3, 9, step=2)
        # minimum child weight, larger the term more conservative the tree.
        param["min_child_weight"] = trial.suggest_int("min_child_weight", 2, 10)
        # defines how selective algorithm is.
        param["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
        param["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])

    if param["booster"] == "dart":
        param["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
        param["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
        param["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
        param["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)

    model.fit(X_train, y_train)
    f1 = cross_val_score(model, X_test, y_test, scoring="f1").mean()
    return f1

In [None]:
study = optuna.create_study(direction='maximize',sampler=TPESampler())
study.optimize(lambda trial : objective(trial,X_nearmiss,y_nearmiss),n_trials= 15)

print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial

print('Best trial: score {},\nparams {}'.format(study.best_trial.value,study.best_trial.params))
print("  Value: {}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))
    

[32m[I 2021-06-10 21:48:29,257][0m A new study created in memory with name: no-name-35543f3c-1b86-4a78-bea4-30186e80955f[0m
[32m[I 2021-06-10 21:48:33,737][0m Trial 0 finished with value: 0.8097690607852991 and parameters: {'booster': 'gblinear', 'lambda': 2.339781663462407e-06, 'alpha': 9.892276268781123e-08, 'subsample': 0.39676370914327214, 'colsample_bytree': 0.6457274902471277, 'n_estimators': 1092, 'learning_rate': 0.0295918571703849}. Best is trial 0 with value: 0.8097690607852991.[0m
[32m[I 2021-06-10 22:09:08,545][0m Trial 1 finished with value: 0.8618510915385216 and parameters: {'booster': 'dart', 'lambda': 5.785692168600777e-05, 'alpha': 3.097851996028351e-05, 'subsample': 0.25794216759352406, 'colsample_bytree': 0.7656156805941858, 'n_estimators': 1070, 'learning_rate': 0.4181695916202503, 'max_depth': 7, 'min_child_weight': 7, 'gamma': 0.008943620709237751, 'grow_policy': 'lossguide', 'sample_type': 'weighted', 'normalize_type': 'tree', 'rate_drop': 1.939700493135

Number of finished trials:  15
Best trial:
Best trial: score 0.90332394803699,
params {'booster': 'dart', 'lambda': 1.313584992794318e-08, 'alpha': 0.03520268400768686, 'subsample': 0.5451488283528434, 'colsample_bytree': 0.9794897257170234, 'n_estimators': 1687, 'learning_rate': 0.01364832787281915, 'max_depth': 3, 'min_child_weight': 10, 'gamma': 0.41976538712494593, 'grow_policy': 'lossguide', 'sample_type': 'uniform', 'normalize_type': 'forest', 'rate_drop': 0.49776567113417985, 'skip_drop': 0.5573682745996533}
  Value: 0.90332394803699
  Params: 
    booster: dart
    lambda: 1.313584992794318e-08
    alpha: 0.03520268400768686
    subsample: 0.5451488283528434
    colsample_bytree: 0.9794897257170234
    n_estimators: 1687
    learning_rate: 0.01364832787281915
    max_depth: 3
    min_child_weight: 10
    gamma: 0.41976538712494593
    grow_policy: lossguide
    sample_type: uniform
    normalize_type: forest
    rate_drop: 0.49776567113417985
    skip_drop: 0.5573682745996533


In [None]:
Optuna_model_15iters = XGBClassifier(
    objective='binary:hinge', 
    n_jobs=4, 
    booster= 'dart', 
    reg_lambda= 1.313584992794318e-08,
    alpha= 0.03520268400768686, 
    subsample= 0.5451488283528434,
    colsample_bytree= 0.9794897257170234,
    n_estimators= 1687, 
    learning_rate= 0.01364832787281915, 
    max_depth= 3, 
    min_child_weight= 10, 
    gamma= 0.41976538712494593, 
    grow_policy= 'lossguide', 
    sample_type= 'uniform', 
    normalize_type= 'forest', 
    rate_drop= 0.49776567113417985, 
    skip_drop= 0.5573682745996533)

basicTrainAndTest(df_train, df_test, Optuna_model_15iters, "Optuna_model_15iters")

# f1 score improves from 0.6979911044445455 -> 0.74937609384564835 with 15 iter of tuning

In [None]:
study

<optuna.study.Study at 0x7f9fe8040a00>

In [None]:
import joblib
joblib.dump(study, './study.pkl')

['./study.pkl']

In [None]:
study = joblib.load('./study.pkl')
study.trials # error

[FrozenTrial(number=0, values=[0.8097690607852991], datetime_start=datetime.datetime(2021, 6, 10, 21, 48, 29, 258904), datetime_complete=datetime.datetime(2021, 6, 10, 21, 48, 33, 737060), params={'booster': 'gblinear', 'lambda': 2.339781663462407e-06, 'alpha': 9.892276268781123e-08, 'subsample': 0.39676370914327214, 'colsample_bytree': 0.6457274902471277, 'n_estimators': 1092, 'learning_rate': 0.0295918571703849}, distributions={'booster': CategoricalDistribution(choices=('gbtree', 'gblinear', 'dart')), 'lambda': LogUniformDistribution(high=1.0, low=1e-08), 'alpha': LogUniformDistribution(high=1.0, low=1e-08), 'subsample': LogUniformDistribution(high=0.7, low=0.2), 'colsample_bytree': UniformDistribution(high=1.0, low=0.5), 'n_estimators': IntUniformDistribution(high=2000, low=600, step=1), 'learning_rate': LogUniformDistribution(high=0.5, low=0.005)}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=0, state=TrialState.COMPLETE, value=None),
 FrozenTrial(number=1, val

### tpot


In [None]:
X_test = df_test.drop('fraud_ind', axis=1).values
y_test = df_test['fraud_ind'].values

In [None]:
# !pip install tpot

# generic optimization
from tpot import TPOTClassifier

pipeline_optimizer = TPOTClassifier(generations=8, population_size=100, cv=5,
                                    random_state=42, verbosity=2, early_stop= 12,
                                    scoring = 'f1',  n_jobs=-2)

pipeline_optimizer.fit(X_nearmiss, y_nearmiss)
pipeline_optimizer.score(X_test, y_test)
pipeline_optimizer.export('./tpot_exported_pipeline.py')

HBox(children=(HTML(value='Optimization Progress'), FloatProgress(value=0.0, max=900.0), HTML(value='')))


Generation 1 - Current best internal CV score: 0.9233572057518289

Generation 2 - Current best internal CV score: 0.9235482919988012

Generation 3 - Current best internal CV score: 0.9235749460135297

Generation 4 - Current best internal CV score: 0.9248122560501809

Generation 5 - Current best internal CV score: 0.9248122560501809

Generation 6 - Current best internal CV score: 0.9248122560501809

Generation 7 - Current best internal CV score: 0.9248122560501809


TPOT closed during evaluation in one generation.


TPOT closed prematurely. Will use the current best pipeline.

Best pipeline: ExtraTreesClassifier(BernoulliNB(MinMaxScaler(RFE(input_matrix, criterion=gini, max_features=0.5, n_estimators=100, step=0.7000000000000001)), alpha=1.0, fit_prior=False), bootstrap=False, criterion=entropy, max_features=0.9000000000000001, min_samples_leaf=1, min_samples_split=2, n_estimators=100)


In [None]:
from sklearn.ensemble import ExtraTreesClassifier
TpotClf = ExtraTreesClassifier(bootstrap=False, criterion="entropy", max_features=0.9000000000000001, min_samples_leaf=1, min_samples_split=2, n_estimators=100)

In [None]:
basicTrainAndTest(df_train, df_test, TpotClf, "TpotClf")

Dataset: TpotClf
# ---------------------------------------------------------------------------------------------------------------------------------------
# Accuracy: 0.995644597013513
# Precision: 0.9093742507791897
# Reacall: 0.7478312302839116
# f1-score: 0.8207292004760359
# micro f1-score: 0.995644597013513
# macro f1-score: 0.9092623599695955
# weighted f1-score: 0.9954349268710131
# AUC: 0.8734121186384594
# ---------------------------------------------------------------------------------------------------------------------------------------

Dataset: TpotClf
---------------------------------------------------------------------------------------------------------------------------------------
Accuracy: 0.995644597013513
Precision: 0.9093742507791897
Reacall: 0.7478312302839116
f1-score: 0.8207292004760359
micro f1-score: 0.995644597013513
macro f1-score: 0.9092623599695955
weighted f1-score: 0.9954349268710131
AUC: 0.8734121186384594
---------------------------------------------------------------------------------------------------------------------------------------


In [None]:

pipeline_optimizer2 = TPOTClassifier(generations=10, population_size=200, cv=5,
                                    random_state=42, verbosity=2, early_stop= 12,
                                    scoring = 'f1',  n_jobs=-2)

pipeline_optimizer2.fit(X_nearmiss, y_nearmiss)
pipeline_optimizer2.score(X_test, y_test)
pipeline_optimizer2.export('./tpot_exported_pipeline2.py')

## Predict & Evaluation



In [None]:
bestmodel = TpotClf

In [None]:
X = df_train.drop('fraud_ind', axis=1).values
y = df_train['fraud_ind'].values

# For prediction
X_test = df_test.drop('fraud_ind', axis=1).values
y_test = df_test['fraud_ind'].values


In [None]:
model.fit(X, y)
y_pred = model.predict(X_test)
print('---' * 45)
print(f'Accuracy: {model.score(X_test, y_test)}')
print(f'Precision: {precision_score(y_test, y_pred)}')
print(f'Reacall: {recall_score(y_test, y_pred)}')
print(f'f1-score: {f1_score(y_test, y_pred)}')
print(f"micro f1-score: {f1_score(y_test, y_pred, average='micro')}")
print(f"macro f1-score: {f1_score(y_test, y_pred, average='macro')}")
print(f"weighted f1-score: {f1_score(y_test, y_pred, average='weighted')}")
print(f'AUC: {roc_auc_score(y_test, y_pred)}')
print('---' * 45)