# **Fraud Detection**

## PreProcessing

### Get Data

#### Download Dataset

In [None]:
f1id = '1SvJ_IiHr-ndJDG_sBf6NCn0lMKUxPIlf'
f2id = '1lZPv46zul32Xbr1qHES66YRzMa-A7MzB'

!pip3 install gdown
import gdown
url = 'https://drive.google.com/uc?id=%s'%(f1id)
output = 'train.csv'
gdown.download(url, output, quiet=False)
url = 'https://drive.google.com/uc?id=%s'%(f2id)
output = 'test.csv'
gdown.download(url, output, quiet=False)



Downloading...
From: https://drive.google.com/uc?id=1SvJ_IiHr-ndJDG_sBf6NCn0lMKUxPIlf
To: /content/train.csv
132MB [00:01, 132MB/s]
Downloading...
From: https://drive.google.com/uc?id=1lZPv46zul32Xbr1qHES66YRzMa-A7MzB
To: /content/test.csv
36.2MB [00:00, 125MB/s] 


'test.csv'

#### Import package

In [None]:
# Basic lib

import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.patches as mpatches

# Preprocess
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
!pip install autoviz datasist pandas_profiling 
from autoviz.AutoViz_Class import AutoViz_Class


from pandas_profiling import ProfileReport
import datasist as ds 
from sklearn.feature_selection import RFECV

# Classifier Libraries
from sklearn.linear_model import LogisticRegression

from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
import lightgbm as lgb

# Performance Evaluation tools
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, cross_val_score
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, classification_report


# imblearn
from imblearn.pipeline import Pipeline, make_pipeline
from imblearn.under_sampling import NearMiss
from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE
from imblearn.combine import SMOTEENN, SMOTETomek
from imblearn.metrics import classification_report_imbalanced


# PCA
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA, TruncatedSVD


# Utils
import time
import gc
from collections import Counter
import warnings
warnings.filterwarnings("ignore")


Collecting autoviz
  Downloading https://files.pythonhosted.org/packages/89/20/8c8c64d5221cfcbc54679f4f048a08292a16dbad178af7c78541aa3af730/autoviz-0.0.81-py3-none-any.whl
Collecting datasist
  Downloading https://files.pythonhosted.org/packages/af/da/ed4f7d38947c32aaf1f70630413a5ee83a4d8c520add349f8d53c878bce1/datasist-1.5.3-py3-none-any.whl
Installing collected packages: autoviz, datasist
Successfully installed autoviz-0.0.81 datasist-1.5.3
Imported AutoViz_Class version: 0.0.81. Call using:
    from autoviz.AutoViz_Class import AutoViz_Class
    AV = AutoViz_Class()
    AV.AutoViz(filename, sep=',', depVar='', dfte=None, header=0, verbose=0,
                            lowess=False,chart_format='svg',max_rows_analyzed=150000,max_cols_analyzed=30)
Note: verbose=0 or 1 generates charts and displays them in your local Jupyter notebook.
      verbose=2 saves plots in your local machine under AutoViz_Plots directory and does not display charts.


#### Read Data

In [None]:
df_train = pd.read_csv('./train.csv')
df_test = pd.read_csv('./test.csv')
df_all = pd.concat((df_train, df_test), 0)


df_train.name = 'Training Set'

df_test.name = 'Test Set'
df_all.name = 'All Set' 

dfs = [df_train, df_test]

In [None]:
# Due to RAM limit on colab, need to reduce memory usage
def reduce_mem(df):
    start_mem_usg = df.memory_usage().sum() / (1024*1024)
    print("Memory usage of properties dataframe is :",start_mem_usg," MB")
    
    for col in df.keys():
        if df[col].dtype == int:
            Max = df[col].max()
            Min = df[col].min()
            if -128 < Min and Max < 127:
                df[col] = df[col].astype(np.int8)
            elif -32768 < Min and Max < 32767:
                df[col] = df[col].astype(np.int16)
            elif -2147483648 < Min and Max < 2147483647:
                df[col] = df[col].astype(np.int32)
            else:
                df[col] = df[col].astype(np.int64)      
        elif df[col].dtype == float:
            df[col] = df[col].astype(np.float32)
        else:
            continue
    print("___MEMORY USAGE AFTER COMPLETION:___")
    mem_usg = df.memory_usage().sum() / 1024**2 
    print("Memory usage is: ",mem_usg," MB")
    print("This is ",100*mem_usg/start_mem_usg,"% of the initial size")



In [None]:
reduce_mem(df_train)
print("---"*10)
reduce_mem(df_test)

#### Observer data 


In [None]:
ds.structdata.check_train_test_set(df_train, df_test, index=None, col=None)


There are 1521787 training rows and 421665 test rows.
There are 23 training columns and 22 test columns.


In [None]:
ds.structdata.describe(df_train)

First five data points


Unnamed: 0,acqic,bacno,cano,conam,contp,csmcu,ecfg,etymd,flbmk,flg_3dsmk,fraud_ind,hcefg,insfg,iterm,locdt,loctm,mcc,mchno,ovrlt,scity,stocn,stscd,txkey
0,6881,113261,38038,513.799988,5,0,N,0,N,N,0,5,N,0,33,172652.0,457,59333,N,0,102,0,516056
1,0,134508,45725,465.619995,5,0,N,2,N,N,0,0,N,0,9,105114.0,451,0,N,5817,102,0,4376
2,6881,15408,188328,513.799988,5,0,N,0,N,N,0,5,N,0,6,152458.0,457,59333,N,0,102,0,483434
3,6716,157159,29967,1016.109985,5,62,N,5,N,N,0,5,N,0,5,172946.0,247,50436,N,3281,102,0,1407164
4,5975,105985,81305,713.659973,5,62,N,4,N,N,0,5,N,0,6,182129.0,263,93775,N,5817,102,0,1051004




Random five data points


Unnamed: 0,acqic,bacno,cano,conam,contp,csmcu,ecfg,etymd,flbmk,flg_3dsmk,fraud_ind,hcefg,insfg,iterm,locdt,loctm,mcc,mchno,ovrlt,scity,stocn,stscd,txkey
820636,3348,81442,152057,1016.340027,5,62,Y,8,N,N,0,5,N,0,75,1345.0,289,54828,N,3460,46,2,762410
523313,6767,55799,103694,394.070007,5,62,N,5,N,N,0,5,N,0,4,161852.0,251,19501,N,5817,102,0,1121551
685301,0,31292,43211,465.619995,5,0,N,2,N,N,0,0,N,0,23,104956.0,451,0,N,5817,102,0,10795
1185478,6769,17607,121488,242.800003,5,62,N,5,N,N,0,5,N,0,37,150746.0,251,78073,N,5858,102,0,226628
989289,6032,122429,96791,336.25,5,62,N,4,N,N,0,5,N,0,24,192058.0,251,77977,N,5817,102,0,252029




Last five data points


Unnamed: 0,acqic,bacno,cano,conam,contp,csmcu,ecfg,etymd,flbmk,flg_3dsmk,fraud_ind,hcefg,insfg,iterm,locdt,loctm,mcc,mchno,ovrlt,scity,stocn,stscd,txkey
1521782,6322,91008,15189,578.380005,5,75,Y,8,,,0,6,N,0,4,191642.0,209,38222,N,5817,102,0,1478280
1521783,3226,145107,116252,435.320007,5,75,Y,8,,,0,6,N,0,13,102338.0,192,90135,N,1458,102,0,661087
1521784,6769,162168,93598,1.38,5,75,Y,8,,,0,6,N,0,29,234618.0,373,79246,N,5817,102,0,167073
1521785,6032,45406,197460,1.38,5,75,Y,2,,,0,6,N,0,24,215218.0,373,79246,N,5817,102,0,338215
1521786,6716,48723,176440,406.589996,5,75,N,5,,,0,6,N,0,13,163603.0,251,69607,N,2310,102,0,1055258




Shape of  data set: (1521787, 23)


Size of  data set: 35001101


Data Types
Note: All Non-numerical features are identified as objects in pandas


Unnamed: 0,Data Type
acqic,int16
bacno,int32
cano,int32
conam,float32
contp,int8
csmcu,int8
ecfg,object
etymd,int8
flbmk,object
flg_3dsmk,object




Numerical Features in Data set
['acqic', 'bacno', 'cano', 'conam', 'contp', 'csmcu', 'etymd', 'fraud_ind', 'hcefg', 'iterm', 'locdt', 'loctm', 'mcc', 'mchno', 'scity', 'stocn', 'stscd', 'txkey']


Categorical Features in Data set


['ecfg', 'flbmk', 'flg_3dsmk', 'insfg', 'ovrlt']



Statistical Description of Columns


Unnamed: 0,acqic,bacno,cano,conam,contp,csmcu,etymd,fraud_ind,hcefg,iterm,locdt,loctm,mcc,mchno,scity,stocn,stscd,txkey
count,1521787.0,1521787.0,1521787.0,1521787.0,1521787.0,1521787.0,1521787.0,1521787.0,1521787.0,1521787.0,1521787.0,1521787.0,1521787.0,1521787.0,1521787.0,1521787.0,1521787.0,1521787.0
mean,6008.003,82090.27,108917.0,654.6267,4.829368,53.83324,4.149114,0.01337572,4.74941,0.04962784,45.32732,146232.0,297.8089,55890.22,4755.128,95.65116,0.02485499,971126.5
std,1502.42,47362.49,60903.63,402.2017,0.6513408,20.72135,2.394259,0.1148774,1.111927,0.3652493,26.01889,52103.93,77.96778,30822.97,1979.815,18.90027,0.2216804,564132.2
min,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,5982.0,41403.0,57635.0,437.06,5.0,62.0,2.0,0.0,5.0,0.0,23.0,110931.0,250.0,33774.0,3795.0,102.0,0.0,486947.5
50%,6716.0,81912.0,109785.0,591.9,5.0,62.0,4.0,0.0,5.0,0.0,45.0,151522.0,264.0,59360.0,5817.0,102.0,0.0,979580.0
75%,6769.0,123059.0,161876.0,807.34,5.0,62.0,5.0,0.0,5.0,0.0,68.0,185827.0,343.0,79200.0,5817.0,102.0,0.0,1455200.0
max,6884.0,163884.0,213334.0,7208.77,6.0,75.0,10.0,1.0,9.0,8.0,90.0,235959.0,459.0,103307.0,6671.0,107.0,4.0,1958239.0




Description of Categorical Features


Unnamed: 0,count,unique,top,freq
ecfg,1521787,2,N,1134512
flbmk,1509206,2,N,1504133
flg_3dsmk,1509206,2,N,1444867
insfg,1521787,2,N,1479475
ovrlt,1521787,2,N,1501536




Unique class Count of Categorical features


Unnamed: 0,Feature,Unique Count
0,ecfg,2
1,flbmk,3
2,flg_3dsmk,3
3,insfg,2
4,ovrlt,2




Missing Values in Data


Unnamed: 0,features,missing_counts,missing_percent
0,acqic,0,0.0
1,bacno,0,0.0
2,cano,0,0.0
3,conam,0,0.0
4,contp,0,0.0
5,csmcu,0,0.0
6,ecfg,0,0.0
7,etymd,0,0.0
8,flbmk,12581,0.8
9,flg_3dsmk,12581,0.8


### Data Cleaning

#### Filling missing value


In [None]:
# fill with most frequent
df_train['flbmk'] = df_train['flbmk'].fillna('N')
df_train['flg_3dsmk'] = df_train['flg_3dsmk'].fillna('N')


#### Handle different data type encoding


In [None]:
cat_feats = ds.structdata.get_cat_feats(df_train)
num_feats = ds.structdata.get_num_feats(df_train)
get_unique_counts = ds.structdata.get_unique_counts(df_train)
all_feats = df_train.keys()


In [None]:
# label encoding catogorical feature
le = LabelEncoder()
for feat in cat_feats:
    df_train[feat] = le.fit_transform(df_train[feat].astype(str))

# one hot feature

# 'contp': 交易類別  
# 'flbmk': fallback交易註記 
# 'ecfg': 網路交易註記  
# 'flg_3dsmk': 3DS交易驗證註記
# 'hcefg': 支付型態
# 'insfg': 分期交易註記
# 'ovrlt': 超額交易註記
# 'stscd': 狀態碼
# 'iterm': 分期期數
onehot_feature = [
    'contp', 'flbmk', 'ecfg', 'flg_3dsmk', 'hcefg', 'insfg', 'ovrlt', 
    'stscd', 'iterm'
]

# frequency feature

# 'csmcu': 消費地幣別
# 'etymd': 交易型態（網路/實體）
# 'mcc': 特店類別
# 'mchno': 特店名稱
# 'acqic': 收單行代碼
# 'bacno': 歸戶帳號
# 'cano': 卡號
# 'scity': 消費地城市
# 'stocn': 消費地國別
freq_feature = [
    'csmcu', 'etymd', 'mcc', 'mchno', 'acqic', 'bacno', 'cano', 'scity',
    'stocn'
]

for k in freq_feature:
    df_train[k + '_f'] = df_train[k].map(df_train[k].value_counts(normalize=True))

for k in onehot_feature:
    add_dumy = pd.get_dummies(df_train[k])
    
    add_dumy.columns = [k + "_{}".format(x) for x in add_dumy.columns]
    if add_dumy.shape[0] < 2:
        add_dumy = add_dumy.iloc[:, 0]
    df_train = pd.concat([df_train, add_dumy], axis=1)



In [None]:
# filter given feature correlated to target feature, use threshold to set it to 0 and 1 (1 means high chance of fraud)
def identify_high_risk_fraud(df, feat, threshold):
    Top_array = df_train[f'{feat}'].value_counts().values[:threshold]
    df_train[f'{feat}_high_risk_fraud'] = df_train[f'{feat}'].apply(lambda x : 0 if x not in Top_array else x)
                                                     

In [None]:
for feat in freq_feature:
    identify_high_risk_fraud(df_train, feat , 15)


#### Individual feature prepocess



##### acqic 收單行代碼


In [None]:
Df = df_train.groupby(['acqic'])['fraud_ind'].agg(['mean', 'count']).reset_index().sort_values('count', ascending = False)
Df


Unnamed: 0,acqic,mean,count
5945,6769,0.001040,250889
5895,6716,0.000695,201407
5226,5975,0.000376,124951
6047,6881,0.001741,99348
5411,6189,0.000689,91434
...,...,...,...
4604,5285,0.000000,1
4603,5284,0.000000,1
4602,5283,0.000000,1
1234,1371,0.000000,1


##### bacno 歸戶帳號


In [None]:
Df = df_train.groupby(['bacno'])['fraud_ind'].agg(['mean', 'count']).reset_index().sort_values('count', ascending = False)
Df


Unnamed: 0,bacno,mean,count
86417,148859,0.0,1117
90097,155157,0.0,938
60000,103279,0.0,747
53019,91313,0.0,681
86727,149363,0.0,593
...,...,...,...
87662,151021,0.0,1
74079,127607,0.0,1
39616,68356,0.0,1
54426,93694,0.0,1


##### locdf & loctm 授權日期與時間


In [None]:
def Str_turn_time(str1):
    str1 = str(int(str1))
    if len(str1) < 6:
        str1 = (6 - len(str1)) * '0' + str1
    return str1


df_train['Hour'] = df_train['loctm'].apply(lambda x :Str_turn_time(x)[:2]).astype(int)
df_train['Morning'] = 0
df_train.loc[(df_train['Hour'].astype('int') > 7) & (df_train['Hour'].astype('int') < 22), 'Morning'] = 1

        

In [None]:
df_train.loc[df_train['locdt'] < 121, 'Month'] = 4
df_train.loc[(df_train['locdt']) < 91, 'Month']  = 3
df_train.loc[(df_train['locdt']) < 61, 'Month']  = 2
df_train.loc[(df_train['locdt']) < 31, 'Month'] = 1

df_train['Week'] = df_train['locdt'].apply(lambda x : x%7)

In [None]:
identify_high_risk_fraud(df_train, 'Hour' , 15)
identify_high_risk_fraud(df_train, 'Month' , 15)
identify_high_risk_fraud(df_train, 'Week' , 15)


##### conam 交易金額



In [None]:
Df = df_train.groupby(['conam'])['fraud_ind'].agg(['mean']).reset_index().sort_values('mean', ascending = True)
Df


Unnamed: 0,conam,mean
36480,1101.839966,0.0
47891,1476.729980,0.0
47892,1476.770020,0.0
47893,1476.819946,0.0
47894,1476.859985,0.0
...,...,...
69378,2388.560059,1.0
25880,819.710022,1.0
11252,463.230011,1.0
25862,819.309998,1.0


##### txkey 交易唯一序號



In [None]:
df_train['Count_txkey_gb_bacno'] =  df_train.groupby(['bacno'])['txkey'].transform('count')


#Calculate count of transaction in the same account and in the same card
df_train['Count_txkey_gb_bacno'] =  df_train.groupby(['bacno', 'cano'])['txkey'].transform('count')

#Calculate count of transaction in the same account in the same one hour
df_train['Count_txkey_gb_bacno_locdt_Hour'] =  df_train.groupby(['bacno', 'locdt', 'Hour'])['txkey'].transform('count')

        

##### scity & stocn 消費城市&國別




In [None]:
new_col = 'scity'+'_'+'stocn'
col = 'scity'
col2 = 'stocn'


df_train[new_col] = df_train[col].astype(str)+'_'+df_train[col2].astype(str)
df_train[new_col] = df_train[col].astype(str)+'_'+df_train[col2].astype(str) 

le = LabelEncoder()
df_train[new_col] = le.fit_transform(df_train[new_col].astype(str))
    
    

#### numerical feature -> StandardScaler
After processing all the feature, scale the numerical feature, so that the model can fit the training data better.

In [None]:
num_feats = ['acqic','bacno','cano','conam', 'csmcu','hcefg','iterm','locdt','loctm','mcc',
 'mchno','scity','stocn', 'stscd','txkey','Hour','Morning','Month','Week']

In [None]:
scaler = StandardScaler()
df_train[num_feats] = scaler.fit_transform(df_train[num_feats])


In [None]:
reduce_mem(df_train)

Memory usage of properties dataframe is : 619.7006006240845  MB
___MEMORY USAGE AFTER COMPLETION:___
Memory usage is:  264.13475227355957  MB
This is  42.62296212195959 % of the initial size


### Feature selection


In [None]:
X = df_train.drop('fraud_ind', axis=1)
y = df_train['fraud_ind']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0, stratify=y)
X_train = X_train.values
X_test = X_test.values
y_train = y_train.values
y_test = y_test.values

# Since this data set is imbalance, use under sampling, for better training result of models
X_nearmiss, y_nearmiss = NearMiss(n_jobs=-1).fit_resample(X_train, y_train)


KeyboardInterrupt: ignored

In [None]:
## This block is commented out, because it takes too long to executed.
## To get the dataFrame after feature selection, please use gdown to download the csv file in the next block.

## Feature ranking with recursive feature elimination and cross-validated selection of the best number of features.

# skf = StratifiedKFold(n_splits=5, random_state=None, shuffle=True)
# xgb = XGBClassifier(n_estimators=600, n_jobs=-1, objective='binary:hinge')
# rfecv = RFECV(estimator=xgb, step=1, cv=skf, verbose=2, n_jobs = -1, scoring = 'f1')
# rfecv.fit(X_nearmiss, y_nearmiss)


In [None]:
# pruned_features = [f for f, s in zip(X, rfecv.support_) if s]
# print('\n The selected features are {}:'.format(pruned_features))
# print(len(pruned_features))

In [None]:
# new_df = pd.concat([df_train[pruned_features], y], axis=1)
# new_df.to_csv('./df_train_afterPruned.csv', index=False)

In [None]:
# new_df.isnull().sum().max()


0

In [None]:
# df_train = new_df

In [None]:

# pruned_features = [f for f, s in zip(X, rfecv.support_) if s]
# print('\n The selected features are {}:'.format(pruned_features))
# print(len(pruned_features))

In [None]:
f = '1HWKK6nFEORrlILI2zJfkHOfkI2xHms_g'
url = 'https://drive.google.com/uc?id=%s'%(f)
output = './train_clean.csv'
!pip3 install gdown
import gdown
gdown.download(url, output, quiet=False)




Downloading...
From: https://drive.google.com/uc?id=1HWKK6nFEORrlILI2zJfkHOfkI2xHms_g
To: /content/train_clean.csv
419MB [00:02, 203MB/s]


'./train_clean.csv'

In [None]:
df  = pd.read_csv('./train_clean.csv')

In [None]:
reduce_mem(df)

Memory usage of properties dataframe is : 348.3095245361328  MB
___MEMORY USAGE AFTER COMPLETION:___
Memory usage is:  145.1290397644043  MB
This is  41.66668711046084 % of the initial size


In [None]:
df_train = df

## Parameter tuning


This data set is highly imbalance, in order to fit the training data better, under sampling the data first, use it to train the model, and get the best hyperparameter, then use the hyperparameter obtained to fit the test set and make prediction.

In [None]:
X = df_train.drop('fraud_ind', axis=1)
y = df_train['fraud_ind']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0, stratify=y)

X_train = X_train.values
X_test = X_test.values
y_train = y_train.values
y_test = y_test.values

# Under sampling
X_nearmiss, y_nearmiss = NearMiss(n_jobs=-1).fit_resample(X_train, y_train)


In [None]:
# A parameter grid for XGBoost for param tuning
params = {
        'learning_rate': [0.01, 0.05, 0.1],
        'min_child_weight': [1, 3, 5],
        'subsample': [0.5, 0.75, 1.0],
        'colsample_bytree': [0.5, 0.7, 1.0],
        'n_estimators': [300, 600, 1000, 2000]
        }

xgb = XGBClassifier(objective='binary:hinge')


### hyperOpt

In [None]:
# !pip install hyperopt pyspark
import hyperopt
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials

space = {'learning_rate': hp.uniform('learning_rate', 0.01, 0.9),
        'max_depth': hp.choice('max_depth', [3, 5, 9, 10]),
        'colsample_bytree': hp.choice('colsample_bytree', [0.2, 0.5, 0.7, 1.0]),
        'subsample': hp.choice ('subsample', [0.2, 0.5, 0.75, 1.0]),
        'min_child_weight' : hp.choice ('min_child_weight', [0.2, 0.5, 0.75, 1.0]),
        'n_estimators' : hp.choice('n_estimators', [600, 1000, 2000,3000])
    }

def objective(space):
    model = XGBClassifier(learning_rate = space['learning_rate'], 
                                   max_depth = space['max_depth'],
                                 colsample_bytree = space['colsample_bytree'],
                                 subsample = space['subsample'],
                                 n_estimators = space['n_estimators'],
                                min_child_weight=space['min_child_weight'],
                              objective='binary:hinge'
                                 )
    
    accuracy = cross_val_score(model, X_nearmiss, y_nearmiss, cv = 5, scoring='f1', n_jobs=-1).mean()

    # We aim to maximize accuracy, therefore we return it as a negative value
    return {'loss': -accuracy, 'status': STATUS_OK }

trials = Trials()    
best = fmin(fn= objective,
            space= space,
            algo= tpe.suggest,
            max_evals = 50,
            trials= trials)


100%|██████████| 50/50 [2:56:01<00:00, 211.23s/it, best loss: -0.9263311136096428]


In [None]:
 best

# 20 iters
#  {'colsample_bytree': 0,
#  'learning_rate': 0.10127250991822791,
#  'max_depth': 2,
#  'min_child_weight': 0,
#  'n_estimators': 1,
#  'subsample': 2}

# 50 iters
# {'colsample_bytree': 1,
#  'learning_rate': 0.01029511061772444,
#  'max_depth': 2,
#  'min_child_weight': 0,
#  'n_estimators': 3,
#  'subsample': 0}

{'colsample_bytree': 1,
 'learning_rate': 0.01029511061772444,
 'max_depth': 2,
 'min_child_weight': 0,
 'n_estimators': 3,
 'subsample': 0}

### optuna

In [None]:
! pip install optuna
import optuna
from optuna import Trial, visualization
from optuna.samplers import TPESampler
from xgboost import XGBClassifier

Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.
To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.


In [None]:

def objective(trial, X, y):

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0, stratify=y)
    X_train = X_train.values
    X_test = X_test.values
    y_train = y_train.values
    y_test = y_test.values

    # Under sampling
    X_nearmiss, y_nearmiss = NearMiss(n_jobs=-1).fit_resample(X_train, y_train)
    
    param = {
        "verbosity": 0,
        "objective": "binary:hinge",
        # defines booster, gblinear for linear functions.
        "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
        # L2 regularization weight.
        "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
        # L1 regularization weight.
        "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
        # sampling ratio for training data.
        "subsample": trial.suggest_float("subsample", 0.2, 0.7,log=True),
        # sampling according to each tree.
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "n_estimators":trial.suggest_int("n_estimators", 600, 2000),
        'learning_rate':trial.suggest_loguniform('learning_rate',0.005,0.5),
        "n_jobs": -1
    }

    model = XGBClassifier(**param)

    if param["booster"] in ["gbtree", "dart"]:
        # maximum depth of the tree, signifies complexity of the tree.
        param["max_depth"] = trial.suggest_int("max_depth", 3, 9, step=2)
        # minimum child weight, larger the term more conservative the tree.
        param["min_child_weight"] = trial.suggest_int("min_child_weight", 2, 10)
        param["eta"] = trial.suggest_float("eta", 1e-8, 1.0, log=True)
        # defines how selective algorithm is.
        param["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
        param["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])

    if param["booster"] == "dart":
        param["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
        param["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
        param["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
        param["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)

    model.fit(X_nearmiss, y_nearmiss)
    f1 = cross_val_score(model, X_test, y_test, scoring="f1").mean()
    return f1

In [None]:
study = optuna.create_study(direction='maximize',sampler=TPESampler())
study.optimize(lambda trial : objective(trial,X,y),n_trials= 100)

print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial

print('Best trial: score {},\nparams {}'.format(study.best_trial.value,study.best_trial.params))


print("  Value: {}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[32m[I 2021-05-31 03:40:42,531][0m A new study created in memory with name: no-name-8d9a757d-1e8b-4af5-9120-b51a76d475b2[0m
[32m[I 2021-05-31 06:36:03,301][0m Trial 0 finished with value: 0.0 and parameters: {'booster': 'gblinear', 'lambda': 1.942273858343516e-08, 'alpha': 2.9992122570949375e-08, 'subsample': 0.29199785945038964, 'colsample_bytree': 0.9475434501528085, 'n_estimators': 1934, 'learning_rate': 0.024718541715781164}. Best is trial 0 with value: 0.0.[0m
[32m[I 2021-05-31 07:02:08,685][0m Trial 1 finished with value: 0.4144782385484914 and parameters: {'booster': 'gbtree', 'lambda': 4.0414225439998046e-05, 'alpha': 2.286148610638384e-08, 'subsample': 0.3788840775033768, 'colsample_bytree': 0.9701327304408993, 'n_estimators': 1039, 'learning_rate': 0.008084936244943932, 'max_depth': 7, 'min_child_weight': 7, 'eta': 0.0014050505033342646, 'gamma': 3.357684565107275e-07, 'grow_policy': 'depthwise'}. Best is trial 1 with value: 0.4144782385484914.[0m


In [None]:
fig = optuna.visualization.plot_optimization_history(study)
fig.show()

In [None]:
fig = optuna.visualization.plot_slice(study)
fig.show()


In [None]:
fig = optuna.visualization.plot_parallel_coordinate(study)
fig.show()


In [None]:
fig = optuna.visualization.plot_contor(study)
fig.show()


### tpot


In [None]:
!pip install tpot
import tpot
# generic optimization
from tpot import TPOTClassifier

pipeline_optimizer = TPOTClassifier(generations=5, population_size=20, cv=5,
                                    random_state=42, verbosity=2, early_stop= 12,
                                    scoring = 'f1',  n_jobs=-2)

# pipeline_optimizer.fit(X_nearmiss, y_nearmiss)

# print(pipeline_optimizer.score(X_test, y_test))
## pipeline_optimizer.export('tpot_exported_pipeline.py')

ModuleNotFoundError: ignored

### RandomizedSearchCV


In [None]:
from sklearn.model_selection import RandomizedSearchCV
skf = StratifiedKFold(n_splits=5, random_state=None, shuffle=True)


random_search = RandomizedSearchCV(xgb, param_distributions=params, 
                                   n_iter=5, scoring='f1', n_jobs=7, 
                                   cv=skf.split(X_nearmiss,y_nearmiss), verbose=1, random_state=1001)

# random_search.fit(X_nearmiss, y_nearmiss)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


RandomizedSearchCV(cv=<generator object _BaseKFold.split at 0x7ff8aae933c0>,
                   estimator=XGBClassifier(base_score=None, booster=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None, gamma=None,
                                           gpu_id=None, importance_type='gain',
                                           interaction_constraints=None,
                                           learning_rate=None,
                                           max_delta_step=None, max_depth=None,
                                           min_child_weight=None, missing...
                                           random_state=None, reg_alpha=None,
                                           reg_lambda=None,
                                           scale_pos_weight=None,
                                           subsample=None, tr

In [None]:
print('\n All results:')
print(random_search.cv_results_)

print('\n Best estimator:')
print(random_search.best_estimator_)
xgb_reg = random_search.best_estimator_

print('\n Best Score:')
print(random_search.best_score_ * 2 - 1)

print('\n Best hyperparameters:')
print(random_search.best_params_)

results = pd.DataFrame(random_search.cv_results_)
results.to_csv('xgb-random-grid-search-results-01.csv', index=False)



 All results:
{'mean_fit_time': array([328.07260108, 153.22582893, 274.45104995, 207.41175866,
        70.10230074]), 'std_fit_time': array([ 0.64311142,  8.8294161 , 11.88589173, 48.43867586, 12.3935348 ]), 'mean_score_time': array([0.19580727, 0.09973316, 0.18410525, 0.13195138, 0.05622759]), 'std_score_time': array([0.02991578, 0.00910633, 0.02679952, 0.04105946, 0.00131081]), 'param_subsample': masked_array(data=[0.5, 0.5, 0.5, 0.5, 0.5],
             mask=[False, False, False, False, False],
       fill_value='?',
            dtype=object), 'param_n_estimators': masked_array(data=[2000, 1000, 2000, 2000, 600],
             mask=[False, False, False, False, False],
       fill_value='?',
            dtype=object), 'param_min_child_weight': masked_array(data=[1, 3, 3, 5, 3],
             mask=[False, False, False, False, False],
       fill_value='?',
            dtype=object), 'param_learning_rate': masked_array(data=[0.05, 0.05, 0.01, 0.05, 0.01],
             mask=[False, False,

## Training

In [None]:
def trainModel(X_train, y_train, modelToTrain):
    accuracy_lst = []
    precision_lst = []
    recall_lst = []
    f1_lst = []
    f1_micro_lst = []
    f1_macro_lst = []
    f1_weighted_lst = []
    auc_lst = []

    skf = StratifiedKFold(n_splits=5, random_state=None, shuffle=True)
    model = modelToTrain

    for fold, (train_index, test_index) in enumerate(skf.split(X_train, y_train), 1):
        print(f'Fold {fold} start training...')
        train_X = X_train[train_index]
        train_y = y_train[train_index] 
        test_X = X_train[test_index]
        test_y = y_train[test_index] 

        model.fit(train_X, train_y)
        y_pred = model.predict(test_X)
        print('---' * 45)
        print(f'For fold {fold}:')
        print(f'Accuracy: {model.score(test_X, test_y)}')
        print(f'Precision: {precision_score(y_train[test_index], y_pred)}')
        print(f'Reacall: {recall_score(y_train[test_index], y_pred)}')
        print(f'f1-score: {f1_score(test_y, y_pred)}')
        print(f"micro f1-score: {f1_score(test_y, y_pred, average='micro')}")
        print(f"macro f1-score: {f1_score(test_y, y_pred, average='macro')}")
        print(f"weighted f1-score: {f1_score(test_y, y_pred, average='weighted')}")
        print(f'AUC: {roc_auc_score(y_train[test_index], y_pred)}')
        print('---' * 45)

        accuracy_lst.append(model.score(test_X, test_y))
        precision_lst.append(precision_score(y_train[test_index], y_pred))
        recall_lst.append(recall_score(y_train[test_index], y_pred))
        f1_lst.append(f1_score(test_y, y_pred))
        f1_micro_lst.append(f1_score(test_y, y_pred, average='micro'))
        f1_macro_lst.append(f1_score(test_y, y_pred, average='macro'))
        f1_weighted_lst.append(f1_score(test_y, y_pred, average='weighted'))
        auc_lst.append(roc_auc_score(y_train[test_index], y_pred))

    print('---' * 45)
    print("accuracy: {}".format(np.mean(accuracy_lst)))
    print("precision: {}".format(np.mean(precision_lst)))
    print("recall: {}".format(np.mean(recall_lst)))
    print("f1: {}".format(np.mean(f1_lst)))
    print("f1 micro: {}".format(np.mean(f1_micro_lst)))
    print("f1 macro : {}".format(np.mean(f1_macro_lst)))
    print("f1 weighted: {}".format(np.mean(f1_weighted_lst)))
    print('---' * 45)

Fold 1 start training...
For fold 1:
Accuracy: 0.9941998002348117
Precision: 0.8813409792677548
Reacall: 0.6544382574516868
f1-score: 0.7511278195488722
micro f1-score: 0.9941998002348117
macro f1-score: 0.8740967632193494
weighted f1-score: 0.993776378687661
AUC: 0.8266219216126405
Fold 2 start training...


In [None]:
trainModel(X_train, y_train, xgb_reg)

In [None]:
trainModel(X_train, y_train, xgb_reg)

In [None]:
trainModel(X_train, y_train, xgb_reg)