In [1]:

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from dateutil.parser import parse
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import roc_auc_score
from sklearn.decomposition import PCA
from sklearn.neural_network import MLPRegressor
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import ADASYN
from collections import Counter
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier




In [2]:
feature_set =  'feature_set_4_normalised'
X_train = pd.read_csv( F'./data/{feature_set}/X_train_full.csv').drop(columns=['ID_JOIN'])
y_train = pd.read_csv( F'./data/{feature_set}/y_train.csv').values.ravel()

X_val = pd.read_csv( F'./data/{feature_set}/X_valid_full.csv').drop(columns=['ID_JOIN'])
y_val = pd.read_csv( F'./data/{feature_set}/y_valid.csv').values.ravel()

In [3]:
counter = Counter(y_train)
print("Before Sampling: {}".format(counter))

pos = Counter(y_train).get(1)
neg = Counter(y_train).get(0)
total = neg+pos

weight_for_0 = (1 / neg) * (total / 2.0)
weight_for_1 = (1 / pos) * (total / 2.0)

class_weight = {0: weight_for_0, 1: weight_for_1}

print('Weight for class 0: {:.2f}'.format(weight_for_0))
print('Weight for class 1: {:.2f}'.format(weight_for_1))

Before Sampling: Counter({0.0: 727178, 1.0: 19329})
Weight for class 0: 0.51
Weight for class 1: 19.31


In [4]:
# model_LR = LogisticRegression(max_iter=10000,random_state=42,verbose=1,class_weight=class_weight)
# model_LR.fit(X_train,y_train)

# val_res_LR = 1 -model_LR.predict_proba(X_val)[:,0]
# roc_auc_score(y_val,val_res_LR)


In [5]:
model_HGB = HistGradientBoostingClassifier(max_iter=1000,random_state=42,verbose=1,max_leaf_nodes=None,early_stopping=True,validation_fraction=0.1,n_iter_no_change=10)
model_HGB.fit(X_train,y_train)
val_res_HGB = 1 -model_HGB.predict_proba(X_val)[:,0]
roc_auc_score(y_val,val_res_HGB)

Binning 0.656 GB of training data: 2.661 s
Binning 0.073 GB of validation data: 0.113 s
Fitting gradient boosted rounds:
[1/1000] 1 tree, 8107 leaves, max depth = 48, train loss: 0.08687, val loss: 0.09607, in 7.611s
[2/1000] 1 tree, 22755 leaves, max depth = 59, train loss: 0.07834, val loss: 0.09477, in 19.226s
[3/1000] 1 tree, 26252 leaves, max depth = 56, train loss: 0.07103, val loss: 0.09400, in 20.395s
[4/1000] 1 tree, 26192 leaves, max depth = 60, train loss: 0.06456, val loss: 0.09338, in 20.108s
[5/1000] 1 tree, 26233 leaves, max depth = 61, train loss: 0.05876, val loss: 0.09278, in 19.774s
[6/1000] 1 tree, 26183 leaves, max depth = 58, train loss: 0.05362, val loss: 0.09225, in 19.911s
[7/1000] 1 tree, 26154 leaves, max depth = 59, train loss: 0.04902, val loss: 0.09175, in 19.726s
[8/1000] 1 tree, 26152 leaves, max depth = 65, train loss: 0.04492, val loss: 0.09137, in 20.313s
[9/1000] 1 tree, 26112 leaves, max depth = 54, train loss: 0.04119, val loss: 0.09113, in 19.904s

0.7617009913604174

In [6]:
# model_ABC = AdaBoostClassifier(random_state=42,n_estimators=50)
# model_ABC.fit(X_train,y_train)
# val_res_ABC = 1 -model_ABC.predict_proba(X_val)[:,0]
# roc_auc_score(y_val,val_res_ABC)

In [7]:
model_BC = BaggingClassifier(random_state=42,n_estimators=100,n_jobs=-1,verbose=1)
model_BC.fit(X_train,y_train)
val_res_ABC = 1 -model_BC.predict_proba(X_val)[:,0]
roc_auc_score(y_val,val_res_ABC)

[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 out of   8 | elapsed: 16.3min remaining: 48.8min
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed: 17.5min finished
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 out of   8 | elapsed:    8.2s remaining:   24.8s
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    9.1s finished


0.7798406990610358

In [8]:
test_df = pd.read_csv( F'./data/{feature_set}/test_features.csv')

In [9]:
dict = {'TX_ID':['ddaa070acea087eae360225e92c1609cea905e43'],
        'TX_FRAUD':[0]
       }
df2 = pd.DataFrame(dict)

In [11]:
test_res_HGB = 1 -model_HGB.predict_proba(test_df.drop(columns=['TX_ID','ID_JOIN']))[:,0]

In [12]:
test_df['TX_FRAUD']  = test_res_HGB

In [15]:
submission_HGB = test_df[['TX_ID','TX_FRAUD']]

In [16]:
submission_HGB = pd.concat([submission_HGB,df2],ignore_index = True)

In [17]:
test_trans = pd.read_csv( F'./data/transactions_test.csv')
test_trans = test_trans['TX_ID']
submission_HGB.set_index('TX_ID',inplace=True)
submission = submission_HGB.reindex(index =test_trans)

In [18]:
location = 'results'
submission.to_csv(F'./{location}/results_4_HGB.csv',index=True)

In [21]:
test_res_BC = 1 -model_HGB.predict_proba(test_df.drop(columns=['TX_ID','ID_JOIN','TX_FRAUD']))[:,0]
test_df['TX_FRAUD']  = test_res_BC
submission_BC = test_df[['TX_ID','TX_FRAUD']]
submission_BC = pd.concat([submission_BC,df2],ignore_index = True)

submission_BC.set_index('TX_ID',inplace=True)
submission_BC = submission_BC.reindex(index =test_trans)

location = 'results'
submission.to_csv(F'./{location}/results_4_BC.csv',index=True)