In [1]:
import numpy as np
import pandas as pd
import datetime
import warnings
import gc

import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ReduceLROnPlateau, LearningRateScheduler, EarlyStopping
from tensorflow.keras.layers import Dense, Input, InputLayer, Add, BatchNormalization, Dropout, Concatenate
from tensorflow.keras.utils import plot_model

from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
import random
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import StratifiedKFold, KFold, GridSearchCV
from sklearn.metrics import roc_auc_score, roc_curve, log_loss
import math
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
import lightgbm
import xgboost as xgb
from lightgbm import LGBMClassifier, log_evaluation

from sklearn.metrics import accuracy_score, r2_score, mean_absolute_error, mean_squared_error

# **<font size=6 color="blue">Contents </font>**

<font size=5>1. EDA</font> <br><br>
<font size=5>2. Feature Engineering</font> <br>
    <font size=4>1) Feature Aggregation</font> <br>
    <font size=4>2) Categorical Encoding</font> <br>
    <font size=4>3) Imputation</font> <br>
    <font size=4>4) Creating Features</font> <br>
    <font size=4>5) Scaling</font> <br> 
     <font size=4>6) Remove highly correlated fields</font> <br> <br>

<font size=5>3. Evaluation Metrics</font> <br><br>
<font size=5>4. Implementation and Validation</font> <br>
<font size=4>1) KNN</font> <br>
<font size=4>2) SVM</font> <br>
<font size=4>3) LGBMClassifier</font> <br>
<font size=4>4) XGBoost</font> <br>
<font size=4>5) Neural Network</font> <br>

In [2]:
# line 1
train_data = pd.read_feather('../input/parquet-files-amexdefault-prediction/train_data.ftr')
train_labels = pd.read_csv('/kaggle/input/amex-default-prediction/train_labels.csv').set_index('customer_ID')
test_data = pd.read_feather('../input/parquet-files-amexdefault-prediction/test_data.ftr')
target = train_labels.target.values
sub = pd.read_csv('/kaggle/input/amex-default-prediction/sample_submission.csv')

# **<font size=6 color="blue">EDA </font>**

In [3]:
# line 2
# Shape of the train data
train_data.shape

(5531451, 190)

In [4]:
# line 3
# Display simple information of the variables 
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5531451 entries, 0 to 5531450
Columns: 190 entries, customer_ID to D_145
dtypes: category(11), float16(176), int64(1), object(2)
memory usage: 2.0+ GB


In [6]:
# line 4
# min and max dates
print('min date: ', train_data['S_2'].min())
print('max date: ', train_data['S_2'].max())

min date:  2017-03-01
max date:  2018-03-31


In [7]:
# line 5
# Generate a simple statistical summary of the DataFrame for Numerical
train_data.describe()

Unnamed: 0,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,D_42,D_43,...,D_136,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145
count,5485466.0,5531451.0,5531451.0,5529435.0,5531451.0,4510907.0,5529435.0,5529435.0,791314.0,3873055.0,...,194699.0,194699.0,194699.0,5429903.0,5490819.0,5429903.0,944408.0,5429903.0,5490724.0,5429903.0
mean,,,,,,,,,,,...,0.0,0.0,0.0,,,,,,,
std,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
min,-0.4589844,0.0,-7.589844,0.0,0.0,-0.6269531,0.0,0.0,-0.000454,1.192093e-07,...,5.960464e-08,0.0,5.960464e-08,0.0,0.0,0.0,-0.014542,0.0,0.0,0.0
25%,0.4802246,0.004528046,0.008865356,0.1053467,0.002895355,0.1273193,0.002872467,0.005226135,0.037506,0.04226685,...,0.009315491,0.002533,0.003517151,0.003026962,0.002555847,0.003026962,0.199341,0.00302887,0.002752304,0.00302887
50%,0.6943359,0.009056091,0.03134155,0.8144531,0.005783081,0.1639404,0.005744934,0.009780884,0.120544,0.08850098,...,0.2539062,0.00507,0.007038116,0.006053925,0.005111694,0.00605011,0.38208,0.006053925,0.005508423,0.006053925
75%,0.8647461,0.2366943,0.1258545,1.001953,0.008659363,0.2580566,0.008613586,0.1550293,0.250977,0.1843262,...,0.2583008,0.007572,0.5014648,0.009078979,0.007663727,0.009078979,0.559082,0.009078979,0.008262634,0.009078979
max,1.009766,5.390625,1.324219,1.009766,3.255859,5.484375,8.992188,1.625,4.191406,10.10938,...,1.759766,1.009766,3.005859,1.009766,1.009766,1.339844,2.228516,1.009766,1.34375,4.828125


In [8]:
# line 6
# Total number of missing values
train_data.isnull().sum().sum()

160858968

In [9]:
# line 7
# Display the number of missing values by variable
train_data.isnull().sum()

customer_ID          0
S_2                  0
P_2              45985
D_39                 0
B_1                  0
                ...   
D_141           101548
D_142          4587043
D_143           101548
D_144            40727
D_145           101548
Length: 190, dtype: int64

In [10]:
# line 8
# Display the number of unique values for each variable
train_data.nunique()

customer_ID    458913
S_2               396
P_2             17205
D_39            11242
B_1             22627
                ...  
D_141            9055
D_142           14075
D_143            8491
D_144           15412
D_145            9228
Length: 190, dtype: int64

In [12]:
# line 9
# Explore target variable
print(train_labels.shape)
train_labels.info()

(458913, 1)
<class 'pandas.core.frame.DataFrame'>
Index: 458913 entries, 0000099d6bd597052cdcda90ffabf56573fe9d7c79be5fbac11a8ed792feb62a to fffff1d38b785cef84adeace64f8f83db3a0c31e8d92eaba8b115f71cab04681
Data columns (total 1 columns):
 #   Column  Non-Null Count   Dtype
---  ------  --------------   -----
 0   target  458913 non-null  int64
dtypes: int64(1)
memory usage: 7.0+ MB


In [13]:
# line 10
train_labels['target'].value_counts()

0    340085
1    118828
Name: target, dtype: int64

# **<font size=6 color="blue">Feature Engineering </font>**

# **1. Feature Aggregation** 
Create a new dataset based on aggregated information from EDA

In [14]:
# line 11
# train dataset
trn_num_statements = train_data.groupby('customer_ID').size().sort_index()

train_agg_data = (train_data
                .groupby('customer_ID')
                .tail(1)
                .set_index('customer_ID', drop=True)
                .sort_index()
                .drop(['S_2'], axis='columns'))

train_agg_data['target'] = train_labels.target
train_agg_data['num_statements'] = trn_num_statements

train_agg_data.reset_index(inplace = True, drop = True)

In [15]:
# line 12
# test dataset
tst_num_statements = test_data.groupby('customer_ID').size().sort_index()

test_agg_data = (test_data
                .groupby('customer_ID')
                .tail(1)
                .set_index('customer_ID', drop=True)
                .sort_index()
                .drop(['S_2'], axis='columns'))

# Merge the labels from the labels dataframe
test_agg_data['num_statements'] = tst_num_statements

test_agg_data.reset_index(inplace = True, drop = True)

# **2. Categorical Encoding** 
<font size=3 >One hot encoding </font>

In [16]:
# line 13
# For train data
cat_features = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']

trn_not_cat_features = [f for f in train_agg_data.columns if f not in cat_features]
tst_not_cat_features = [f for f in test_agg_data.columns if f not in cat_features]

In [18]:
# line 14
encoder = OrdinalEncoder()
trn_encoded_features = encoder.fit_transform(train_agg_data[cat_features])

In [19]:
# line 15
trn_encoded_features = pd.DataFrame(trn_encoded_features)
train_agg_data = pd.concat([train_agg_data[trn_not_cat_features], trn_encoded_features], axis = 1)

In [20]:
# line 16
# For test data
tst_encoded_features = encoder.transform(test_agg_data[cat_features])
tst_encoded_features = pd.DataFrame(tst_encoded_features)

test_agg_data = pd.concat([test_agg_data[tst_not_cat_features], tst_encoded_features], axis = 1)
test_agg_data.head()

Unnamed: 0,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,D_42,D_43,...,1,2,3,4,5,6,7,8,9,10
0,0.568848,0.121399,0.01078,1.009766,0.006924,0.149414,0.000396,0.003576,0.10376,0.007397,...,1.0,0.0,0.0,0.0,1.0,0.0,2.0,2.0,,5.0
1,0.841309,0.126465,0.016556,1.008789,0.009712,0.112183,0.006191,0.011383,,,...,1.0,1.0,0.0,3.0,0.0,1.0,1.0,0.0,,5.0
2,0.697754,0.002724,0.001485,0.810059,0.002621,0.166138,0.004887,0.015945,,0.105286,...,1.0,0.0,0.0,3.0,0.0,0.0,2.0,2.0,0.0,3.0
3,0.513184,0.324707,0.149536,0.205688,0.002277,0.181152,0.005814,0.498535,,0.21167,...,2.0,0.0,0.0,4.0,0.0,1.0,0.0,1.0,,4.0
4,0.254395,0.768066,0.563477,0.038025,0.50293,0.168335,0.009483,0.831055,,0.071899,...,5.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,,4.0


# **3. Imputation** 
<font size=3 > Impute missing values</font>

In [32]:
# line 17
# # Keep the latest statement features for each customer
# train_dataset = train_dataset_.groupby('customer_ID').tail(1).set_index('customer_ID', drop=True).sort_index()

In [33]:
# line 18
# categorical_cols = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']

# num_cols = [col for col in train_dataset.columns if col not in categorical_cols + ["target"]]

In [34]:
# line 19
# Delinquency = [d for d in train_dataset.columns if d.startswith('D_')]
# Spend = [s for s in train_dataset.columns if s.startswith('S_')]
# Payment = [p for p in train_dataset.columns if p.startswith('P_')]
# Balance = [b for b in train_dataset.columns if b.startswith('B_')]
# Risk = [r for r in train_dataset.columns if r.startswith('R_')]
# Dict = {'Delinquency': len(Delinquency), 'Spend': len(Spend), 'Payment': len(Payment), 'Balance': len(Balance), 'Risk': len(Risk),}

In [35]:
# line 20
# train_dataset = train_dataset.drop(['S_2','D_66','D_42','D_49','D_73','D_76','R_9','B_29','D_87','D_88','D_106','R_26','D_108','D_110','D_111','B_39','B_42','D_132','D_134','D_135','D_136','D_137','D_138','D_142'], axis=1)

In [36]:
# line 21
# selected_col = np.array(['P_2','S_3','B_2','D_41','D_43','B_3','D_44','D_45','D_46','D_48','D_50','D_53','S_7','D_56','S_9','B_6','B_8','D_52','P_3','D_54','D_55','B_13','D_59','D_61','B_15','D_62','B_16','B_17','D_77','B_19','B_20','D_69','B_22','D_70','D_72','D_74','R_7','B_25','B_26','D_78','D_79','D_80','B_27','D_81','R_12','D_82','D_105','S_27','D_83','R_14','D_84','D_86','R_20','B_33','D_89','D_91','S_22','S_23','S_24','S_25','S_26','D_102','D_103','D_104','D_107','B_37','R_27','D_109','D_112','B_40','D_113','D_115','D_118','D_119','D_121','D_122','D_123','D_124','D_125','D_128','D_129','B_41','D_130','D_131','D_133','D_139','D_140','D_141','D_143','D_144','D_145'])

# for col in selected_col:
#     train_dataset[col] = train_dataset[col].fillna(train_dataset[col].median())

# selcted_col2 = np.array(['D_68','B_30','B_38','D_64','D_114','D_116','D_117','D_120','D_126'])

# for col2 in selcted_col2:
#     train_dataset[col2] =  train_dataset[col2].fillna(train_dataset[col2].mode()[0])

In [38]:
# line 22
# test_dataset = test_dataset_.groupby('customer_ID').tail(1).set_index('customer_ID', drop=True).sort_index()
# test_dataset = test_dataset.drop(['S_2','D_42','D_49','D_66','D_73','D_76','R_9','B_29','D_87','D_88','D_106','R_26','D_108','D_110','D_111','B_39','B_42','D_132','D_134','D_135','D_136','D_137','D_138','D_142'], axis=1)

In [37]:
# line 23
# selected_column = np.array(['P_2','S_3','B_2','D_41','D_43','B_3','D_44','D_45','D_46','D_48','D_50','D_53','S_7','D_56','S_9','S_12','S_17','B_6','B_8','D_52','P_3','D_54','D_55','B_13','D_59','D_61','B_15','D_62','B_16','B_17','D_77','B_19','B_20','D_69','B_22','D_70','D_72','D_74','R_7','B_25','B_26','D_78','D_79','D_80','B_27','D_81','R_12','D_82','D_105','S_27','D_83','R_14','D_84','D_86','R_20','B_33','D_89','D_91','S_22','S_23','S_24','S_25','S_26','D_102','D_103','D_104','D_107','B_37','R_27','D_109','D_112','B_40','D_113','D_115','D_118','D_119','D_121','D_122','D_123','D_124','D_125','D_128','D_129','B_41','D_130','D_131','D_133','D_139','D_140','D_141','D_143','D_144','D_145'])

# for column in selected_column:
#     test_dataset[column] = test_dataset[column].fillna(test_dataset[column].median())

# selected_column2 = np.array(['D_68','B_30','B_38','D_114','D_116','D_117','D_120','D_126'])

# for column2 in selected_column2:
#     test_dataset[column2] =  test_dataset[column2].fillna(test_dataset[column2].mode()[0])

In [21]:
# line 24
# fill missing values with a number (0)
train_agg_data.fillna(value = 0, inplace = True)
test_agg_data.fillna(value = 0, inplace = True)

# **4. Creating Features**

In [39]:
# line 25
features_avg = ['B_1', 'B_2', 'B_3', 'B_4', 'B_5', 'B_6', 'B_8', 'B_9', 'B_10', 'B_11', 'B_12', 'B_13', 'B_14',\
                'B_15', 'B_16', 'B_17', 'B_18', 'B_19', 'B_20', 'B_21', 'B_22', 'B_23', 'B_24', 'B_25', 'B_28',\
                'B_29', 'B_30', 'B_32', 'B_33', 'B_37', 'B_38', 'B_39', 'B_40', 'B_41',  'D_39', 'D_41',\
                'D_42', 'D_43', 'D_44', 'D_45', 'D_46', 'D_47', 'D_48', 'D_50', 'D_51', 'D_53', 'D_54', 'D_55',\
                'D_58', 'D_59', 'D_60', 'D_61', 'D_62', 'D_65', 'D_66', 'D_69', 'D_70', 'D_71', 'D_72', 'D_73',\
                'D_74', 'D_75', 'D_76', 'D_77', 'D_78', 'D_80', 'D_82', 'D_84', 'D_91', 'D_92', 'D_96',\
                'D_103', 'D_104', 'D_108', 'D_112', 'D_113', 'D_114', 'D_115', 'D_117', 'D_118', 'D_119', 'D_120',\
                'D_121', 'D_122', 'D_123', 'D_124', 'D_125', 'D_126', 'D_128', 'D_129', 'D_131', 'D_132', 'D_133',\
                'D_134', 'D_135', 'D_136', 'D_140', 'D_141', 'D_142', 'D_144', 'D_145', 'P_2', 'P_3', 'P_4', 'R_1',\
                'R_2', 'R_3', 'R_7', 'R_8', 'R_9', 'R_10', 'R_11', 'R_14', 'R_15', 'R_16', 'R_17', 'R_20', \
                'R_22',  'R_26', 'R_27', 'S_3', 'S_5', 'S_6', 'S_7', 'S_9', 'S_11', 'S_12', 'S_13', 'S_15', 'S_16',\
                'S_18', 'S_22', 'S_23', 'S_25', 'S_26', 'B_42','D_86','D_94','R_21','R_24']
features_min = ['B_2', 'B_4', 'B_5', 'B_9', 'B_13', 'B_14', 'B_15', 'B_16', 'B_17', 'B_19', 'B_20', 'B_28',\
                'B_29', 'B_36',  'D_41', 'D_42', 'D_45', 'D_46', 'D_48', 'D_50', 'D_51',\
                'D_53', 'D_55', 'D_56', 'D_58', 'D_59', 'D_60', 'D_62', 'D_71', 'D_74', 'D_75', \
                'D_102', 'D_112', 'D_113', 'D_115', 'D_118', 'D_119', 'D_121', 'D_122', 'D_128', 'D_132',\
                'D_141', 'D_144', 'D_145', 'P_2', 'P_3', 'R_1', 'R_27', 'S_3', 'S_5', 'S_7', 'S_9', 'S_11',\
                'S_12', 'S_23', 'S_25','B_42','B_33', 'D_39', 'D_78','D_83','D_70','D_140']
features_max = ['B_1', 'B_2', 'B_3', 'B_4', 'B_5', 'B_6', 'B_7', 'B_8', 'B_9', 'B_10', 'B_12', 'B_13', 'B_14', 'B_15',\
                'B_16', 'B_17', 'B_18', 'B_19', 'B_21', 'B_23', 'B_24', 'B_25', 'B_29', 'B_30', 'B_37', 'B_38',\
                'B_40', 'D_39', 'D_41', 'D_42', 'D_43', 'D_44', 'D_45', 'D_46', 'D_47', 'D_48', 'D_49',\
                'D_50', 'D_52', 'D_55', 'D_56', 'D_58', 'D_59', 'D_60', 'D_61', 'D_63', 'D_64', 'D_65', 'D_70', 'D_71',\
                'D_72', 'D_73', 'D_74', 'D_76', 'D_77', 'D_78', 'D_80', 'D_82', 'D_84', 'D_102', 'D_105', 'D_107',\
                'D_110', 'D_112', 'D_115', 'D_116', 'D_117', 'D_118', 'D_119', 'D_121', 'D_122', 'D_123', 'D_124',\
                'D_125', 'D_128', 'D_131', 'D_132', 'D_133', 'D_134', 'D_138',  'D_141',\
                'D_142', 'D_144', 'D_145', 'P_2', 'P_3', 'P_4', 'R_1', 'R_3', 'R_5', 'R_6', 'R_7','R_10', 'R_11', \
                'R_14',  'R_20', 'R_26', 'R_27', 'S_3', 'S_5', 'S_7', 'S_8', 'S_11', 'S_12', 'S_13', 'S_15', 'S_16', \
                'S_22', 'S_23', 'S_24', 'S_25', 'S_26', 'S_27','B_42','B_39','D_91',  'R_8', 'D_136','D_140','D_111','D_126','D_135','B_33','R_17']
features_last = ['B_1', 'B_2', 'B_3', 'B_4', 'B_5', 'B_6', 'B_7', 'B_8', 'B_9', 'B_10', 'B_11', 'B_12', 'B_13', 'B_14', \
                 'B_15', 'B_16', 'B_17', 'B_18', 'B_19', 'B_20', 'B_21', 'B_22', 'B_23', 'B_24', 'B_25', 'B_26', 'B_28',\
                 'B_29', 'B_30', 'B_33', 'B_36', 'B_37', 'B_38', 'B_39', 'B_40', 'D_39', 'D_41',\
                 'D_42', 'D_43', 'D_44', 'D_45', 'D_46', 'D_47', 'D_48', 'D_49', 'D_50', 'D_51', 'D_52', 'D_53', 'D_54',\
                 'D_55', 'D_56', 'D_58', 'D_59', 'D_60', 'D_61', 'D_62', 'D_63', 'D_64', 'D_65', 'D_69', 'D_70', 'D_71',\
                 'D_72',  'D_75', 'D_76', 'D_77', 'D_78', 'D_79', 'D_80',  'D_82', 'D_83', 'D_86', 'D_91',\
                 'D_96', 'D_105', 'D_106', 'D_112', 'D_114', 'D_119', 'D_120', 'D_121', 'D_122', 'D_124',  \
                 'D_127', 'D_130', 'D_131', 'D_132', 'D_133', 'D_134', 'D_138', 'D_140', 'D_141', 'D_142', \
                 'D_145', 'P_2', 'P_3', 'P_4', 'R_1', 'R_2', 'R_3',  'R_5', 'R_6', 'R_7', 'R_9', 'R_10',\
                 'R_11', 'R_12', 'R_20', 'R_26', 'R_27', 'S_3', 'S_5', 'S_7', \
                 'S_8', 'S_9', 'S_11', 'S_12', 'S_13', 'S_16', 'S_19', 'S_20', 'S_22', 'S_23', 'S_24', 'S_25', 'S_26',\
                 'S_27','B_32','B_41', 'B_42', 'D_73','D_81','D_125','D_126', 'R_4', 'R_14', 'R_8','R_13','R_15','R_19','S_6']

cid = pd.Categorical(train_agg_data, ordered=True)
df_avg = (train_agg_data
          .groupby(cid)
          .mean()[features_avg]
          .rename(columns={f: f"{f}_avg" for f in features_avg})
         )
df_min = (train_agg_data
          .groupby(cid)
          .min()[features_min]
          .rename(columns={f: f"{f}_min" for f in features_min})
         )
df_max = (train_agg_data
          .groupby(cid)
          .max()[features_max]
          .rename(columns={f: f"{f}_max" for f in features_max})
         )

train_agg_data = (train_agg_data.loc[last, features_last]
      .rename(columns={f: f"{f}_last" for f in features_last})
      .set_index(np.asarray(cid[last]))
     )

train_agg_data = pd.concat([train_agg_data, df_min, df_max,df_avg], axis=1)

cid = pd.Categorical(test_agg_data, ordered=True)
df_avg = (test_agg_data
          .groupby(cid)
          .mean()[features_avg]
          .rename(columns={f: f"{f}_avg" for f in features_avg})
         )
df_min = (test_agg_data
          .groupby(cid)
          .min()[features_min]
          .rename(columns={f: f"{f}_min" for f in features_min})
         )
df_max = (test_agg_data
          .groupby(cid)
          .max()[features_max]
          .rename(columns={f: f"{f}_max" for f in features_max})
         )

test_agg_data = (test_agg_data.loc[last, features_last]
      .rename(columns={f: f"{f}_last" for f in features_last})
      .set_index(np.asarray(cid[last]))
     )

test_agg_data = pd.concat([test_agg_data, df_min, df_max,df_avg], axis=1)

# **5. Scaling** 

In [40]:
# line 26
scaler = StandardScaler()
features = [f for f in test_agg_data.columns if f != 'customer_ID' and f != 'target' ]

train_agg_data_ = scaler.fit_transform(train_agg_data)
test_data_scaled_ = scaler.transform(test_agg_data[features])

# **6. Remove highly correlated fields**

In [41]:
# line 27
# train_dataset_without_target = train_dataset.drop(["target"],axis=1)

# cor_matrix = train_dataset_without_target.corr()
# col_core = set()

# for i in range(len(cor_matrix.columns)):
#     for j in range(i):
#         if(cor_matrix.iloc[i, j] > 0.9):
#             col_name = cor_matrix.columns[i]
#             col_core.add(col_name)

# **<font size=6 color="blue">Evaluation Metrics </font>**

In [29]:
# line 28
#amex metric
def amex_metric(y_true, y_pred, return_components=False) -> float:
    
    def top_four_percent_captured(df) -> float:
        
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum() 
    
    def weighted_gini(df) -> float:
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(df) -> float:
        
        df2 = pd.DataFrame({'target': df.target, 'prediction': df.target})
        df2.sort_values('prediction', ascending=False, inplace=True)
        return weighted_gini(df) / weighted_gini(df2)
    
    df = pd.DataFrame({'target': y_true.ravel(), 'prediction': y_pred.ravel()})
    df.sort_values('prediction', ascending=False, inplace=True)
    g = normalized_weighted_gini(df)
    d = top_four_percent_captured(df)

    if return_components: return g, d, 0.5 * (g + d)
    return 0.5 * (g + d)

# MAE, RMSE, R SQUARED, ACCURACY
def evaluate(y_test , y_pred):
    print("MAE: ",mean_absolute_error(y_test,y_pred) , '\n')
        
    print("RMSE: ",np.sqrt(mean_squared_error(y_test,y_pred)), '\n')
        
    r2 = r2_score(y_test,y_pred)
    print("R Squared: ",r2, '\n')
    
    accuracy = accuracy_score(y_test, y_pred, normalize=True)
    print("Accuracy: ",accuracy, '\n')

# **<font size=6 color="blue">Implementation & Validation</font>**

In [30]:
# line 29
# split dataset
y = train_agg_data['target']
X = train_agg_data.drop('target', axis=1)

x_train, x_test , y_train , y_test = train_test_split(X,y,test_size=0.3,random_state=42) 

# **1. KNN**

In [42]:
# line 30
# instantiate the model with k=5
knn_model = KNeighborsClassifier(n_neighbors=5)

# fit the model to the training set
knn_model.fit(x_train, y_train)

# predict on the test-set
y_pred_knn = knn_model.predict(x_test)

In [7]:
# line 31
print('AMEX METRIC: ',amex_metric_np(y_test.values, y_pred_knn, return_components=False))
print('')
evaluate(y_test , y_pred_knn)


AMEX METRIC:  0.46709616342968685

MAE:  0.13537365307300916 

RMSE:  0.36793158749013266 

R Squared:  0.2945193916954576 

Accuracy:  0.8646263469269908 



In [43]:
# line 32
# make predictions
predictions_knn = knn_5.predict(test_dataset)
output_knn = pd.DataFrame({'customer_ID': sub.customer_ID, 'prediction': predictions_knn})
output_knn.to_csv('submission_knn.csv', index=False)

# **2. SVM**

In [44]:
# line 33
param_grid = { 
    'C': [6,7,8],
}
clf = LinearSVC(class_weight='balanced', dual=False, penalty = "l1", random_state = 42)
svc_random = GridSearchCV(estimator = clf, param_grid = param_grid, cv=2, verbose=0, n_jobs = -1)
svc_random.fit(x_train,y_train)
svc_model = svc_random.best_estimator_

y_pred_svm = svc_model.predict(x_test)

In [8]:
# line 34
print('AMEX METRIC: ',amex_metric(y_test, y_pred, return_components=False))
print('')
evaluate(y_test , y_pred)


AMEX METRIC:  0.5233161525009339

MAE:  0.11808286937668196 

RMSE:  0.343631880617445 

R Squared:  0.3846278605388628 

Accuracy:  0.881917130623318



In [45]:
# line 35
# make predictions
predictions = svc_model.predict(test_dataset[num_columns])
output_svm = pd.DataFrame({'customer_ID': sub.customer_ID, 'prediction': predictions})
output_svm.to_csv('submission_svm.csv', index=False)

# **3. XGBoost**

In [48]:
# line 36
model = xgb.XGBClassifier()
model.fit(x_train, y_train)


XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, ...)



In [49]:
# line 37
y_pred = model.predict(x_test)

In [50]:
# line 38
amex = amex_metric(y_test, y_pred, return_components=False)
print('AMEX METRIC: ', amex)

AMEX METRIC:  0.5736806826400411


In [51]:
# line 39
predictions_xgb = model.predict(test_dataset[num_columns])
output_xgb = pd.DataFrame({'customer_ID': sub.customer_ID, 'prediction': predictions_xgb})
output_xgb.to_csv('submission_svm.csv', index=False)

# **4. LGBMClassifier**

In [2]:
# line 40
features = [f for f in train_agg_data.columns if f != 'customer_ID' and f != 'target' ]

def my_booster(random_state=1, n_estimators=1300):
    return LGBMClassifier(n_estimators=n_estimators,
                          learning_rate=0.03, reg_lambda=50,
                          min_child_samples=2400,
                          num_leaves=94,
                          colsample_bytree=0.19,
                          max_bins=517, random_state=random_state)
      
score_list = []
y_pred_list = []
kf = StratifiedKFold(n_splits=5)
for fold, (idx_tr, idx_va) in enumerate(kf.split(train_agg_data, target)):
    X_tr, X_va, y_tr, y_va, model = None, None, None, None, None
    start_time = datetime.datetime.now()
    X_tr = train_agg_data.iloc[idx_tr][features]
    X_va = train_agg_data.iloc[idx_va][features]
    y_tr = target[idx_tr]
    y_va = target[idx_va]
    
    model = my_booster()
    with warnings.catch_warnings():
        warnings.filterwarnings('ignore', category=UserWarning)
        model.fit(X_tr, y_tr,
                  eval_set = [(X_va, y_va)], 
                  eval_metric=[lgb_amex_metric],
                  callbacks=[log_evaluation(100)])
    X_tr, y_tr = None, None
    y_va_pred = model.predict_proba(X_va, raw_score=True)
    score = amex_metric(y_va, y_va_pred)
    n_trees = model.best_iteration_
    if n_trees is None: n_trees = model.n_estimators
    print(f"Fold {fold} | {str(datetime.datetime.now() - start_time)[-12:-7]} |"
          f" {n_trees:5} trees |"
          f"                Score = {score:.5f}")
    score_list.append(score)
    
    if INFERENCE:
        y_pred_list.append(model.predict_proba(test_agg_data[features], raw_score=True))
            
print(f"OOF Score:                       {np.mean(score_list):.5f}")


[100]	valid_0's binary_logloss: 0.246906	valid_0's amex: 0.764519
[200]	valid_0's binary_logloss: 0.228331	valid_0's amex: 0.780338
[300]	valid_0's binary_logloss: 0.223375	valid_0's amex: 0.785591
[400]	valid_0's binary_logloss: 0.221115	valid_0's amex: 0.78835
[500]	valid_0's binary_logloss: 0.219791	valid_0's amex: 0.790354
[600]	valid_0's binary_logloss: 0.218981	valid_0's amex: 0.791777
[700]	valid_0's binary_logloss: 0.218384	valid_0's amex: 0.792521
[800]	valid_0's binary_logloss: 0.217952	valid_0's amex: 0.793035
[900]	valid_0's binary_logloss: 0.217702	valid_0's amex: 0.793741
[1000]	valid_0's binary_logloss: 0.217524	valid_0's amex: 0.793917
[1100]	valid_0's binary_logloss: 0.217322	valid_0's amex: 0.793403
[1200]	valid_0's binary_logloss: 0.217244	valid_0's amex: 0.793542
[1300]	valid_0's binary_logloss: 0.21716	valid_0's amex: 0.794232
[1400]	valid_0's binary_logloss: 0.217156	valid_0's amex: 0.794385
[1500]	valid_0's binary_logloss: 0.217179	valid_0's amex: 0.794007
Fold 

In [52]:
# line 41
sub_lgb = pd.DataFrame({'customer_ID': test.index,'prediction': np.mean(y_pred_list, axis=0)})
sub_lgb.to_csv('submission_lgb.csv', index=False)

# **5. Neural Network**

In [31]:
# line 42
# feature selection
features = [f for f in train_agg_data.columns if f != 'target' and f != 'customer_ID']

In [32]:
# line 43
import gc
del train_data, test_data
gc.collect()

99

In [35]:
# line 44
# NN model

def nn_model():
    regularization = 4e-4
    activation_func = 'swish'
    inputs = Input(shape = (len(features)))

    x0 = Dense(256,kernel_regularizer = tf.keras.regularizers.l2(regularization), activation = activation_func)(inputs)
    x1 = Dense(128,kernel_regularizer = tf.keras.regularizers.l2(regularization),activation = activation_func)(x0)
    x1 = Dense(64,kernel_regularizer = tf.keras.regularizers.l2(regularization),activation = activation_func)(x1)
    x1 = Dense(32,kernel_regularizer = tf.keras.regularizers.l2(regularization),activation = activation_func)(x1)
    
    x1 = Concatenate()([x1, x0])
    x1 = Dropout(0.1)(x1)
    
    x1 = Dense(16, kernel_regularizer=tf.keras.regularizers.l2(regularization),activation=activation_func,)(x1)
    
    x1 = Dense(1, activation='sigmoid')(x1)
    
    model = Model(inputs, x1)
    
    return model

In [36]:
# line 45
#model parameters
BATCH_SIZE         = 2048
EPOCHS             = 192 
EPOCHS_COSINEDECAY = 192 
DIAGRAMS           = True
USE_PLATEAU        = False
INFERENCE          = False
VERBOSE            = 0 
TARGET             = 'target'

In [37]:
# line 46
# Train model
def fit_model(X_train, y_train, X_val, y_val, run = 0):

    start = 0.01
    start_time = datetime.datetime.now()
    
    X_train = scaler.fit_transform(X_train)

    epochs = EPOCHS    
    lr = ReduceLROnPlateau(monitor = 'val_loss', factor = 0.7, patience = 4, verbose = VERBOSE)
    es = EarlyStopping(monitor = 'val_loss',patience = 12, verbose = 1, mode = 'min', restore_best_weights = True)
    tm = tf.keras.callbacks.TerminateOnNaN()
    callbacks = [lr, es, tm]
    
    if USE_PLATEAU == False:
        epochs = EPOCHS_COSINEDECAY
        end = 0.0002

        def cosine_decay(epoch):
            if epochs > 1:
                w = (1 + math.cos(epoch / (epochs - 1) * math.pi)) / 2
            else:
                w = 1
            return w * start + (1 - w) * end
        
        lr = LearningRateScheduler(cosine_decay, verbose = 0)
        callbacks = [lr, tm]

    model = nn_model()
    optimizer_func = tf.keras.optimizers.Adam(learning_rate = start)
    loss_func = tf.keras.losses.BinaryCrossentropy()
    model.compile(optimizer = optimizer_func, loss = loss_func)
    
    X_val = scaler.transform(X_val)
    validation_data = (X_val, y_val)
    
    history = model.fit(X_train, 
                        y_train, 
                        validation_data = validation_data, 
                        epochs          = epochs,
                        verbose         = VERBOSE,
                        batch_size      = BATCH_SIZE,
                        shuffle         = True,
                        callbacks       = callbacks
                       )
    
    history_list.append(history.history)
    
    print(f'Training Loss: {history_list[-1]["loss"][-1]:.5f}, Validation Loss: {history_list[-1]["val_loss"][-1]:.5f}')
    callbacks, es, lr, tm, history = None, None, None, None, None
    
    y_val_pred = model.predict(X_val, batch_size = BATCH_SIZE, verbose = VERBOSE).ravel()
    amex_score = amex_metric(y_val.values, y_val_pred, return_components = False)
    
    print(f'Fold {run}.{fold} '
          f' Amex Score: {amex_score:.5f}')
    
    print('')
    
    score_list.append(amex_score)
    
    test_data_scaled = scaler.transform(test_agg_data[features])
    tst_pred = model.predict(test_data_scaled)
    predictions.append(tst_pred)
    
    return model

In [54]:
# line 47
history_list = []
score_list   = []
predictions  = []

kf = KFold(n_splits = 5)

for fold, (trn_idx, val_idx) in enumerate(kf.split(train_agg_data)):
    X_train, X_val = train_agg_data.iloc[trn_idx][features], train_agg_data.iloc[val_idx][features]
    y_train, y_val = train_agg_data.iloc[trn_idx][TARGET], train_agg_data.iloc[val_idx][TARGET]
    
    fit_model(X_train, y_train, X_val, y_val)
    
print(f'OOF AUC: {np.mean(score_list):.5f}')


Training Loss: 0.22557, Validation Loss: 0.22963
Fold 0.0 Amex Score: 0.78236

Training Loss: 0.22550, Validation Loss: 0.22978
Fold 0.1 Amex Score: 0.78053

Training Loss: 0.22623, Validation Loss: 0.22740
Fold 0.2 Amex Score: 0.78152

Training Loss: 0.22650, Validation Loss: 0.22582
Fold 0.3 Amex Score: 0.78385

Training Loss: 0.22691, Validation Loss: 0.22560
Fold 0.4 Amex Score: 0.78634

OOF AUC: 0.78292

