**Importing Libraries**

In [1]:
import pandas as pd
import numpy as np
import gc
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

import tensorflow
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from keras.callbacks import EarlyStopping

In [2]:
number_of_transactions = 2

In [3]:
data = pd.read_csv('../input/amex-default-prediction/train_data.csv', nrows=200000).groupby('customer_ID').tail(number_of_transactions).set_index('customer_ID', drop=True).sort_index()
labels = pd.read_csv('../input/amex-default-prediction/train_labels.csv').set_index('customer_ID', drop=True).sort_index()

**Merging the Training data with Labels**

In [4]:
df = pd.merge(data, labels, left_index=True, right_index=True)  

In [5]:
df.head()

Unnamed: 0_level_0,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,D_42,...,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145,target
customer_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0000099d6bd597052cdcda90ffabf56573fe9d7c79be5fbac11a8ed792feb62a,2018-02-21,0.937349,0.002425,0.019837,1.008307,0.000607,0.102985,0.006174,0.007454,,...,,,0.001563,0.001976,0.002569,,0.001098,0.006346,0.004204,0
0000099d6bd597052cdcda90ffabf56573fe9d7c79be5fbac11a8ed792feb62a,2018-03-13,0.934745,0.009119,0.009382,1.007647,0.006104,0.135021,0.001604,0.007174,,...,,,0.007186,0.004234,0.005086,,0.00581,0.00297,0.008533,0
00000fd6641609c6ece5454664794f0340ad84dddce9a267a310b5ae68e9d8e5,2018-02-06,0.878856,0.53629,0.034558,1.005419,0.007248,0.159486,0.006274,0.008733,,...,,,0.006658,0.004627,0.00482,,0.008272,0.002528,0.003584,0
00000fd6641609c6ece5454664794f0340ad84dddce9a267a310b5ae68e9d8e5,2018-03-25,0.880519,0.178126,0.034684,1.004028,0.006911,0.165509,0.005552,0.005068,,...,,,0.00298,0.007479,0.00787,,0.003284,0.003169,0.008514,0
00001b22f846c82c51f6e3958ccd81970162bae8b007e80662ef27519fcc18c1,2018-02-09,0.866342,0.001526,0.008318,0.818953,0.006756,,0.009511,0.004373,,...,,,0.008112,0.009543,0.003328,,0.004781,0.007325,0.005606,0


**Descriptive Statistics**

In [6]:
df.describe()

Unnamed: 0,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,D_42,D_43,...,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145,target
count,32759.0,32978.0,32978.0,32975.0,32978.0,26809.0,32975.0,32975.0,4422.0,23348.0,...,1231.0,1231.0,32729.0,32972.0,32729.0,5767.0,32729.0,32972.0,32729.0,32978.0
mean,0.632882,0.1969083,0.140342,0.585763,0.1076098,0.231808,0.09378779,0.1556723,0.178078,0.165073,...,0.018771,0.145233,0.1812338,0.028439,0.1666252,0.3946,0.1811465,0.05212858,0.06346008,0.264085
std,0.266624,0.3694746,0.231423,0.413427,0.2781923,0.204968,0.2873838,0.2537062,0.243863,0.228746,...,0.116713,0.253666,0.3810453,0.151332,0.3501898,0.245088,0.3809258,0.1818579,0.1948796,0.440852
min,-0.35523,7.301857e-07,-0.30933,6e-06,6.44072e-07,-0.240078,6.066678e-07,1.323113e-07,-0.000219,1e-06,...,7e-06,1.4e-05,3.833695e-07,1e-06,1.323148e-07,-0.0094,1.65358e-08,7.164186e-07,4.609489e-08,0.0
25%,0.447735,0.004974273,0.009007,0.068823,0.00300148,0.128562,0.003014322,0.005520861,0.033735,0.043012,...,0.002437,0.003324,0.003060873,0.002539,0.003017133,0.193622,0.003046739,0.002706759,0.003001234,0.0
50%,0.678158,0.009976387,0.034422,0.813246,0.005952563,0.165469,0.005992414,0.01118144,0.115234,0.091015,...,0.004973,0.006791,0.006099273,0.005125,0.006023944,0.380992,0.00610307,0.005460053,0.006053148,0.0
75%,0.859826,0.2710732,0.158221,1.00186,0.008901228,0.266837,0.008974754,0.2178614,0.235525,0.196589,...,0.007673,0.500239,0.009095845,0.007666,0.009105157,0.581989,0.009120776,0.008242189,0.00908765,1.0
max,1.009943,5.036047,1.324053,1.009999,2.507711,2.888382,6.798167,1.625262,3.249918,5.794954,...,1.009288,1.509486,1.01,1.009992,1.174753,1.751388,1.009993,1.342748,4.187805,1.0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 32978 entries, 0000099d6bd597052cdcda90ffabf56573fe9d7c79be5fbac11a8ed792feb62a to 09572fafe01b8bbb560809da84c7d1c8d9e79eb7287bc5a2d1e1a4a474ab5382
Columns: 190 entries, S_2 to target
dtypes: float64(185), int64(2), object(3)
memory usage: 48.1+ MB


In [8]:
df.isnull().sum()

S_2           0
P_2         219
D_39          0
B_1           0
B_2           3
          ...  
D_142     27211
D_143       249
D_144         6
D_145       249
target        0
Length: 190, dtype: int64


# Visualisation

**Count Plot of the Target Variable**

In [9]:
#sns.countplot(x = 'target',data = df)

# Pre-Processing

**Dropping the Transaction Dates**

In [10]:
drop_cols = ['S_2'] 
df.drop(drop_cols, inplace=True, axis=1)

**Creating Training Labels and Data**

In [11]:
df_dummy= pd.get_dummies(df)

In [12]:
y = df_dummy['target']
X = df_dummy.drop('target', axis=1)

**Handling missing values**

Missing values are imputed with the respective column mean.

In [13]:
col_names = X.columns
imputer = SimpleImputer()
X = pd.DataFrame(imputer.fit_transform(X))  
X.columns = col_names

**Standardization**

In [14]:
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), index=X.index, columns=X.columns) 

**Splitting into training and validation sets**

In [15]:
#train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 1, shuffle=True, stratify=y, test_size=0.2)
X_train, X_test , y_train , y_test = train_test_split(X, y, random_state = 1, shuffle=True, stratify=y, test_size=0.2)

Memory cleanup to prevent out of memory error.

In [16]:
del data, labels, X, y
gc.collect()

206

# Model Building

**Logistic Regression**

In [17]:
from sklearn.metrics import classification_report,accuracy_score
from sklearn import metrics

In [18]:
from sklearn.linear_model import LogisticRegression

In [19]:
clf_lr = LogisticRegression(n_jobs=1, C=1e5)
clf_lr.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=100000.0, n_jobs=1)

In [20]:
%%time
y_pred_val_lr = clf_lr.predict(X_test)
print('Accuracy on Validation set :',accuracy_score(y_test, y_pred_val_lr))
print("\n")
print(classification_report(y_test, y_pred_val_lr))

Accuracy on Validation set : 0.8946331109763493


              precision    recall  f1-score   support

           0       0.92      0.94      0.93      4854
           1       0.81      0.78      0.80      1742

    accuracy                           0.89      6596
   macro avg       0.87      0.86      0.86      6596
weighted avg       0.89      0.89      0.89      6596

CPU times: user 82.1 ms, sys: 63.6 ms, total: 146 ms
Wall time: 37.3 ms


In [21]:
from sklearn.linear_model import SGDClassifier

In [22]:
sgd = SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)
sgd.fit(X_train, y_train)

SGDClassifier(alpha=0.001, max_iter=5, random_state=42, tol=None)

In [23]:
%%time
y_pred_val_sgd = sgd.predict(X_test)
print('Accuracy on Validation set :',accuracy_score(y_test, y_pred_val_sgd))
print("\n")
print(classification_report(y_test, y_pred_val_sgd))

Accuracy on Validation set : 0.8779563371740449


              precision    recall  f1-score   support

           0       0.92      0.92      0.92      4854
           1       0.77      0.76      0.77      1742

    accuracy                           0.88      6596
   macro avg       0.84      0.84      0.84      6596
weighted avg       0.88      0.88      0.88      6596

CPU times: user 50.9 ms, sys: 58.1 ms, total: 109 ms
Wall time: 32 ms


**Decision Tree Classifier**

In [24]:
from sklearn.tree import DecisionTreeClassifier

In [25]:
dc = DecisionTreeClassifier(random_state=0)
dc.fit(X_train, y_train)

DecisionTreeClassifier(random_state=0)

In [26]:
%%time
y_pred_val_dc = dc.predict(X_test)
print('Accuracy on Validation set :',accuracy_score(y_test, y_pred_val_dc))
print("\n")
print(classification_report(y_test, y_pred_val_dc))

Accuracy on Validation set : 0.8506670709520922


              precision    recall  f1-score   support

           0       0.90      0.90      0.90      4854
           1       0.72      0.71      0.72      1742

    accuracy                           0.85      6596
   macro avg       0.81      0.81      0.81      6596
weighted avg       0.85      0.85      0.85      6596

CPU times: user 25.3 ms, sys: 0 ns, total: 25.3 ms
Wall time: 24.4 ms


In [27]:
from lightgbm import LGBMClassifier

In [28]:
lgbm = LGBMClassifier()
lgbm.fit(X_train, y_train)

LGBMClassifier()

In [29]:
%%time
y_pred_val_lgbm = lgbm.predict(X_test)


CPU times: user 92 ms, sys: 0 ns, total: 92 ms
Wall time: 32.4 ms


In [None]:
df_subm = pd.read_csv('../input/amex-default-prediction/test_data.csv')

In [None]:
df_subm["prediction"] = y_pred_val_lgbm

In [None]:
result = df_subm[['customer_ID','prediction']]

In [None]:
import base64
from IPython.display import HTML
def download_csv(df, title = "Download CSV file", filename = "output_180474E.csv"):  
    csv = df.to_csv()
    b64 = base64.b64encode(csv.encode())
    payload = b64.decode()
    html = '<a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{title}</a>'
    html = html.format(payload=payload,title=title,filename=filename)
    return HTML(html)

In [None]:
download_csv(result)

**Listing Accuracies on Validation Data**

In [None]:
print('\nLogistic Regression :',accuracy_score(y_test, y_pred_val_lr))
print('\nSupport Vector :',accuracy_score(y_test, y_pred_val_sgd))
print('\nDecision Tree :',accuracy_score(y_test, y_pred_val_dc))
print('\nLight GBM Classifier :',accuracy_score(y_test, y_pred_val_lgbm))

# Inferences

In [None]:
preds = clf.predict_proba(X_test)[:, 1]

The best accuracy is obtained using Light GBM Classifier.