In [164]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import OneHotEncoder, LabelEncoder

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import xgboost as xgb
from xgboost import XGBClassifier
import lightgbm as lgb


pd.set_option('max_columns', 100)
pd.set_option('max_rows', 100)

In [165]:
train = pd.read_csv("dataset/train.csv")
test = pd.read_csv("dataset/test.csv")
submission = pd.read_csv("dataset/sample_submission.csv")

In [166]:
train.head(10)

Unnamed: 0,customer_id,name,age,gender,owns_car,owns_house,no_of_children,net_yearly_income,no_of_days_employed,occupation_type,total_family_members,migrant_worker,yearly_debt_payments,credit_limit,credit_limit_used(%),credit_score,prev_defaults,default_in_last_6months,credit_card_default
0,CST_115179,ita Bose,46,F,N,Y,0.0,107934.04,612.0,Unknown,1.0,1.0,33070.28,18690.93,73,544.0,2,1,1
1,CST_121920,Alper Jonathan,29,M,N,Y,0.0,109862.62,2771.0,Laborers,2.0,0.0,15329.53,37745.19,52,857.0,0,0,0
2,CST_109330,Umesh Desai,37,M,N,Y,0.0,230153.17,204.0,Laborers,2.0,0.0,48416.6,41598.36,43,650.0,0,0,0
3,CST_128288,Rie,39,F,N,Y,0.0,122325.82,11941.0,Core staff,2.0,0.0,22574.36,32627.76,20,754.0,0,0,0
4,CST_151355,McCool,46,M,Y,Y,0.0,387286.0,1459.0,Core staff,1.0,0.0,38282.95,52950.64,75,927.0,0,0,0
5,CST_123268,Sarah Marsh,46,F,Y,N,0.0,252765.91,2898.0,Accountants,2.0,1.0,37046.86,40245.64,19,937.0,0,0,0
6,CST_127502,Mason,38,M,N,Y,1.0,262389.2,5541.0,High skill tech staff,3.0,0.0,50839.39,41311.08,42,733.0,0,0,0
7,CST_151722,Saba,46,F,Y,Y,1.0,241211.39,1448.0,Core staff,3.0,0.0,30008.46,32209.22,91,906.0,0,0,0
8,CST_133768,Ashutosh,40,F,,Y,0.0,210091.43,11551.0,Laborers,2.0,0.0,21521.89,65037.74,14,783.0,0,0,0
9,CST_111670,David Milliken,39,F,Y,Y,2.0,207109.13,2791.0,High skill tech staff,4.0,0.0,9509.1,28425.52,14,666.0,0,0,0


In [167]:
test.head(10)

Unnamed: 0,customer_id,name,age,gender,owns_car,owns_house,no_of_children,net_yearly_income,no_of_days_employed,occupation_type,total_family_members,migrant_worker,yearly_debt_payments,credit_limit,credit_limit_used(%),credit_score,prev_defaults,default_in_last_6months
0,CST_142525,Siva,52,F,Y,N,0.0,232640.53,998.0,Unknown,2.0,0.0,14406.73,26524.4,4,779.0,0,0
1,CST_129215,Scott,48,F,N,N,1.0,284396.79,1338.0,Unknown,3.0,0.0,57479.99,68998.72,70,806.0,0,0
2,CST_138443,Victoria,50,F,N,N,1.0,149419.28,1210.0,Unknown,3.0,0.0,21611.01,25187.8,71,528.0,2,0
3,CST_123812,John McCrank,30,F,N,N,1.0,160437.54,503.0,Laborers,2.0,1.0,28990.76,29179.39,9,815.0,0,0
4,CST_144450,Martinne,52,M,N,Y,0.0,233480.37,157.0,Laborers,2.0,1.0,54213.72,82331.82,82,613.0,1,1
5,CST_107341,asil Katz,29,F,N,Y,0.0,145295.56,4794.0,Sales staff,2.0,0.0,40487.68,20346.31,71,720.0,0,0
6,CST_147879,Baker,37,F,N,Y,0.0,144028.05,1069.0,Laborers,1.0,0.0,24609.26,31383.4,29,944.0,0,0
7,CST_156027,Saphir,29,M,N,Y,0.0,126638.58,319.0,Unknown,1.0,0.0,40468.8,25486.65,5,939.0,0,0
8,CST_109067,Sinead Cruise,41,F,N,Y,0.0,81034.46,3012.0,Medicine staff,2.0,0.0,19918.25,13458.33,12,763.0,0,0
9,CST_114556,Conor Humphries,41,M,Y,Y,1.0,102395.81,450.0,Unknown,3.0,1.0,26494.71,24354.25,90,897.0,0,0


In [168]:
train.shape, test.shape

((45528, 19), (11383, 18))

In [169]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45528 entries, 0 to 45527
Data columns (total 19 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   customer_id              45528 non-null  object 
 1   name                     45528 non-null  object 
 2   age                      45528 non-null  int64  
 3   gender                   45528 non-null  object 
 4   owns_car                 44981 non-null  object 
 5   owns_house               45528 non-null  object 
 6   no_of_children           44754 non-null  float64
 7   net_yearly_income        45528 non-null  float64
 8   no_of_days_employed      45065 non-null  float64
 9   occupation_type          45528 non-null  object 
 10  total_family_members     45445 non-null  float64
 11  migrant_worker           45441 non-null  float64
 12  yearly_debt_payments     45433 non-null  float64
 13  credit_limit             45528 non-null  float64
 14  credit_limit_used(%)  

In [170]:
train.isnull().sum()/train.shape[0] * 100

customer_id                0.000000
name                       0.000000
age                        0.000000
gender                     0.000000
owns_car                   1.201458
owns_house                 0.000000
no_of_children             1.700053
net_yearly_income          0.000000
no_of_days_employed        1.016957
occupation_type            0.000000
total_family_members       0.182305
migrant_worker             0.191091
yearly_debt_payments       0.208663
credit_limit               0.000000
credit_limit_used(%)       0.000000
credit_score               0.017572
prev_defaults              0.000000
default_in_last_6months    0.000000
credit_card_default        0.000000
dtype: float64

In [171]:
train.describe()

Unnamed: 0,age,no_of_children,net_yearly_income,no_of_days_employed,total_family_members,migrant_worker,yearly_debt_payments,credit_limit,credit_limit_used(%),credit_score,prev_defaults,default_in_last_6months,credit_card_default
count,45528.0,44754.0,45528.0,45065.0,45445.0,45441.0,45433.0,45528.0,45528.0,45520.0,45528.0,45528.0,45528.0
mean,38.993411,0.420655,200655.6,67609.289293,2.158081,0.179111,31796.965311,43548.42,52.23502,782.791257,0.06071,0.05054,0.081203
std,9.54399,0.724097,669074.0,139323.524434,0.911572,0.38345,17269.727234,148784.7,29.37691,100.619746,0.264629,0.219059,0.273149
min,23.0,0.0,27170.61,2.0,1.0,0.0,2237.47,4003.14,0.0,500.0,0.0,0.0,0.0
25%,31.0,0.0,126345.8,936.0,2.0,0.0,19231.14,23973.81,27.0,704.0,0.0,0.0,0.0
50%,39.0,0.0,171714.9,2224.0,2.0,0.0,29081.65,35688.04,54.0,786.0,0.0,0.0,0.0
75%,47.0,1.0,240603.8,5817.0,3.0,0.0,40561.15,53435.76,78.0,867.0,0.0,0.0,0.0
max,55.0,9.0,140759000.0,365252.0,10.0,1.0,328112.86,31129970.0,99.0,949.0,2.0,1.0,1.0


In [172]:
train.nunique()

customer_id                45528
name                        4010
age                           33
gender                         3
owns_car                       2
owns_house                     2
no_of_children                10
net_yearly_income          45502
no_of_days_employed         7874
occupation_type               19
total_family_members          10
migrant_worker                 2
yearly_debt_payments       45251
credit_limit               45371
credit_limit_used(%)         100
credit_score                 450
prev_defaults                  3
default_in_last_6months        2
credit_card_default            2
dtype: int64

In [173]:
test.isnull().sum()/test.shape[0]*100

customer_id                0.000000
name                       0.000000
age                        0.000000
gender                     0.000000
owns_car                   1.159624
owns_house                 0.000000
no_of_children             1.669156
net_yearly_income          0.000000
no_of_days_employed        0.922428
occupation_type            0.000000
total_family_members       0.272336
migrant_worker             0.228411
yearly_debt_payments       0.193271
credit_limit               0.000000
credit_limit_used(%)       0.000000
credit_score               0.026355
prev_defaults              0.000000
default_in_last_6months    0.000000
dtype: float64

In [174]:
train['credit_card_default'].value_counts()

0    41831
1     3697
Name: credit_card_default, dtype: int64

In [175]:
train['gender'].value_counts()

F      29957
M      15570
XNA        1
Name: gender, dtype: int64

In [176]:
test['gender'].value_counts()

F      7443
M      3939
XNA       1
Name: gender, dtype: int64

In [177]:
train['occupation_type'].value_counts()/train.shape[0] *100

Unknown                  31.407046
Laborers                 17.865929
Sales staff              10.378229
Core staff                8.921982
Managers                  6.958355
Drivers                   6.033650
High skill tech staff     3.694430
Accountants               3.237568
Medicine staff            2.800474
Security staff            2.251362
Cooking staff             1.981198
Cleaning staff            1.460640
Private service staff     0.850026
Low-skill Laborers        0.738007
Waiters/barmen staff      0.445879
Secretaries               0.437094
Realty agents             0.221842
HR staff                  0.171323
IT staff                  0.144966
Name: occupation_type, dtype: float64

In [178]:
test['occupation_type'].value_counts()/test.shape[0] *100

Unknown                  31.151717
Laborers                 18.193798
Sales staff              10.331196
Core staff                8.943161
Managers                  7.080734
Drivers                   6.175876
High skill tech staff     3.557937
Accountants               3.206536
Medicine staff            2.793640
Security staff            2.213828
Cooking staff             1.827286
Cleaning staff            1.440745
Private service staff     0.957568
Low-skill Laborers        0.535887
Waiters/barmen staff      0.412896
Secretaries               0.368971
Realty agents             0.289906
HR staff                  0.272336
IT staff                  0.245981
Name: occupation_type, dtype: float64

In [179]:
# Missing Values Imputation
mode_cols = ['owns_car',  'migrant_worker','no_of_children', 'total_family_members']
mean_cols = ['no_of_days_employed',  'yearly_debt_payments', 'credit_score']

# Mode Impotation
for col in mode_cols:
    train[col] = train[col].fillna(train[col].mode()[0])
    test[col] = test[col].fillna(train[col].mode()[0])

# Mean Imputation
for col in mean_cols:
    train[col] = train[col].fillna(train[col].mean())
    test[col] = test[col].fillna(test[col].mean())

In [180]:
train['indicator'] = "train"
test['indicator'] = "test"

In [181]:
# Merge train and test data
data = train.iloc[:,train.columns!="credit_card_default"].append(test,sort=False)

In [182]:
target = train['credit_card_default']
target.shape

(45528,)

In [183]:
train.shape, test.shape, data.shape

((45528, 20), (11383, 19), (56911, 19))

### Data Preprocessing

In [184]:
data.columns

Index(['customer_id', 'name', 'age', 'gender', 'owns_car', 'owns_house',
       'no_of_children', 'net_yearly_income', 'no_of_days_employed',
       'occupation_type', 'total_family_members', 'migrant_worker',
       'yearly_debt_payments', 'credit_limit', 'credit_limit_used(%)',
       'credit_score', 'prev_defaults', 'default_in_last_6months',
       'indicator'],
      dtype='object')

In [185]:
# Drop customer id, name columns
df = data.drop(['customer_id','name'],axis=1)

In [186]:
df.columns

Index(['age', 'gender', 'owns_car', 'owns_house', 'no_of_children',
       'net_yearly_income', 'no_of_days_employed', 'occupation_type',
       'total_family_members', 'migrant_worker', 'yearly_debt_payments',
       'credit_limit', 'credit_limit_used(%)', 'credit_score', 'prev_defaults',
       'default_in_last_6months', 'indicator'],
      dtype='object')

In [187]:
# One hot encoding for categorical columns
cat_columns = ['gender', 'owns_car', 'owns_house','occupation_type','migrant_worker']
df = pd.get_dummies(df, columns=cat_columns)

In [188]:
df.columns

Index(['age', 'no_of_children', 'net_yearly_income', 'no_of_days_employed',
       'total_family_members', 'yearly_debt_payments', 'credit_limit',
       'credit_limit_used(%)', 'credit_score', 'prev_defaults',
       'default_in_last_6months', 'indicator', 'gender_F', 'gender_M',
       'gender_XNA', 'owns_car_N', 'owns_car_Y', 'owns_house_N',
       'owns_house_Y', 'occupation_type_Accountants',
       'occupation_type_Cleaning staff', 'occupation_type_Cooking staff',
       'occupation_type_Core staff', 'occupation_type_Drivers',
       'occupation_type_HR staff', 'occupation_type_High skill tech staff',
       'occupation_type_IT staff', 'occupation_type_Laborers',
       'occupation_type_Low-skill Laborers', 'occupation_type_Managers',
       'occupation_type_Medicine staff',
       'occupation_type_Private service staff',
       'occupation_type_Realty agents', 'occupation_type_Sales staff',
       'occupation_type_Secretaries', 'occupation_type_Security staff',
       'occupatio

In [189]:
train=df[df.indicator == "train"]
test = df[df.indicator == "test"]

train = train.drop(['indicator'], axis=1)
test = test.drop(['indicator'], axis=1)

In [190]:
train.shape, test.shape
#train.columns

((45528, 39), (11383, 39))

### Model Building

In [194]:
## Test Train Split
train_df, test_df, target_train, target_val = train_test_split(train, 
                                                         target, 
                                                         train_size= 0.80,
                                                         random_state=0);


In [195]:
train_df.shape, test_df.shape, target_train.shape, target_val.shape

((36422, 39), (9106, 39), (36422,), (9106,))

In [196]:
# Logistic Regression
lr = LogisticRegression()
lr.fit(train_df, target_train)
print("Fitting of Logistic Forest finished")

Fitting of Logistic Forest finished


In [197]:
lr_predictions = lr.predict(test_df)
print("Predictions finished")

Predictions finished


In [198]:
from sklearn.metrics import (accuracy_score, f1_score, log_loss, classification_report)
print("f1 score: {}".format(f1_score(target_val, lr_predictions)))
print("Accuracy: {}".format(accuracy_score(target_val, lr_predictions)))
print("="*80)
print(classification_report(target_val, lr_predictions))

f1 score: 0.7065779748706577
Accuracy: 0.9564023720623764
              precision    recall  f1-score   support

           0       0.97      0.98      0.98      8372
           1       0.77      0.65      0.71       734

    accuracy                           0.96      9106
   macro avg       0.87      0.82      0.84      9106
weighted avg       0.95      0.96      0.95      9106



In [199]:
# Random Forest
seed = 123   # We set our random seed to zero for reproducibility
# Random Forest parameters
rf_params = {
    'n_jobs': -1,
    'n_estimators': 1000,
#     'warm_start': True, 
    'max_features': 0.3,
    'max_depth': 4,
    'min_samples_leaf': 2,
    'max_features' : 'sqrt',
    'random_state' : seed,
    'verbose': 0
}

In [201]:
rf = RandomForestClassifier(**rf_params)
rf.fit(train_df, target_train)
print("Fitting of Random Forest finished")

Fitting of Random Forest finished


In [202]:
rf_predictions = rf.predict(test_df)
print("Predictions finished")

Predictions finished


In [203]:
print("f1 score: {}".format(f1_score(target_val, rf_predictions)))
print("Accuracy: {}".format(accuracy_score(target_val, rf_predictions)))
print("="*80)
print(classification_report(target_val, rf_predictions))

f1 score: 0.8707692307692307
Accuracy: 0.9815506259609049
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      8372
           1       1.00      0.77      0.87       734

    accuracy                           0.98      9106
   macro avg       0.99      0.89      0.93      9106
weighted avg       0.98      0.98      0.98      9106



In [204]:
# Gradient Boosting Params
gb_params ={
    'n_estimators': 1500,
    'max_features': 0.9,
    'learning_rate' : 0.25,
    'max_depth': 4,
    'min_samples_leaf': 2,
    'subsample': 1,
    'max_features' : 'sqrt',
    'random_state' : seed,
    'verbose': 0
}

In [206]:
gb = GradientBoostingClassifier(**gb_params)
gb.fit(train_df, target_train)
# Get our predictions
gb_predictions = gb.predict(test_df)
print("Predictions have finished")

Predictions have finished


In [207]:
print("f1 score: {}".format(f1_score(target_val, gb_predictions)))
print("Accuracy: {}".format(accuracy_score(target_val, gb_predictions)))
print("="*80)
print(classification_report(target_val, gb_predictions))

f1 score: 0.8608321377331419
Accuracy: 0.9786953656929497
              precision    recall  f1-score   support

           0       0.98      0.99      0.99      8372
           1       0.91      0.82      0.86       734

    accuracy                           0.98      9106
   macro avg       0.95      0.91      0.92      9106
weighted avg       0.98      0.98      0.98      9106



In [208]:
lgb_params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective':'binary',
    'metric': {'auc'},
    'num_leaves': 96,
    'learning_rate': 0.01,
    'feature_fraction': 0.5,
    'bagging_fraction': 0.5,
    'bagging_freq': 1,
    'verbose': 1,
    'min_data_in_leaf' : 1,
    'max_bin' : 255,
    'lambda_l1' : 0.00002,
    'lambda_l2' : 0.00001,
    'min_gain_to_split' : 0.001
}

In [210]:
import lightgbm as lgb
seed=123
lgb = lgb.LGBMClassifier()
lgb.fit(train_df, target_train)
# Get our predictions
lgb_predictions = lgb.predict(test_df)
print("Predictions have finished")

Predictions have finished


In [211]:
print("f1 score: {}".format(f1_score(target_val, lgb_predictions)))
print("Accuracy: {}".format(accuracy_score(target_val, lgb_predictions)))
print("="*80)
print(classification_report(target_val, lgb_predictions))

f1 score: 0.8712288447387786
Accuracy: 0.9807819020426093
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      8372
           1       0.95      0.81      0.87       734

    accuracy                           0.98      9106
   macro avg       0.97      0.90      0.93      9106
weighted avg       0.98      0.98      0.98      9106



In [213]:
## Cat Boosting
seed = 123
cb = CatBoostClassifier(
    iterations=7000,
    learning_rate = 0.03,
    verbose=500,
    eval_metric = 'AUC'
)
cb.fit(train_df, target_train)
# Get our predictions
cb_predictions = cb.predict(test_df)
print("Predictions have finished")

0:	total: 37ms	remaining: 4m 18s
500:	total: 4.54s	remaining: 58.8s
1000:	total: 8.71s	remaining: 52.2s
1500:	total: 13.1s	remaining: 47.9s
2000:	total: 17.3s	remaining: 43.1s
2500:	total: 21.5s	remaining: 38.7s
3000:	total: 25.7s	remaining: 34.3s
3500:	total: 30s	remaining: 29.9s
4000:	total: 34.1s	remaining: 25.6s
4500:	total: 38.4s	remaining: 21.3s
5000:	total: 42.5s	remaining: 17s
5500:	total: 46.5s	remaining: 12.7s
6000:	total: 50.5s	remaining: 8.4s
6500:	total: 54.5s	remaining: 4.18s
6999:	total: 58.1s	remaining: 0us
Predictions have finished


In [214]:
print("f1 score: {}".format(f1_score(target_val, cb_predictions)))
print("Accuracy: {}".format(accuracy_score(target_val, cb_predictions)))
print("="*80)
print(classification_report(target_val, cb_predictions))

f1 score: 0.8571428571428571
Accuracy: 0.9782560948824951
              precision    recall  f1-score   support

           0       0.98      0.99      0.99      8372
           1       0.91      0.81      0.86       734

    accuracy                           0.98      9106
   macro avg       0.95      0.90      0.92      9106
weighted avg       0.98      0.98      0.98      9106



In [215]:
## Xg Boosting
seed = 123
xgb = XGBClassifier(n_estimators=100)
xgb.fit(train_df, target_train)
# Get our predictions
xgb_predictions = xgb.predict(test_df)
print("Predictions have finished")



Predictions have finished


In [216]:
print("f1 score: {}".format(f1_score(target_val, xgb_predictions)))
print("Accuracy: {}".format(accuracy_score(target_val, xgb_predictions)))
print("="*80)
print(classification_report(target_val, xgb_predictions))

f1 score: 0.8581871345029239
Accuracy: 0.9786953656929497
              precision    recall  f1-score   support

           0       0.98      0.99      0.99      8372
           1       0.93      0.80      0.86       734

    accuracy                           0.98      9106
   macro avg       0.95      0.90      0.92      9106
weighted avg       0.98      0.98      0.98      9106



In [219]:
from sklearn.model_selection import cross_val_score
rf_scores = cross_val_score(rf, train_df, target_train, cv=10, scoring = "f1_macro")
print("Mean rf:", rf_scores.mean(),'\n')
lgb_scores = cross_val_score(lgb, train_df, target_train, cv=10, scoring = "f1_macro")
print("Mean lgb:", lgb_scores.mean(),'\n')
# cb_scores = cross_val_score(cb, train_df, target_train, cv=10, scoring = "f1_macro")
# print("Mean cb:", cb_scores.mean(),'\n')
xgb_scores = cross_val_score(xgb, train_df, target_train, cv=10, scoring = "f1_macro")
print("Mean xgb:", xgb_scores.mean(),'\n')
lr_scores = cross_val_score(lr, train_df, target_train, cv=10, scoring = "f1_macro")
#print("Scores:", scores)
print("Mean lr:", lr_scores.mean(),'\n')

Mean rf: 0.9249599779091622 

Mean lgb: 0.9208697570362065 

0:	total: 30.6ms	remaining: 3m 34s
500:	total: 4.17s	remaining: 54.1s
1000:	total: 8.32s	remaining: 49.8s
1500:	total: 12.4s	remaining: 45.5s
2000:	total: 16.6s	remaining: 41.4s
2500:	total: 20.5s	remaining: 36.9s
3000:	total: 24.6s	remaining: 32.8s
3500:	total: 28.9s	remaining: 28.9s
4000:	total: 32.9s	remaining: 24.7s
4500:	total: 36.9s	remaining: 20.5s
5000:	total: 40.7s	remaining: 16.3s
5500:	total: 44.4s	remaining: 12.1s
6000:	total: 48.1s	remaining: 8s
6500:	total: 51.6s	remaining: 3.96s
6999:	total: 55.4s	remaining: 0us
0:	total: 31.8ms	remaining: 3m 42s
500:	total: 4.47s	remaining: 58s


KeyboardInterrupt: 

In [220]:

importances = pd.DataFrame({
    'Feature': train.columns,
    'Importance': xgb.feature_importances_
})
importances = importances.sort_values(by='Importance', ascending=False)
importances = importances.set_index('Feature')
importances

Unnamed: 0_level_0,Importance
Feature,Unnamed: 1_level_1
credit_score,0.59263
credit_limit_used(%),0.08935
occupation_type_Cooking staff,0.018641
occupation_type_Drivers,0.016432
gender_F,0.015931
total_family_members,0.015306
gender_M,0.015204
prev_defaults,0.015156
occupation_type_Security staff,0.014719
migrant_worker_0.0,0.013891


In [227]:
test['credit_card_default'] = rf.predict(test)

In [228]:
test['credit_card_default'].value_counts()

0    10731
1      652
Name: credit_card_default, dtype: int64

In [249]:
submission_df = pd.read_csv("dataset/test.csv")
submission_df['credit_card_default'] = test['credit_card_default']

In [250]:
submission_df = submission_df[['customer_id', 'credit_card_default']]

In [256]:
submission_df.to_csv('submission_rf_baseline.csv',index=False)