# **Credit Card Risk Assessment**
In this project, we will apply Machine Learning algorithm to predict whether an indivdual holding a credit card will be a defaulter or not.

### Loading the necessary libraries

In [2]:
import numpy as np
import pandas as pd

### Loading the data

In [3]:
df = pd.read_csv("Credit_default_dataset.csv")
df.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,1,20000.0,2,2,1,24,2,2,-1,-1,-2,-2,3913.0,3102.0,689.0,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,2,120000.0,2,2,2,26,-1,2,0,0,0,2,2682.0,1725.0,2682.0,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,1
2,3,90000.0,2,2,2,34,0,0,0,0,0,0,29239.0,14027.0,13559.0,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0
3,4,50000.0,2,2,1,37,0,0,0,0,0,0,46990.0,48233.0,49291.0,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,0
4,5,50000.0,1,2,1,57,-1,0,-1,0,0,0,8617.0,5670.0,35835.0,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,0


In [4]:
# Lets check the shape of the data
df.shape

(30000, 25)

### Feature Engineering

In [5]:
# As the ID feature is just a unique identifier of each record, we will drop it
df = df.drop(["ID"], axis = 1)

In [6]:
# Lets change the name of the feature PAY_0 to PAY_1 to make the numbering more sensible
df.rename(columns = {"PAY_0" : "PAY_1"}, inplace = True)

In [7]:
# Lets check our data
df.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_1,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,20000.0,2,2,1,24,2,2,-1,-1,-2,-2,3913.0,3102.0,689.0,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,120000.0,2,2,2,26,-1,2,0,0,0,2,2682.0,1725.0,2682.0,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,1
2,90000.0,2,2,2,34,0,0,0,0,0,0,29239.0,14027.0,13559.0,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0
3,50000.0,2,2,1,37,0,0,0,0,0,0,46990.0,48233.0,49291.0,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,0
4,50000.0,1,2,1,57,-1,0,-1,0,0,0,8617.0,5670.0,35835.0,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,0


In [34]:
# Lets check the categories inside the target to validate whether the dataset is balanced or not
df["default.payment.next.month"].value_counts()

0    23364
1     6636
Name: default.payment.next.month, dtype: int64

We can say that the dataset is balanced as 25% of the records are defaulters out of total records.

In [8]:
# Lets see the unique values for the feature EDUCATION
df["EDUCATION"].value_counts()

2    14030
1    10585
3     4917
5      280
4      123
6       51
0       14
Name: EDUCATION, dtype: int64

Here, we can consider that 1 refers to school, 2 refers to high school, 3 refers to college and 4 refers to university. Classes 0,5 and 6 lacks description, so we will assume that they are from university and hence belongs to class 4. 

In [9]:
df["EDUCATION"] = df["EDUCATION"].map({0:4, 1:1, 2:2, 3:3, 4:4, 5:4, 6:4})

In [10]:
# Lets see the unique values for the feature MARRIAGE
df["MARRIAGE"].value_counts()

2    15964
1    13659
3      323
0       54
Name: MARRIAGE, dtype: int64

We can see very few belongs to class 0 and 3 compared to class 2 and 3. So we will map class 0 with class 3.

In [12]:
df["MARRIAGE"] = df["MARRIAGE"].map({0:3, 1:1, 2:2, 3:3})

In [13]:
# Independent and Dependent features

X = df.drop(["default.payment.next.month"], axis = 1)
y = df["default.payment.next.month"]

### Model Building and Hyperparameter Optimization

In [15]:
from sklearn.model_selection import RandomizedSearchCV
import xgboost

In [16]:
# Setting the parameters

params={
 "learning_rate"    : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ] ,
 "max_depth"        : [ 3, 4, 5, 6, 8, 10, 12, 15],
 "min_child_weight" : [ 1, 3, 5, 7 ],
 "gamma"            : [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
 "colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7 ]
    
}

In [22]:
# Lets set a timer to count the model training time

def timer(start_time = None):
  if not start_time:
    start_time = datetime.now()
    return start_time
  elif start_time:
    thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
    tmin, tsec = divmod(temp_sec, 60)
    print("\n Time taken: %i hours %i minutes and %i seconds." %(thour, tmin, round(tsec, 2)))

In [23]:
# Lets initialize the xgboost model
classifier = xgboost.XGBClassifier()

In [24]:
# Lets initialize the Random Search
random_search = RandomizedSearchCV(classifier, param_distributions = params, n_iter = 5, scoring = "roc_auc", n_jobs = -1, cv = 5, verbose = 3)

In [25]:
# Model Training
from datetime import datetime

start_time = timer(None) #timing starts from this point for "start_time" variable
random_search.fit(X,y)
timer(start_time)  # timing ends here for the "start_time" variable

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:  1.6min finished



 Time taken: 0 hours 1 minutes and 39 seconds.


In [26]:
# Lets see the best estimators
random_search.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.4, gamma=0.0,
              learning_rate=0.3, max_delta_step=0, max_depth=3,
              min_child_weight=3, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [28]:
# Lets see the best parameters
random_search.best_params_

{'colsample_bytree': 0.4,
 'gamma': 0.0,
 'learning_rate': 0.3,
 'max_depth': 3,
 'min_child_weight': 3}

In [30]:
# Fitting the model with the best estimators
classifier = xgboost.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.4, gamma=0.0,
              learning_rate=0.3, max_delta_step=0, max_depth=3,
              min_child_weight=3, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [31]:
# Lets use cross validation
from sklearn.model_selection import cross_val_score
score = cross_val_score(classifier, X, y, cv = 10)

In [32]:
# Lets see the score of the 10 cv's
score

array([0.80633333, 0.80733333, 0.81533333, 0.80666667, 0.818     ,
       0.82533333, 0.836     , 0.832     , 0.82933333, 0.825     ])

In [33]:
# Lets calculate the mean of the scores
score.mean()

0.8201333333333333

The model performed well giving a cross val score of 82%