In [1]:
# Import Libraries

import pandas as pd
import pickle

# Import Models
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

# Import Model Evaluation and Utilities
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, make_scorer, recall_score, classification_report

In [2]:
# Fetch Data
raw_data = pd.read_csv('../data/processed/raw_data.csv')
std_data = pd.read_csv('../data/processed/standardized_data.csv')

# Convert user-profile to numbers
raw_y = raw_data['user_profile'].map({'Payer': 0, 'Defaulter': 1})
std_y = std_data['user_profile'].map({'Payer': 0, 'Defaulter': 1})

# Drop user_profile from the data
raw_X = raw_data.drop('user_profile', axis=1)
std_X = std_data.drop('user_profile', axis=1)

In [3]:
# Train-test split
raw_X_train, raw_X_test, raw_y_train, raw_y_test = train_test_split(raw_X, raw_y, test_size=0.2, stratify= raw_y, random_state=42)
std_X_train, std_X_test, std_y_train, std_y_test = train_test_split(std_X, std_y, test_size=0.2, stratify= std_y, random_state=42)

In [4]:
print("Train Data Shape: ", raw_X_train.shape)
print("Test Data Shape: ", raw_X_test.shape)

Train Data Shape:  (1808511, 18)
Test Data Shape:  (452128, 18)


# Logistic Regression

In [51]:
X_train, y_train, X_test, y_test = std_X_train, std_y_train, std_X_test, std_y_test

In [52]:
# Model Training
clf = LogisticRegression()
clf.fit(X_train, y_train)

In [53]:
# Save the model
pickle.dump(clf, open('../models/logistic_regression.pkl', 'wb'))

In [41]:
# Model Evaluation
y_pred = clf.predict(X_test)
print('Accuracy: ', accuracy_score(y_test, y_pred)*100)
print('Classification Report: \n ', classification_report(y_test, y_pred))

Accuracy:  87.99764668412486
Classification Report: 
                precision    recall  f1-score   support

           0       0.88      1.00      0.94    391014
           1       0.96      0.12      0.21     61114

    accuracy                           0.88    452128
   macro avg       0.92      0.56      0.57    452128
weighted avg       0.89      0.88      0.84    452128



### Observations:

- Although, the accuracy and precision/recall of payers seems quite high; but as we decided, the most crucial for us is the **Recall of Defaulters**
- From this model, the recall of defaulters = **12%**

# Naive Bayes

In [54]:
X_train, y_train, X_test, y_test = raw_X_train, raw_y_train, raw_X_test, raw_y_test

In [55]:
# Model Training
clf = GaussianNB()
clf.fit(X_train, y_train)

In [56]:
# Save the model
pickle.dump(clf, open('../models/naive_bayes.pkl', 'wb'))

In [44]:
# Model Evaluation
y_pred = clf.predict(X_test)
print('Accuracy: ', accuracy_score(y_test, y_pred)*100)
print('Classification Report: \n ', classification_report(y_test, y_pred))

Accuracy:  82.63788130794819
Classification Report: 
                precision    recall  f1-score   support

           0       0.89      0.91      0.90    391014
           1       0.34      0.31      0.32     61114

    accuracy                           0.83    452128
   macro avg       0.62      0.61      0.61    452128
weighted avg       0.82      0.83      0.82    452128



### Observations:

- Although, the accuracy and precision/recall of payers seems have reduced in comparison to Logistic Regression
- The performance of this model is much better in detecting **Defaulter** with recall of Defaulters = **31%**

# Decission Trees

In [57]:
X_train, y_train, X_test, y_test = std_X_train, std_y_train, std_X_test, std_y_test

In [47]:
# Grid Search for Hyperparameter Tuning
param_grid = {
    'criterion': ['gini'],
    'max_depth': [5, 10, 20, 30, 50, 100],
    'min_samples_leaf': [2, 4, 8],
}

clf = DecisionTreeClassifier(random_state=42)

# Grid search based on recall of `Defaulter`
# # Custom scorer to optimize recall for class 1
recall_scorer = make_scorer(recall_score, average='macro')
grid_search = GridSearchCV(clf, param_grid, scoring=recall_scorer, cv=5, n_jobs=-1, verbose=10)
grid_search.fit(X_train, y_train)

# Print best parameters
print(f"Best Parameters: {grid_search.best_params_}")

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV 4/5; 1/18] START criterion=gini, max_depth=5, min_samples_leaf=2............
[CV 3/5; 1/18] START criterion=gini, max_depth=5, min_samples_leaf=2............[CV 4/5; 2/18] START criterion=gini, max_depth=5, min_samples_leaf=4............

[CV 3/5; 2/18] START criterion=gini, max_depth=5, min_samples_leaf=4............
[CV 2/5; 1/18] START criterion=gini, max_depth=5, min_samples_leaf=2............
[CV 1/5; 2/18] START criterion=gini, max_depth=5, min_samples_leaf=4............
[CV 2/5; 2/18] START criterion=gini, max_depth=5, min_samples_leaf=4............
[CV 1/5; 1/18] START criterion=gini, max_depth=5, min_samples_leaf=2............
[CV 5/5; 2/18] START criterion=gini, max_depth=5, min_samples_leaf=4............
[CV 5/5; 1/18] START criterion=gini, max_depth=5, min_samples_leaf=2............
[CV 2/5; 1/18] END criterion=gini, max_depth=5, min_samples_leaf=2;, score=0.555 total time=   7.4s
[CV 4/5; 1/18] END criterion=

In [58]:
# Model Training
clf = DecisionTreeClassifier(**grid_search.best_params_, random_state=42)
clf.fit(X_train, y_train)

In [59]:
# Save the model
pickle.dump(clf, open('../models/decision_trees.pkl', 'wb'))

In [49]:
# Model Evaluation
y_pred = clf.predict(X_test)
print('Accuracy: ', accuracy_score(y_test, y_pred)*100)
print('Classification Report: \n ', classification_report(y_test, y_pred))

Accuracy:  83.03533512633591
Classification Report: 
                precision    recall  f1-score   support

           0       0.89      0.92      0.90    391014
           1       0.33      0.25      0.29     61114

    accuracy                           0.83    452128
   macro avg       0.61      0.59      0.60    452128
weighted avg       0.81      0.83      0.82    452128



### Observations:

- Although, the accuracy and precision/recall of payers seems have reduced in comparison to Logistic Regression
- The performance of this model is not better in detecting **Defaulter** as compared to **Naive Bayes** with recall of Defaulters = **25%**


# KNN

In [14]:
X_train, y_train, X_test, y_test = std_X_train, std_y_train, std_X_test, std_y_test

In [7]:
# Model Training
clf = KNeighborsClassifier()

# Grid Search for Hyperparameter Tuning
param_grid = {
    'n_neighbors': [50, 100, 400, 700, 1000]
}

recall_scorer = make_scorer(recall_score, average='macro')
grid_search = GridSearchCV(clf, param_grid, scoring=recall_scorer, cv=5, n_jobs=-1, verbose=10)
grid_search.fit(X_train, y_train)

# Print best parameters
print(f"Best Parameters: {grid_search.best_params_}")

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV 3/5; 2/5] START n_neighbors=100.............................................
[CV 3/5; 1/5] START n_neighbors=50..............................................
[CV 1/5; 2/5] START n_neighbors=100.............................................
[CV 1/5; 1/5] START n_neighbors=50..............................................[CV 4/5; 2/5] START n_neighbors=100.............................................

[CV 4/5; 1/5] START n_neighbors=50..............................................
[CV 5/5; 1/5] START n_neighbors=50..............................................
[CV 2/5; 2/5] START n_neighbors=100.............................................
[CV 2/5; 1/5] START n_neighbors=50..............................................
[CV 5/5; 2/5] START n_neighbors=100.............................................
[CV 5/5; 1/5] END ...............n_neighbors=50;, score=0.558 total time=75.2min
[CV 1/5; 3/5] START n_neighbors=400..............

In [None]:
# Model Training
clf = KNeighborsClassifier(**grid_search.best_params_)
clf.fit(X_train, y_train)

In [16]:
# Save the model
pickle.dump(clf, open('../models/knn.pkl', 'wb'))

In [17]:
# Model Evaluation
y_pred = clf.predict(X_test)
print('Accuracy: ', accuracy_score(y_test, y_pred)*100)
print('Classification Report: \n ', classification_report(y_test, y_pred))

Accuracy:  87.99609844999647
Classification Report: 
                precision    recall  f1-score   support

           0       0.88      1.00      0.94    391014
           1       0.93      0.12      0.21     61114

    accuracy                           0.88    452128
   macro avg       0.91      0.56      0.57    452128
weighted avg       0.89      0.88      0.84    452128



### Observations:

- Although, the accuracy and precision/recall of payers seems quite high; but as we decided, the most crucial for us is the **Recall of Defaulters**
- From this model, the recall of defaulters = **12%**


# Random Forest

In [5]:
X_train, y_train, X_test, y_test = std_X_train, std_y_train, std_X_test, std_y_test

In [6]:
# Model Training
clf = RandomForestClassifier(random_state=42)

# Grid Search for Hyperparameter Tuning
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
}
recall_scorer = make_scorer(recall_score, average='macro')
grid_search = GridSearchCV(clf, param_grid, scoring=recall_scorer, cv=5, n_jobs=4, verbose=10)
grid_search.fit(X_train, y_train)

# Print best parameters
print(f"Best Parameters: {grid_search.best_params_}")

Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV 1/5; 1/36] START max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50
[CV 2/5; 1/36] START max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50
[CV 3/5; 1/36] START max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50
[CV 4/5; 1/36] START max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50
[CV 3/5; 1/36] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50;, score=0.567 total time= 1.9min
[CV 5/5; 1/36] START max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50
[CV 2/5; 1/36] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50;, score=0.564 total time= 1.9min
[CV 1/5; 2/36] START max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100
[CV 1/5; 1/36] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50;, score=0.566 total time= 1.9mi



[CV 3/5; 4/36] START max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=50
[CV 2/5; 4/36] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=50;, score=0.563 total time= 1.8min
[CV 4/5; 4/36] START max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=50
[CV 3/5; 4/36] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=50;, score=0.567 total time= 1.8min
[CV 5/5; 4/36] START max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=50
[CV 4/5; 3/36] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200;, score=0.564 total time= 7.6min
[CV 1/5; 5/36] START max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100
[CV 4/5; 4/36] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=50;, score=0.564 total time= 1.9min
[CV 2/5; 5/36] START max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100
[CV 5/5; 4/36] END max_dept

In [7]:
# Model Training
clf = RandomForestClassifier(**grid_search.best_params_, random_state=42)
clf.fit(X_train, y_train)

In [8]:
# Save the model
pickle.dump(clf, open('../models/random_forest.pkl', 'wb'))

In [9]:
# Model Evaluation
y_pred = clf.predict(X_test)
print('Accuracy: ', accuracy_score(y_test, y_pred)*100)
print('Classification Report: \n ', classification_report(y_test, y_pred))

Accuracy:  87.77226803029231
Classification Report: 
                precision    recall  f1-score   support

           0       0.88      0.99      0.93    391014
           1       0.76      0.14      0.23     61114

    accuracy                           0.88    452128
   macro avg       0.82      0.57      0.58    452128
weighted avg       0.86      0.88      0.84    452128



### Observations:

- Although, the accuracy and precision/recall of payers seems quite high; but as we decided, the most crucial for us is the **Recall of Defaulters**
- From this model, the recall of defaulters = **14%**
- Model overall seems to be performing bad in comparison with KNN and Logistic Regression


# XGBoost

In [10]:
X_train, y_train, X_test, y_test = std_X_train, std_y_train, std_X_test, std_y_test

In [15]:
# Model Training
clf = XGBClassifier(random_state=42)

# Grid Search for Hyperparameter Tuning
param_grid = {
    'n_estimators': [10, 50, 100],
    'max_depth': [3, 5, None],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
}
recall_scorer = make_scorer(recall_score, average='macro')
grid_search = GridSearchCV(clf, param_grid, scoring=recall_scorer, cv=5, n_jobs=4, verbose=10)
grid_search.fit(X_train, y_train)

# Print best parameters
print(f"Best Parameters: {grid_search.best_params_}")

Fitting 5 folds for each of 108 candidates, totalling 540 fits
[CV 4/5; 1/108] START colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=10, subsample=0.8
[CV 1/5; 1/108] START colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=10, subsample=0.8
[CV 3/5; 1/108] START colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=10, subsample=0.8
[CV 2/5; 1/108] START colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=10, subsample=0.8
[CV 4/5; 1/108] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=10, subsample=0.8;, score=0.500 total time=   1.9s
[CV 3/5; 1/108] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=10, subsample=0.8;, score=0.500 total time=   1.9s
[CV 2/5; 1/108] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=10, subsample=0.8;, score=0.500 total time=   1.9s
[CV 1/5; 1/108] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=



[CV 1/5; 3/108] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=50, subsample=0.8;, score=0.556 total time=   3.2s
[CV 2/5; 3/108] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=50, subsample=0.8;, score=0.554 total time=   3.2s
[CV 3/5; 3/108] START colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=50, subsample=0.8
[CV 4/5; 3/108] START colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=50, subsample=0.8
[CV 1/5; 4/108] START colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=50, subsample=1.0[CV 5/5; 3/108] START colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=50, subsample=0.8

[CV 4/5; 3/108] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=50, subsample=0.8;, score=0.556 total time=   3.1s
[CV 3/5; 3/108] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=50, subsample=0.8;, score=0.557 total time=   3.1s
[CV 2/5; 4/1

In [16]:
# Model Training
clf = XGBClassifier(**grid_search.best_params_, random_state=42)
clf.fit(X_train, y_train)

In [17]:
# Save the model
pickle.dump(clf, open('../models/xgb.pkl', 'wb'))

In [18]:
# Model Evaluation
y_pred = clf.predict(X_test)
print('Accuracy: ', accuracy_score(y_test, y_pred)*100)
print('Classification Report: \n ', classification_report(y_test, y_pred))

Accuracy:  88.04674782362517
Classification Report: 
                precision    recall  f1-score   support

           0       0.88      1.00      0.94    391014
           1       0.96      0.12      0.22     61114

    accuracy                           0.88    452128
   macro avg       0.92      0.56      0.58    452128
weighted avg       0.89      0.88      0.84    452128



### Observations:

- Although, the accuracy and precision/recall of payers seems quite high; but as we decided, the most crucial for us is the **Recall of Defaulters**
- From this model, the recall of defaulters = **12%**
- Among all the models with high accuracy and low recall of defaulters, XGBoost is performing better.