In [50]:
# Import Libraries

import pandas as pd
import pickle

# Import Models
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
# from xgboost import XGBClassifier (pip install xgboost)

# Import Model Evaluation and Utilities
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, make_scorer, recall_score, classification_report

In [37]:
# Fetch Data
raw_data = pd.read_csv('../data/processed/raw_data.csv')
std_data = pd.read_csv('../data/processed/standardized_data.csv')

# Convert user-profile to numbers
raw_y = raw_data['user_profile'].map({'Payer': 0, 'Defaulter': 1})
std_y = std_data['user_profile'].map({'Payer': 0, 'Defaulter': 1})

# Drop user_profile from the data
raw_X = raw_data.drop('user_profile', axis=1)
std_X = std_data.drop('user_profile', axis=1)

In [38]:
# Train-test split
raw_X_train, raw_X_test, raw_y_train, raw_y_test = train_test_split(raw_X, raw_y, test_size=0.2, stratify= raw_y, random_state=42)
std_X_train, std_X_test, std_y_train, std_y_test = train_test_split(std_X, std_y, test_size=0.2, stratify= std_y, random_state=42)

# Logistic Regression

In [51]:
X_train, y_train, X_test, y_test = std_X_train, std_y_train, std_X_test, std_y_test

In [52]:
# Model Training
clf = LogisticRegression()
clf.fit(X_train, y_train)

In [53]:
# Save the model
pickle.dump(clf, open('../models/logistic_regression.pkl', 'wb'))

In [41]:
# Model Evaluation
y_pred = clf.predict(X_test)
print('Accuracy: ', accuracy_score(y_test, y_pred)*100)
print('Classification Report: \n ', classification_report(y_test, y_pred))

Accuracy:  87.99764668412486
Classification Report: 
                precision    recall  f1-score   support

           0       0.88      1.00      0.94    391014
           1       0.96      0.12      0.21     61114

    accuracy                           0.88    452128
   macro avg       0.92      0.56      0.57    452128
weighted avg       0.89      0.88      0.84    452128



### Observations:

- Although, the accuracy and precision/recall of payers seems quite high; but as we decided, the most crucial for us is the **Recall of Defaulters**
- From this model, the recall of defaulters = **12%**

# Naive Bayes

In [54]:
X_train, y_train, X_test, y_test = raw_X_train, raw_y_train, raw_X_test, raw_y_test

In [55]:
# Model Training
clf = GaussianNB()
clf.fit(X_train, y_train)

In [56]:
# Save the model
pickle.dump(clf, open('../models/naive_bayes.pkl', 'wb'))

In [44]:
# Model Evaluation
y_pred = clf.predict(X_test)
print('Accuracy: ', accuracy_score(y_test, y_pred)*100)
print('Classification Report: \n ', classification_report(y_test, y_pred))

Accuracy:  82.63788130794819
Classification Report: 
                precision    recall  f1-score   support

           0       0.89      0.91      0.90    391014
           1       0.34      0.31      0.32     61114

    accuracy                           0.83    452128
   macro avg       0.62      0.61      0.61    452128
weighted avg       0.82      0.83      0.82    452128



### Observations:

- Although, the accuracy and precision/recall of payers seems have reduced in comparison to Logistic Regression
- The performance of this model is much better in detecting **Defaulter** with recall of Defaulters = **31%**

# Decission Trees

In [57]:
X_train, y_train, X_test, y_test = std_X_train, std_y_train, std_X_test, std_y_test

In [47]:
# Grid Search for Hyperparameter Tuning
param_grid = {
    'criterion': ['gini'],
    'max_depth': [5, 10, 20, 30, 50, 100],
    'min_samples_leaf': [2, 4, 8],
}

clf = DecisionTreeClassifier(random_state=42)

# Grid search based on recall of `Defaulter`
# # Custom scorer to optimize recall for class 1
recall_scorer = make_scorer(recall_score, average='macro')
grid_search = GridSearchCV(clf, param_grid, scoring=recall_scorer, cv=5, n_jobs=-1, verbose=10)
grid_search.fit(X_train, y_train)

# Print best parameters
print(f"Best Parameters: {grid_search.best_params_}")

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV 4/5; 1/18] START criterion=gini, max_depth=5, min_samples_leaf=2............
[CV 3/5; 1/18] START criterion=gini, max_depth=5, min_samples_leaf=2............[CV 4/5; 2/18] START criterion=gini, max_depth=5, min_samples_leaf=4............

[CV 3/5; 2/18] START criterion=gini, max_depth=5, min_samples_leaf=4............
[CV 2/5; 1/18] START criterion=gini, max_depth=5, min_samples_leaf=2............
[CV 1/5; 2/18] START criterion=gini, max_depth=5, min_samples_leaf=4............
[CV 2/5; 2/18] START criterion=gini, max_depth=5, min_samples_leaf=4............
[CV 1/5; 1/18] START criterion=gini, max_depth=5, min_samples_leaf=2............
[CV 5/5; 2/18] START criterion=gini, max_depth=5, min_samples_leaf=4............
[CV 5/5; 1/18] START criterion=gini, max_depth=5, min_samples_leaf=2............
[CV 2/5; 1/18] END criterion=gini, max_depth=5, min_samples_leaf=2;, score=0.555 total time=   7.4s
[CV 4/5; 1/18] END criterion=

In [58]:
# Model Training
clf = DecisionTreeClassifier(**grid_search.best_params_, random_state=42)
clf.fit(X_train, y_train)

In [59]:
# Save the model
pickle.dump(clf, open('../models/decision_trees.pkl', 'wb'))

In [49]:
# Model Evaluation
y_pred = clf.predict(X_test)
print('Accuracy: ', accuracy_score(y_test, y_pred)*100)
print('Classification Report: \n ', classification_report(y_test, y_pred))

Accuracy:  83.03533512633591
Classification Report: 
                precision    recall  f1-score   support

           0       0.89      0.92      0.90    391014
           1       0.33      0.25      0.29     61114

    accuracy                           0.83    452128
   macro avg       0.61      0.59      0.60    452128
weighted avg       0.81      0.83      0.82    452128



### Observations:

- Although, the accuracy and precision/recall of payers seems have reduced in comparison to Logistic Regression
- The performance of this model is not better in detecting **Defaulter** as compared to **Naive Bayes** with recall of Defaulters = **25%**