# 1. Load data

In [17]:
import pandas as pd

data = pd.read_csv('data/clean/applications.csv')
data.info()

<class 'pandas.DataFrame'>
RangeIndex: 25128 entries, 0 to 25127
Data columns (total 50 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   id                                25128 non-null  int64  
 1   car                               25128 non-null  int64  
 2   realty                            25128 non-null  int64  
 3   cnt_children                      25128 non-null  float64
 4   income                            25128 non-null  float64
 5   mobile_phone                      25128 non-null  int64  
 6   work_phone                        25128 non-null  int64  
 7   phone                             25128 non-null  int64  
 8   email                             25128 non-null  int64  
 9   cnt_fam_members                   25128 non-null  float64
 10  age                               25128 non-null  float64
 11  work_experience                   25128 non-null  float64
 12  bad_debt       

## 1.2. Inspect outcome variable balance

In [29]:
data.iloc[:, -1].value_counts()

status
1    25007
0      121
Name: count, dtype: int64

Due to class imbalance, we will use stratified sampling when splitting the data into training and test sets.

# 2. Split data into training and test sets

In [18]:
from sklearn.model_selection import train_test_split

X = data.iloc[:, 1:-1].values
y = data.iloc[:, -1].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3)

## 2.1. Use SMOTE to handle class imbalance

In [19]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=3)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# 3. Train models

## 3.1. Logistic Regression

In [20]:
from sklearn.linear_model import LogisticRegression

log_classifier = LogisticRegression()
log_classifier.fit(X_train_resampled, y_train_resampled)
log_y_pred = log_classifier.predict(X_test)

## 3.2. Random Forest Classifier

In [24]:
from sklearn.ensemble import RandomForestClassifier

rf_classifier = RandomForestClassifier(n_estimators=300, criterion='entropy', random_state=3)
rf_classifier.fit(X_train_resampled, y_train_resampled)
rf_y_pred = rf_classifier.predict(X_test)

## 3.3. XGBoost Classifier

In [22]:
from xgboost import XGBClassifier

xgb_classifier = XGBClassifier(random_state=3)
xgb_classifier.fit(X_train_resampled, y_train_resampled)
xgb_y_pred = xgb_classifier.predict(X_test)

# 4. Review models performance

In [27]:
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import cross_val_score

accuracy_log = cross_val_score(estimator=log_classifier, X=X_train_resampled, y=y_train_resampled, cv=10)
print("Logistic Regression CV Accuracy: %.2f%% +/- %.2f%%" % (accuracy_log.mean()*100, accuracy_log.std()*100))
print("Logistic Regression Accuracy:", accuracy_score(y_test, log_y_pred))
print(confusion_matrix(y_test, log_y_pred))
print('-'*10)
accuracy_rf = cross_val_score(estimator=rf_classifier, X=X_train_resampled, y=y_train_resampled, cv=10)
print("Random Forest CV Accuracy: %.2f%% +/- %.2f%%" % (accuracy_rf.mean()*100, accuracy_rf.std()*100))
print("Random Forest Accuracy:", accuracy_score(y_test, rf_y_pred))
print(confusion_matrix(y_test, rf_y_pred))
print('-'*10)
accuracy_xgb = cross_val_score(estimator=xgb_classifier, X=X_train_resampled, y=y_train_resampled, cv=10)
print("XGBoost CV Accuracy: %.2f%% +/- %.2f%%" % (accuracy_xgb.mean()*100, accuracy_xgb.std()*100))
print("XGBoost Accuracy:", accuracy_score(y_test, xgb_y_pred))
print(confusion_matrix(y_test, xgb_y_pred))

Logistic Regression CV Accuracy: 99.78% +/- 0.10%
Logistic Regression Accuracy: 0.9974134500596896
[[  23    1]
 [  12 4990]]
----------
Random Forest CV Accuracy: 99.89% +/- 0.33%
Random Forest Accuracy: 0.9964186231595702
[[   6   18]
 [   0 5002]]
----------
XGBoost CV Accuracy: 99.97% +/- 0.03%
XGBoost Accuracy: 0.9994031038599284
[[  24    0]
 [   3 4999]]


# 5. Save the better model

In [28]:
xgb_classifier.save_model('model_a01.json')