# MSA 2023 Phase 2 - Part 2

In [1]:
import sklearn
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import pickle
%matplotlib inline

## 1. Load and split preprocessed data

In [2]:
# load data and split train test data
df = pd.read_csv('../0. Resources/datasets/credit_risk_preprocessed.csv', encoding = 'utf-8')

# split data
x_train, x_test, y_train, y_test = train_test_split(df[[c for c in df.columns if c != 'class']], df['class'], test_size = 0.33, random_state = 30)

In [3]:
print('x_train:', x_train.shape)
print('x_test:', x_test.shape)

x_train: (670, 80)
x_test: (330, 80)


## 2. Choose an algorithm

In [4]:
# the data includes categorical data and numerical data. ensemble model like random forest and xgboost would be appropriate
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import cross_validate
from sklearn.metrics import f1_score, roc_auc_score, recall_score, accuracy_score

## 3. Train and test a model

In [5]:
# train and test a model
# hyper parameter search

model_rf = RandomForestClassifier(n_estimators = 50)
model_xgb = XGBClassifier(n_estimators = 50)
model_dt = DecisionTreeClassifier()
model_lr = LogisticRegression()
model_svc = SVC(kernel = 'rbf')

# model_rf.fit(x_train, y_train)
# model_xgb.fit(x_train, y_train)
# model_dt.fit(x_train, y_train)
# model_lr.fit(x_train, y_train)
# model_svc.fit(x_train, y_train)

# y_pred_rf = model_rf.predict(x_test)
# y_pred_xgb = model_xgb.predict(x_test)
# y_pred_dt = model_dt.predict(x_test)
# y_pred_lr = model_lr.predict(x_test)
# y_pred_svc = model_svc.predict(x_test)

models = {'random forest': model_rf, 'xgboost': model_xgb, 'decision tree': model_dt, 'logistic regression': model_lr, 'svm': model_svc}

for m in models.keys():
    print('-----------------------------------------')
    print(m)
    scores = cross_validate(models[m], x_train, y_train, scoring=['accuracy', 'precision', 'recall'], cv=5)
    print('Accuracy: %.4f'%np.mean(scores['test_accuracy']), 'Precision: %.4f'%np.mean(scores['test_precision']), 'Recall: %.4f'%np.mean(scores['test_recall']))



-----------------------------------------
random forest
Accuracy: 0.7537 Precision: 0.6557 Recall: 0.3464
-----------------------------------------
xgboost
Accuracy: 0.7134 Precision: 0.5106 Recall: 0.4078
-----------------------------------------
decision tree
Accuracy: 0.6881 Precision: 0.4752 Recall: 0.4997
-----------------------------------------
logistic regression
Accuracy: 0.7388 Precision: 0.5612 Recall: 0.4686
-----------------------------------------
svm
Accuracy: 0.7522 Precision: 0.6384 Recall: 0.3567


In [6]:
# Choose Logistic Regression as the model
model_lr = LogisticRegression()

model_lr.fit(x_train, y_train)

y_test_pred = model_lr.predict(x_test)

## 4. Evaluate the model 

In [7]:
# check the result
print('logistic regression: ', 'accuracy: %.4f'%accuracy_score(y_test, y_test_pred),  'recall: %.4f'%recall_score(y_test, y_test_pred), 
      ' f1: %.4f'%f1_score(y_test, y_test_pred), ' auc: %.4f'%roc_auc_score(y_test, y_test_pred))

logistic regression:  accuracy: 0.7485 recall: 0.4712  f1: 0.5414  auc: 0.6736


In [8]:
# saving model
with open('model_lr.pkl', 'wb') as f:
    pickle.dump(model_lr, f)

## 5. Summary