In [8]:
from sklearn.model_selection import train_test_split
import pandas as pd

In [9]:
df_model = pd.read_csv("data/prepared_data.csv")
df_model.head()

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Risk
0,22,0,2,1,0,1,5951,48,0
1,45,1,2,0,0,0,7882,42,1
2,53,1,2,0,0,0,4870,24,0
3,35,1,3,2,0,1,6948,36,1
4,28,1,3,1,0,1,5234,30,0


In [10]:
X = df_model.iloc[:, :-1]
y = df_model.iloc[:, -1]

In [14]:
y.head()

0    0
1    1
2    0
3    1
4    0
Name: Risk, dtype: int64

In [13]:
X.head()

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration
0,22,0,2,1,0,1,5951,48
1,45,1,2,0,0,0,7882,42
2,53,1,2,0,0,0,4870,24
3,35,1,3,2,0,1,6948,36
4,28,1,3,1,0,1,5234,30


In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify = y, random_state=42)

In [16]:
X_train.shape, X_test.shape

((417, 8), (105, 8))

In [41]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
import joblib

In [19]:
def train_model(model, param_grid, X_train, y_train, X_test, y_test):
    grid = GridSearchCV(model, param_grid = param_grid, cv = 5, scoring = 'accuracy', n_jobs = -1)
    grid.fit(X_train, y_train)
    best_model = grid.best_estimator_
    y_pred = best_model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    return best_model, acc, grid.best_params_

In [20]:
dt = DecisionTreeClassifier(random_state = 42, class_weight = 'balanced')
dt_param_grid = {
    "max_depth": [3, 5, 7, 10, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
}

In [21]:
best_dt, acc_dt, params_dt = train_model(dt, dt_param_grid, X_train, y_train, X_test, y_test)

In [25]:
print("Decision Tree Accuracy: ", acc_dt)
print("Best parameters: ", params_dt)

Decision Tree Accuracy:  0.6
Best parameters:  {'max_depth': 7, 'min_samples_leaf': 2, 'min_samples_split': 10}


In [27]:
rf = RandomForestClassifier(random_state = 42, class_weight = 'balanced', n_jobs = -1)
rf_param_grid = {
    "n_estimators": [10, 20, 50, 100],
    "max_depth": [3, 5, 7, 10, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
}

In [28]:
best_rf, acc_rf, params_rf = train_model(rf, rf_param_grid, X_train, y_train, X_test, y_test)

In [30]:
print("Random Forest Accuracy: ", acc_rf)
print("Best parameters: ", params_rf)

Random Forest Accuracy:  0.6476190476190476
Best parameters:  {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 50}


In [34]:
et = ExtraTreesClassifier(random_state = 42, class_weight = 'balanced', n_jobs = -1)
et_param_grid = {
    "n_estimators": [10, 20, 50, 100, 200],
    "max_depth": [3, 5, 7, 10, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
}

In [35]:
best_et, acc_et, params_et = train_model(et, et_param_grid, X_train, y_train, X_test, y_test)

In [36]:
print("ExtraTrees Accuracy: ", acc_et)
print("Best parameters: ", params_et)

ExtraTrees Accuracy:  0.6285714285714286
Best parameters:  {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 100}


In [37]:
xgb = XGBClassifier(random_state = 42, scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum(), use_label_encoder=False , eval_metric="logloss")

In [38]:
xgb_param_grid = {
    "n_estimators": [10, 20, 50, 100],
    "max_depth": [3, 5, 7, 10, None],
    "learning_rate": [0.01, 0.1, 0.2, 0.3, 0.4, 0.5],
    "subsample": [0.1, 0.6, 0.7, 0.8, 0.9],
    "colsample_bytree": [0.1, 0.6, 0.7, 0.8, 0.9],
}

In [39]:
best_xgb, acc_xgb, params_xgb = train_model(xgb, xgb_param_grid, X_train, y_train, X_test, y_test)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [40]:
print("XGB accuracy", acc_xgb)
print("Best params", params_xgb)

XGB accuracy 0.6952380952380952
Best params {'colsample_bytree': 0.7, 'learning_rate': 0.2, 'max_depth': 7, 'n_estimators': 100, 'subsample': 0.7}


In [42]:
joblib.dump(best_xgb, "best_model/xgb_model.pkl")

['xgb_model.pkl']