In [13]:
import numpy as np
import pandas as pd
import xgboost as xgb
from feature_engine.encoding import OneHotEncoder, RareLabelEncoder
from feature_engine.imputation.categorical import CategoricalImputer
from sklearn.ensemble import ExtraTreesRegressor, RandomForestClassifier
from sklearn.experimental import enable_iterative_imputer
from sklearn.feature_extraction import DictVectorizer
from sklearn.impute import IterativeImputer, KNNImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from preprocessing.preprocessors import (BinaryEncoder, CleanStrings,
                                         ColumnDropperTransformer,
                                         CustomColumnMapping, CustomRowMapping,
                                         DivideColumnValue,
                                         MultiplyColumnValue,
                                         PrefixStringEncoding)


In [14]:
# Reading the data
df = pd.read_csv('dataset/loan_prediction.csv')

df.columns = [cols.lower() for cols in df]

id_col = 'loan_id'
target = 'loan_status'

clean_strings_cols = [cols for cols in df.columns if df[cols].dtype == 'object']

data_mapping_cols = ['graduate', 'property_area']


thousand_multiplier_cols = ['loan_amount']

month_divider_cols = ['loan_amount_term']

month_encoder_cols = ['loan_amount_term']

month_encoder_prefix = 'month'

binary_encoder_cols = ['married', 'graduate', 'self_employed']

binary_encoder_value='yes'

gender_encoder_cols = ['gender']

gender_encoder_value = 'male'

column_mappings = {'applicantincome':'applicant_income', 
                   "coapplicantincome":"co_application_income", 
                   "loanamount":"loan_amount",
                   "education":"graduate"}

data_mappings = { 'graduate': {'graduate': 'yes', 'not_graduate_graduate': 'no'},
  'property_area': { 'rural': 'rural',
                     'semiurban': 'semi_urban',
                     'urban': 'urban'}}

In [35]:
preprocessor_pipeline = Pipeline(
    steps=[
        ('clean_strings', 
         CleanStrings(column_list=clean_strings_cols)
        ),
        ('column_mappings',
         CustomColumnMapping(column_mapping=column_mappings)
        ),
        ('thousand_multiplier', 
         MultiplyColumnValue(column_list=thousand_multiplier_cols, 
                             multiply_by=1000)
        ),
        ('month_divider',
         DivideColumnValue(column_list=month_divider_cols, 
                           divide_by=12)
        ),
        ('prefix_month_encoder',
         PrefixStringEncoding(column_list=month_encoder_cols,
                              string_val=month_encoder_prefix)
        ),
        ('data_mappings', 
         CustomRowMapping(column_list=data_mapping_cols, 
                          column_value_mapping=data_mappings)
        ),
        ('binary_common_encoder', 
         BinaryEncoder(column_list=binary_encoder_cols, 
                       value=binary_encoder_value)
        ),
        ('gender_encoder', 
         BinaryEncoder(column_list=gender_encoder_cols, 
                       value=gender_encoder_value)
        )
    ])

In [4]:
preprocessed_data = preprocessor_pipeline.fit_transform(df)

In [5]:
feature_cols = [cols for cols in preprocessed_data if cols not in [id_col, target]]
cat_cols = [cols for cols in feature_cols if preprocessed_data[cols].dtype == 'object']
X = preprocessed_data[feature_cols].copy()
y = preprocessed_data[target].map({'y':1, 'n':0})
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1)

In [6]:
transformer_pipeline = Pipeline(
    steps=[
        ('cat_imputer',
          CategoricalImputer(fill_value='frequent',
                             variables=cat_cols)
        ),   
        ('rare_label_encoding',
         RareLabelEncoder(tol=0.01,
                          n_categories=8,
                          variables=cat_cols)
        ),
        ('ohe_encoding',
         OneHotEncoder(variables=cat_cols)
        ),
        ('scaling_data',
            MinMaxScaler()
        ),
        ('knn_imputer',
         KNNImputer(add_indicator=True)
        )
    ]
)

In [7]:
transformed_data_train = transformer_pipeline.fit_transform(X_train, y_train)
transformed_data_test = transformer_pipeline.transform(X_test)



In [8]:
X_train_transformed = pd.DataFrame(data=transformed_data_train, columns=transformer_pipeline.get_feature_names_out())
X_test_transformed = pd.DataFrame(data=transformed_data_test, columns=transformer_pipeline.get_feature_names_out())

In [12]:
# Decision Tree
d_param_grid = {
    'max_features': [None, 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8,10,20],
    'min_samples_leaf' : [1,3,5,10,20],
    'criterion' : ['gini', 'entropy'],
    'random_state' : [1], 
    'class_weight' : ['balanced', None]
}
d_clf = DecisionTreeClassifier()


# Logistic Regression
lr_param_grid = {
    "C":np.logspace(-3,3,7), 
    "max_iter": [500, 1000,2000, 5000],
    'class_weight' : ['balanced', None],
    'random_state' : [1]
    } 
    
lr_clf = LogisticRegression()


# Random Forest
rf_param_grid = { 
    'n_estimators': [100,200,300],
    'max_features': [None, 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8,10,20],
    'min_samples_leaf' : [1,3,5,10,20],
    'criterion' : ['gini', 'entropy'],
    'random_state' : [1], 
    'class_weight' : ['balanced', None]
}

rf_clf = RandomForestClassifier(n_jobs=-1)

# Xgboost
xgb_params = {
    'eta': [0.05, 0.1, 0.2],
    'max_depth': [4,5,6,7,8,10,20],
    'min_child_weight': [1,3,5,10,20],
    'n_estimators': [5, 10, 20, 50],
    'objective':['binary:logistic'],
    'seed': [1],
    'verbosity': [1]
}

xgb_clf = xgb.XGBClassifier()

In [13]:
# Training the models
%time

d_clf_cv = GridSearchCV(estimator=d_clf, param_grid=d_param_grid, cv=5, scoring='roc_auc')
d_clf_cv.fit(X_train_transformed, y_train)

print("Decision tree optimised")

lr_clf_cv = GridSearchCV(estimator=lr_clf, param_grid=lr_param_grid, cv=5, scoring='roc_auc')
lr_clf_cv.fit(X_train_transformed, y_train)

print("Logistic regression optimised")

rf_clf_cv = GridSearchCV(estimator=rf_clf, param_grid=rf_param_grid, cv=5, scoring='roc_auc')
rf_clf_cv.fit(X_train_transformed, y_train)

print("Random Forest optimised")

xgb_clf_cv = GridSearchCV(estimator=xgb_clf, param_grid=xgb_params, cv=5, scoring='roc_auc')
xgb_clf_cv.fit(X_train_transformed, y_train)

print("xgboost classifier optimised")

lr_best_params = lr_clf_cv.best_params_
d_best_params = d_clf_cv.best_params_
rf_best_params = rf_clf_cv.best_params_
xgb_best_params = xgb_clf_cv.best_params_

CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 7.15 µs
Decision tree optimised
Logistic regression optimised
Random Forest optimised
xgboost classifier optimised


In [14]:
# Training the best models
lr_best_clf = LogisticRegression(**lr_best_params)
d_best_clf = DecisionTreeClassifier(**d_best_params)
rf_best_clf = RandomForestClassifier(**rf_best_params)
xgb_best_clf = xgb.XGBClassifier(**xgb_best_params)

lr_best_clf.fit(X_train_transformed, y_train)
d_best_clf.fit(X_train_transformed, y_train)
rf_best_clf.fit(X_train_transformed, y_train)
xgb_best_clf.fit(X_train_transformed, y_train)

In [15]:
# Evaluating all the models
# Evaluate the models
def evaluate_roc(model, X_val, y_val):
    """Evaluation function to return recall"""

    predictions = model.predict_proba(X_val)[:,1]
    roc_auc = roc_auc_score(y_val, predictions)
    return roc_auc

def evaluate_accuracy(model, X_val, y_val):
    """Evaluation function to return recall"""

    predictions = model.predict(X_val)
    acc = accuracy_score(y_val, predictions)
    return acc


d_roc_auc = evaluate_roc(d_best_clf, X_val=X_test_transformed, y_val=y_test)
lr_roc_auc = evaluate_roc(lr_best_clf, X_val=X_test_transformed, y_val=y_test)
rf_roc_auc = evaluate_roc(rf_best_clf, X_val=X_test_transformed, y_val=y_test)
xgb_roc_auc = evaluate_roc(xgb_best_clf, X_val=X_test_transformed, y_val=y_test)

d_acc = evaluate_accuracy(d_best_clf, X_val=X_test_transformed, y_val=y_test)
lr_acc = evaluate_accuracy(lr_best_clf, X_val=X_test_transformed, y_val=y_test)
rf_acc = evaluate_accuracy(rf_best_clf, X_val=X_test_transformed, y_val=y_test)
xgb_acc = evaluate_accuracy(xgb_best_clf, X_val=X_test_transformed, y_val=y_test)

d_roc_auc, lr_roc_auc, rf_roc_auc, xgb_roc_auc, d_acc, lr_acc, rf_acc, xgb_acc

(0.6339285714285714,
 0.6797619047619048,
 0.7119047619047619,
 0.7464285714285714,
 0.7580645161290323,
 0.7096774193548387,
 0.7258064516129032,
 0.7580645161290323)

In [16]:
# Confusion Matrix
d_cm = confusion_matrix(y_true=y_test, y_pred=d_best_clf.predict(X_test_transformed), normalize='true')
lr_cm = confusion_matrix(y_true=y_test, y_pred=lr_best_clf.predict(X_test_transformed), normalize='true')
rf_cm = confusion_matrix(y_true=y_test, y_pred=rf_best_clf.predict(X_test_transformed), normalize='true')
xgb_cm = confusion_matrix(y_true=y_test, y_pred=xgb_best_clf.predict(X_test_transformed), normalize='true')

print(f"Decision Tree Confusion Matrix \n {d_cm}")
print()

print(f"Logistic Regression Confusion Matrix \n {lr_cm}")
print()

print(f"Random Forest Confusion Matrix \n {rf_cm}")
print()

print(f"Xgboost Confusion Matrix \n {xgb_cm}")
print()

Decision Tree Confusion Matrix 
 [[0.3        0.7       ]
 [0.02380952 0.97619048]]

Logistic Regression Confusion Matrix 
 [[0.55       0.45      ]
 [0.21428571 0.78571429]]

Random Forest Confusion Matrix 
 [[0.4        0.6       ]
 [0.11904762 0.88095238]]

Xgboost Confusion Matrix 
 [[0.35       0.65      ]
 [0.04761905 0.95238095]]



In [18]:
# Models and scores dict
model_performances = {
    "decision_tree" : {
        "model" : d_best_clf,
        "roc_auc" : d_roc_auc,
        "acc": d_acc
    },
    "xgboost" : {
        "model" : xgb_best_clf,
        "roc_auc" : xgb_roc_auc,
        "acc": xgb_acc
    },
    "random_forest" : {
        "model" : rf_best_clf,
        "roc_auc" : rf_roc_auc,
        "acc": rf_acc
    },
    "logistic_regression" : {
        "model" : lr_best_clf,
        "roc_auc" : lr_roc_auc,
        "acc": lr_acc
    }
} 

best_model = sorted(model_performances.items(), reverse=True, key=lambda score: score[1]['roc_auc'])[0]

if best_model[0] == 'xgboost':
    print(f"Using bentoml xgboost framework")
else:
    print(f"Using bentoml scikit learn framework")

model = best_model[1]['model']
print(model)

Using bentoml xgboost framework
XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False, eta=0.2,
              eval_metric=None, feature_types=None, gamma=0, gpu_id=-1,
              grow_policy='depthwise', importance_type=None,
              interaction_constraints='', learning_rate=0.200000003,
              max_bin=256, max_cat_threshold=64, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=10,
              missing=nan, monotone_constraints='()', n_estimators=50, n_jobs=0,
              num_parallel_tree=1, predictor='auto', ...)


In [39]:
# Test data for api
import json
test_data = df.drop([id_col, target],1).iloc[1].to_dict()
print(json.dumps(test_data, indent=2))

{
  "gender": "Male",
  "married": "Yes",
  "dependents": "1",
  "education": "Graduate",
  "self_employed": "No",
  "applicantincome": 4583,
  "coapplicantincome": 1508.0,
  "loanamount": 128.0,
  "loan_amount_term": 360.0,
  "credit_history": 1.0,
  "property_area": "Rural"
}


  test_data = df.drop([id_col, target],1).iloc[1].to_dict()
