In [2]:
# importing necessary libraries

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, fbeta_score, recall_score, precision_score
from sklearn.svm import SVC

from utils.Model_utils import GridThreshSearch, param_maker

In [18]:
# Turn off worning when calculating precicion
import warnings
from sklearn.exceptions import UndefinedMetricWarning
warnings.filterwarnings("ignore", category=UndefinedMetricWarning)



In [3]:
data = pd.read_csv('Derived_data/Data_model.csv')

In [4]:
data.head()

Unnamed: 0,client_gender,vehicle_production_year,age,had_car_loan,had_other_loans,loan_type,loan_initial_term,loan_to_value_ratio,annual_percentage_rate,monthly_interest_rate,...,branch_branch 2,branch_branch 3,branch_branch 4,branch_branch 5,branch_branch 6,branch_branch 7,default_status,log_income,log_vehical_value,log_loan_amount
0,1,2002,37.0,0,0,0,36,0.2,35.875763,3.374,...,0,1,0,0,0,0,1,7.101676,9.169518,7.56008
1,1,1995,36.0,0,0,0,54,0.777778,36.079749,3.374,...,0,1,0,0,0,0,1,0.0,8.776476,8.525161
2,1,1999,57.0,0,0,1,18,0.454545,42.292028,3.374,...,0,0,0,1,0,0,1,6.579251,8.571681,7.783224
3,0,2000,65.0,0,0,0,36,0.886364,35.580055,3.374,...,0,1,0,0,0,0,1,0.0,9.264829,9.144201
4,1,2008,35.0,0,0,1,54,0.75,37.090166,3.374,...,1,0,0,0,0,0,1,0.0,8.946375,8.658693


In [5]:
# Defining targetn and features

X = data[['client_gender', 'vehicle_production_year',
       'age', 'had_car_loan', 'had_other_loans', 'loan_type', 'loan_initial_term',
       'loan_to_value_ratio', 'annual_percentage_rate',
       'monthly_interest_rate', 'region_region 1',
       'region_region 10', 'region_region 11', 'region_region 12',
       'region_region 2', 'region_region 3', 'region_region 4',
       'region_region 5', 'region_region 6', 'region_region 7',
       'region_region 8', 'region_region 9', 'branch_branch 2',
       'branch_branch 3', 'branch_branch 4', 'branch_branch 5',
       'branch_branch 6', 'branch_branch 7', 'log_income',
       'log_vehical_value', 'log_loan_amount']]

y = data['default_status']

In [6]:
# Spliting to train and test sets

x_train, x_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42, test_size=0.1)

In [7]:
# Spliting into train and validation sets

x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, stratify=y_train, random_state=42, test_size=0.1)

In [8]:
# Standar scaler to scale the data and polinomial features 

scaler = StandardScaler()
poly = PolynomialFeatures(degree=3)

In [9]:
# Scaling the dataset
x_train_scaled = scaler.fit_transform(x_train)
x_val_scaled = scaler.transform(x_val)
x_test_scaled = scaler.transform(x_test)

In [10]:
# Deriving second order polinomials
x_train_scaled_poly = poly.fit_transform(x_train_scaled)

To predict the probability of default we will try 3 models:
* Linear Regression
* Random Forest
* Support Vector Classifier

In [11]:
# Parameter grid to try

logreg_params = param_maker({'C': [0.01, 0.1, 1, 10]})
svc_params =    param_maker({'C': [0.01, 0.1, 1, 10]})
forest_params = param_maker({
                            'max_depth': [30, 40],  
                            'min_samples_split': [5, 10, 15], 
                            'min_samples_leaf': [2, 4, 6],
                            })

In [12]:
# Static parameters

logreg_static_pars = {'class_weight' : 'balanced', 'max_iter' : 10000, 'random_state' : 4}
svc_static_pars = {'gamma' : 'scale', 'probability': True, 'class_weight': 'balanced', 'kernel' : 'rbf', 'random_state' : 4}
forest_statis_pars = {'random_state' : 4, 'n_estimators' : 1000, 'class_weight' : 'balanced'}

### Fitting Logistic Regression

In [20]:
log_model = GridThreshSearch(model=LogisticRegression, params=logreg_params, static_params=logreg_static_pars)

In [21]:
log_model.fit(x_train=x_train_scaled, x_val=x_val_scaled, y_train=y_train, y_val=y_val)

<utils.Model_utils.GridThreshSearch at 0x2a7713932c0>

In [22]:
# Best model
log_model.best_estimator

In [23]:
# Metric results:
print(f'f-beta: {log_model.best_fb}')
print(f'Precision: {log_model.best_prec}')
print(f'Recall: {log_model.best_rec}')

f-beta: 0.4330392943063352
Precision: 0.1393548387096774
Recall: 0.9152542372881356


### Fitting Random Forest

In [24]:
forest_model = GridThreshSearch(model=RandomForestClassifier, params=forest_params, static_params=forest_statis_pars)

In [25]:
forest_model.fit(x_train = x_train, x_val=x_val, y_train=y_train, y_val=y_val)

<utils.Model_utils.GridThreshSearch at 0x2a773a738f0>

In [None]:
# Best model
forest_model.best_estimator

In [None]:
# Metric results:
print(f'f-beta: {forest_model.best_fb}')
print(f'Precision: {forest_model.best_prec}')
print(f'Recall: {forest_model.best_rec}')

### Fitting Support Vector Classifier

In [20]:
svc_model = GridThreshSearch(model=SVC, params=svc_params, static_params=svc_static_pars)

In [21]:
svc_model.fit(x_train=x_train_scaled, x_val=x_val_scaled, y_train=y_train, y_val=y_val)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

<Model_utils.GridThreshSearch at 0x17e5afd5a60>

In [None]:
# Best model
svc_model.best_estimator

In [22]:
# Metric results:
print(f'f-beta: {svc_model.best_fb}')
print(f'Precision: {svc_model.best_prec}')
print(f'Recall: {svc_model.best_rec}')

0.4448938321536906