In [1]:
%load_ext autoreload
%autoreload 2

In [63]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
import seaborn as sns
import seaborn.objects as so
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict



from src.DataProcessor import DataProcessor


In [38]:
PATH = r'/Users/michaelschaid/GitHub/credit_modeling/data/loans_2007.csv'
DATA = (DataProcessor(PATH, target = 'loan_status')
        .load_data()
        .clean_data()
        .processes_dummies()
        .extract_features_and_target()
        )

In [64]:

def fit_lr(max_iter=200, class_weight=None):
    lr = LogisticRegression(max_iter=max_iter, class_weight=class_weight)
    X = DATA.feature_data
    y_true = DATA.target_data
    y_pred = cross_val_predict(lr,X,y_true, cv=3)
    return y_pred, y_true
y_pred, y_true  = fit_lr(max_iter=200)

In [65]:
def calculate_rates(y_pred: pd.Series, y_true: pd.Series)-> float:
    """## Summary
    takes in predicted and acutal values, calculate true postive, true neg, false pos, and false neg
    returns true postve rate and false postve rate
    `

    Args:
        y_pred (pd.Series): predicted y values
        y_true (pd.Seres): true y values

    Returns:
        tuple: true postve rate:float and false postve rate: float
    """
    true_pos = sum((y_pred == 1) & (y_true == 1))
    false_pos = sum((y_pred == 1) & (y_true == 0))
    true_neg = sum((y_pred == 0) & (y_true == 0))
    false_neg = sum((y_pred == 0) & (y_true == 1))
    
    false_pos_rate = false_pos/(false_pos + true_neg)
    true_pos_rate = true_pos/(true_pos + false_neg)
    
    return true_pos_rate, false_pos_rate
    
    
tpr, fpr = calculate_rates(y_pred, y_true)
print(f'true positive rate: {tpr}, \n false postive rate {fpr}')

true positive rate: 0.9985812025096951, 
 false postive rate 0.9962070927365826


Model is predictingly nearlly all ones, due to imbalance in the class
We will test both: 

*  oversampling and undersampling to ensure that the classifier gets input that has a balanced number of each class.
  
* Tell the classifier to penalize misclassifications of the less prevalent class more than the other class.


## Balanced penalty
> penality calculated automatically to be ~6 for misclassification

In [66]:
bal_y, bal_true_y = fit_lr(class_weight = 'balanced')
tpr_bal, fpr_bal = calculate_rates(bal_y, bal_true_y)
print(f'true positive rate: {tpr_bal}, \n false postive rate {fpr_bal}')

true positive rate: 0.5581234038528234, 
 false postive rate 0.3699981035463683


tpr is reduced by ~ 50%, and fpr is reduced by ~ 66%

> manual penalty set to 10:1

In [67]:
penalty={
    0:10, 
    1:1
}

man_pred_y, man_true_y = fit_lr(max_iter = 200, class_weight = penalty)
tpr_man, fpr_man = calculate_rates(man_pred_y, man_true_y)
print(f'true positive rate: {tpr_man}, \n false postive rate {fpr_man}')


true positive rate: 0.16940442034240313, 
 false postive rate 0.09444339085909349


fpr has been reduced to ~9%, will revist if necessary but first test random forest

## Random Forest

In [73]:
def fit_forest(n_estimators=400, class_weight=None, random_state=1):
    rf = RandomForestClassifier(max_iter=n_estimators,random_state=random_state, class_weight=class_weight)
    X = DATA.feature_data
    y_true = DATA.target_data
    y_pred = cross_val_predict(rf,X,y_true, cv=3)
    return y_pred, y_true

rf_y_pred, rf_y_true  = fit_lr()

In [74]:
tpr_rf, fpr_rf = calculate_rates(rf_y_pred, rf_y_true)
print(f'true positive rate: {tpr_man}, \n false postive rate {fpr_man}')

true positive rate: 0.16940442034240313, 
 false postive rate 0.09444339085909349


# Summary
with the current data set, we have reached a minimumn fpr of 9.44%. If this is unsatisfactory, we will need to strategies feature engineering or additional modeling methods to improve performance 