Logistic Regression
=========

Libraries:

In [2]:
# remove warnings
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt; plt.style.use("ggplot")

from sklearn.linear_model import LogisticRegression

from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.metrics import roc_curve, roc_auc_score, confusion_matrix, precision_recall_fscore_support, auc, accuracy_score

#### Read Data:

In [5]:
data = pd.read_csv("/media/juanan/DATA/loan_data_analysis/data/loans_processed.csv", sep = "^")\
                   .sample(100000)

In [6]:
data.head()

Unnamed: 0,addr_state,annual_inc,application_type,avg_cur_bal,bc_open_to_buy,bc_util,delinq_2yrs,delinq_amnt,disbursement_method,dti,...,revol_util,sub_grade,tax_liens,term,tot_coll_amt,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,verification_status
758903,CA,57000.0,Individual,51460.0,329.0,98.7,0.0,0.0,Cash,26.75,...,87.4,E2,0.0,36 months,0.0,480978.0,51290.0,25200.0,34478.0,Verified
100808,WI,18000.0,Individual,1554.0,1299.0,72.9,0.0,0.0,Cash,24.47,...,48.0,D2,0.0,36 months,0.0,20608.0,13987.0,4800.0,10408.0,Not Verified
39088,FL,35000.0,Individual,1054.0,3228.0,28.3,0.0,0.0,Cash,7.03,...,19.9,C3,0.0,60 months,0.0,11600.0,5269.0,4500.0,4000.0,Verified
58201,NJ,240000.0,Individual,47342.0,3609.0,42.7,0.0,0.0,Cash,4.01,...,15.9,D3,0.0,36 months,0.0,687640.0,27521.0,6300.0,35040.0,Source Verified
125772,CA,140000.0,Individual,61103.0,5815.0,73.4,0.0,0.0,Cash,24.04,...,70.9,C4,0.0,36 months,0.0,643932.0,93376.0,21900.0,149273.0,Verified


In [7]:
data.shape

(100000, 48)

#### Pre-processing:

Scale:

In [8]:
numeric_variables = data._get_numeric_data().columns

In [9]:
scaler = RobustScaler()

In [10]:
scaler.fit(data[numeric_variables])

RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
       with_scaling=True)

In [11]:
data[numeric_variables] = scaler.transform(data[numeric_variables])

In [12]:
data.head()

Unnamed: 0,addr_state,annual_inc,application_type,avg_cur_bal,bc_open_to_buy,bc_util,delinq_2yrs,delinq_amnt,disbursement_method,dti,...,revol_util,sub_grade,tax_liens,term,tot_coll_amt,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,verification_status
758903,CA,-0.177778,Individual,3.009498,-0.352356,0.779193,0.0,0.0,Cash,0.791457,...,0.913279,E2,0.0,36 months,0.0,1.962802,0.407105,0.596939,0.159036,Verified
100808,WI,-1.044444,Individual,-0.311093,-0.251296,0.231423,0.0,0.0,Cash,0.600503,...,-0.154472,D2,0.0,36 months,0.0,-0.374283,-0.500511,-0.443878,-0.396712,Not Verified
39088,FL,-0.666667,Individual,-0.344362,-0.050322,-0.715499,0.0,0.0,Cash,-0.860134,...,-0.915989,C3,0.0,60 months,0.0,-0.420012,-0.712628,-0.459184,-0.544665,Verified
58201,NJ,3.888889,Individual,2.735499,-0.010627,-0.409766,0.0,0.0,Cash,-1.113065,...,-1.02439,D3,0.0,36 months,0.0,3.011929,-0.171217,-0.367347,0.172012,Source Verified
125772,CA,1.666667,Individual,3.651114,0.219207,0.242038,0.0,0.0,Cash,0.564489,...,0.466125,C4,0.0,36 months,0.0,2.790043,1.431095,0.428571,2.809517,Verified


In [13]:
#data = data.loc[:,data.columns != 'last_fico_range_high']

Categorical Variables:

In [14]:
categorical_variables = data.select_dtypes(include="object").columns

In [15]:
def categorical_to_numeric(variable):
    
    variable_dict = dict(data.groupby(variable)['loan_status'].mean())
    
    result = data[variable].map(lambda i: variable_dict[i])
    
    return result

In [16]:
for variable in categorical_variables:
    data[variable] = categorical_to_numeric(variable)

In [17]:
data.head()

Unnamed: 0,addr_state,annual_inc,application_type,avg_cur_bal,bc_open_to_buy,bc_util,delinq_2yrs,delinq_amnt,disbursement_method,dti,...,revol_util,sub_grade,tax_liens,term,tot_coll_amt,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,verification_status
758903,0.201377,-0.177778,0.207613,3.009498,-0.352356,0.779193,0.0,0.0,0.207042,0.791457,...,0.913279,0.384487,0.0,0.166491,0.0,1.962802,0.407105,0.596939,0.159036,0.242382
100808,0.179862,-1.044444,0.207613,-0.311093,-0.251296,0.231423,0.0,0.0,0.207042,0.600503,...,-0.154472,0.295874,0.0,0.166491,0.0,-0.374283,-0.500511,-0.443878,-0.396712,0.153999
39088,0.219215,-0.666667,0.207613,-0.344362,-0.050322,-0.715499,0.0,0.0,0.207042,-0.860134,...,-0.915989,0.234722,0.0,0.336205,0.0,-0.420012,-0.712628,-0.459184,-0.544665,0.242382
58201,0.212695,3.888889,0.207613,2.735499,-0.010627,-0.409766,0.0,0.0,0.207042,-1.113065,...,-1.02439,0.305691,0.0,0.166491,0.0,3.011929,-0.171217,-0.367347,0.172012,0.220686
125772,0.201377,1.666667,0.207613,3.651114,0.219207,0.242038,0.0,0.0,0.207042,0.564489,...,0.466125,0.248523,0.0,0.166491,0.0,2.790043,1.431095,0.428571,2.809517,0.242382


In [18]:
data.shape

(100000, 48)

Target:

In [19]:
X = data.loc[:, data.columns!='loan_status']

In [20]:
y = data['loan_status']

In [15]:
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=4290)

Prior:

In [21]:
y.value_counts()[0] / len(y)

0.79254

__Logistic Regression__:

Tuning regularization:

In [25]:
parameters = {'C': np.power(10.0, np.arange(-10, 10))}

GridSearch with Cross Validation:

In [27]:
log_reg = LogisticRegression(penalty='l2')

In [36]:
log_reg_classifier = GridSearchCV(log_reg, parameters, cv=5, scoring="roc_auc")

In [37]:
log_reg_classifier.fit(X, y)

GridSearchCV(cv=5, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': array([1.e-10, 1.e-09, 1.e-08, 1.e-07, 1.e-06, 1.e-05, 1.e-04, 1.e-03,
       1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03, 1.e+04, 1.e+05,
       1.e+06, 1.e+07, 1.e+08, 1.e+09])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=0)

Best estimator and result:

In [38]:
log_reg_classifier.best_estimator_

LogisticRegression(C=10.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [39]:
log_reg_classifier.best_score_

0.7162357408033346