In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [2]:
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

## Define Train Data Set

In [3]:
train_df.head()

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,57107,57107,13375.0,0.1797,483.34,MORTGAGE,223000.0,Not Verified,low_risk,n,...,100.0,50.0,0.0,0.0,577150.0,122018.0,32000.0,170200.0,N,N
1,141451,141451,21000.0,0.1308,478.68,MORTGAGE,123000.0,Source Verified,low_risk,n,...,85.0,33.3,0.0,0.0,132750.0,27896.0,15900.0,35398.0,N,N
2,321143,321143,20000.0,0.124,448.95,MORTGAGE,197000.0,Source Verified,low_risk,n,...,85.7,33.3,0.0,0.0,628160.0,114043.0,22600.0,90340.0,N,N
3,11778,11778,3000.0,0.124,100.22,RENT,45000.0,Not Verified,low_risk,n,...,100.0,16.7,1.0,0.0,42006.0,20761.0,19900.0,15406.0,N,N
4,169382,169382,30000.0,0.1612,1056.49,MORTGAGE,133000.0,Source Verified,low_risk,n,...,100.0,66.7,0.0,0.0,283248.0,109056.0,79500.0,58778.0,N,N


In [4]:
# delete not relevant column that will not inluence our predictions 
train_df = train_df.drop(columns=['Unnamed: 0', 'index'])
train_df.head()

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,dti,delinq_2yrs,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,13375.0,0.1797,483.34,MORTGAGE,223000.0,Not Verified,low_risk,n,29.99,0.0,...,100.0,50.0,0.0,0.0,577150.0,122018.0,32000.0,170200.0,N,N
1,21000.0,0.1308,478.68,MORTGAGE,123000.0,Source Verified,low_risk,n,11.26,2.0,...,85.0,33.3,0.0,0.0,132750.0,27896.0,15900.0,35398.0,N,N
2,20000.0,0.124,448.95,MORTGAGE,197000.0,Source Verified,low_risk,n,11.28,0.0,...,85.7,33.3,0.0,0.0,628160.0,114043.0,22600.0,90340.0,N,N
3,3000.0,0.124,100.22,RENT,45000.0,Not Verified,low_risk,n,18.08,0.0,...,100.0,16.7,1.0,0.0,42006.0,20761.0,19900.0,15406.0,N,N
4,30000.0,0.1612,1056.49,MORTGAGE,133000.0,Source Verified,low_risk,n,27.77,0.0,...,100.0,66.7,0.0,0.0,283248.0,109056.0,79500.0,58778.0,N,N


In [5]:
# Convert categorical data to numeric and separate target feature for training data
X = pd.get_dummies(train_df)
X.head()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,loan_status_low_risk,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N,debt_settlement_flag_Y
0,13375.0,0.1797,483.34,223000.0,29.99,0.0,0.0,15.0,0.0,39728.0,...,True,True,False,True,True,False,True,False,True,False
1,21000.0,0.1308,478.68,123000.0,11.26,2.0,0.0,16.0,0.0,9585.0,...,True,True,False,True,True,False,True,False,True,False
2,20000.0,0.124,448.95,197000.0,11.28,0.0,0.0,12.0,0.0,16708.0,...,True,True,False,True,True,False,True,False,True,False
3,3000.0,0.124,100.22,45000.0,18.08,0.0,0.0,12.0,1.0,8809.0,...,True,True,False,True,True,False,True,False,True,False
4,30000.0,0.1612,1056.49,133000.0,27.77,0.0,2.0,13.0,0.0,65420.0,...,True,True,False,True,True,False,True,False,True,False


In [16]:
# define y_train
y_train = X['loan_status_high_risk']

In [6]:
# Drop the loan_status (target column) to create the X data
X_train = X.drop(['loan_status_high_risk', 'loan_status_low_risk'], axis=1)
X_train.head()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N,debt_settlement_flag_Y
0,13375.0,0.1797,483.34,223000.0,29.99,0.0,0.0,15.0,0.0,39728.0,...,False,True,False,True,True,False,True,False,True,False
1,21000.0,0.1308,478.68,123000.0,11.26,2.0,0.0,16.0,0.0,9585.0,...,False,True,False,True,True,False,True,False,True,False
2,20000.0,0.124,448.95,197000.0,11.28,0.0,0.0,12.0,0.0,16708.0,...,False,True,False,True,True,False,True,False,True,False
3,3000.0,0.124,100.22,45000.0,18.08,0.0,0.0,12.0,1.0,8809.0,...,False,True,False,True,True,False,True,False,True,False
4,30000.0,0.1612,1056.49,133000.0,27.77,0.0,2.0,13.0,0.0,65420.0,...,False,True,False,True,True,False,True,False,True,False


## Define Test Data Set

In [7]:
test_df = test_df.drop(columns=['Unnamed: 0', 'index'])
test_df.head()

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,dti,delinq_2yrs,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,40000.0,0.0819,814.7,MORTGAGE,140000.0,Not Verified,low_risk,n,19.75,0.0,...,97.7,0.0,0.0,0.0,527975.0,70914.0,74600.0,99475.0,N,N
1,6000.0,0.1524,208.7,RENT,55000.0,Not Verified,low_risk,n,11.52,2.0,...,66.7,0.0,0.0,0.0,34628.0,23460.0,5900.0,23628.0,N,N
2,3600.0,0.1695,128.27,RENT,42000.0,Not Verified,low_risk,n,6.74,0.0,...,100.0,0.0,0.0,0.0,23100.0,19183.0,7300.0,15000.0,N,N
3,20000.0,0.1524,478.33,RENT,100000.0,Not Verified,low_risk,n,12.13,0.0,...,100.0,50.0,0.0,0.0,56481.0,43817.0,13800.0,35981.0,N,N
4,3600.0,0.124,120.27,RENT,50000.0,Not Verified,low_risk,n,16.08,0.0,...,100.0,25.0,0.0,0.0,45977.0,32448.0,21000.0,24977.0,N,N


In [8]:
# Convert categorical data to numeric and separate target feature for testing data
test_data = pd.get_dummies(test_df)

test_data.head()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,loan_status_high_risk,loan_status_low_risk,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N
0,40000.0,0.0819,814.7,140000.0,19.75,0.0,1.0,18.0,0.0,9471.0,...,False,True,True,False,True,True,False,True,False,True
1,6000.0,0.1524,208.7,55000.0,11.52,2.0,0.0,8.0,0.0,1280.0,...,False,True,True,False,True,True,False,True,False,True
2,3600.0,0.1695,128.27,42000.0,6.74,0.0,0.0,6.0,0.0,4757.0,...,False,True,True,False,True,True,False,True,False,True
3,20000.0,0.1524,478.33,100000.0,12.13,0.0,2.0,7.0,0.0,12731.0,...,False,True,True,False,True,True,False,True,False,True
4,3600.0,0.124,120.27,50000.0,16.08,0.0,3.0,6.0,0.0,10413.0,...,False,True,True,False,True,True,False,True,False,True


In [9]:
y_test = test_data['loan_status_high_risk'] 
X_test = test_data.drop(['loan_status_high_risk', 'loan_status_low_risk'], axis=1)

In [10]:
# Check if testing and training set have identical columns
the_diff = list(set(X_train.columns) - set(X_test.columns))
the_diff

['debt_settlement_flag_Y']

In [11]:
# Check what values we have in training set
X_train['debt_settlement_flag_Y'].unique()

array([False,  True])

In [12]:
# generate randomly values for test set
import random
dsf_Y = []
for x in range(0, len(X_test)):
    dsf_Y.append(random.randint(0,1))

In [13]:
# add missing dummy variables to testing set
X_test['debt_settlement_flag_Y'] = dsf_Y

## Logistic Resression and Random Forest on the Unscaled Data

### Predictions on what type of Model will perform better
* LR will do worse on unscaled data because features values range is too wide
* Random Forest (RF) being an ensemble-based learning algorithm should do better as it will form number of trees to get clearer model and weighs certain features more important than others

In [14]:
# Train the Logistic Regression model on the unscaled data and print the model score
lr = LogisticRegression(max_iter=400)

In [17]:
# Fit the model with train data
lr.fit(X_train, y_train)
print(f"Training Data Score: {lr.score(X_train, y_train)}")
print(f"Testing Data Score: {lr.score(X_test, y_test)}")

Training Data Score: 0.6886699507389162
Testing Data Score: 0.5499787324542748


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [18]:
# Train a Random Forest Classifier model and print the model score
rf = RandomForestClassifier(random_state=1, n_estimators=500).fit(X_train, y_train)
print(f'Training Score: {rf.score(X_train, y_train)}')
print(f'Testing Score: {rf.score(X_test, y_test)}')

Training Score: 1.0
Testing Score: 0.6429179072735006


### Conclusion
* LR scored 0.547
* RF scored 0.644
* As we predicted RF scored better than LR on unscaled data

## Find Best Parameters with GridSearchCV

### We want to tune our models with hyperparameters to improve models score. 
* Expected: the LR should do better with tuned parameters
* RF model might do better with tuned parameters

In [19]:
from sklearn.model_selection import GridSearchCV

## Best Parameters for LR Model

In [20]:
param_grid = {'C': [1, 5, 10, 50],
             'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']}
param_grid

{'C': [1, 5, 10, 50],
 'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']}

In [25]:
grid_lr = GridSearchCV(lr, param_grid, verbose=3, refit="accuracy")

In [26]:
grid_lr.fit(X_train, y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


  alpha_star, phi_star, old_fval, derphi_star = scalar_search_wolfe2(
  ret = line_search_wolfe2(


[CV 1/5] END .............C=1, solver=newton-cg;, score=0.667 total time=  19.9s


  alpha_star, phi_star, old_fval, derphi_star = scalar_search_wolfe2(
  ret = line_search_wolfe2(


[CV 2/5] END .............C=1, solver=newton-cg;, score=0.678 total time=  22.2s


  alpha_star, phi_star, old_fval, derphi_star = scalar_search_wolfe2(
  ret = line_search_wolfe2(


[CV 3/5] END .............C=1, solver=newton-cg;, score=0.709 total time=  20.2s


  alpha_star, phi_star, old_fval, derphi_star = scalar_search_wolfe2(
  ret = line_search_wolfe2(


[CV 4/5] END .............C=1, solver=newton-cg;, score=0.710 total time=  21.5s


  alpha_star, phi_star, old_fval, derphi_star = scalar_search_wolfe2(
  ret = line_search_wolfe2(


[CV 5/5] END .............C=1, solver=newton-cg;, score=0.713 total time=  20.2s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 1/5] END .................C=1, solver=lbfgs;, score=0.627 total time=   0.7s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 2/5] END .................C=1, solver=lbfgs;, score=0.680 total time=   0.7s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 3/5] END .................C=1, solver=lbfgs;, score=0.702 total time=   0.4s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 4/5] END .................C=1, solver=lbfgs;, score=0.705 total time=   0.5s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 5/5] END .................C=1, solver=lbfgs;, score=0.684 total time=   0.5s
[CV 1/5] END .............C=1, solver=liblinear;, score=0.671 total time=   1.1s
[CV 2/5] END .............C=1, solver=liblinear;, score=0.675 total time=   1.5s
[CV 3/5] END .............C=1, solver=liblinear;, score=0.713 total time=   1.2s
[CV 4/5] END .............C=1, solver=liblinear;, score=0.715 total time=   1.5s
[CV 5/5] END .............C=1, solver=liblinear;, score=0.721 total time=   1.5s




[CV 1/5] END ...................C=1, solver=sag;, score=0.574 total time=   1.7s




[CV 2/5] END ...................C=1, solver=sag;, score=0.545 total time=   1.7s




[CV 3/5] END ...................C=1, solver=sag;, score=0.605 total time=   1.7s




[CV 4/5] END ...................C=1, solver=sag;, score=0.605 total time=   1.7s




[CV 5/5] END ...................C=1, solver=sag;, score=0.602 total time=   1.8s




[CV 1/5] END ..................C=1, solver=saga;, score=0.571 total time=   1.8s




[CV 2/5] END ..................C=1, solver=saga;, score=0.546 total time=   1.8s




[CV 3/5] END ..................C=1, solver=saga;, score=0.594 total time=   1.8s




[CV 4/5] END ..................C=1, solver=saga;, score=0.592 total time=   1.8s




[CV 5/5] END ..................C=1, solver=saga;, score=0.588 total time=   1.8s


  alpha_star, phi_star, old_fval, derphi_star = scalar_search_wolfe2(
  ret = line_search_wolfe2(


[CV 1/5] END .............C=5, solver=newton-cg;, score=0.666 total time=  28.1s


  alpha_star, phi_star, old_fval, derphi_star = scalar_search_wolfe2(
  ret = line_search_wolfe2(


[CV 2/5] END .............C=5, solver=newton-cg;, score=0.677 total time=  28.7s


  alpha_star, phi_star, old_fval, derphi_star = scalar_search_wolfe2(
  ret = line_search_wolfe2(


[CV 3/5] END .............C=5, solver=newton-cg;, score=0.709 total time=  21.4s


  alpha_star, phi_star, old_fval, derphi_star = scalar_search_wolfe2(
  ret = line_search_wolfe2(


[CV 4/5] END .............C=5, solver=newton-cg;, score=0.711 total time=  20.6s


  alpha_star, phi_star, old_fval, derphi_star = scalar_search_wolfe2(
  ret = line_search_wolfe2(


[CV 5/5] END .............C=5, solver=newton-cg;, score=0.713 total time=  23.5s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 1/5] END .................C=5, solver=lbfgs;, score=0.625 total time=   0.5s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 2/5] END .................C=5, solver=lbfgs;, score=0.678 total time=   0.7s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 3/5] END .................C=5, solver=lbfgs;, score=0.711 total time=   0.4s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 4/5] END .................C=5, solver=lbfgs;, score=0.700 total time=   0.6s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 5/5] END .................C=5, solver=lbfgs;, score=0.691 total time=   0.9s
[CV 1/5] END .............C=5, solver=liblinear;, score=0.672 total time=   1.7s
[CV 2/5] END .............C=5, solver=liblinear;, score=0.674 total time=   1.5s
[CV 3/5] END .............C=5, solver=liblinear;, score=0.712 total time=   1.4s
[CV 4/5] END .............C=5, solver=liblinear;, score=0.712 total time=   1.2s
[CV 5/5] END .............C=5, solver=liblinear;, score=0.725 total time=   1.2s




[CV 1/5] END ...................C=5, solver=sag;, score=0.574 total time=   1.7s




[CV 2/5] END ...................C=5, solver=sag;, score=0.545 total time=   1.8s




[CV 3/5] END ...................C=5, solver=sag;, score=0.605 total time=   1.8s




[CV 4/5] END ...................C=5, solver=sag;, score=0.605 total time=   1.7s




[CV 5/5] END ...................C=5, solver=sag;, score=0.602 total time=   1.7s




[CV 1/5] END ..................C=5, solver=saga;, score=0.571 total time=   1.8s




[CV 2/5] END ..................C=5, solver=saga;, score=0.547 total time=   1.8s




[CV 3/5] END ..................C=5, solver=saga;, score=0.594 total time=   1.8s




[CV 4/5] END ..................C=5, solver=saga;, score=0.592 total time=   1.8s




[CV 5/5] END ..................C=5, solver=saga;, score=0.588 total time=   1.8s


  alpha_star, phi_star, old_fval, derphi_star = scalar_search_wolfe2(
  ret = line_search_wolfe2(


[CV 1/5] END ............C=10, solver=newton-cg;, score=0.665 total time=  22.3s




[CV 2/5] END ............C=10, solver=newton-cg;, score=0.678 total time=  25.6s


  alpha_star, phi_star, old_fval, derphi_star = scalar_search_wolfe2(
  ret = line_search_wolfe2(


[CV 3/5] END ............C=10, solver=newton-cg;, score=0.709 total time=  23.1s


  alpha_star, phi_star, old_fval, derphi_star = scalar_search_wolfe2(
  ret = line_search_wolfe2(


[CV 4/5] END ............C=10, solver=newton-cg;, score=0.709 total time=  18.9s


  alpha_star, phi_star, old_fval, derphi_star = scalar_search_wolfe2(
  ret = line_search_wolfe2(


[CV 5/5] END ............C=10, solver=newton-cg;, score=0.713 total time=  24.8s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 1/5] END ................C=10, solver=lbfgs;, score=0.620 total time=   0.4s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 2/5] END ................C=10, solver=lbfgs;, score=0.685 total time=   0.3s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 3/5] END ................C=10, solver=lbfgs;, score=0.706 total time=   0.4s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 4/5] END ................C=10, solver=lbfgs;, score=0.699 total time=   0.4s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 5/5] END ................C=10, solver=lbfgs;, score=0.688 total time=   0.3s
[CV 1/5] END ............C=10, solver=liblinear;, score=0.674 total time=   1.2s
[CV 2/5] END ............C=10, solver=liblinear;, score=0.676 total time=   0.9s
[CV 3/5] END ............C=10, solver=liblinear;, score=0.712 total time=   1.5s
[CV 4/5] END ............C=10, solver=liblinear;, score=0.711 total time=   1.1s
[CV 5/5] END ............C=10, solver=liblinear;, score=0.726 total time=   1.0s




[CV 1/5] END ..................C=10, solver=sag;, score=0.574 total time=   1.7s




[CV 2/5] END ..................C=10, solver=sag;, score=0.545 total time=   1.7s




[CV 3/5] END ..................C=10, solver=sag;, score=0.605 total time=   1.7s




[CV 4/5] END ..................C=10, solver=sag;, score=0.605 total time=   1.8s




[CV 5/5] END ..................C=10, solver=sag;, score=0.602 total time=   1.7s




[CV 1/5] END .................C=10, solver=saga;, score=0.571 total time=   1.8s




[CV 2/5] END .................C=10, solver=saga;, score=0.547 total time=   1.8s




[CV 3/5] END .................C=10, solver=saga;, score=0.594 total time=   1.8s




[CV 4/5] END .................C=10, solver=saga;, score=0.592 total time=   1.8s




[CV 5/5] END .................C=10, solver=saga;, score=0.588 total time=   1.8s


  alpha_star, phi_star, old_fval, derphi_star = scalar_search_wolfe2(
  ret = line_search_wolfe2(


[CV 1/5] END ............C=50, solver=newton-cg;, score=0.665 total time=  20.2s


  alpha_star, phi_star, old_fval, derphi_star = scalar_search_wolfe2(
  ret = line_search_wolfe2(


[CV 2/5] END ............C=50, solver=newton-cg;, score=0.678 total time=  17.8s


  alpha_star, phi_star, old_fval, derphi_star = scalar_search_wolfe2(
  ret = line_search_wolfe2(


[CV 3/5] END ............C=50, solver=newton-cg;, score=0.709 total time=  26.5s


  alpha_star, phi_star, old_fval, derphi_star = scalar_search_wolfe2(
  ret = line_search_wolfe2(


[CV 4/5] END ............C=50, solver=newton-cg;, score=0.709 total time=  22.4s


  alpha_star, phi_star, old_fval, derphi_star = scalar_search_wolfe2(
  ret = line_search_wolfe2(


[CV 5/5] END ............C=50, solver=newton-cg;, score=0.713 total time=  22.2s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 1/5] END ................C=50, solver=lbfgs;, score=0.625 total time=   0.8s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 2/5] END ................C=50, solver=lbfgs;, score=0.682 total time=   0.7s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 3/5] END ................C=50, solver=lbfgs;, score=0.705 total time=   0.7s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 4/5] END ................C=50, solver=lbfgs;, score=0.698 total time=   0.8s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 5/5] END ................C=50, solver=lbfgs;, score=0.693 total time=   0.9s
[CV 1/5] END ............C=50, solver=liblinear;, score=0.674 total time=   1.3s
[CV 2/5] END ............C=50, solver=liblinear;, score=0.677 total time=   1.2s
[CV 3/5] END ............C=50, solver=liblinear;, score=0.711 total time=   1.3s
[CV 4/5] END ............C=50, solver=liblinear;, score=0.714 total time=   1.1s
[CV 5/5] END ............C=50, solver=liblinear;, score=0.724 total time=   1.2s




[CV 1/5] END ..................C=50, solver=sag;, score=0.574 total time=   1.7s




[CV 2/5] END ..................C=50, solver=sag;, score=0.545 total time=   1.7s




[CV 3/5] END ..................C=50, solver=sag;, score=0.605 total time=   1.7s




[CV 4/5] END ..................C=50, solver=sag;, score=0.605 total time=   1.7s




[CV 5/5] END ..................C=50, solver=sag;, score=0.602 total time=   1.7s




[CV 1/5] END .................C=50, solver=saga;, score=0.571 total time=   1.8s




[CV 2/5] END .................C=50, solver=saga;, score=0.546 total time=   1.9s




[CV 3/5] END .................C=50, solver=saga;, score=0.594 total time=   1.9s




[CV 4/5] END .................C=50, solver=saga;, score=0.592 total time=   1.9s




[CV 5/5] END .................C=50, solver=saga;, score=0.588 total time=   1.8s


In [27]:
print(grid_lr.best_params_)

{'C': 50, 'solver': 'liblinear'}


In [28]:
from sklearn.metrics import accuracy_score

predictions = grid_lr.predict(X_test)

print(f" LR model with hyperparams accuracy: {accuracy_score(y_test, predictions):.3f}")

 LR model with hyperparams accuracy: 0.572


## Best Parameters for RF Model

In [29]:
param_grid1 = {'max_features': ['sqrt', 'log2'],
              'n_estimators': [10, 100, 200]
             }
param_grid1

{'max_features': ['sqrt', 'log2'], 'n_estimators': [10, 100, 200]}

In [30]:
grid_rf = GridSearchCV(rf, param_grid1, verbose=3, refit="accuracy")

In [31]:
grid_rf.fit(X_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV 1/5] END max_features=sqrt, n_estimators=10;, score=0.630 total time=   0.3s
[CV 2/5] END max_features=sqrt, n_estimators=10;, score=0.693 total time=   0.2s
[CV 3/5] END max_features=sqrt, n_estimators=10;, score=0.764 total time=   0.2s
[CV 4/5] END max_features=sqrt, n_estimators=10;, score=0.764 total time=   0.2s
[CV 5/5] END max_features=sqrt, n_estimators=10;, score=0.766 total time=   0.2s
[CV 1/5] END max_features=sqrt, n_estimators=100;, score=0.657 total time=   2.2s
[CV 2/5] END max_features=sqrt, n_estimators=100;, score=0.733 total time=   2.3s
[CV 3/5] END max_features=sqrt, n_estimators=100;, score=0.784 total time=   2.4s
[CV 4/5] END max_features=sqrt, n_estimators=100;, score=0.799 total time=   2.3s
[CV 5/5] END max_features=sqrt, n_estimators=100;, score=0.814 total time=   2.3s
[CV 1/5] END max_features=sqrt, n_estimators=200;, score=0.661 total time=   4.6s
[CV 2/5] END max_features=sqrt, n_estimator

In [32]:
print(grid_rf.best_params_)

{'max_features': 'sqrt', 'n_estimators': 200}


In [33]:
predictionsRF = grid_rf.predict(X_test)

print(f" RF model with hyperparams accuracy: {accuracy_score(y_test, predictionsRF):.3f}")

 RF model with hyperparams accuracy: 0.650


### Conclusion
* We did not get significant improvement after tuning models, so it is not what we expected.
* GridSearch for LR Model with adjusted hyperparameters did slightly better than LR - 0.575 vs 0.547
* GridSearch for RF Model with hyperparams did not significantly improve comparing to RF - 0.648 vs 0.644

## Scaled data

### Predictions for scaled data
* LR Model should perfrom better on scaled data, as it will reduce the impact of initial wide range of values.
* RF is not that sensitive to the range of data point, so we are not expecting better score for RF.

In [34]:
# Scale the data
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

In [35]:
# Train the Logistic Regression model on the scaled data and print the model score
lr.fit(X_train_scaled, y_train)

In [36]:
print(f"Training Data Score: {lr.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {lr.score(X_test_scaled, y_test)}")

Training Data Score: 0.7078817733990148
Testing Data Score: 0.6614206720544449


In [37]:
# Train a Random Forest Classifier model on the scaled data and print the model score
rf.fit(X_train_scaled, y_train)

In [38]:
print(f"Training Data Score: {rf.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {rf.score(X_test_scaled, y_test)}")

Training Data Score: 1.0
Testing Data Score: 0.5599744789451297


### Conclusion
* LR perfromed better on scaled data as it was expected - 0.66 comparing to 0.575
* RF scored less on scaled data - 0.56 vs 0.648, so as expected it did not improve, but event went lower.