#  Lending Club - Interest Rate Prediction Using Linear Regression, L1, L2, ElasticNet and Randomized Grid Search Hyperparameter tuning.


Here in this jupyter notebook, we bulid models to predict **'Interest_Rate'** with matric **'int_rate'**.

Importing the data and other libraries.

In [25]:
# Data structure packages
import pandas as pd
import numpy as np

# Machine Learning packages
import time
import math
from scipy import stats
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score, confusion_matrix
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from PIL import Image

# Visualization packages
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use("ggplot")

dataset optimised for Regression 

In [5]:
# load data
loan = pd.read_csv("LendingData.csv", low_memory=False)

In [6]:
# Should exclude other response features before modeling
no_other_response_loan = loan.drop(['sub_grade','int_rate'], axis = 1)

In [7]:
no_other_response_loan.transpose()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,887340,887341,887342,887343,887344,887345,887346,887347,887348,887349
Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,887369,887370,887371,887372,887373,887374,887375,887376,887377,887378
loan_amnt,5000,2500,2400,10000,3000,5000,7000,3000,5600,5375,...,4000,7500,10850,12000,11575,10000,24000,13000,12000,20000
term,36,60,36,36,60,36,60,36,60,60,...,36,36,36,36,36,36,36,60,60,36
installment,162.87,59.83,84.33,339.31,67.79,156.46,170.08,109.43,152.39,121.45,...,126.59,240.22,399.04,398.52,404.61,332.1,797.03,316.07,317.86,664.2
emp_length,10,1,10,10,1,3,8,9,4,1,...,10,8,0,1,10,8,10,5,1,10
annual_inc,24000,30000,12252,49200,80000,36000,47004,48000,40000,15000,...,50000,40000,32000,63000,25400,31000,79000,35000,64400,100000
issue_d,Dec-2011,Dec-2011,Dec-2011,Dec-2011,Dec-2011,Dec-2011,Dec-2011,Dec-2011,Dec-2011,Dec-2011,...,Jan-2015,Jan-2015,Jan-2015,Jan-2015,Jan-2015,Jan-2015,Jan-2015,Jan-2015,Jan-2015,Jan-2015
dti,27.65,1,8.72,20,17.94,11.2,23.51,5.35,5.55,18.08,...,12.63,25.63,29.44,23.69,27.08,28.69,3.9,30.9,27.19,10.83
delinq_2yrs,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
open_acc,3,3,2,10,15,9,7,4,11,2,...,11,10,9,13,9,9,5,9,17,8


In [8]:
# exclude Loan payment related features which are unavailable when we try to predict Interest Rate
pyment_related = ['total_pymnt' ,'total_pymnt_inv','out_prncp']

In [9]:
# exclude target label related features
no_other_response_loan = no_other_response_loan.drop(pyment_related, axis = 1)
y = loan.int_rate.values

Categorical Feature Transformation
One-hot encoding 

In [10]:
# Collect column names, check missing value 
one_hot_column = []
dummy_column = []

for each in no_other_response_loan:
    if no_other_response_loan[each].dtype == 'object':
        # Check NA values again
        print(each, no_other_response_loan[each].isnull().sum())
        if len(no_other_response_loan[each].unique()) <= 2:
            one_hot_column.append(each)
        elif len(no_other_response_loan[each].unique()) > 2:
            dummy_column.append(each)

issue_d 0


In [11]:
# One-hot encoding for categorical variables containing two levels
le = LabelEncoder()
le_count = 0

for col in one_hot_column:
    print(col)
    le.fit(no_other_response_loan[col])
    no_other_response_loan[col] = le.transform(no_other_response_loan[col])
            
    # numbe of columns that were label encoded
    le_count += 1
            
print('%d columns were label encoded.' % le_count)

0 columns were label encoded.


### 2.2 Dummy features

In [104]:
# Create dummy features
sum_len = 0
for col in dummy_column:
    sum_len += len(no_other_response_loan[col].unique())
    print(col, len(no_other_response_loan[col].unique()))
    
print('In total:', sum_len - len(dummy_column), 'will be additionally added.')

issue_d 103
In total: 102 will be additionally added.


Data preprocessing
Split data into train and test sets

In [105]:
# Seperate features into x (independent features) and y (dependent feature)
print(no_other_response_loan.shape)
print(y.shape)

(887350, 34)
(887350,)


In [106]:
#Correct dummy transformation
x = pd.get_dummies(no_other_response_loan, columns = dummy_column, sparse=True)
print(x.shape)

(887350, 136)


In [107]:
# Split data into train and test (80% & 20%)
x_train, x_test, y_train, y_test = train_test_split(x, y, 
                                                    test_size = 0.2, random_state = 1)

In [108]:
x_train.head()


Unnamed: 0.1,Unnamed: 0,loan_amnt,term,installment,emp_length,annual_inc,dti,delinq_2yrs,open_acc,total_acc,...,issue_d_Oct-2015,issue_d_Sep-2007,issue_d_Sep-2008,issue_d_Sep-2009,issue_d_Sep-2010,issue_d_Sep-2011,issue_d_Sep-2012,issue_d_Sep-2013,issue_d_Sep-2014,issue_d_Sep-2015
688870,688899,15000.0,60,361.67,1,50000,15.6,0.0,7.0,7.0,...,0,0,0,0,0,0,0,0,0,0
400426,400455,3125.0,36,107.78,4,43000,9.77,0.0,10.0,24.0,...,0,0,0,0,0,0,0,0,0,0
276320,276349,25000.0,60,533.03,10,62000,12.68,0.0,8.0,19.0,...,0,0,0,0,0,0,0,0,0,0
344799,344828,20050.0,60,476.89,10,60000,16.62,0.0,8.0,24.0,...,0,0,0,0,0,0,0,0,0,0
844750,844779,11925.0,36,438.58,1,600000,2.27,1.0,10.0,33.0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
# Before modeling, standardize each feature
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

x_train_std = scaler.fit_transform(x_train)
x_test_std = scaler.transform(x_test)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  import sys


In [109]:
print(x_train_std.shape)
print(x_test_std.shape)
print(len(y_train))
print(len(y_test))

(709880, 136)
(177470, 136)
709880
177470


# Functions

In [29]:
def mean_absolute_percentage_error(y_true, y_pred): 
    '''
    Calculates mean absolute error of the true and predicted values.
    '''
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    count = 0
    sum = 0
    for a, p in zip(y_true, y_pred):
        if(a!=0):
            sum+=(abs(a-p)/a)
            count+=1
    return (sum/count) * 100

In [58]:
def view_metrics(y_test, predictions, algorithm, samples, total_samples):
    '''
    Plots the true and predicted values and prints RMS, MAE and MAPE metrics.

    
     fig = plt.figure(figsize=(18,5))
     plt.scatter(x=[i for i in range(total_samples)], y=[x for x in predictions], label='Predicted values')
     plt.scatter(x=[i for i in range(total_samples)], y=[x[0] for x in y_test.values], label='True values')
     plt.title('True and Predicted values for test dataset - %s' % algorithm)
     plt.legend()
     plt.show()
     '''

   # RMS =  np.sqrt(mean_squared_error(y_test, predictions, multioutput='raw_values'))
   # MAE = mean_absolute_error(y_test, predictions, multioutput='raw_values')
    MAPE = mean_absolute_percentage_error(y_test, predictions)
   # print("Root Mean Squared Error:", RMS)
   # print("Mean Absolute Error", MAE) 
    print("Mean Absolute Percentage Error", MAPE)
    return MAPE

## 4. Modeling
- 4.1 Linear Regression
- 4.2 Linear Regression L1
- 4.3 Linear Regression L2
- 4.4 Linear Regression ElasticNet    
    - 4.1.1 Randomized Grid Search - Linear Regression
    - 4.1.2 Model performance evaluation: MAPE


### 4.1 Linear Regression

In [59]:
lin_reg_model = LinearRegression() # build model
lin_reg_model= lin_reg_model.fit(x_train, y_train) # train model
R2 = lin_reg_model.score(x_train, y_train) # coefficient of determination
print(lin_reg_model.coef_)

[-5.39898281e-07 -1.42755688e-03  4.79720040e-01  4.61873065e-02
 -6.50949724e-03 -3.25398448e-06  1.45259312e-02  2.28629829e-01
  1.74671901e-02 -2.61559884e-02  6.14363390e-01  3.02354350e+00
  4.09804228e+00  3.95628971e+00  3.36179008e+00  3.54960154e+00
 -7.17273849e-01  7.79668237e-01  4.40523508e-01  1.18212795e+00
 -2.61565613e-01  6.55778816e-01  9.01195523e-01  7.26078193e-01
  2.31852928e+00  5.73880370e-01  2.29276402e+00  3.27512568e+00
  2.32389911e+00  2.83067285e+00  2.42323885e+00  2.79361765e+00
  1.70954517e+00  3.89822494e+00  1.37578076e-02 -5.06099020e-01
 -2.40315311e+00 -3.00307255e-01  1.20656793e+00  1.09282229e+00
  1.74773066e-01  7.29984852e-01  2.35070574e+00 -2.56737408e-01
 -1.81495361e+00 -1.49842316e+00  3.69597364e-01  7.57026062e-01
  7.99740696e-01  2.54637169e-02  4.66864330e-02  7.75658030e-01
  7.55761018e-02 -2.14771514e+00 -4.03269195e-01  8.07180527e-01
  6.26316587e-01 -2.11981141e-01 -1.58090184e-01  5.06929542e-02
  2.59954599e-01 -3.24598

In [60]:
lin_reg_model_predictions = lin_reg_model.predict(x_test) # make predictions

In [61]:
lin_reg_model_metrics = view_metrics(y_test, lin_reg_model_predictions,'Linear Regression', samples = 50, total_samples=2000)


Mean Absolute Percentage Error 19.477674847707945


# 4.2 L1

In [89]:
from sklearn import linear_model
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.metrics import r2_score
reg = linear_model.Lasso(alpha=0.1)
#reg.fit([[0, 0], [1, 1]], [0, 1])
reg=Lasso(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)
reg.fit([[0, 0], [1, 1]], [0, 1])
#reg.predict([[1, 1]])

Lasso(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [113]:
prediction = reg.predict([[1, 1]])
#prediction = LR.predict(x_test_std)

In [114]:
# Print out important metrics
# Mean squared error:, Variance score:

print("Mean squared error: %.4f" % mean_squared_error(y_test, prediction))
print('Variance score: %.4f' % r2_score(y_test, prediction))

Mean Absolute Percentage Error 19.477674847707945


# 4.3 L2

In [97]:
from sklearn import linear_model
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.metrics import r2_score

reg = linear_model.Ridge(alpha=.5)
#reg.fit([[0, 0], [0, 0], [1, 1]], [0, .1, 1]) 
reg=Ridge(alpha=0.5, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001)
reg.fit([[0, 0], [0, 0], [1, 1]], [0, .1, 1])

Ridge(alpha=0.5, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [102]:
prediction = reg.predict([[1, 1]])

In [112]:
print("Mean squared error: %.4f" % mean_squared_error(y_test, prediction))
print('Variance score: %.4f' % r2_score(y_test, prediction))
MAPE = mean_absolute_percentage_error(y_test, prediction)
print("Mean Absolute Percentage Error", MAPE)

Mean Absolute Percentage Error 19.477674847707945


# 4.4 ElasticNet

In [82]:
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.metrics import r2_score

LR = ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5,
      max_iter=1000, normalize=False, positive=False, precompute=False,
      random_state=0, selection='cyclic', tol=0.0001, warm_start=False)
LR.fit(x_train_std, y_train)

ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5,
      max_iter=1000, normalize=False, positive=False, precompute=False,
      random_state=0, selection='cyclic', tol=0.0001, warm_start=False)

In [83]:
prediction = LR.predict(x_test_std)

In [86]:
# Print out important metrics
# Mean squared error: 14.80, Variance score: 0.34
print("Mean squared error: %.4f" % mean_squared_error(y_test, prediction))
print('Variance score: %.4f' % r2_score(y_test, prediction))
MAPE = mean_absolute_percentage_error(y_test, prediction)
print("Mean Absolute Percentage Error", MAPE)

Mean squared error: 15.9322
Variance score: 0.1671
Mean Absolute Percentage Error 28.527828817705476


### 4.1.1 Randomized Grid Search - Linear Regression

In [23]:
from sklearn.model_selection import RandomizedSearchCV
# Record the randomized grid searching time
start_time = time.time()

# Construct a decision tree regression model
LR = ElasticNet()

# Construct ranges for each parameter 
param_grid = {'alpha': np.linspace(0.1,20,20),
              
             'l1_ratio': np.linspace(0,1,10)
             }

# Do the randomized grid searching 10 times 
# Find the optimal combination of parameters such that the minimum mean squared error attained
grid_obj = RandomizedSearchCV(LR, param_distributions=param_grid, cv=5, scoring='neg_mean_squared_error',
                              n_iter = 10)
grid_obj.fit(x_train_std, y_train)

print('execution time', time.time() - start_time)



execution time 2875.7467744350433


In [73]:
# Call back the optimal combination of parameters 
LR_best = grid_obj.best_estimator_
LR_best

ElasticNet(alpha=0.1, copy_X=True, fit_intercept=True,
      l1_ratio=0.5555555555555556, max_iter=1000, normalize=False,
      positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

In [74]:
grid_search_result = pd.DataFrame({'rank': grid_obj.cv_results_['rank_test_score'].tolist(),
                                   'params': grid_obj.cv_results_['params'],
                                   'mean_test_score': grid_obj.cv_results_['mean_test_score'],
                                   'mean_train_score': grid_obj.cv_results_['mean_train_score'],
                                   'std_test_score': grid_obj.cv_results_['std_test_score'],
                                   'std_train_score': grid_obj.cv_results_['std_train_score']
                                  })

grid_search_result



Unnamed: 0,rank,params,mean_test_score,mean_train_score,std_test_score,std_train_score
0,5,"{'l1_ratio': 0.7777777777777777, 'alpha': 16.8...",-19.219067,-19.21901,0.060158,0.015043
1,5,"{'l1_ratio': 0.4444444444444444, 'alpha': 5.33...",-19.219067,-19.21901,0.060158,0.015043
2,2,"{'l1_ratio': 0.0, 'alpha': 0.1}",-11.86412,-11.520919,0.692999,0.07035
3,5,"{'l1_ratio': 0.7777777777777777, 'alpha': 14.7...",-19.219067,-19.21901,0.060158,0.015043
4,5,"{'l1_ratio': 0.2222222222222222, 'alpha': 14.7...",-19.219067,-19.21901,0.060158,0.015043
5,3,"{'l1_ratio': 0.0, 'alpha': 7.43157894736842}",-17.520431,-17.519324,0.059571,0.015012
6,4,"{'l1_ratio': 0.4444444444444444, 'alpha': 3.24...",-18.657239,-18.657165,0.060159,0.015634
7,5,"{'l1_ratio': 0.8888888888888888, 'alpha': 6.38...",-19.219067,-19.21901,0.060158,0.015043
8,5,"{'l1_ratio': 0.7777777777777777, 'alpha': 20.0}",-19.219067,-19.21901,0.060158,0.015043
9,1,"{'l1_ratio': 0.5555555555555556, 'alpha': 0.1}",-11.781343,-11.474272,0.608948,0.067936


### 4.1.2 Model performance evaluation: MSE and R-squared

In [87]:
LR_best.fit(x_train_std, y_train)

# Mean squared error: 2.12, Variance score: 0.91

# Make a prediction based on test data
prediction = LR_best.predict(x_test_std)

# MSE (mean squared error)
print("Mean squared error: %.2f"
      % mean_squared_error(y_test, prediction))
# R-square 
print('Variance score: %.2f' % r2_score(y_test, prediction))

#MAPE
MAPE = mean_absolute_percentage_error(y_test, prediction)
print("Mean Absolute Percentage Error", MAPE)

Mean squared error: 11.59
Variance score: 0.39
Mean Absolute Percentage Error 23.74286307370759
