# Gia Gillis

## Loan Interest Rate Analysis Part 3 of 3

### Create, apply, and score models.

Import necessary libraries.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest, f_regression

Function to split train and test data and scale data

In [45]:
def split_and_scale(train_x, train_y, test_size):
    x_train, x_test, y_train, y_test = train_test_split(train_x, train_y, test_size=test_size)

    #Scaling data
    """
    scaler = StandardScaler()
    x_train = scaler.fit_transform(x_train, y_train)
    x_test = scaler.transform(x_test)
    """
    return x_train, x_test, y_train, y_test

Load data, drop Loan Id and Borrower Id columns, and change dates strings to date objects.

In [49]:
loans=pd.read_csv(r'data\clean_loan_interest_rates_2.csv', parse_dates=True)
loans['Loan Date']=pd.to_datetime(loans['Loan Date'], format='%Y-%m-%d')
loans['Credit Line Date']=pd.to_datetime(loans['Credit Line Date'], format='%Y-%m-%d')
backup=loans.copy()

In [50]:
loans.columns

Index(['Interest Rate', 'Requested', 'Annual Income', 'Loan Date', 'Ratio',
       'Late Payments', 'Credit Line Date', 'Months Del', 'Months PR',
       'Derog Recs',
       ...
       'State_TN.1', 'State_TX.1', 'State_UT.1', 'State_VA.1', 'State_VT.1',
       'State_WA.1', 'State_WI.1', 'State_WV.1', 'State_WY.1', 'Status_W.1'],
      dtype='object', length=179)

In [51]:
loans.shape

(287001, 179)

Separate data into features x and label y.

In [53]:
#Dropping data columns for now
loans=loans.drop(['Loan Date', 'Credit Line Date'], axis=1)

In [54]:
X = loans.iloc[:,1:]
y = loans.iloc[:,0]

In [55]:
X.shape

(287001, 176)

In [56]:
y.shape

(287001,)

In [57]:
X.head()

Unnamed: 0,Requested,Annual Income,Ratio,Late Payments,Months Del,Months PR,Derog Recs,Credit Lines,Number of Payments_ 60 MONTHS,Loan Grade_B,...,State_TN.1,State_TX.1,State_UT.1,State_VA.1,State_VT.1,State_WA.1,State_WI.1,State_WV.1,State_WY.1,Status_W.1
0,25000.0,85000.0,19.48,0.0,0.0,0.0,0.0,42.0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,7000.0,65000.0,14.29,0.0,0.0,0.0,0.0,7.0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,25000.0,70000.0,10.5,0.0,41.0,0.0,0.0,17.0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1200.0,54000.0,5.47,0.0,64.0,0.0,0.0,31.0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,10800.0,32000.0,11.63,0.0,58.0,0.0,0.0,40.0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [58]:
y.head()

0    0.1189
1    0.1071
2    0.1699
3    0.1311
4    0.1357
Name: Interest Rate, dtype: float64

#### Separate data into test and train and scale data.

In [65]:
X_train, X_test, y_train, y_test = split_and_scale(X, y, .20)

### Models

#### Build, apply, and score models.

#### Linear Regression

In [72]:
lm=LinearRegression()
lm.fit(X_train, y_train)
y_pred=lm.predict(X_test)
mse=mean_squared_error(y_test, y_pred)
print('Linear Regression Mean Squared Error ', mse)
print('Linear Regression ROOT Mean Squared Error ', np.sqrt(mse))

Linear Regression Mean Squared Error  0.00015239600071529684
Linear Regression ROOT Mean Squared Error  0.012344877509124861
Linear Regression R2  0.920451289340929


In [78]:
from sklearn.model_selection import cross_val_score
cvs=cross_val_score(lm, X, y, cv=4)
cvs.mean()

0.9004581315926914

#### Ridge Regression

In [80]:
r=Ridge(alpha=.1)
r.fit(X_train, y_train)
y_pred=r.predict(X_test)
mse=mean_squared_error(y_test, y_pred)
print('Ridge Mean Squared Error ', mse)
print('Ridge ROOT Mean Squared Error ', np.sqrt(mse))

Ridge Mean Squared Error  0.00015239480012040966
Ridge ROOT Mean Squared Error  0.01234482888177919


In [81]:
cvs=cross_val_score(r, X, y, cv=4)
print(cvs.mean())

0.90045805626066


The model with a lower MSE is a better fit for the data.  The two models are very similar.

#### Lasso Regression

In [82]:
l=Lasso()
l.fit(X_train, y_train)
y_pred=l.predict(X_test)
mse=mean_squared_error(y_test, y_pred)
print('Lasso Mean Squared Error ', mse)
print('Lasso ROOT Mean Squared Error ', np.sqrt(mse))
cvs=cross_val_score(l, X, y, cv=4)
print(cvs.mean())

Lasso Mean Squared Error  0.0018439210963400192
Lasso ROOT Mean Squared Error  0.04294090236988528
-0.0014725186889326673


Scores for Lasso Regression are worse than Linear Regression and Ridge, so it won't be explored further

### Feature Selection

#### Use feature selection to see if a subset of the features gives better scores.

In [83]:
X.shape

(287001, 176)

In [84]:
y.shape

(287001,)

In [85]:
k=90
kbest= SelectKBest(f_regression, k=k)
select_train_x=kbest.fit_transform(X, y)

In [86]:
select_train_x.shape

(287001, 90)

In [87]:
x_train, x_test, y_train, y_test = split_and_scale(select_train_x, train_y, .20)

#### Ridge Regression with Feature Selection

In [89]:
r=Ridge()
r.fit(x_train, y_train)
y_pred=r.predict(x_test)
print('features k ', k)
mse=mean_squared_error(y_test, y_pred)
print('Ridge Mean Squared Error ', mse)
print('Ridge ROOT Mean Squared Error ', np.sqrt(mse))
cvs=cross_val_score(l, X, y, cv=4)
print(cvs.mean())

features k  90
Ridge Mean Squared Error  0.00015206543196393056
Ridge ROOT Mean Squared Error  0.012331481336965586
-0.0014725186889326673


Number of features of 120 yields the highest r2 value for Ridge regression, but again, the r2 still didn't get above .82.

In [31]:
from sklearn.model_selection import cross_val_score
scores=cross_val_score(model, x_train, y_train, cv=10, scoring='neg_mean_squared_error')
np.sqrt(-scores)

#### ElasticNet with Feature Selection

In [91]:
e=ElasticNet()
e.fit(x_train, y_train)
y_pred=e.predict(x_test)
print('features k ', k)
mse=mean_squared_error(y_test, y_pred)
print('EN Mean Squared Error ', mse)
print('EN ROOT Mean Squared Error ', np.sqrt(mse))
cvs=cross_val_score(l, X, y, cv=4)
print(cvs.mean())

features k  90
EN Mean Squared Error  0.0018492892392467368
EN ROOT Mean Squared Error  0.043003363115537104
-0.0014725186889326673


In [92]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor

#### DecisionTreeRegressor with Feature Selection

In [93]:
dt=DecisionTreeRegressor()
dt.fit(x_train, y_train)
y_pred=dt.predict(x_test)
print('features k ', k)
mse=mean_squared_error(y_test, y_pred)
print('Mean Squared Error ', mse)
print ('ROOT mse', np.sqrt(mse))

features k  90
Mean Squared Error  0.00029440222696006655
ROOT mse 0.017158153366841858


#### GradientBoosting Regressor with Feature Selection.

In [94]:
gb=GradientBoostingRegressor()
gb.fit(x_train, y_train)
y_pred=gb.predict(x_test)
print('features k ', k)
mse=mean_squared_error(y_test, y_pred)
print('Mean Squared Error ', mse)
print ('ROOT mse', np.sqrt(mse))

features k  90
Mean Squared Error  0.00016566797560851903
ROOT mse 0.012871207231977854
