# Gia Gillis

## Loan Interest Rate Analysis Part 3 of 3

### Create, apply, and score models.

Import necessary libraries.

In [69]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest, f_regression

Function to replace null values.

In [70]:
def replace_null_values(x, strategy):
    imputer = SimpleImputer(missing_values=np.nan,strategy=strategy)
    imputer = imputer.fit(x)
    x = imputer.transform(x)
    x = pd.DataFrame.from_records(x)
    return x

In [71]:
def split_and_scale(train_x, train_y, test_size):
    x_train, x_test, y_train, y_test = train_test_split(train_x, train_y, test_size=test_size)

    #Scaling data
    scaler = StandardScaler()
    x_train = scaler.fit_transform(x_train, y_train)
    x_test = scaler.transform(x_test)
    return x_train, x_test, y_train, y_test

Load data, drop Loan Id and Borrower Id columns, and change dates strings to date objects.

In [72]:
clean_loans=pd.read_csv(r'C:\Users\Gia\Downloads\Analyst_Test\Analyst_Test\clean_loan_interest_rates.csv', parse_dates=True)
clean_loans['Loan Date']=pd.to_datetime(clean_loans['Loan Date'], format='%Y-%m-%d')
clean_loans['Credit Line Date']=pd.to_datetime(clean_loans['Credit Line Date'], format='%Y-%m-%d')

In [73]:
clean_loans.columns

Index(['Interest Rate', 'Requested', 'Funded', 'Investor Funded',
       'Number of Payments', 'Loan Grade', 'Loan Subgrade', 'Job',
       'Years Employed', 'Home', 'Annual Income', 'Income Verified',
       'Loan Date', 'Loan Cat', 'State', 'Ratio', 'Late Payments',
       'Credit Line Date', 'Months Del', 'Months PR', 'Derog Recs',
       'Credit Lines', 'Status'],
      dtype='object')

Dropping all rows with null values resulted in less than half of the data, so this is unused and null values need to be replaced instead.

In [13]:
#clean_loans.dropna(inplace=True)

In [74]:
clean_loans.isnull().sum()

Interest Rate             0
Requested                 1
Funded                    1
Investor Funded           1
Number of Payments        1
Loan Grade            51851
Loan Subgrade         51851
Job                   20251
Years Employed        14793
Home                  51960
Annual Income         51734
Income Verified           1
Loan Date                 1
Loan Cat                  1
State                     1
Ratio                     1
Late Payments             1
Credit Line Date          1
Months Del                0
Months PR                 0
Derog Recs                1
Credit Lines              1
Status                    1
dtype: int64

In [81]:
indexNames = clean_loans[clean_loans['Loan Grade'].isnull()].index
indexNames

Int64Index([   147,    164,    194,    204,    206,    215,    216,    217,
               242,    245,
            ...
            338772, 338784, 338800, 338812, 338831, 338833, 338836, 338838,
            338839, 338843],
           dtype='int64', length=51851)

In [82]:
# Delete these row indexes from dataFrame
clean_loans.drop(indexNames, inplace=True)

In [102]:
clean_loans.reindex()

Unnamed: 0,Interest Rate,Requested,Funded,Investor Funded,Number of Payments,Loan Grade,Loan Subgrade,Job,Years Employed,Home,...,Loan Cat,State,Ratio,Late Payments,Credit Line Date,Months Del,Months PR,Derog Recs,Credit Lines,Status
0,0.1189,25000.0,25000.0,19080.0,36 months,B,B4,,< 1 year,RENT,...,debt_consolidation,CA,19.48,0.0,1994-02-01,0.0,0.0,0.0,42.0,f
1,0.1071,7000.0,7000.0,673.0,36 months,B,B5,Cnn,< 1 year,RENT,...,credit_card,NY,14.29,0.0,2000-10-01,0.0,0.0,0.0,7.0,f
2,0.1699,25000.0,25000.0,24725.0,36 months,D,D3,Web Programmer,1 year,RENT,...,debt_consolidation,NY,10.50,0.0,2000-06-01,41.0,0.0,0.0,17.0,f
3,0.1311,1200.0,1200.0,1200.0,36 months,C,C2,City Of Beaumont Texas,10+ years,OWN,...,debt_consolidation,TX,5.47,0.0,1985-01-01,64.0,0.0,0.0,31.0,f
4,0.1357,10800.0,10800.0,10692.0,36 months,C,C3,State Farm Insurance,6 years,RENT,...,debt_consolidation,CT,11.63,0.0,1996-12-01,58.0,0.0,0.0,40.0,f
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
338847,0.1299,10000.0,10000.0,10000.0,60 months,C,C1,Administrative Assistant,8 years,RENT,...,credit_card,FL,21.51,0.0,2003-11-01,26.0,0.0,0.0,20.0,w
338848,0.1629,13150.0,13150.0,13150.0,36 months,D,D2,Helper,1 year,OWN,...,debt_consolidation,TX,29.76,0.0,2007-10-01,38.0,0.0,0.0,21.0,f
338849,0.1099,20000.0,20000.0,20000.0,60 months,B,B3,Facility Administrator,1 year,MORTGAGE,...,credit_card,TX,24.13,0.0,2004-10-01,63.0,0.0,0.0,48.0,w
338850,0.1757,18475.0,18475.0,18475.0,60 months,D,D4,Senior Creative Designer/Ad Sales,10+ years,OWN,...,debt_consolidation,TX,31.43,0.0,1994-03-01,0.0,0.0,0.0,31.0,f


In [103]:
clean_loans.shape

(287001, 23)

In [104]:
clean_loans.isnull().sum()

Interest Rate             0
Requested                 0
Funded                    0
Investor Funded           0
Number of Payments        0
Loan Grade                0
Loan Subgrade             0
Job                   17067
Years Employed        12449
Home                  44128
Annual Income         43974
Income Verified           0
Loan Date                 0
Loan Cat                  0
State                     0
Ratio                     0
Late Payments             0
Credit Line Date          0
Months Del                0
Months PR                 0
Derog Recs                0
Credit Lines              0
Status                    0
dtype: int64

Separate data into features x and label y.

In [126]:
train_x = clean_loans.iloc[:,1:]
train_y = clean_loans.iloc[:,0]

In [128]:
train_x.shape

(287001, 22)

In [129]:
train_y.shape

(287001,)

In [130]:
train_x.head()

Unnamed: 0,Requested,Funded,Investor Funded,Number of Payments,Loan Grade,Loan Subgrade,Job,Years Employed,Home,Annual Income,...,Loan Cat,State,Ratio,Late Payments,Credit Line Date,Months Del,Months PR,Derog Recs,Credit Lines,Status
0,25000.0,25000.0,19080.0,36 months,B,B4,,< 1 year,RENT,85000.0,...,debt_consolidation,CA,19.48,0.0,1994-02-01,0.0,0.0,0.0,42.0,f
1,7000.0,7000.0,673.0,36 months,B,B5,Cnn,< 1 year,RENT,65000.0,...,credit_card,NY,14.29,0.0,2000-10-01,0.0,0.0,0.0,7.0,f
2,25000.0,25000.0,24725.0,36 months,D,D3,Web Programmer,1 year,RENT,70000.0,...,debt_consolidation,NY,10.5,0.0,2000-06-01,41.0,0.0,0.0,17.0,f
3,1200.0,1200.0,1200.0,36 months,C,C2,City Of Beaumont Texas,10+ years,OWN,54000.0,...,debt_consolidation,TX,5.47,0.0,1985-01-01,64.0,0.0,0.0,31.0,f
4,10800.0,10800.0,10692.0,36 months,C,C3,State Farm Insurance,6 years,RENT,32000.0,...,debt_consolidation,CT,11.63,0.0,1996-12-01,58.0,0.0,0.0,40.0,f


In [131]:
train_y.head()

0    0.1189
1    0.1071
2    0.1699
3    0.1311
4    0.1357
Name: Interest Rate, dtype: float64

Divide features x into numeric data and categorical data, excluding datetime data from both.

In [132]:
numerical_data=train_x.select_dtypes(include=['float'])

In [133]:
categorical_data=train_x.select_dtypes(exclude=['float', 'datetime'])

In [134]:
categorical_data.columns

Index(['Number of Payments', 'Loan Grade', 'Loan Subgrade', 'Job',
       'Years Employed', 'Home', 'Income Verified', 'Loan Cat', 'State',
       'Status'],
      dtype='object')

Drop Job column because it caused problems when encoding categorical data.  Job column had too many values.

In [135]:
categorical_data.drop('Job', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [136]:
numerical_data.head()

Unnamed: 0,Requested,Funded,Investor Funded,Annual Income,Ratio,Late Payments,Months Del,Months PR,Derog Recs,Credit Lines
0,25000.0,25000.0,19080.0,85000.0,19.48,0.0,0.0,0.0,0.0,42.0
1,7000.0,7000.0,673.0,65000.0,14.29,0.0,0.0,0.0,0.0,7.0
2,25000.0,25000.0,24725.0,70000.0,10.5,0.0,41.0,0.0,0.0,17.0
3,1200.0,1200.0,1200.0,54000.0,5.47,0.0,64.0,0.0,0.0,31.0
4,10800.0,10800.0,10692.0,32000.0,11.63,0.0,58.0,0.0,0.0,40.0


In [137]:
categorical_data.head()

Unnamed: 0,Number of Payments,Loan Grade,Loan Subgrade,Years Employed,Home,Income Verified,Loan Cat,State,Status
0,36 months,B,B4,< 1 year,RENT,verified - income,debt_consolidation,CA,f
1,36 months,B,B5,< 1 year,RENT,not verified,credit_card,NY,f
2,36 months,D,D3,1 year,RENT,verified - income,debt_consolidation,NY,f
3,36 months,C,C2,10+ years,OWN,not verified,debt_consolidation,TX,f
4,36 months,C,C3,6 years,RENT,not verified,debt_consolidation,CT,f


Attempted to replace null values in categorical data using SimpleImputer, but the kernel would hang.

In [23]:
#Replace null values in both categorical data and numerical data
#categorical_data = replace_null_values(categorical_data, 'most_frequent')

In [138]:
categorical_data.isnull().sum()

Number of Payments        0
Loan Grade                0
Loan Subgrade             0
Years Employed        12449
Home                  44128
Income Verified           0
Loan Cat                  0
State                     0
Status                    0
dtype: int64

In [139]:
for column in categorical_data.columns:
    print(max(categorical_data[column].value_counts().index), max(categorical_data[column].value_counts()))

 60 months 209864
G 86080
G5 20342
< 1 year 91838
RENT 123495
verified - income source 107446
wedding 167771
WY 44605
w 196876


#### Replaced null in categorical data with the most frequent value from each column.

In [140]:
for column in categorical_data.columns:
    categorical_data[column].fillna(max(categorical_data[column].value_counts().index), inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)


In [141]:
categorical_data.isnull().sum()

Number of Payments    0
Loan Grade            0
Loan Subgrade         0
Years Employed        0
Home                  0
Income Verified       0
Loan Cat              0
State                 0
Status                0
dtype: int64

In [156]:
categorical_data.shape

(287001, 127)

In [157]:
categorical_data.index

Int64Index([     0,      1,      2,      3,      4,      5,      6,      7,
                 8,      9,
            ...
            338841, 338842, 338844, 338845, 338846, 338847, 338848, 338849,
            338850, 338851],
           dtype='int64', length=287001)

In [159]:
categorical_data.reset_index(inplace=True)

In [160]:
categorical_data.index

RangeIndex(start=0, stop=287001, step=1)

#### Replaced null values in numerical data with mean.

In [142]:
numerical_data = replace_null_values(numerical_data, 'mean')

In [143]:
numerical_data.isnull().sum()

0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    0
8    0
9    0
dtype: int64

In [144]:
categorical_data=pd.get_dummies(categorical_data)

In [161]:
categorical_data.shape

(287001, 128)

In [162]:
numerical_data.shape

(287001, 10)

In [150]:
numerical_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,25000.0,25000.0,19080.0,85000.0,19.48,0.0,0.0,0.0,0.0,42.0
1,7000.0,7000.0,673.0,65000.0,14.29,0.0,0.0,0.0,0.0,7.0
2,25000.0,25000.0,24725.0,70000.0,10.50,0.0,41.0,0.0,0.0,17.0
3,1200.0,1200.0,1200.0,54000.0,5.47,0.0,64.0,0.0,0.0,31.0
4,10800.0,10800.0,10692.0,32000.0,11.63,0.0,58.0,0.0,0.0,40.0
...,...,...,...,...,...,...,...,...,...,...
286996,10000.0,10000.0,10000.0,50000.0,21.51,0.0,26.0,0.0,0.0,20.0
286997,13150.0,13150.0,13150.0,30000.0,29.76,0.0,38.0,0.0,0.0,21.0
286998,20000.0,20000.0,20000.0,99000.0,24.13,0.0,63.0,0.0,0.0,48.0
286999,18475.0,18475.0,18475.0,42000.0,31.43,0.0,0.0,0.0,0.0,31.0


#### Unite categorical and numerical data after nulls are replaced, and categorical data is replace with dummies.

In [163]:
train_x = pd.concat([categorical_data, numerical_data], axis=1)

In [164]:
train_x.shape

(287001, 138)

In [165]:
train_y.shape

(287001,)

#### Separate data into test and train and scale data.

In [166]:
x_train, x_test, y_train, y_test = split_and_scale(train_x, train_y, .10)

### Models

#### Build, apply, and score models.

#### Linear Regression

In [167]:
lm=LinearRegression()
lm.fit(x_train, y_train)
y_pred=lm.predict(x_test)
print('Linear Regression Mean Squared Error ', mean_squared_error(y_test, y_pred))
print('Linear Regression R2 ', r2_score(y_test, y_pred))

Linear Regression Mean Squared Error  5.767558334622245e-05
Linear Regression R2  0.970024822438911


#### Ridge Regression

In [168]:
parameters= [{'alpha': [0.001,0.1,1, 10, 100, 1000, 10000, 100000, 100000]}]
r=Ridge()
grid = GridSearchCV(r, parameters,cv=4)
grid.fit(x_train, y_train)
best=grid.best_estimator_
best

Ridge(alpha=10, copy_X=True, fit_intercept=True, max_iter=None, normalize=False,
      random_state=None, solver='auto', tol=0.001)

In [169]:
y_pred=best.predict(x_test)

In [170]:
print('Ridge Mean Squared Error ', mean_squared_error(y_test, y_pred))
print('Ridge R2 ', r2_score(y_test, y_pred))

Ridge Mean Squared Error  5.7665609273589145e-05
Ridge R2  0.970030006167984


The model with the higher R2 is a better fit for the data, and model with a lower MSE is a better fit for the data.  The two models are very similar.

### Feature Selection

#### Use feature selection to see if a subset of the features gives better scores.

In [171]:
train_x.shape

(287001, 138)

In [172]:
train_y.shape

(287001,)

In [173]:
train_y.head()

0    0.1189
1    0.1071
2    0.1699
3    0.1311
4    0.1357
Name: Interest Rate, dtype: float64

In [174]:
type(np.array(train_y))

numpy.ndarray

In [175]:
k=90
kbest= SelectKBest(f_regression, k=k)
select_train_x=kbest.fit_transform(train_x, train_y)

In [176]:
select_train_x.shape

(287001, 90)

In [177]:
x_train, x_test, y_train, y_test = split_and_scale(select_train_x, train_y, .20)

#### Linear Regression with Feature Selection

In [178]:
lm=LinearRegression()
lm.fit(x_train, y_train)
y_pred=lm.predict(x_test)
print('features k ', k)
print('Mean Squared Error ', mean_squared_error(y_test, y_pred))
print('R2 ', r2_score(y_test, y_pred))

features k  90
Mean Squared Error  5.801122718598759e-05
R2  0.969517071949132


Testing with different numbers of features, the highest r2 value was attained around 90 features, but the r2 value still didn't get above .82 for Linear Regression.

#### Ridge Regression with Feature Selection

In [179]:
parameters= [{'alpha': [0.001,0.1,1, 10, 100, 1000, 10000, 100000, 100000]}]
r=Ridge()
grid = GridSearchCV(r, parameters,cv=4)
grid.fit(x_train, y_train)
best=grid.best_estimator_
y_pred=best.predict(x_test)
print('features k ', k)
print('Mean Squared Error ', mean_squared_error(y_test, y_pred))
print('R2 ', r2_score(y_test, y_pred))

features k  90
Mean Squared Error  5.800199960601452e-05
R2  0.969521920728757


Number of features of 120 yields the highest r2 value for Ridge regression, but again, the r2 still didn't get above .82.

In [31]:
from sklearn.model_selection import cross_val_score
scores=cross_val_score(model, x_train, y_train, cv=10, scoring='neg_mean_squared_error')
np.sqrt(-scores)

#### Lasso Regression with Feature Selection

In [64]:
parameters= [{'alpha': [0.001,0.1,1, 10, 100, 1000, 10000, 100000, 100000]}]
l=Lasso()
grid = GridSearchCV(l, parameters,cv=4)
grid.fit(x_train, y_train)
best=grid.best_estimator_
y_pred=best.predict(x_test)
print('features k ', k)
print('Mean Squared Error ', mean_squared_error(y_test, y_pred))
print('R2 ', r2_score(y_test, y_pred))

features k  90
Mean Squared Error  0.00036981754948868935
R2  0.8080061437857151


#### ElasticNet with Feature Selection

In [66]:
parameters= [{'alpha': [0.001,0.1,1, 10, 100, 1000, 10000, 100000, 100000]}]
e=ElasticNet()
grid = GridSearchCV(e, parameters, cv=10)
grid.fit(x_train, y_train)
best=grid.best_estimator_
y_pred=best.predict(x_test)
print('features k ', k)
print('Mean Squared Error ', mean_squared_error(y_test, y_pred))
print('R2 ', r2_score(y_test, y_pred))

features k  90
Mean Squared Error  0.0003478535486274111
R2  0.8194089374851516


In [39]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor

#### DecisionTreeRegressor with Feature Selection

In [67]:
dt=DecisionTreeRegressor()
dt.fit(x_train, y_train)
y_pred=dt.predict(x_test)
print('features k ', k)
print('Mean Squared Error ', mean_squared_error(y_test, y_pred))
print('R2 ', r2_score(y_test, y_pred))

features k  90
Mean Squared Error  0.0004928254388673179
R2  0.7441455750830246


#### KNeighborsRegressor with Feature Selection.  Commented out because it kept hanging the kernel.

In [None]:
"""
kn=KNeighborsRegressor
kn.fit(x_train, y_train)
y_pred=kn.predict(x_test)
print('features k ', k)
print('Mean Squared Error ', mean_squared_error(y_test, y_pred))
print('R2 ', r2_score(y_test, y_pred))
"""

#### GradientBoosting Regressor with Feature Selection.

In [68]:
gb=GradientBoostingRegressor()
gb.fit(x_train, y_train)
y_pred=gb.predict(x_test)
print('features k ', k)
print('Mean Squared Error ', mean_squared_error(y_test, y_pred))
print('R2 ', r2_score(y_test, y_pred))

features k  90
Mean Squared Error  0.00029210698499280096
R2  0.8483502295836521
