In [397]:
# Import necessary packages
#
import pandas as pd
import random
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt
import statsmodels.api as sm
from matplotlib import style
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
style.use('fivethirtyeight')
#
import seaborn as sns
#
import sklearn.metrics as skm
#
from pandas import DataFrame, Series
from datetime import datetime
from matplotlib.colors import ListedColormap
from sklearn import datasets
from sklearn.cluster import KMeans
from scipy.cluster.hierarchy import dendrogram, linkage, leaves_list
#
# Pandas settings - initially as per lecture notes - modify as needed
#
pd.set_option('display.notebook_repr_html', True)
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
#
%matplotlib inline  
#
import warnings
warnings.filterwarnings('ignore')

In [118]:
# Import raw data from .csv files
#
# Skip the first line of each file as it contains unnecessary text instead of the column/attribute names
# Otherwise, this first line would prevent the data to be parsed into pandas data frame properly
#
# Data 2007 to 2015
d01 = DataFrame(pd.read_csv('LoanStats3a.csv', skiprows=1, low_memory=False))
d02 = DataFrame(pd.read_csv('LoanStats3b.csv', skiprows=1, low_memory=False))
d03 = DataFrame(pd.read_csv('LoanStats3c.csv', skiprows=1, low_memory=False))
d04 = DataFrame(pd.read_csv('LoanStats3d.csv', skiprows=1, low_memory=False))
# Data 2016 
d05 = DataFrame(pd.read_csv('LoanStats_2016Q1.csv', skiprows=1, low_memory=False))
d06 = DataFrame(pd.read_csv('LoanStats_2016Q2.csv', skiprows=1, low_memory=False))
d07 = DataFrame(pd.read_csv('LoanStats_2016Q3.csv', skiprows=1, low_memory=False))
d08 = DataFrame(pd.read_csv('LoanStats_2016Q4.csv', skiprows=1, low_memory=False))
# Data 2017
d09 = DataFrame(pd.read_csv('LoanStats_2017Q1.csv', skiprows=1, low_memory=False))
d10 = DataFrame(pd.read_csv('LoanStats_2017Q2.csv', skiprows=1, low_memory=False))

In [172]:
# Combine individual data frames into a single data frame
#
loans_raw = pd.concat([d01, d02, d03, d04, d05, d06, d07, d08, d09, d10])
loans_raw['year'] = pd.to_datetime(loans_raw['settlement_date']).dt.year

In [316]:
loans = loans_raw[['id', 'grade', 'int_rate', 'year', 'annual_inc', 'dti', 'home_ownership']]

In [317]:
# Loan id column is all null so let us update it with integers and set this column as the index column
#
loans.loc[loans['id'].isnull(), 'id'] = loans.loc[loans['id'].isnull()].index

In [318]:
loans = loans.dropna()
loans.set_index('id').head()

Unnamed: 0_level_0,grade,int_rate,year,annual_inc,dti,home_ownership
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
105,D,16.77%,2015.0,38000.0,23.18,RENT
188,B,12.69%,2013.0,58500.0,20.64,RENT
255,B,12.42%,2014.0,39000.0,18.31,RENT
362,E,19.42%,2015.0,60000.0,20.04,RENT
430,C,13.49%,2014.0,75000.0,25.71,MORTGAGE


In [319]:
# Change objects to float type
#
loans['int_rate'] = loans['int_rate'].str.rstrip('%').astype('float')
loans['annual_inc'] = loans['annual_inc'].astype('float')
loans['dti'] = loans['dti'].astype('float')

In [332]:
# Convert categorical data to numerical
# 
# Grade
number = LabelEncoder()
loans['number_grade'] = number.fit_transform(loans['grade'].astype('str'))

# Home ownership
#
loans['number_home_ownership'] = number.fit_transform(loans['home_ownership'].astype('str'))

In [333]:
loans.shape

(14775, 9)

In [334]:
loans.groupby(loans.year)['id'].count()

year
2009.0        1
2010.0        1
2011.0        5
2012.0       26
2013.0       90
2014.0      335
2015.0     1217
2016.0     1743
2017.0    10470
2018.0      887
Name: id, dtype: int64

In [335]:
# Training set1: year 2009 - 2015
#
tr_15prev = loans[loans['year'].between(2009.0, 2015.0)]

# Training set2: year 2016
#
tr_16 = loans[loans['year'] == 2016.0]

In [336]:
tr_15prev.head()

Unnamed: 0,id,grade,int_rate,year,annual_inc,dti,home_ownership,number_grade,number_home_ownership
105,105,D,16.77,2015.0,38000.0,23.18,RENT,3,5
188,188,B,12.69,2013.0,58500.0,20.64,RENT,1,5
255,255,B,12.42,2014.0,39000.0,18.31,RENT,1,5
362,362,E,19.42,2015.0,60000.0,20.04,RENT,4,5
430,430,C,13.49,2014.0,75000.0,25.71,MORTGAGE,2,1


In [345]:
tr_16.head()

Unnamed: 0,id,grade,int_rate,year,annual_inc,dti,home_ownership,number_grade,number_home_ownership
876,876,B,12.69,2016.0,34000.0,13.41,RENT,1,5
6469,6469,E,18.64,2016.0,115000.0,11.45,MORTGAGE,4,1
14822,14822,B,10.59,2016.0,37000.0,15.7,MORTGAGE,1,1
17564,17564,E,17.14,2016.0,48012.0,13.82,MORTGAGE,4,1
25069,25069,C,14.72,2016.0,45500.0,21.52,OWN,2,4


In [361]:
# Training set1 independent variables (x)
#
x_15prev = tr_15prev[['int_rate', 'annual_inc', 'dti', 'number_home_ownership']]
x1 = sm.add_constant(x_15prev)

# Training set1 dependent variables (x)
y_15prev = tr_15prev['number_grade']

# Model1
#
model_15prev = LinearRegression()
model_15prev.fit(x_15prev, y_15prev)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [372]:
# Training set2 independent variables (x)
#
x_16 = tr_16[['int_rate', 'annual_inc', 'dti', 'number_home_ownership']]
x1 = sm.add_constant(x_16)

# Training set2 dependent variables (x)
y_16 = tr_16['number_grade']

# Model2
#
model_16 = LinearRegression()
model_16.fit(x_16, y_16)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [365]:
# Test data
#
test = loans[loans['year'] == 2017.0]
test_x = test[['int_rate', 'annual_inc', 'dti', 'number_home_ownership']]

test_y = test['number_grade']

In [374]:
model_15prev.score(test_x, test_y)

0.87667311560192684

In [373]:
model_16.score(test_x, test_y)

0.91138247900125047

In [370]:
result = sm.OLS(test_y, test_x).fit()
result.summary()

0,1,2,3
Dep. Variable:,number_grade,R-squared:,0.957
Model:,OLS,Adj. R-squared:,0.957
Method:,Least Squares,F-statistic:,57990.0
Date:,"Fri, 02 Feb 2018",Prob (F-statistic):,0.0
Time:,04:25:43,Log-Likelihood:,-9074.3
No. Observations:,10470,AIC:,18160.0
Df Residuals:,10466,BIC:,18190.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
int_rate,0.2039,0.001,256.227,0.000,0.202,0.205
annual_inc,-7.784e-07,4.44e-08,-17.535,0.000,-8.65e-07,-6.91e-07
dti,-0.0122,0.000,-28.302,0.000,-0.013,-0.011
number_home_ownership,-0.0885,0.003,-32.527,0.000,-0.094,-0.083

0,1,2,3
Omnibus:,3363.044,Durbin-Watson:,1.839
Prob(Omnibus):,0.0,Jarque-Bera (JB):,72226.913
Skew:,1.024,Prob(JB):,0.0
Kurtosis:,15.703,Cond. No.,70100.0


In [396]:
test['number_prediction'] = round(result.predict(test_x), 0)

x_encoded = number.fit_transform(test['number_prediction'])
test['prediction'] = reverse.inverse_transform(test['number_prediction'].all)

['A', 'B', 'C', 'D', 'E', 'F', 'G']

ValueError: y contains new labels: [ <bound method Series.all of 2335      3.0
2352      3.0
3231      2.0
7855      4.0
11798     3.0
13737     2.0
17372     3.0
24100     3.0
25351     3.0
41622     2.0
144       3.0
480       4.0
834       3.0
913       4.0
1047      2.0
1229      4.0
1806      4.0
2172      4.0
2745      3.0
2763      2.0
2830      3.0
3339      4.0
3359      1.0
3482      4.0
3691      2.0
3694      3.0
3775      3.0
3894      3.0
4182      4.0
4536      3.0
4655      3.0
4890      4.0
5134      3.0
5384      2.0
5413      3.0
5631      3.0
5819      4.0
6363      2.0
6489      3.0
6575      3.0
6735      3.0
6743      3.0
6929      3.0
6943      3.0
7135      4.0
7509      3.0
7528      4.0
7768      2.0
7786      4.0
7787      3.0
         ... 
86899     2.0
87175     2.0
88351     3.0
89142     1.0
89691     2.0
90015     5.0
90037     3.0
90509     3.0
91097     2.0
92062     3.0
92764     4.0
93276     5.0
93352     5.0
94012     5.0
95026     4.0
95298     3.0
95693     3.0
95875     2.0
95887     3.0
1513      3.0
8822      3.0
25737     2.0
31291     2.0
42382     1.0
42716     2.0
43177     6.0
44275     1.0
53942     2.0
54451     4.0
57359     6.0
61990     2.0
69411     2.0
70281     4.0
70659     2.0
77258     2.0
79430     5.0
81164     6.0
82254     4.0
83264     2.0
86348     2.0
87459     2.0
91452     4.0
95944     3.0
96948     3.0
99066     3.0
99095     3.0
101380    2.0
103494    2.0
104602    4.0
105418    3.0
Name: number_prediction, Length: 10470, dtype: float64>]

In [391]:
test.head()

Unnamed: 0,id,grade,int_rate,year,annual_inc,dti,home_ownership,number_grade,number_home_ownership,prediction,number_prediction
2335,2335,D,16.77,2017.0,110000.0,5.5,RENT,3,5,3.0,3.0
2352,2352,C,14.65,2017.0,43000.0,11.25,MORTGAGE,2,1,3.0,3.0
3231,3231,B,11.71,2017.0,115000.0,12.27,MORTGAGE,1,1,2.0,2.0
7855,7855,F,20.25,2017.0,150000.0,4.69,RENT,5,5,4.0,4.0
11798,11798,D,17.49,2017.0,127000.0,12.84,MORTGAGE,3,1,3.0,3.0
