In [1]:
#Multivaiate.py
# Load lending club stastitcs full data from :
#https://github.com/Thinkful-Ed/curric-data-001-data-sets/blob/master/loans/loansData.csv
#Use income (annual_inc) to model interest rates (int_rate)
#Add home ownership (home_ownership) to the model
#Does that affect the significance of the coefficients in the original model?
#Try to add the interaction of home ownership and incomes as a term. How does this impact the new model?
#helpful examples from
#http://nbviewer.jupyter.org/urls/s3.amazonaws.com/datarobotblog/notebooks/multiple_regression_in_python.ipynb#appendix

In [2]:
#all imports
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm

In [3]:
#loading data
loansData = pd.read_csv('https://raw.githubusercontent.com/Thinkful-Ed/curric-data-001-data-sets/master/loans/loansData.csv')
loansData.dropna(inplace=True)
print(loansData.head())

       Amount.Requested  Amount.Funded.By.Investors Interest.Rate Loan.Length  \
81174             20000                     20000.0         8.90%   36 months   
99592             19200                     19200.0        12.12%   36 months   
80059             35000                     35000.0        21.98%   60 months   
15825             10000                      9975.0         9.99%   36 months   
33182             12000                     12000.0        11.71%   36 months   

             Loan.Purpose Debt.To.Income.Ratio State Home.Ownership  \
81174  debt_consolidation               14.90%    SC       MORTGAGE   
99592  debt_consolidation               28.36%    TX       MORTGAGE   
80059  debt_consolidation               23.81%    CA       MORTGAGE   
15825  debt_consolidation               14.30%    KS       MORTGAGE   
33182         credit_card               18.78%    NJ           RENT   

       Monthly.Income FICO.Range  Open.CREDIT.Lines  Revolving.CREDIT.Balance  \
81174

In [4]:
#converting monthly to annula as mentioned in the project
loansData['annual_inc']=loansData['Monthly.Income'].map(lambda x: x*12)
loansData['annual_inc'].head()

81174     78500.04
99592     54999.96
80059    138000.00
15825     45999.96
33182     38340.00
Name: annual_inc, dtype: float64

In [5]:
#Reshaping data - convering string to float
loansData['int_rate']=loansData['Interest.Rate'].map(lambda x: round(float(x.rstrip('%'))/100, 4))
loansData['int_rate'].head()

81174    0.0890
99592    0.1212
80059    0.2198
15825    0.0999
33182    0.1171
Name: int_rate, dtype: float64

In [6]:
#creating variables to fit into model
x1=[]
x1=loansData['annual_inc']
print(x1[0:5])

81174     78500.04
99592     54999.96
80059    138000.00
15825     45999.96
33182     38340.00
Name: annual_inc, dtype: float64


In [7]:
#fitting the model
y=[]
y=loansData['int_rate']
X = sm.add_constant(x1)
model=sm.OLS(y, X)
f=model.fit()
print(f.summary())

                            OLS Regression Results                            
Dep. Variable:               int_rate   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                    0.4168
Date:                Fri, 23 Dec 2016   Prob (F-statistic):              0.519
Time:                        00:46:16   Log-Likelihood:                 4388.2
No. Observations:                2498   AIC:                            -8772.
Df Residuals:                    2496   BIC:                            -8761.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const          0.1299      0.001     88.807      0.0

In [8]:
#?Scales for int rate and for annual income are wide apart probably --- Do we need to change the scales or model
#ie use log reduction instead
#try second method explicitly giving all indep vars including intercept

#creting an intercept
loansData['intercept']=float(1.0)
i=[]
i = loansData['intercept']
i.head()
#


81174    1.0
99592    1.0
80059    1.0
15825    1.0
33182    1.0
Name: intercept, dtype: float64

In [9]:
model = sm.OLS(y,loansData[['intercept', 'annual_inc']])
result1 = model.fit()
print (result1.summary())
#both gives the same result

                            OLS Regression Results                            
Dep. Variable:               int_rate   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                    0.4168
Date:                Fri, 23 Dec 2016   Prob (F-statistic):              0.519
Time:                        00:46:16   Log-Likelihood:                 4388.2
No. Observations:                2498   AIC:                            -8772.
Df Residuals:                    2496   BIC:                            -8761.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
intercept      0.1299      0.001     88.807      0.0

In [10]:
#now add home ownership to the model
#first conver the categorical var 
#dummify vs pd.categorical

loansData['ho_ord']=pd.Categorical(loansData['Home.Ownership']).labels
loansData.ho_ord, loansData['Home.Ownership']

#The above gives warning message hence using new method
#loansData['ho_ord']=pd.Categorical(loansData['Home.Ownership'], categories=['MORTGAGE', 'RENT', 'OWN'], ordered=False)
#new method not working, categorical prints only 0, 2, 3  perhaps something else is labeled 1




(81174     0
 99592     0
 80059     0
 15825     0
 33182     3
 62403     2
 48808     3
 22090     0
 76404     3
 15867     3
 94971     3
 36911     0
 41200     0
 83869     3
 53853     3
 21399     3
 62127     3
 23446     3
 44987     3
 17977     0
 86099     0
 99483     0
 28798     3
 24168     3
 10356     0
 46027     3
 2238      3
 65278     0
 4227      0
 50182     0
          ..
 84265     3
 80231     0
 49533     0
 102514    0
 78618     3
 86953     3
 80129     0
 85216     0
 38247     0
 91245     0
 53041     3
 63051     0
 14446     0
 68628     0
 98758     0
 13070     0
 45836     2
 52330     3
 48243     3
 63256     3
 42124     3
 78043     3
 925       0
 74047     0
 49957     0
 23735     0
 65882     2
 55610     0
 38576     3
 3116      3
 Name: ho_ord, dtype: int8, 81174     MORTGAGE
 99592     MORTGAGE
 80059     MORTGAGE
 15825     MORTGAGE
 33182         RENT
 62403          OWN
 48808         RENT
 22090     MORTGAGE
 76404         RENT


In [11]:
#adding home ownership pattern to the model
model2=sm.OLS(loansData['int_rate'], loansData[['intercept', 'annual_inc', 'ho_ord']]).fit()
model2.summary()

0,1,2,3
Dep. Variable:,int_rate,R-squared:,0.007
Model:,OLS,Adj. R-squared:,0.006
Method:,Least Squares,F-statistic:,8.274
Date:,"Fri, 23 Dec 2016",Prob (F-statistic):,0.000262
Time:,00:46:16,Log-Likelihood:,4396.2
No. Observations:,2498,AIC:,-8786.0
Df Residuals:,2495,BIC:,-8769.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
intercept,0.1252,0.002,66.918,0.000,0.122 0.129
annual_inc,2.687e-08,1.8e-08,1.496,0.135,-8.36e-09 6.21e-08
ho_ord,0.0024,0.001,4.016,0.000,0.001 0.004

0,1,2,3
Omnibus:,67.032,Durbin-Watson:,1.99
Prob(Omnibus):,0.0,Jarque-Bera (JB):,53.954
Skew:,0.281,Prob(JB):,1.92e-12
Kurtosis:,2.55,Cond. No.,190000.0


In [12]:
#working with interaction, not sure whether multiplying tow relevant variables is all that required
loansData['interaction']=loansData['ho_ord']*loansData['annual_inc']
loansData.interaction.head()



81174         0.0
99592         0.0
80059         0.0
15825         0.0
33182    115020.0
Name: interaction, dtype: float64

In [13]:
#modeling with interaction
model3=sm.OLS(loansData['int_rate'], loansData[['intercept', 'annual_inc', 'ho_ord', 'interaction']]).fit()
model3.summary()

0,1,2,3
Dep. Variable:,int_rate,R-squared:,0.008
Model:,OLS,Adj. R-squared:,0.007
Method:,Least Squares,F-statistic:,6.588
Date:,"Fri, 23 Dec 2016",Prob (F-statistic):,0.000197
Time:,00:46:16,Log-Likelihood:,4397.8
No. Observations:,2498,AIC:,-8788.0
Df Residuals:,2494,BIC:,-8764.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
intercept,0.1267,0.002,61.802,0.000,0.123 0.131
annual_inc,7.914e-09,2.09e-08,0.380,0.704,-3.3e-08 4.88e-08
ho_ord,0.0007,0.001,0.675,0.499,-0.001 0.003
interaction,2.561e-08,1.43e-08,1.789,0.074,-2.47e-09 5.37e-08

0,1,2,3
Omnibus:,66.657,Durbin-Watson:,1.994
Prob(Omnibus):,0.0,Jarque-Bera (JB):,53.272
Skew:,0.277,Prob(JB):,2.7e-12
Kurtosis:,2.548,Cond. No.,397000.0


In [14]:
#output in form of graphs has to be worked out
#working well