# DS-SF-26 | Codealong 06 | Introduction to Regression and Model Fit

## Setup

In [56]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf

pd.set_option('display.max_rows', 10)
pd.set_option('display.notebook_repr_html', True)
pd.set_option('display.max_columns', 10)

%matplotlib inline
plt.style.use('ggplot')

In [57]:
def read_dataset():
    return pd.read_csv(os.path.join('..', 'datasets', 'zillow-06-start.csv'), index_col = 'ID')

df = read_dataset()

## Part A1 - Simple Linear Regression

### Transforming Variables, e.g., scaling

Activity: How to scale SalePrice from $ to $M, Size and LotSize from sqft to "1,000 sqft"?

In [58]:
def scale_variables(df):
    df.SalePrice = df.SalePrice / (10**6)
    df.Size = df.Size / (10**3)
    df.LotSize = df.LotSize / (10**3)

    # Sale price in $M
    # Size in 1,000 sqft
    # Lot size in 1,000 sqft
    
scale_variables(df)

### `SalePrice` as a function of `Size`

In [59]:
model = smf.ols(formula = 'SalePrice ~ Size', data = df).fit()

model.summary()

0,1,2,3
Dep. Variable:,SalePrice,R-squared:,0.236
Model:,OLS,Adj. R-squared:,0.235
Method:,Least Squares,F-statistic:,297.4
Date:,"Tue, 24 May 2016",Prob (F-statistic):,2.67e-58
Time:,19:56:29,Log-Likelihood:,-1687.9
No. Observations:,967,AIC:,3380.0
Df Residuals:,965,BIC:,3390.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
Intercept,0.1551,0.084,1.842,0.066,-0.010 0.320
Size,0.7497,0.043,17.246,0.000,0.664 0.835

0,1,2,3
Omnibus:,1842.865,Durbin-Watson:,1.704
Prob(Omnibus):,0.0,Jarque-Bera (JB):,3398350.943
Skew:,13.502,Prob(JB):,0.0
Kurtosis:,292.162,Cond. No.,4.4


*SalePrice = 0.1551 + 0.7497 x Size*

### Model's parameters

In [60]:
model.params

Intercept    0.155052
Size         0.749728
dtype: float64

In [61]:
type(model.params)

pandas.core.series.Series

In [62]:
model.params.Intercept  #calling intercept by itself

0.15505177276001381

In [63]:
model.params['Intercept']

0.15505177276001381

### t-values

In [64]:
model.tvalues

Intercept     1.842394
Size         17.245775
dtype: float64

### p-values

In [65]:
model.pvalues

Intercept    6.572416e-02
Size         2.667697e-58
dtype: float64

### Confidence Intervals

In [66]:
model.conf_int(cols = [0, 1])

Unnamed: 0,0,1
Intercept,-0.010102,0.320205
Size,0.664415,0.835041


0 = lower | 1 = upper

In [67]:
type(model.conf_int(cols = [0, 1]))

pandas.core.frame.DataFrame

## Part A2 - Simple Linear Regression

### `SalePrice` as a function of `Size` without `Intercept`

In [68]:
model = smf.ols(formula = 'SalePrice ~ 0 + Size', data = df).fit()


#put '0 +' removes the intercept
model.summary()

0,1,2,3
Dep. Variable:,SalePrice,R-squared:,0.565
Model:,OLS,Adj. R-squared:,0.565
Method:,Least Squares,F-statistic:,1255.0
Date:,"Tue, 24 May 2016",Prob (F-statistic):,7.83e-177
Time:,19:56:29,Log-Likelihood:,-1689.6
No. Observations:,967,AIC:,3381.0
Df Residuals:,966,BIC:,3386.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
Size,0.8176,0.023,35.426,0.000,0.772 0.863

0,1,2,3
Omnibus:,1830.896,Durbin-Watson:,1.722
Prob(Omnibus):,0.0,Jarque-Bera (JB):,3370566.094
Skew:,13.3,Prob(JB):,0.0
Kurtosis:,291.005,Cond. No.,1.0


### Drop outliers

Activity: How to drop outliers?

FIND OUTLIERS:


Q1-1.5 X IQR

Q3+1.5 X IQR

In [69]:
def drop_outliers(df):
   print 'Dropping outliers'
   print '- n (before) =', len(df)

   Q1 = df.SalePrice.quantile(.25)
   Q2 = df.SalePrice.quantile(.5)
   Q3 = df.SalePrice.quantile(.75)
   IQR = Q3 - Q1

   print '- Q1         =', Q1, '($M)'
   print '- Q2/Median  =', Q2, '($M)'
   print '- Q3         =', Q3, '($M)'

   df.drop(df[(df.SalePrice < Q1 - 1.5 * IQR) | (df.SalePrice > Q3 + 1.5 * IQR)].index, inplace = True)

   print '- n (after)  =', len(df)

In [70]:
Q1= df.SalePrice.quantile(.25)
Q3= df.SalePrice.quantile(.75)

IQR = Q3-Q1

print Q1
print Q3

0.8
1.5325


*.8 houses that are higher than 1.53 $M*

In [72]:
drop_outliers(df)

Dropping outliers
- n (before) = 933
- Q1         = 0.788 ($M)
- Q2/Median  = 1.05 ($M)
- Q3         = 1.43 ($M)
- n (after)  = 915


### `SalePrice` as a function of `Size` (again)

In [73]:
model = smf.ols(formula = 'SalePrice ~ Size', data = df).fit()

model.summary()

0,1,2,3
Dep. Variable:,SalePrice,R-squared:,0.185
Model:,OLS,Adj. R-squared:,0.184
Method:,Least Squares,F-statistic:,199.8
Date:,"Tue, 24 May 2016",Prob (F-statistic):,4.72e-41
Time:,19:57:09,Log-Likelihood:,-494.81
No. Observations:,885,AIC:,993.6
Df Residuals:,883,BIC:,1003.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
Intercept,0.7249,0.030,23.870,0.000,0.665 0.785
Size,0.2522,0.018,14.137,0.000,0.217 0.287

0,1,2,3
Omnibus:,19.979,Durbin-Watson:,1.641
Prob(Omnibus):,0.0,Jarque-Bera (JB):,37.077
Skew:,-0.107,Prob(JB):,8.89e-09
Kurtosis:,3.98,Cond. No.,4.67


*can see how the intercept went up - now costs $7000*

slope decreased 

can see that the coeffient is now statistically significant 

removing outliers - intercept higher; but slope lower b/c outliers arent causing a larger slope

## Part B - How to check modeling assumptions?

### `.plot_regress_exog()`

In [None]:
figure = plt.figure(figsize = (12, 8))
figure = sm.graphics.plot_regress_exog(model, 'Size', fig = figure)

## Part C1 - How to check normality assumption?

### Histogram (e.g., residuals)

Activity: How to get histograms of residuals?

In [None]:
model = smf.ols(formula = 'SalePrice ~ Size', data = df).fit()

# TODO

Is it normal?

### q-q plot (e.g., residuals) against a normal distribution

In [None]:
figure, ax = plt.subplots(figsize = (8, 8))
figure = sm.qqplot(model.resid, line = 's', ax = ax)

plt.show()

## Part C2 - How to check normality assumption?

### q-q plot of two normal distributions

`.qqplot()` with `line = 's'`

In [None]:
normal_array = np.random.normal(0, 1, size = 100)

figure, ax = plt.subplots(figsize = (8, 8))
figure = sm.qqplot(normal_array, line = '45', ax = ax)
plt.show()

`.qqplot()` with `line = '45'`; N(0, 1) vs. N(0, 1) 

In [None]:
normal_array = np.random.normal(10, 1, size = 100)

figure, ax = plt.subplots(figsize = (8, 8))
figure = sm.qqplot(normal_array, line = '45', ax = ax)
plt.show()

`.qqplot()` with `line = 's'`; N(10, 1) vs. N(0, 1) 

In [None]:
normal_array = np.random.normal(10, 1, size = 100)

figure, ax = plt.subplots(figsize = (8, 8))
figure = sm.qqplot(normal_array, line = 's', ax = ax)
plt.show()

`.qqplot()` with `line = '45'`; N(0, 10) vs. N(0, 1) 

In [None]:
normal_array = np.random.normal(0, 10, size = 100)

figure, ax = plt.subplots(figsize = (8, 8))
figure = sm.qqplot(normal_array, line = '45', ax = ax)
plt.show()

## Part D - Inference and Fit

In [None]:
df = read_dataset() # reload the dataset to get our outliers back...

scale_variables(df) # scaling variables (function defined above)

### Effect of outliers on regression modeling - `SalePrice` as a function of `Size`

In [None]:
sns.lmplot('Size', 'SalePrice', df, size = 8)

### `SalePrice` as a function of `Size` after dropping the "worst" outlier

In [None]:
# TODO

sns.lmplot('Size', 'SalePrice', subset_df, size = 8)

## Part E - R<sup>2</sup>

### `SalePrice` as a function of `Size`

In [None]:
model = smf.ols(formula = 'SalePrice ~ 0 + Size', data = df).fit()

model.summary()

### R<sup>2</sup>

In [None]:
model.rsquared

In [None]:
print 'With outliers:'
print '- SalePrice ~     Size; R^2 =', smf.ols(formula = 'SalePrice ~ Size', data = df).fit().rsquared
print '- SalePrice ~ 0 + Size; R^2 =', smf.ols(formula = 'SalePrice ~ 0 + Size', data = df).fit().rsquared

print
drop_outliers(df) # dropping outliers (function defined above)
print

print 'Without outliers:'
print '- SalePrice ~     Size; R^2 =', smf.ols(formula = 'SalePrice ~ Size', data = df).fit().rsquared
print '- SalePrice ~ 0 + Size; R^2 =', smf.ols(formula = 'SalePrice ~ 0 + Size', data = df).fit().rsquared

## Part F - Multiple Linear Regression

In [None]:
df = read_dataset() # reload the dataset to get our outliers back...

scale_variables(df) # scaling variables (function defined above)

In [None]:
model = smf.ols(formula = 'SalePrice ~ Size + BedCount', data = df).fit()

model.summary()

## Part G - Multicollinearity

Yet another way to transform variables: `.apply()`

### Transforming Variables (cont.)

In [None]:
df[ ['SizeLog', 'LotSizeLog'] ] = df[ ['Size', 'LotSize'] ].apply(np.log10)
df[ ['SizeSqrt', 'LotSizeSqrt'] ] = df[ ['Size', 'LotSize'] ].apply(np.sqrt)
df[ ['SizeSquare', 'LotSizeSquare'] ] = df[ ['Size', 'LotSize'] ].apply(np.square)

In [None]:
df

### Multicollinearity

Multicollinearity between Size, ln(Size), sqrt(Size), and Size^2

In [None]:
df[ ['Size', 'SizeLog', 'SizeSqrt', 'SizeSquare' ] ].corr()

In [None]:
model = smf.ols(formula = 'SalePrice ~ Size + SizeLog + SizeSqrt + SizeSquare', data = df).fit()

model.summary()

## Part H - Adjusted R<sup>2</sup>

In [None]:
formula = 'SalePrice ~ 0 + IsAStudio + BedCount + BathCount + Size + LotSize'

model = smf.ols(formula = formula, data = df).fit()

print 'R^2 =', model.rsquared, '(original model)'

In [None]:
x_df = pd.DataFrame(index = df.index)

for i in range(100):
    x = 'X{}'.format(i)
    x_df[x] = np.random.random(len(df))

formula = 'SalePrice ~ 0 + IsAStudio + BedCount + BathCount + Size + LotSize + BuiltInYear + '
formula += ' + '.join(x_df.columns.values)

x_df = x_df.join(df)

x_model = smf.ols(formula = formula, data = x_df).fit()

In [None]:
# TODO