In [57]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import string

from statsmodels.formula.api import ols
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm
import scipy.stats as stats
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE

from sklearn.metrics import r2_score

In [2]:
df = pd.read_csv('master_df.csv')
df.head()

Unnamed: 0,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,...,zipcode,lat,long,sqft_living15,sqft_lot15,basement,Renovated,year,month,age_when_sold
0,2014-12-09,538000.0,3,2.25,2570,7242,2.0,0,0,3,...,98125,47.721,-122.319,1690,7639,1,1,2014,12,63
1,2015-02-25,180000.0,2,1.0,770,10000,1.0,0,0,3,...,98028,47.7379,-122.233,2720,8062,0,0,2015,2,82
2,2014-12-09,604000.0,4,3.0,1960,5000,1.0,0,0,5,...,98136,47.5208,-122.393,1360,5000,1,0,2014,12,49
3,2015-02-18,510000.0,3,2.0,1680,8080,1.0,0,0,3,...,98074,47.6168,-122.045,1800,7503,0,0,2015,2,28
4,2014-05-12,1230000.0,4,4.5,5420,101930,1.0,0,0,3,...,98053,47.6561,-122.005,4760,101930,1,0,2014,5,13


In [10]:
df.drop('date', axis = 1, inplace = True)

In [55]:
sk_ols = LinearRegression(fit_intercept=False, normalize=False, n_jobs=-1)

In [20]:
rfe = RFE(sk_ols, n_features_to_select=10, step=1, verbose=2)

In [34]:
X_all = df.drop(columns=['price', 'lat', 'long', 'month', 'yr_renovated', 'yr_built'])
rfe.fit(X_all, df['price'])

Fitting estimator with 18 features.
Fitting estimator with 17 features.
Fitting estimator with 16 features.
Fitting estimator with 15 features.
Fitting estimator with 14 features.
Fitting estimator with 13 features.
Fitting estimator with 12 features.
Fitting estimator with 11 features.


RFE(estimator=LinearRegression(copy_X=True, fit_intercept=False, n_jobs=-1,
                               normalize=False),
    n_features_to_select=10, step=1, verbose=2)

In [35]:
new_cols = X_all.columns[rfe.support_]

In [36]:
X_new = df[new_cols]
X_new.head()

Unnamed: 0,bedrooms,bathrooms,floors,view,condition,grade,basement,Renovated,year,age_when_sold
0,3,2.25,2.0,0,3,7,1,1,2014,63
1,2,1.0,1.0,0,3,6,0,0,2015,82
2,4,3.0,1.0,0,5,7,1,0,2014,49
3,3,2.0,1.0,0,3,8,0,0,2015,28
4,4,4.5,1.0,0,3,11,1,0,2014,13


In [37]:
def make_ols_sklearn(X, y, test_size=0.20, fit_intercept=False, standardize=False):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
    if standardize:
        ss = StandardScaler()
        ss.fit(X_train)
        X_train = ss.transform(X_train)
        X_test = ss.transform(X_test)
    ols = LinearRegression(fit_intercept=fit_intercept, normalize=False)
    ols.fit(X_train, y_train)
    train_score = ols.score(X_train, y_train)
    test_score = ols.score(X_test, y_test)
    print(f"train score = {train_score}")
    print(f"test score = {test_score}")
    return ols

In [47]:
make_ols_sklearn(X_new, df['price'])

train score = 0.602859451078902
test score = 0.5971230084774994


LinearRegression(copy_X=True, fit_intercept=False, n_jobs=None, normalize=False)

In [50]:
cv_scores = cross_val_score(sk_ols, X_new, df['price'], scoring='r2', cv=10, n_jobs=-1) 

In [51]:
cv_scores

array([0.60696267, 0.60766635, 0.57935736, 0.58893222, 0.59043037,
       0.6210322 , 0.61039207, 0.61839829, 0.60185292, 0.55601637])

In [53]:
target ='price'

In [52]:
train, test = train_test_split(df)

In [58]:
predictors = '+'.join(new_cols)
formula = target + '~' + predictors
model = ols(formula=formula, data=train).fit()
model.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.602
Model:,OLS,Adj. R-squared:,0.602
Method:,Least Squares,F-statistic:,2160.0
Date:,"Tue, 02 Jun 2020",Prob (F-statistic):,0.0
Time:,12:36:33,Log-Likelihood:,-194820.0
No. Observations:,14290,AIC:,389700.0
Df Residuals:,14279,BIC:,389700.0
Df Model:,10,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-4.527e+07,7.29e+06,-6.212,0.000,-5.96e+07,-3.1e+07
bedrooms,2485.6238,2259.219,1.100,0.271,-1942.739,6913.986
bathrooms,9.32e+04,3786.335,24.615,0.000,8.58e+04,1.01e+05
floors,3.366e+04,4148.710,8.113,0.000,2.55e+04,4.18e+04
view,4.997e+04,2585.976,19.324,0.000,4.49e+04,5.5e+04
condition,2.389e+04,2879.791,8.295,0.000,1.82e+04,2.95e+04
grade,1.798e+05,2040.273,88.132,0.000,1.76e+05,1.84e+05
basement,2.92e+04,3906.733,7.475,0.000,2.15e+04,3.69e+04
Renovated,3.952e+04,9945.089,3.974,0.000,2e+04,5.9e+04

0,1,2,3
Omnibus:,7276.644,Durbin-Watson:,1.989
Prob(Omnibus):,0.0,Jarque-Bera (JB):,105252.664
Skew:,2.1,Prob(JB):,0.0
Kurtosis:,15.615,Cond. No.,8700000.0
