In [1]:
import warnings
import pandas
import numpy as np
from sklearn.linear_model import Lasso, LinearRegression
from sklearn import cross_validation
from sklearn import grid_search
from sklearn.ensemble import ExtraTreesRegressor
import statsmodels.api as sm
float_formatter = lambda x: "%.2f" % x
np.set_printoptions(formatter={'float_kind':float_formatter})

  from pandas.core import datetools


In [2]:
#import the dataset

dataset = pandas.read_csv("kc-house-data.csv",encoding = "ISO-8859-1")



X    = dataset[["sqft_above","sqft_basement","sqft_lot","sqft_living","floors","bedrooms",
                     "yr_built","lat","long","bathrooms"]].values
Y               = dataset["price"].values
zipcodes        = pandas.get_dummies(dataset["zipcode"]).values
condition       = pandas.get_dummies(dataset["condition"]).values
grade           = pandas.get_dummies(dataset["grade"]).values
X               = np.concatenate((X,zipcodes),axis=1)
X               = np.concatenate((X,condition),axis=1)
X               = np.concatenate((X,grade),axis=1)

In [3]:
#building stats model
model = sm.OLS(dataset["price"],X)
results = model.fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.792
Model:                            OLS   Adj. R-squared:                  0.791
Method:                 Least Squares   F-statistic:                     881.0
Date:                Sun, 02 Jul 2017   Prob (F-statistic):               0.00
Time:                        00:45:09   Log-Likelihood:            -2.9064e+05
No. Observations:               21613   AIC:                         5.815e+05
Df Residuals:                   21519   BIC:                         5.822e+05
Df Model:                          93                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
x1            68.1286      1.963     34.700      0.0

In [4]:
#building linear regression model
clf   = LinearRegression()
clf.fit(X, dataset["price"].values)
scores = cross_validation.cross_val_score(clf,X , dataset["price"].values, cv=3)
print("Linear Regression Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
print(clf.coef_)
print("LinearRegression # coeffs :" + str(clf.coef_.shape[0]))

Linear Regression Accuracy: 0.79 (+/- 0.03)
[68.13 42.55 0.17 110.68 -18185.11 -22886.19 -906.86 90841.06 -312489.77
 31747.52 -193770.15 -171856.43 -213184.45 546040.15 89639.07 71092.33
 55938.85 117820.40 -55195.42 -85991.11 -10536.03 -42920.22 -85389.46
 -251428.66 41937.85 10855.21 -96081.92 74505.39 -169196.86 -165453.20
 -204241.85 157735.64 -5243.37 -89553.20 994823.05 329252.48 -145158.15
 48849.79 49602.08 43562.04 -148771.16 -91370.95 -142131.37 -94404.92
 11831.69 -108001.05 -44356.97 19984.65 33219.32 -81731.64 -187783.10
 213934.78 74158.75 232175.27 -122934.35 74867.54 -123285.25 239054.32
 356630.14 85409.74 45683.18 40161.94 -57167.42 225825.84 99065.47
 -29342.70 -60330.27 -101174.34 22256.52 48722.38 -114510.48 -161489.48
 -94039.27 -123042.72 -178282.51 -29248.94 -139790.37 -181526.12 -171630.22
 116910.26 -64443.25 -19227.36 -213.86 21958.91 61925.55 -270319.87
 -240355.51 -303312.36 -335196.59 -345627.10 -334030.22 -296285.97
 -206750.02 -68366.24 155197.21 618198

In [None]:
clf            = Lasso(max_iter = 100000000)
clf.fit(X, dataset["price"].values)
scores = cross_validation.cross_val_score(clf,X , dataset["price"].values, cv=3)
print("Lasso Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
print(clf.coef_)
print("Lasso # coeffs :" + str(clf.coef_[clf.coef_>0].shape[0]))


In [None]:
clf            = ExtraTreesRegressor()
parameters     = {'max_depth':np.arange(1,15)}
clfgrid        = grid_search.GridSearchCV(clf, parameters)
clfgrid.fit(X, dataset["price"].values)
scores = cross_validation.cross_val_score(clf,X , dataset["price"].values, cv=3)
print("Extratrees Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))