In [29]:
#Import the libraries
import pandas as pd
# setting pandas display to avoid scientific notation in my dataframes
pd.options.display.float_format = '{:.2f}'.format
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as scs
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn import linear_model
from sklearn import metrics
from statsmodels.formula.api import ols

In [10]:
#Import the data
housings_data = pd.read_csv("kc_house_data.csv", usecols=["price", "bedrooms", "bathrooms", "sqft_living", "sqft_lot", 
                                                       "floors", "waterfront", "grade", "yr_built"])
#show first 5 rows
housings_data.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,grade,yr_built
0,221900.0,3,1.0,1180,5650,1.0,,7 Average,1955
1,538000.0,3,2.25,2570,7242,2.0,NO,7 Average,1951
2,180000.0,2,1.0,770,10000,1.0,NO,6 Low Average,1933
3,604000.0,4,3.0,1960,5000,1.0,NO,7 Average,1965
4,510000.0,3,2.0,1680,8080,1.0,NO,8 Good,1987


In [11]:
housings_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21597 entries, 0 to 21596
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   price        21597 non-null  float64
 1   bedrooms     21597 non-null  int64  
 2   bathrooms    21597 non-null  float64
 3   sqft_living  21597 non-null  int64  
 4   sqft_lot     21597 non-null  int64  
 5   floors       21597 non-null  float64
 6   waterfront   19221 non-null  object 
 7   grade        21597 non-null  object 
 8   yr_built     21597 non-null  int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 1.5+ MB


In [12]:
housings_data['floors']

0       1.00
1       2.00
2       1.00
3       1.00
4       1.00
        ... 
21592   3.00
21593   2.00
21594   2.00
21595   2.00
21596   2.00
Name: floors, Length: 21597, dtype: float64

In [15]:
housings_data['grade']

0        7
1        7
2        6
3        7
4        8
        ..
21592    8
21593    8
21594    7
21595    8
21596    7
Name: grade, Length: 21597, dtype: int64

In [14]:
housings_data['grade'] = housings_data['grade'].str.split(n=1, expand=True)[0].astype(int)


In [22]:
housings_data['waterfront']

0        NO
1        NO
2        NO
3        NO
4        NO
         ..
21592    NO
21593    NO
21594    NO
21595    NO
21596    NO
Name: waterfront, Length: 21597, dtype: object

In [19]:
housings_data['waterfront'].fillna("NO", inplace=True)

In [23]:
housings_data['waterfront'] = housings_data['waterfront'].map({'YES': 1, 'NO': 0})

In [26]:
housings_data['waterfront'].value_counts()

0    21451
1      146
Name: waterfront, dtype: int64

In [28]:
housings_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21597 entries, 0 to 21596
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   price        21597 non-null  float64
 1   bedrooms     21597 non-null  int64  
 2   bathrooms    21597 non-null  float64
 3   sqft_living  21597 non-null  int64  
 4   sqft_lot     21597 non-null  int64  
 5   floors       21597 non-null  float64
 6   waterfront   21597 non-null  int64  
 7   grade        21597 non-null  int64  
 8   yr_built     21597 non-null  int64  
dtypes: float64(3), int64(6)
memory usage: 1.5 MB


In [32]:
price_preds = housings_data.drop('price', axis=1)
price_target = housings_data['price']
price_preds.head()

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,grade,yr_built
0,3,1.0,1180,5650,1.0,0,7,1955
1,3,2.25,2570,7242,2.0,0,7,1951
2,2,1.0,770,10000,1.0,0,6,1933
3,4,3.0,1960,5000,1.0,0,7,1965
4,3,2.0,1680,8080,1.0,0,8,1987


In [33]:
predictors = sm.add_constant(price_preds)
predictors.head()

Unnamed: 0,const,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,grade,yr_built
0,1.0,3,1.0,1180,5650,1.0,0,7,1955
1,1.0,3,2.25,2570,7242,2.0,0,7,1951
2,1.0,2,1.0,770,10000,1.0,0,6,1933
3,1.0,4,3.0,1960,5000,1.0,0,7,1965
4,1.0,3,2.0,1680,8080,1.0,0,8,1987


In [34]:
model = sm.OLS(price_target, predictors).fit()

In [35]:
model.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.645
Model:,OLS,Adj. R-squared:,0.645
Method:,Least Squares,F-statistic:,4896.0
Date:,"Fri, 22 Jul 2022",Prob (F-statistic):,0.0
Time:,14:25:56,Log-Likelihood:,-296220.0
No. Observations:,21597,AIC:,592500.0
Df Residuals:,21588,BIC:,592500.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,7.042e+06,1.22e+05,57.876,0.000,6.8e+06,7.28e+06
bedrooms,-4.18e+04,2051.003,-20.380,0.000,-4.58e+04,-3.78e+04
bathrooms,5.235e+04,3453.722,15.159,0.000,4.56e+04,5.91e+04
sqft_living,177.3169,3.308,53.605,0.000,170.833,183.801
sqft_lot,-0.2435,0.037,-6.610,0.000,-0.316,-0.171
floors,1.752e+04,3434.171,5.101,0.000,1.08e+04,2.42e+04
waterfront,7.56e+05,1.84e+04,41.120,0.000,7.2e+05,7.92e+05
grade,1.299e+05,2158.840,60.163,0.000,1.26e+05,1.34e+05
yr_built,-3989.1562,64.121,-62.213,0.000,-4114.839,-3863.473

0,1,2,3
Omnibus:,15764.498,Durbin-Watson:,1.976
Prob(Omnibus):,0.0,Jarque-Bera (JB):,986533.129
Skew:,2.917,Prob(JB):,0.0
Kurtosis:,35.593,Cond. No.,3600000.0


In [37]:
price_preds_scaled = (price_preds - np.mean(price_preds)) / np.std(price_preds)

In [38]:
price_preds_scaled.describe()

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,grade,yr_built
count,21597.0,21597.0,21597.0,21597.0,21597.0,21597.0,21597.0,21597.0
mean,-0.0,0.0,-0.0,0.0,-0.0,-0.0,0.0,0.0
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,-2.56,-2.1,-1.86,-0.35,-0.92,-0.08,-3.97,-2.42
25%,-0.4,-0.48,-0.71,-0.24,-0.92,-0.08,-0.56,-0.68
50%,-0.4,0.17,-0.19,-0.18,0.01,-0.08,-0.56,0.14
75%,0.68,0.5,0.51,-0.11,0.94,-0.08,0.29,0.89
max,31.98,7.65,12.48,39.51,3.72,12.12,4.55,1.5


In [39]:
predictors = sm.add_constant(price_preds_scaled)
model = sm.OLS(price_target, predictors).fit()
model.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.645
Model:,OLS,Adj. R-squared:,0.645
Method:,Least Squares,F-statistic:,4896.0
Date:,"Fri, 22 Jul 2022",Prob (F-statistic):,0.0
Time:,14:38:04,Log-Likelihood:,-296220.0
No. Observations:,21597,AIC:,592500.0
Df Residuals:,21588,BIC:,592500.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.403e+05,1490.349,362.530,0.000,5.37e+05,5.43e+05
bedrooms,-3.872e+04,1899.798,-20.380,0.000,-4.24e+04,-3.5e+04
bathrooms,4.026e+04,2655.797,15.159,0.000,3.51e+04,4.55e+04
sqft_living,1.628e+05,3036.868,53.605,0.000,1.57e+05,1.69e+05
sqft_lot,-1.008e+04,1525.304,-6.610,0.000,-1.31e+04,-7092.161
floors,9453.5428,1853.320,5.101,0.000,5820.898,1.31e+04
waterfront,6.195e+04,1506.533,41.120,0.000,5.9e+04,6.49e+04
grade,1.524e+05,2532.692,60.163,0.000,1.47e+05,1.57e+05
yr_built,-1.172e+05,1883.537,-62.213,0.000,-1.21e+05,-1.13e+05

0,1,2,3
Omnibus:,15764.498,Durbin-Watson:,1.976
Prob(Omnibus):,0.0,Jarque-Bera (JB):,986533.129
Skew:,2.917,Prob(JB):,0.0
Kurtosis:,35.593,Cond. No.,4.72
