In [133]:
import numpy as np
import math
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from statsmodels.formula.api import ols
import statsmodels.api as sm
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.linear_model import LinearRegression, ElasticNet, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error

In [134]:
# Load data
df = pd.read_csv("https://raw.githubusercontent.com/delinai/schulich_ds1/main/Datasets/kc_house_data.csv")

In [135]:
# date update
df['date'] = pd.to_datetime(df['date'])
df['year'] = df['date'].dt.year
df['renovation'] = df['yr_renovated'].apply(lambda x: 1 if x > 0 else 0)
df['house_age'] = df['year'] - df['yr_built']

In [136]:
# Drop unrelated features'id', 'date', 'zipcode', 'year', 'yr_renovated'
df = df.drop(['id','date', 'year', 'yr_built', 'yr_renovated'], axis=1)

In [137]:
df['price'].head()

0    221900.0
1    538000.0
2    180000.0
3    604000.0
4    510000.0
Name: price, dtype: float64

In [138]:
#df['price'] = np.log(df['price'])

In [139]:
df['price'].head()

0    221900.0
1    538000.0
2    180000.0
3    604000.0
4    510000.0
Name: price, dtype: float64

In [140]:
# Identify features
features = [i for i in df.columns if i not in ['price']]
unique_feature_num = df[features].nunique().sort_values()

numerical_features = []
categorical_features = []
for i in range(df[features].shape[1]):
    if unique_feature_num[i] <= 5:
        categorical_features.append(unique_feature_num.index[i])
    else:
        numerical_features.append(unique_feature_num.index[i])

In [141]:
# Check empty elements
nvc = pd.DataFrame(df.isnull().sum().sort_values(), columns=['Total Null Values'])
nvc['Percentage'] = round(nvc['Total Null Values']/df.shape[0],3)*100

# Apply one-hot encoding for categorical features
ecc = nvc[nvc['Percentage']!=0].index.values
fcc = [i for i in categorical_features if i not in ecc]
oh = True
dm = True
for i in fcc:
    if df[i].nunique()==2:
        if oh==True: print("One-Hot Encoding on features:")
        print(i);oh=False
        df[i]=pd.get_dummies(df[i], drop_first=True, prefix=str(i))
    if (df[i].nunique()>2 and df[i].nunique()<17):
        if dm==True: print("\nDummy Encoding on features:")
        print(i);dm=False
        df = pd.concat([df.drop([i], axis=1), pd.DataFrame(pd.get_dummies(df[i], drop_first=True, prefix=str(i)))],axis=1)


One-Hot Encoding on features:
renovation
waterfront

Dummy Encoding on features:
condition
view


In [142]:
# Check updated categorical features
new_features = [i for i in df.columns if i not in ['price']]
unique_feature_num = df[new_features].nunique().sort_values()
updated_num_features = []
updated_cat_features = []
for i in range(df[new_features].shape[1]):
    if unique_feature_num[i] <= 5:
        updated_cat_features.append(unique_feature_num.index[i])
    else:
        updated_num_features.append(unique_feature_num.index[i])

print("updated_num_features: {}".format(updated_num_features))
print("upated_cat_features: {}".format(updated_cat_features))


updated_num_features: ['floors', 'grade', 'bedrooms', 'bathrooms', 'house_age', 'sqft_basement', 'long', 'sqft_living15', 'sqft_above', 'sqft_living', 'lat', 'sqft_lot15', 'sqft_lot']
upated_cat_features: ['view_4', 'view_2', 'view_1', 'condition_5', 'waterfront', 'condition_4', 'condition_3', 'condition_2', 'renovation', 'view_3']


In [143]:
# remove outliers based on selected features
df1 = df.copy()
selected_features = ['price', 'sqft_living15', 'sqft_above', 'sqft_living', 'sqft_lot', 'sqft_lot15']
for i in selected_features:
    Q1 = df1[i].quantile(0.25)
    Q3 = df1[i].quantile(0.75)
    IQR = Q3 - Q1
    df1 = df1[df1[i] <= (Q3+(2.5*IQR))]
    df1 = df1[df1[i] >= (Q1-(2.5*IQR))]
    df1 = df1.reset_index(drop=True)

In [144]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18819 entries, 0 to 18818
Data columns (total 24 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   price          18819 non-null  float64
 1   bedrooms       18819 non-null  int64  
 2   bathrooms      18819 non-null  float64
 3   sqft_living    18819 non-null  int64  
 4   sqft_lot       18819 non-null  int64  
 5   floors         18819 non-null  float64
 6   waterfront     18819 non-null  bool   
 7   grade          18819 non-null  int64  
 8   sqft_above     18819 non-null  int64  
 9   sqft_basement  18819 non-null  int64  
 10  lat            18819 non-null  float64
 11  long           18819 non-null  float64
 12  sqft_living15  18819 non-null  int64  
 13  sqft_lot15     18819 non-null  int64  
 14  renovation     18819 non-null  bool   
 15  house_age      18819 non-null  int64  
 16  condition_2    18819 non-null  bool   
 17  condition_3    18819 non-null  bool   
 18  condit

In [145]:
selected = list(df1.columns.values)[1:]
selected = ' + '.join(selected)
selected = "price ~ " + selected
selected

'price ~ bedrooms + bathrooms + sqft_living + sqft_lot + floors + waterfront + grade + sqft_above + sqft_basement + lat + long + sqft_living15 + sqft_lot15 + renovation + house_age + condition_2 + condition_3 + condition_4 + condition_5 + view_1 + view_2 + view_3 + view_4'

In [146]:
# results_1 = ols("price ~ waterfront + view + condition + grade", data=df).fit()
results_1 = ols("price ~ bathrooms + house_age + sqft_basement + long + sqft_living15 + sqft_above + sqft_living + lat + sqft_lot15 + sqft_lot", data=df1).fit()
print(results_1.summary())

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.618
Model:                            OLS   Adj. R-squared:                  0.618
Method:                 Least Squares   F-statistic:                     3382.
Date:                Tue, 11 Jul 2023   Prob (F-statistic):               0.00
Time:                        08:41:30   Log-Likelihood:            -2.5051e+05
No. Observations:               18819   AIC:                         5.010e+05
Df Residuals:                   18809   BIC:                         5.011e+05
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept     -4.277e+07   1.21e+06    -35.249

In [147]:
# Data split
TARGET = ['price']
X_train, X_test, y_train, y_test = train_test_split(df1[updated_num_features + updated_cat_features]
                                                    ,df1[TARGET],
                                                    test_size = 0.3, random_state=1234)


In [148]:
cat_transformer = Pipeline(steps = [('onehot', OneHotEncoder(handle_unknown='ignore'))])
num_transformer = Pipeline(steps = [('scaler', StandardScaler())])

preprocessor = ColumnTransformer(transformers = [('cat',cat_transformer, updated_cat_features),
                                                 ('num', num_transformer, updated_num_features)])

In [149]:
regressors = ['LinearRegression', 'Lasso', 'ElasticNet', 'RandomForest']
for reg in regressors:
    if reg == 'LinearRegression':
        regressor = LinearRegression()
    elif reg == 'Lasso':
        regressor = Lasso(alpha=1.0)
    elif reg == 'ElasticNet':
        regressor = ElasticNet(alpha=1.0, l1_ratio=0.8)
    elif reg == 'RandomForest':
        regressor = RandomForestRegressor(n_estimators=10)

    reg_model = Pipeline(steps = [
        ('preprocessor', preprocessor),
        ('regressor', regressor)
    ])

    reg_model.fit(X_train, y_train)
    pred = reg_model.predict(X_test)
    msqrt = np.sqrt(mean_squared_error(pred,y_test))
    print("regressor: {}".format(reg))
    print(msqrt)

regressor: LinearRegression
128479.62233349563


  model = cd_fast.enet_coordinate_descent(


regressor: Lasso
128514.34803906898
regressor: ElasticNet
134085.02642817041


  self._final_estimator.fit(Xt, y, **fit_params_last_step)


regressor: RandomForest
89616.50758990529
