In [1]:
#import dependencies for plotting
import os
import numpy as np
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import hvplot.pandas

# import dependencies for model
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [2]:
#Load csv file

housing = ("Resources/Modified_housing.csv")

housing_df = pd.read_csv(housing)
housing_df

Unnamed: 0,id,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,yr_built,yr_renovated,zipcode,lat,long,renovation_category
0,7229300521,231300.0,2,1.0,1180,5650,1.0,0,0,1955,0,98178,47.5112,-122.257,Never Renovated
1,6414100192,538000.0,3,2.0,2570,7242,2.0,0,0,1951,1991,98125,47.7210,-122.319,Renovated before 2000
2,5631500400,180000.0,2,1.0,770,10000,1.0,0,0,1933,0,98028,47.7379,-122.233,Never Renovated
3,2487200875,604000.0,4,3.0,1960,5000,1.0,0,0,1965,0,98136,47.5208,-122.393,Never Renovated
4,1954400510,510000.0,3,2.0,1680,8080,1.0,0,0,1987,0,98074,47.6168,-122.045,Never Renovated
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21605,263000018,360000.0,3,2.0,1530,1131,3.0,0,0,2009,0,98103,47.6993,-122.346,Never Renovated
21606,6600060120,400000.0,4,2.0,2310,5813,2.0,0,0,2014,0,98146,47.5107,-122.362,Never Renovated
21607,1523300141,402101.0,2,1.0,1020,1350,2.0,0,0,2009,0,98144,47.5944,-122.299,Never Renovated
21608,291310100,400000.0,3,2.0,1600,2388,2.0,0,0,2004,0,98027,47.5345,-122.069,Never Renovated


In [3]:
# Drop the original 'yr_renovated' column and ID
housing_df_cleaned = housing_df.drop(columns=['yr_renovated', 'id'])
housing_df_cleaned.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,yr_built,zipcode,lat,long,renovation_category
0,231300.0,2,1.0,1180,5650,1.0,0,0,1955,98178,47.5112,-122.257,Never Renovated
1,538000.0,3,2.0,2570,7242,2.0,0,0,1951,98125,47.721,-122.319,Renovated before 2000
2,180000.0,2,1.0,770,10000,1.0,0,0,1933,98028,47.7379,-122.233,Never Renovated
3,604000.0,4,3.0,1960,5000,1.0,0,0,1965,98136,47.5208,-122.393,Never Renovated
4,510000.0,3,2.0,1680,8080,1.0,0,0,1987,98074,47.6168,-122.045,Never Renovated


In [4]:
# Generate summary statistics
housing_df_cleaned.describe()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,yr_built,zipcode,lat,long
count,21610.0,21610.0,21610.0,21610.0,21610.0,21610.0,21610.0,21610.0,21610.0,21610.0,21610.0,21610.0
mean,540085.3,3.370801,2.058723,2079.881212,15108.29,1.534891,0.007543,0.234197,1971.003609,98077.945673,47.560049,-122.21391
std,367152.2,0.930153,0.755576,918.500299,41423.23,0.554753,0.086523,0.766136,29.372639,53.505373,0.138572,0.140833
min,75000.0,0.0,0.0,290.0,520.0,1.0,0.0,0.0,1900.0,98001.0,47.1559,-122.519
25%,321612.5,3.0,2.0,1425.5,5040.0,1.0,0.0,0.0,1951.0,98033.0,47.470925,-122.328
50%,450000.0,3.0,2.0,1910.0,7619.0,2.0,0.0,0.0,1975.0,98065.0,47.5718,-122.231
75%,645000.0,4.0,2.0,2550.0,10688.75,2.0,0.0,0.0,1997.0,98118.0,47.678,-122.125
max,7700000.0,33.0,8.0,13540.0,1651359.0,4.0,1.0,4.0,2015.0,98199.0,47.7776,-121.315


In [5]:
# Plot your data to see what's in your DataFrame
housing_df_cleaned.hvplot.line(
    width=800,
    height=400,
    rot=90
)

In [6]:
# Encode the categorical column into dummy/indicator variables
dummies = pd.get_dummies(housing_df_cleaned, columns=['zipcode','renovation_category'], dtype=int)
dummies

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,yr_built,lat,...,zipcode_98168,zipcode_98177,zipcode_98178,zipcode_98188,zipcode_98198,zipcode_98199,renovation_category_Never Renovated,renovation_category_Renovated 2000-2010,renovation_category_Renovated 2010-2015,renovation_category_Renovated before 2000
0,231300.0,2,1.0,1180,5650,1.0,0,0,1955,47.5112,...,0,0,1,0,0,0,1,0,0,0
1,538000.0,3,2.0,2570,7242,2.0,0,0,1951,47.7210,...,0,0,0,0,0,0,0,0,0,1
2,180000.0,2,1.0,770,10000,1.0,0,0,1933,47.7379,...,0,0,0,0,0,0,1,0,0,0
3,604000.0,4,3.0,1960,5000,1.0,0,0,1965,47.5208,...,0,0,0,0,0,0,1,0,0,0
4,510000.0,3,2.0,1680,8080,1.0,0,0,1987,47.6168,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21605,360000.0,3,2.0,1530,1131,3.0,0,0,2009,47.6993,...,0,0,0,0,0,0,1,0,0,0
21606,400000.0,4,2.0,2310,5813,2.0,0,0,2014,47.5107,...,0,0,0,0,0,0,1,0,0,0
21607,402101.0,2,1.0,1020,1350,2.0,0,0,2009,47.5944,...,0,0,0,0,0,0,1,0,0,0
21608,400000.0,3,2.0,1600,2388,2.0,0,0,2004,47.5345,...,0,0,0,0,0,0,1,0,0,0


In [7]:
# Selecting numerical features for standardization
numerical_features = ['price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors','lat', 'long']

# Standardizing the numerical features
scaler = StandardScaler()
dummies[numerical_features] = scaler.fit_transform(dummies[numerical_features])


In [8]:
kmeans = KMeans(n_clusters=3, random_state=42)  # Adjust number of clusters as needed
dummies['cluster'] = kmeans.fit_predict(dummies)

# Display the DataFrame with cluster assignments
display(dummies)


  super()._check_params_vs_input(X, default_n_init=10)


Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,yr_built,lat,...,zipcode_98177,zipcode_98178,zipcode_98188,zipcode_98198,zipcode_98199,renovation_category_Never Renovated,renovation_category_Renovated 2000-2010,renovation_category_Renovated 2010-2015,renovation_category_Renovated before 2000,cluster
0,-0.841048,-1.473771,-1.401245,-0.979751,-0.228338,-0.964219,0,0,1955,-0.352525,...,0,1,0,0,0,1,0,0,0,1
1,-0.005680,-0.398654,-0.077721,0.533620,-0.189905,0.838426,0,0,1951,1.161525,...,0,0,0,0,0,0,0,0,1,1
2,-0.980775,-1.473771,-1.401245,-1.426142,-0.123322,-0.964219,0,0,1933,1.283487,...,0,0,0,0,0,1,0,0,0,0
3,0.174086,0.676463,1.245803,-0.130521,-0.244030,-0.964219,0,0,1965,-0.283245,...,0,0,0,0,0,1,0,0,0,1
4,-0.081944,-0.398654,-0.077721,-0.435373,-0.169674,-0.964219,0,0,1987,0.409552,...,0,0,0,0,0,1,0,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21605,-0.490504,-0.398654,-0.077721,-0.598687,-0.337434,2.641071,0,0,2009,1.004924,...,0,0,0,0,0,1,0,0,0,2
21606,-0.381555,0.676463,-0.077721,0.250543,-0.224403,0.838426,0,0,2014,-0.356133,...,0,0,0,0,0,1,0,0,0,2
21607,-0.375832,-1.473771,-1.401245,-1.153953,-0.332147,0.838426,0,0,2009,0.247899,...,0,0,0,0,0,1,0,0,0,2
21608,-0.381555,-0.398654,-0.077721,-0.522474,-0.307088,0.838426,0,0,2004,-0.184377,...,0,0,0,0,0,1,0,0,0,2


In [10]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(dummies.drop(columns=['price']), dummies['price'], test_size=0.2, random_state=42)

# Initialize and train linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict on test set
y_pred = model.predict(X_test)

# Evaluate model performance
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

Mean Squared Error: 0.22742620019080684


In [11]:
import numpy as np

# Compute the variance of the actual house prices
variance = np.var(y_test)

# Print the variance
print("Variance of House Prices:", variance)

# Calculate the ratio of MSE to variance
mse_to_variance_ratio = mse / variance

# Print the ratio
print("MSE to Variance Ratio:", mse_to_variance_ratio)

Variance of House Prices: 1.0503516242185489
MSE to Variance Ratio: 0.21652387157492106
