In [1]:
# Load Libraries and Data
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from math import sqrt

housing = pd.read_csv('https://raw.githubusercontent.com/byui-cse/cse450-course/master/data/housing.csv')
housing.head()

# Clean and Categorize Code
housing['zipcode'] = housing['zipcode'].astype(str)

housing['sqft_per_floor'] = housing['sqft_living'] / housing['floors']

features = [
    "condition",
    "grade",
    "view",
    "lat",
    "long",
    "sqft_living",
    "sqft_lot",
    "sqft_lot15",
    "sqft_per_floor"
]

X = housing[features]
y = housing['price']

X = X.join(pd.get_dummies(housing['zipcode'], prefix='zip'))
X = X.dropna()
y = y.loc[X.index]

# Train and test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Train Random Forest Model
rf = RandomForestRegressor(
    n_estimators=300,
    random_state=42,
    max_depth=None,
    n_jobs=-1
)

rf.fit(X_train, y_train)



1. RMSE = average prediction error in dollars


2. R² = how much price variation your model explains
0.65–0.80 is very solid for housing data
This = quantifiable evidence of reliability

In [None]:
# Cecil's Question
# Evaluate the Model
y_pred = rf.predict(X_test)

rmse = sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

rmse, r2


In [None]:
# William's Question
# Feature Importance
importances = pd.Series(rf.feature_importances_, index=X.columns)
importances = importances.sort_values(ascending=False)

importances.head(10)

# Plotting Feature Importance
plt.figure(figsize=(10,6))
importances.head(10).plot(kind='barh')
plt.title("Top 10 Most Important Features for Predicting Price")
plt.gca().invert_yaxis()
plt.show()


In [None]:
# Model Reliability Visualization per Cecil
plt.figure(figsize=(6,6))
sns.scatterplot(x=y_test, y=y_pred, alpha=0.3)
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")
plt.title("Predicted vs Actual Housing Prices")
plt.show()


Tight diagonal = good predictions

Scatter = error
This visually supports reliability claims.

Model shows: home features, location

Could potentially join external data: median price per neighborhood, school neighborhood ratings, walkability ratings

I built a Random Forest regression model using housing features such as square footage, condition, grade, view, and geographic location to predict home prices. I engineered an additional feature representing square footage per floor to capture how living space is distributed across levels. The model was evaluated using RMSE and R² to provide quantifiable evidence of prediction reliability. Feature importance scores were used to identify which aspects of the property most strongly influence price, addressing stakeholder questions about key drivers of value. Geographic variables were included to support future integration of external neighborhood-level factors.

Attempting to work with zipcode, lat, long.

In [None]:
# Using "Homes in area are similiar"
zip_stats = housing.groupby('zipcode')['price'].agg([
    'mean', 'median', 'count'
]).reset_index()

zip_stats.columns = ['zipcode', 'zip_price_mean', 'zip_price_median', 'zip_count']

housing = housing.merge(zip_stats, on='zipcode', how='left')


zip_price_mean - average price per area

zip_count - quantity of homes

In [None]:
# categorize into natural geographical areas
from sklearn.cluster import KMeans

coords = housing[['lat', 'long']]

kmeans = KMeans(n_clusters=20, random_state=42)
housing['geo_cluster'] = kmeans.fit_predict(coords)

X = housing[[
    "sqft_living", "sqft_lot", "condition", "grade", "view",
    "sqft_lot15", "sqft_per_floor", "geo_cluster"
]]


In [None]:
# Creating distance variable
import numpy as np

# Approx Seattle city center
seattle_lat, seattle_long = 47.6062, -122.3321

housing['dist_to_seattle'] = np.sqrt(
    (housing['lat'] - seattle_lat)**2 +
    (housing['long'] - seattle_long)**2
)


In [None]:
# Each zipcode is a redefined category
housing_encoded = pd.get_dummies(housing, columns=['zipcode'], prefix='zip')

X = housing_encoded[[
    "sqft_living", "sqft_lot", "condition", "grade", "view",
    "sqft_lot15", "sqft_per_floor"
] + [col for col in housing_encoded.columns if col.startswith('zip_')]]


In [None]:
housing['sqft_per_floor'] = housing['sqft_living'] / housing['floors']

from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=20, random_state=42)
housing['geo_cluster'] = kmeans.fit_predict(housing[['lat', 'long']])

zip_stats = housing.groupby('zipcode')['price'].mean().reset_index()
zip_stats.columns = ['zipcode', 'zip_price_mean']
housing = housing.merge(zip_stats, on='zipcode', how='left')

features = [
    "sqft_living",
    "sqft_lot",
    "sqft_lot15",
    "condition",
    "grade",
    "view",
    "sqft_per_floor",
    "geo_cluster",
    "zip_price_mean"
]


Latitude, longitude, and zipcode do not correlate well with price when treated as raw numerical values. To capture neighborhood-level pricing effects, I transformed location features into meaningful geographic groupings. Homes were clustered into geographic regions using latitude and longitude, creating a neighborhood identifier. Additionally, zipcode-level price averages were computed to capture typical market values by area. These transformations allowed the model to learn area-based pricing patterns and significantly improved predictive performance.