In [6]:
import pandas as pd
import plotly.express as px
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
import numpy as np

df = pd.read_csv('zillow.csv')

#### Questions of interest
- What regions in Utah County are generally more expensive to live in?
- Are town houses a popular option in Utah County?
- How does the house type affect it's price?
- What does the price distribution look like for all housing?
- Can we predict a house price based on some of it's features?

#### What regions in Utah County are generally more expensive to live in?

In [7]:
fig = px.scatter_mapbox(
    df,
    lat='latitude',
    lon='longitude',
    size = 'price', 
    color = 'homeType'
)
fig.update_layout(mapbox_style="carto-positron")
fig.show()

Alpine area, Saratoga Springs lake houses, and Woodland Hills areas look expensive.

What does this look like broken down into cities?

In [8]:
df_avg_price_per_city = df.groupby('city').median(numeric_only = True).reset_index().sort_values('price', ascending = False)
fig = px.bar(df_avg_price_per_city, x = 'city', y = 'price')
fig.show()

Draper, Alpine, and Woodland Hills all look like outliers here.

#### Are Town Houses a Popular Option in Utah County?

In [9]:
df_homeType = df.value_counts('homeType').reset_index().rename(columns = {0: 'Count'})

fig = px.pie(
    df_homeType,
    names='homeType',
    values='Count'
)

fig.show()

We have no way of telling if they are growing or declining in popularity, but town houses make up nearly 1/5th of the homes sold today.

#### How Does the House Type Affect it's Price

In [10]:
fig = px.scatter(df[df['homeType'].isin(['SINGLE_FAMILY', 'TOWNHOUSE', 'CONDO'])], 
                 'livingArea', 
                 'price', 
                 color = 'homeType', 
                 trendline= 'lowess')
fig.update_xaxes(range = [0, 4000])
fig.update_yaxes(range = [0, 1000000])
fig.update_traces(
    line=dict(width=3), 
    selector=dict(type='scatter', mode='lines')
)
fig.show()

Looking at areas with sufficient data points, Single Homes seem to be the most expensive option, with town houses being the next most expensive, followed by condos.

#### What Does the Price Distribution Look Like for All Housing?

In [11]:
fig = px.histogram(df[df['price'] < 2000000], x = 'price')
fig.show()

Looks like the median house price in Utah County is a little over half a million

#### Can we predict a house price based on some of it's features?

In [12]:
# Prepare the data frame for a Boosted Tree Model
one_hot_encoded1 = pd.get_dummies(df['homeType'])
one_hot_encoded1.drop(columns = ['SINGLE_FAMILY'], inplace = True)
one_hot_encoded = pd.get_dummies(df['city'])
one_hot_encoded.drop(columns = ['Saratoga Springs'], inplace = True)
df_prepared = pd.concat([df, one_hot_encoded1], axis=1)
df_prepared = pd.concat([df_prepared, one_hot_encoded], axis=1)
df_prepared = df_prepared.drop(columns = ['city', 
                                          'homeType', 
                                          'priceReduction', 
                                          'zipcode', 
                                          'latitude', 
                                          'longitude', 
                                          '30_year_mortgage', 
                                          'daysOnZillow'])
df_prepared = df_prepared[-df_prepared['livingArea'].isna()]
df_prepared = df_prepared[df_prepared['price'] < 1500000]

# Split the data and tune hyperparameters
X_train, X_test, y_train, y_test = train_test_split(df_prepared.drop(columns = ['price']), df_prepared['price'], test_size=0.2, random_state=42)

xgb_model = xgb.XGBRegressor()

param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [50, 100, 200]
}

def MAPE(y_true, y_pred):
    absolute_percentage_error = np.abs((y_true - y_pred) / y_true)
    weight = 1 + np.exp(-np.abs(y_true))
    weighted_error = absolute_percentage_error * weight
    return np.mean(weighted_error)

mape_score = make_scorer(MAPE, greater_is_better=False)

grid_search = GridSearchCV(xgb_model, param_grid, scoring = mape_score, cv = 5)

grid_result = grid_search.fit(X_train, y_train)

# Set best hyperparameters to the model
best_model = grid_result.best_estimator_

# Make predictions on the test set
y_pred = best_model.predict(X_test)

predictions = pd.DataFrame({'Actual': y_test, 
                            'Prediction': y_pred}).astype(int)

In [15]:
fig = px.line(predictions, x = 'Prediction', y = 'Prediction')
fig.add_scatter(x=predictions['Prediction'], y=predictions['Actual'], mode='markers')
fig.update_layout(
    yaxis_title_text='Actual', 
    showlegend=False
)
fig.show()

It looks like our model captures a lot of variation in house prices. Perhaps this could be improved further, but this isn't a bad start.