In [None]:
import pandas as pd
import seaborn as sns
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from sklearn.metrics import mean_absolute_error, mean_squared_error
import datetime as dt

from statsmodels.tsa.statespace.sarimax import SARIMAX
import pmdarima as pm

In [None]:
neighborhoods_df = pd.read_csv('../data/neighborhood_medians.csv')
nyc_df = pd.read_csv('../data/nyc_medians.csv')
neighborhoods_df.loc[:, "Date"] = pd.to_datetime(neighborhoods_df["Date"], format='%Y-%m-%d').dt.date
nyc_df.loc[:, "Date"] = pd.to_datetime(nyc_df["Date"], format='%Y-%m-%d').dt.date

neighborhoods_df.set_index('Date', inplace=True)
nyc_df.set_index('Date', inplace=True)

neighborhoods_df = neighborhoods_df[["areaName", "Median Price", "Borough", "areaType"]]
nyc_df = nyc_df[["Median Price"]]

In [None]:
neighborhoods_df.head()

In [None]:
covid_start = dt.date(2020, 3, 1)
covid_end = dt.date(2021, 11, 1)

neighborhoods_df["during_covid"] = neighborhoods_df.index.to_series().between(covid_start, covid_end).astype(int)
nyc_df['during_covid'] = nyc_df.index.to_series().between(covid_start, covid_end).astype(int)

In [None]:
def split_data(df):
    train_data = df.iloc[:-12]
    test_data = df.iloc[-12:]

    train_exog = df[["is_covid"]].iloc[:-12]
    test_exog = df[["is_covid"]].iloc[-12:]
    return train_data, test_data, train_exog, test_exog

Now the data needs to be prepared for modeling by first limiting it to 2018 and up, then splitting into train/test sets.

In [None]:
neighborhoods2018 = neighborhoods_df[neighborhoods_df.index >= dt.date(2018, 1, 1)]
nyc2018 = nyc_df[nyc_df.index >= dt.date(2018, 1, 1)]

print("Neighborhoods 2018 shape:", neighborhoods2018.shape)
print("NYC 2018 shape:", nyc2018.shape)

neighborhoods2018.head(25)

In [None]:

boroughs2018 = neighborhoods2018.groupby(['Date', 'Borough'])['Median Price'].agg('median').reset_index()
boroughs2018 = boroughs2018.set_index("Date")
boroughs2018["during_covid"] = boroughs2018.index.to_series().between(covid_start, covid_end).astype(int)
boroughs2018.head()

In [None]:
sns.lineplot(data=boroughs2018, x='Date', y='Median Price', hue='Borough')

In [None]:
neighborhoods2018 = neighborhoods2018.rename(columns={'areaName': 'Neighborhood'})
boroughs2018["during_covid"] = boroughs2018.index.to_series().between(covid_start, covid_end).astype(int)
boroughs2018 = boroughs2018[boroughs2018["Borough"].notna()]
neighborhoods2018 = neighborhoods2018[neighborhoods2018["Borough"].notna()]

In [None]:
neighborhood_names = pd.unique(neighborhoods2018['Neighborhood'])
borough_names = pd.unique(boroughs2018['Borough'])

In [None]:
nyc2018.reset_index(inplace=True)
boroughs2018.reset_index(inplace=True)
neighborhoods2018.reset_index(inplace=True)

nyc2018['unique_id'] = "NYC_Total"
boroughs2018["unique_id"] = boroughs2018["Borough"]
neighborhoods2018["unique_id"] = neighborhoods2018["Neighborhood"]

boroughs2018["unique_id"] = boroughs2018[boroughs2018['unique_id'].notna()]['unique_id']


cols = ['unique_id', 'Date', 'Median Price', 'during_covid'] 

combined_df = pd.concat([
    nyc2018[cols], 
    boroughs2018[cols], 
    neighborhoods2018[cols]
], ignore_index=True)

combined_df = combined_df.rename(columns={'Median Price': 'y', 'Date': 'ds', 'during_covid': 'covid'})

In [None]:
combined_df.head()

In [None]:
hieratchy_levels = {
    "city" : "NYC_Total",
    "Borough" : borough_names,
    "Neighborhood": neighborhood_names
}

In [None]:
neighborhood_to_borough = neighborhoods2018[['unique_id', 'Borough']].drop_duplicates().set_index('unique_id')['Borough'].to_dict()
neighborhood_to_borough

In [None]:
combined_df['Borough'] = combined_df['unique_id'].map(neighborhood_to_borough)
combined_df['Borough'] = combined_df['Borough'].fillna('Unknown')
combined_df['Borough'] = combined_df['Borough'].astype('str')


In [None]:
df_bottom = neighborhoods2018[['Date', 'Neighborhood', 'Median Price', 'Borough', 'during_covid']]
df_bottom = df_bottom.rename({'Date': 'ds', 'Median Price': 'y'}, axis=1)

In [None]:
from hierarchicalforecast.utils import aggregate

df_hier, S_df, tags = aggregate(df_bottom, [['Borough'], ['Borough', 'Neighborhood']])


In [None]:
from statsforecast import StatsForecast
from statsforecast.models import AutoARIMA
from hierarchicalforecast.core import HierarchicalReconciliation
from hierarchicalforecast.methods import MinTrace

# 1. Train SARIMAX on all levels
sf = StatsForecast(models=[AutoARIMA(season_length=12)], freq='ME')
base_forecasts = sf.forecast(df=df_hier, h=12)

# 2. Reconcile to make them "coherent"
reconciler = HierarchicalReconciliation(reconcilers=[MinTrace(method='ols')])
reconciled_forecasts = reconciler.reconcile(Y_hat_df=base_forecasts, S=S_df, tags=tags)

In [None]:
reconciled_forecasts.tail(20)

In [None]:
borough_id = 'Manhattan/Chinatown'
col = 'AutoARIMA/MinTrace_method-ols'

chinatown_fcst = (
    reconciled_forecasts
    .loc[reconciled_forecasts['unique_id'] == borough_id,
         ['ds', col]]
    .rename(columns={col: 'y_hat'})
    .sort_values('ds')
)

In [None]:
chinatown_fcst.head(10)

In [None]:
sns.lineplot(data=chinatown_fcst, x='ds', y='y_hat')

In [None]:
import matplotlib.pyplot as plt

def plot_forecast(forecasts, neighborhood):
    train_data = neighborhoods2018.query(f'Neighborhood == "{neighborhood}"')
    train_data = train_data.rename(columns={'Date': 'ds', 'Median Price' : 'y'})

    plt.figure(figsize=(12, 6))

    plt.plot(train_data['ds'], train_data["y"], label='Training Data', color='blue', alpha=0.7)
    plt.plot(forecasts['ds'], forecasts["y_hat"], label='Forecast', color='red', linestyle = 'dashed')


    plt.title("Backtest Performance")
    plt.xlabel("Date")
    plt.ylabel("Median Price")
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.show()

In [None]:
plot_forecast(chinatown_fcst, "Chinatown")

In [None]:
def forecast(id):
    col = 'AutoARIMA/MinTrace_method-ols'

    forecast = (
        reconciled_forecasts
        .loc[reconciled_forecasts['unique_id'] == id,
            ['ds', col]]
        .rename(columns={col: 'y_hat'})
        .sort_values('ds')
    )
    return forecast


In [None]:
woodside = forecast("Queens/Woodside")
plot_forecast(woodside, "Woodside")

Now that I have the SARIMAX hierarchical model, I will test it then consider using an alternative model like Prophet.

In [None]:
df_bottom = neighborhoods2018[['Date', 'Neighborhood', 'Median Price', 'Borough', 'during_covid']]
df_bottom = df_bottom.rename({'Date': 'ds', 'Median Price': 'y'}, axis=1)
df_bottom = df_bottom[df_bottom['ds'].between(dt.date(2018, 1, 1), dt.date(2024, 12, 1))]

Unnamed: 0,ds,Neighborhood,y,Borough,during_covid
0,2018-01-01,All Downtown,3650.0,Manhattan,0
1,2018-02-01,All Downtown,3750.0,Manhattan,0
2,2018-03-01,All Downtown,3735.0,Manhattan,0
3,2018-04-01,All Downtown,3750.0,Manhattan,0
4,2018-05-01,All Downtown,3837.0,Manhattan,0


In [35]:
df_bottom = df_bottom

df_hier, S_df, tags = aggregate(df_bottom, [['Borough'], ['Borough', 'Neighborhood']])

sf = StatsForecast(models=[AutoARIMA(season_length=12)], freq='ME')
base_forecasts = sf.forecast(df=df_hier, h=12)

reconciler = HierarchicalReconciliation(reconcilers=[MinTrace(method='ols')])
reconciled_forecasts = reconciler.reconcile(Y_hat_df=base_forecasts, S=S_df, tags=tags)

  reconciled_forecasts = reconciler.reconcile(Y_hat_df=base_forecasts, S=S_df, tags=tags)
