# Business ARIMA
Only business data for 2016-2023 in this notebook, we perform an ARIMA on the data to forecast data for 2024 and beyond.

In [696]:
# import necessary libraries
import pandas as pd
import statsmodels.api as sm
import warnings

# Suppress all warnings
warnings.filterwarnings("ignore")

In [697]:
# read in the preprocessed business data
osm_data_2016 = pd.read_csv("../../data/curated/osm_data_2016.csv")
osm_data_2017 = pd.read_csv("../../data/curated/osm_data_2017.csv")
osm_data_2018 = pd.read_csv("../../data/curated/osm_data_2018.csv")
osm_data_2019 = pd.read_csv("../../data/curated/osm_data_2019.csv")
osm_data_2020 = pd.read_csv("../../data/curated/osm_data_2020.csv")
osm_data_2021 = pd.read_csv("../../data/curated/osm_data_2021.csv")
osm_data_2022 = pd.read_csv("../../data/curated/osm_data_2022.csv")
osm_data_2023 = pd.read_csv("../../data/curated/osm_data_2023.csv")

In [698]:
# add a year column to all the datasets
osm_data_2016["year"] = 2016
osm_data_2017["year"] = 2017
osm_data_2018["year"] = 2018
osm_data_2019["year"] = 2019
osm_data_2020["year"] = 2020
osm_data_2021["year"] = 2021
osm_data_2022["year"] = 2022
osm_data_2023["year"] = 2023

In [699]:
dfs = [osm_data_2016, osm_data_2017, osm_data_2018, osm_data_2019, osm_data_2020, osm_data_2021, osm_data_2022, osm_data_2023]

In [700]:
merged_df = pd.concat(dfs, axis=0)
print(merged_df.shape)
merged_df.head(5)

(11966, 11)


Unnamed: 0,SAL_CODE,commercial,education,food_establishments,healthcare,industrial,public_transport,recreation,residential,shopping,year
0,20002,7,4,49,1,3,0,18,6,15,2016
1,20003,1,5,3,0,0,0,11,0,1,2016
2,20007,0,0,0,0,0,0,0,1,0,2016
3,20010,0,0,0,0,1,0,0,0,0,2016
4,20013,0,1,4,0,0,0,6,0,2,2016


In [701]:
merged_df[merged_df["SAL_CODE"]==20002].sort_values("year")

Unnamed: 0,SAL_CODE,commercial,education,food_establishments,healthcare,industrial,public_transport,recreation,residential,shopping,year
0,20002,7,4,49,1,3,0,18,6,15,2016
0,20002,10,4,48,1,4,0,20,6,16,2017
0,20002,10,4,52,1,4,0,19,6,18,2018
0,20002,10,4,52,1,4,0,19,6,19,2019
0,20002,10,5,55,1,4,0,22,7,21,2020
0,20002,9,5,55,1,4,0,24,8,21,2021
0,20002,13,5,60,2,4,0,22,8,24,2022
0,20002,19,4,62,2,5,0,26,14,23,2023


In [702]:
# read in historical rent data to get a list of SAL codes that we need to ARIMA the business data on
historical_rent_df = pd.read_csv("../../data/curated/historical_rent_cleaned.csv")
print(historical_rent_df.shape)
historical_rent_df.head(5)

(567, 51)


Unnamed: 0,SAL_CODE,2000_average_weekly_rent,2000_average_quarterly_count,2001_average_weekly_rent,2001_average_quarterly_count,2002_average_weekly_rent,2002_average_quarterly_count,2003_average_weekly_rent,2003_average_quarterly_count,2004_average_weekly_rent,...,2020_average_weekly_rent,2020_average_quarterly_count,2021_average_weekly_rent,2021_average_quarterly_count,2022_average_weekly_rent,2022_average_quarterly_count,2023_average_weekly_rent,2023_average_quarterly_count,2024_average_weekly_rent,2024_average_quarterly_count
0,20111,137.0,979.5,141.25,857.0,151.25,896.0,161.25,948.25,171.25,...,317.5,1743.75,331.25,1632.25,355.0,1546.5,371.25,1473.25,380.0,1345.0
1,20198,190.0,505.75,207.75,502.5,216.25,529.0,221.25,525.75,226.25,...,466.25,720.0,457.0,643.25,432.5,872.25,495.0,764.75,550.0,729.0
2,21193,200.0,608.25,207.5,765.5,210.0,1059.5,210.0,1334.0,217.5,...,400.0,2389.25,400.0,2591.5,411.25,3047.5,437.5,3557.0,470.0,3777.0
3,21640,320.0,2278.75,320.0,2752.5,320.0,3382.75,305.0,3972.75,300.0,...,483.75,10206.25,366.25,16559.25,426.25,14627.75,587.5,13547.25,640.0,13582.0
4,21938,142.5,443.5,151.25,434.75,161.25,429.5,171.25,473.0,178.75,...,390.75,434.5,407.5,428.5,432.5,365.25,465.0,312.5,475.0,356.0


In [703]:
# variable assignment cell in preparation for the ARIMA
df = merged_df
sal_codes = list(historical_rent_df["SAL_CODE"].unique())
business_types = list(df.columns)
business_types.remove("SAL_CODE")
business_types.remove("year")
print(len(sal_codes))

567


In [704]:
# convert all business columns to a float data type so the ARIMA can be performed
df = df.astype({'commercial': 'float64', 'education': 'float64', 'food_establishments': 'float64',
                'healthcare': 'float64', 'industrial': 'float64', 'public_transport': 'float64',
                'recreation': 'float64', 'residential': 'float64', 'shopping': 'float64'})

# ensure all values are converted and nothing is converted to 'null' or 'NaN'
print(df.isna().sum())
df[df.isna().any(axis=1)]

SAL_CODE               0
commercial             0
education              0
food_establishments    0
healthcare             0
industrial             0
public_transport       0
recreation             0
residential            0
shopping               0
year                   0
dtype: int64


Unnamed: 0,SAL_CODE,commercial,education,food_establishments,healthcare,industrial,public_transport,recreation,residential,shopping,year


In [705]:
df.dtypes

SAL_CODE                 int64
commercial             float64
education              float64
food_establishments    float64
healthcare             float64
industrial             float64
public_transport       float64
recreation             float64
residential            float64
shopping               float64
year                     int64
dtype: object

In [706]:
# ARIMA the business data

iteration = 0
for sal in sal_codes:

    # create a temporary df of all business data of a SAL code (all years included)
    temp_df = df[df["SAL_CODE"] == sal]
    temp_df = temp_df.sort_values("year")
    code_2024_results = {"year": 2024}
    code_2025_results = {"year": 2025}
    code_2026_results = {"year": 2026}
    code_2027_results = {"year": 2027}
    code_2028_results = {"year": 2028}
    code_dicts = [code_2024_results, code_2025_results, code_2026_results, code_2027_results, code_2028_results]
    for code_dict in code_dicts:
        code_dict["SAL_CODE"] = sal
    iteration += 1

    # ARIMA over all business type data
    for col in business_types:
        # Ensure the temp_df[col] contains numeric values without NaN
        temp_df[col] = pd.to_numeric(temp_df[col], errors='coerce')
        temp_df = temp_df.dropna(subset=[col])  # Drop rows with NaN in column

        # ARIMA where there is data from more than two years worth of data
        if len(temp_df) > 2:
            forecast_steps = 5

            # Fit and forecast ARIMA model
            model = sm.tsa.ARIMA(temp_df[col], order=(1, 1, 1))
            model_fit = model.fit()
            forecast = model_fit.forecast(steps=forecast_steps).to_numpy()
        
            # Store forecast results
            for code_dict in code_dicts:
                code_dict[col] = round(float(forecast[code_dict["year"]-2024]),3)
        
        # EXCEPTION HANDLING

        # Assume it increases by the same difference (linearly) between the two measures each year
        elif len(temp_df) == 2:
            diff = temp_df.iloc[1][col] - temp_df.iloc[0][col]
            for code_dict in code_dicts:
                code_dict[col] = temp_df.iloc[1][col] + diff*code_dict["year"]-2023
        # if 1 years of business data avaliable asumme the business count is constant here
        elif len(temp_df) == 1:
            for code_dict in code_dicts:
                code_dict[col] = temp_df.iloc[0][col]
        # where there is no business data avaliable we assume there is no businesses here and assign the value 0
        # as the osm dataset was very complete anwyay
        else: #len(temp_df) == 0:
            for code_dict in code_dicts:
                code_dict[col] = 0
    # append this data to the dataframe of all the data
    for code_dict in code_dicts:
        df = pd.concat([df, pd.DataFrame([code_dict])], ignore_index=True)
    print(f"SAL code iteration {iteration} complete")


SAL code iteration 1 complete
SAL code iteration 2 complete
SAL code iteration 3 complete
SAL code iteration 4 complete
SAL code iteration 5 complete
SAL code iteration 6 complete
SAL code iteration 7 complete
SAL code iteration 8 complete
SAL code iteration 9 complete
SAL code iteration 10 complete
SAL code iteration 11 complete
SAL code iteration 12 complete
SAL code iteration 13 complete
SAL code iteration 14 complete
SAL code iteration 15 complete
SAL code iteration 16 complete
SAL code iteration 17 complete
SAL code iteration 18 complete
SAL code iteration 19 complete
SAL code iteration 20 complete
SAL code iteration 21 complete
SAL code iteration 22 complete
SAL code iteration 23 complete
SAL code iteration 24 complete
SAL code iteration 25 complete
SAL code iteration 26 complete
SAL code iteration 27 complete
SAL code iteration 28 complete
SAL code iteration 29 complete
SAL code iteration 30 complete
SAL code iteration 31 complete
SAL code iteration 32 complete
SAL code iteratio

In [707]:
df[df["SAL_CODE"]==20002].sort_values("year")

Unnamed: 0,SAL_CODE,commercial,education,food_establishments,healthcare,industrial,public_transport,recreation,residential,shopping,year
0,20002,7.0,4.0,49.0,1.0,3.0,0.0,18.0,6.0,15.0,2016
1360,20002,10.0,4.0,48.0,1.0,4.0,0.0,20.0,6.0,16.0,2017
2753,20002,10.0,4.0,52.0,1.0,4.0,0.0,19.0,6.0,18.0,2018
4205,20002,10.0,4.0,52.0,1.0,4.0,0.0,19.0,6.0,19.0,2019
5700,20002,10.0,5.0,55.0,1.0,4.0,0.0,22.0,7.0,21.0,2020
7217,20002,9.0,5.0,55.0,1.0,4.0,0.0,24.0,8.0,21.0,2021
8767,20002,13.0,5.0,60.0,2.0,4.0,0.0,22.0,8.0,24.0,2022
10333,20002,19.0,4.0,62.0,2.0,5.0,0.0,26.0,14.0,23.0,2023
11991,20002,21.35,4.0,63.46,2.0,5.0,0.0,24.657,15.34,23.916,2024
11992,20002,21.76,4.0,64.921,2.0,5.0,0.0,25.077,16.44,24.832,2025


In [708]:
# filter the data that was engineered for 2024
osm_data_2024 = df[df["year"]==2024]
osm_data_2024 = osm_data_2024.drop(columns="year")
# counts can't below 0
osm_data_2024[osm_data_2024 < 0] = 0
print(osm_data_2024.shape)
osm_data_2024.head(3)

(567, 10)


Unnamed: 0,SAL_CODE,commercial,education,food_establishments,healthcare,industrial,public_transport,recreation,residential,shopping
11966,20111,66.0,8.0,85.749,16.0,6.0,3.0,26.0,179.0,27.897
11971,20198,0.0,1.949,0.0,3.0,0.0,0.0,4.0,0.0,0.0
11976,21193,3.0,6.0,6.164,2.0,0.0,0.0,46.003,12.0,2.0


In [709]:
# output the artificial data
osm_data_2024.to_csv("../../data/curated/osm_data_2024.csv")

In [710]:
# automate the process for the last two cells
for i in range(0,4):
    year = 2025+i
    output_df = df[df["year"]==year]
    output_df= output_df.drop(columns="year")
    # counts can't below 0
    output_df[output_df <= 0] = 0
    print(output_df.shape)
    output_df.to_csv(f"../../data/curated/osm_data_{year}.csv")

(567, 10)
(567, 10)
(567, 10)
(567, 10)
