# Crime ARIMA
Only crime data for 2016-2024 in this notebook, we perform an ARIMA on the data to forecast data for 2024 to 2028.

In [324]:
# import necessary libraries
import pandas as pd
import statsmodels.api as sm
import warnings

# Suppress all warnings
warnings.filterwarnings("ignore")

In [325]:
# read in the crime data
crime_df = pd.read_csv("../../data/curated/crime_suburb_data.csv")
crime_df

Unnamed: 0,suburb_name,SAL_CODE_2021,2015_A Crimes against the person,2015_B Property and deception offences,2015_C Drug offences,2015_D Public order and security offences,2015_E Justice procedures offences,2015_F Other offences,2015_total,2016_A Crimes against the person,...,2023_F Other offences,2023_total,2024_A Crimes against the person,2024_B Property and deception offences,2024_C Drug offences,2024_D Public order and security offences,2024_E Justice procedures offences,2024_F Other offences,2024_total,all_crimes_2015-2024
0,Abbeyard,20001,0,0,0,1,0,1,2,0,...,0,0,0,0,0,0,0,0,0,7
1,Abbotsford,20002,10,18,3,7,4,0,42,13,...,1,49,11,16,4,8,6,1,46,479
2,Aberfeldie,20003,6,11,2,0,1,0,20,8,...,0,22,7,11,0,2,3,0,23,238
3,Aberfeldy,20004,0,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,2
4,Acheron,20005,1,1,0,0,0,0,2,0,...,0,1,1,0,0,0,1,0,2,27
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2842,Yundool,22940,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,5
2843,Yuroke,22941,1,7,0,0,2,0,10,1,...,0,7,2,4,0,1,1,0,8,78
2844,Yuulong,22942,0,2,1,0,0,0,3,0,...,0,2,0,0,0,0,0,0,0,9
2845,Zeerust,22943,1,2,0,0,0,0,3,5,...,0,6,0,1,0,0,1,0,2,46


In [326]:
# read in historical rent data to get a list of SAL codes that we need to ARIMA the business data on
historical_rent_df = pd.read_csv("../../data/curated/historical_rent_cleaned.csv")
print(historical_rent_df.shape)
historical_rent_df.head(5)

(567, 51)


Unnamed: 0,SAL_CODE,2000_average_weekly_rent,2000_average_quarterly_count,2001_average_weekly_rent,2001_average_quarterly_count,2002_average_weekly_rent,2002_average_quarterly_count,2003_average_weekly_rent,2003_average_quarterly_count,2004_average_weekly_rent,...,2020_average_weekly_rent,2020_average_quarterly_count,2021_average_weekly_rent,2021_average_quarterly_count,2022_average_weekly_rent,2022_average_quarterly_count,2023_average_weekly_rent,2023_average_quarterly_count,2024_average_weekly_rent,2024_average_quarterly_count
0,20111,137.0,979.5,141.25,857.0,151.25,896.0,161.25,948.25,171.25,...,317.5,1743.75,331.25,1632.25,355.0,1546.5,371.25,1473.25,380.0,1345.0
1,20198,190.0,505.75,207.75,502.5,216.25,529.0,221.25,525.75,226.25,...,466.25,720.0,457.0,643.25,432.5,872.25,495.0,764.75,550.0,729.0
2,21193,200.0,608.25,207.5,765.5,210.0,1059.5,210.0,1334.0,217.5,...,400.0,2389.25,400.0,2591.5,411.25,3047.5,437.5,3557.0,470.0,3777.0
3,21640,320.0,2278.75,320.0,2752.5,320.0,3382.75,305.0,3972.75,300.0,...,483.75,10206.25,366.25,16559.25,426.25,14627.75,587.5,13547.25,640.0,13582.0
4,21938,142.5,443.5,151.25,434.75,161.25,429.5,171.25,473.0,178.75,...,390.75,434.5,407.5,428.5,432.5,365.25,465.0,312.5,475.0,356.0


In [327]:
# variable assignment cell in preparation for the ARIMA
df = crime_df
sal_codes = list(historical_rent_df["SAL_CODE"].unique())
crime_types = ["A Crimes against the person", "B Property and deception offences", "C Drug offences", 
                   "D Public order and security offences", "E Justice procedures offences",	"F Other offences"]
print(len(sal_codes))

567


In [328]:
df.dtypes

suburb_name                                  object
SAL_CODE_2021                                 int64
2015_A Crimes against the person              int64
2015_B Property and deception offences        int64
2015_C Drug offences                          int64
                                              ...  
2024_D Public order and security offences     int64
2024_E Justice procedures offences            int64
2024_F Other offences                         int64
2024_total                                    int64
all_crimes_2015-2024                          int64
Length: 73, dtype: object

In [329]:
df.iloc[2].filter(like='A Crimes against the person')

2015_A Crimes against the person    6
2016_A Crimes against the person    8
2017_A Crimes against the person    8
2018_A Crimes against the person    7
2019_A Crimes against the person    5
2020_A Crimes against the person    8
2021_A Crimes against the person    8
2022_A Crimes against the person    5
2023_A Crimes against the person    7
2024_A Crimes against the person    7
Name: 2, dtype: object

In [330]:
# ensure no missing values before performing ARIMA
df.isna().sum()

suburb_name                                  0
SAL_CODE_2021                                0
2015_A Crimes against the person             0
2015_B Property and deception offences       0
2015_C Drug offences                         0
                                            ..
2024_D Public order and security offences    0
2024_E Justice procedures offences           0
2024_F Other offences                        0
2024_total                                   0
all_crimes_2015-2024                         0
Length: 73, dtype: int64

In [331]:
# ARIMA the crime data
iteration = 0
# loop through all crime data
for i in df.index:
    sal = df.iloc[i-1]["SAL_CODE_2021"]
    # only perform ARIMA on suburbs that we are studying
    if sal in sal_codes:

        # code results is where we store the ARIMAed data
        code_results = {}
        code_results["SAL_CODE_2021"] = sal
        iteration += 1

        # ARIMA over all crime division data
        for col in crime_types:

            temp_df = df.iloc[i-1].filter(like=col)
            temp_df = pd.to_numeric(temp_df, errors='coerce')
            temp_df = temp_df.dropna()  # Drop rows with NaN in column

            # ARIMA where there is data from more than two years worth of data
            if len(temp_df) > 2:
                forecast_steps = 4

                # Fit and forecast ARIMA model
                model = sm.tsa.ARIMA(temp_df, order=(1, 1, 1))
                model_fit = model.fit()
                forecast = model_fit.forecast(steps=forecast_steps).to_numpy()
            
                # Store forecast results
                for year in range(2025,2029):
                    full_col = str(year) + '_' + col
                    code_results[full_col] = round(float(forecast[year-2025]),3)
                #print(code_results)
            
            # EXCEPTION HANDLING

            # Assume it increases by the same difference (linearly) between the two measures each year
            elif len(temp_df) == 2:
                diff = temp_df.iloc[0]['2024_'+col] - temp_df.iloc[0]['2023'+col] # CHANGE 
                for year in range(2025,2029):
                    full_col = str(year) + '_' + col
                    code_results[full_col] = temp_df.iloc[0]['2024_'+col] + diff*(year-2024)
            # if 1 years of business data avaliable asumme the crime count is constant here
            elif len(temp_df) == 1:
                for year in range(2025,2029):
                    full_col = str(year) + '_' + col
                    code_results[full_col] = temp_df.iloc[0]['2024_'+col]
            # where there is no crime data avaliable we assume there is no businesses here and assign the value 0
            # shouldn't matter as the dataset was very complete anwyay
            else: #len(temp_df) == 0:
                for year in range(2025,2029):
                    code_results[full_col] = 0
        # append this data to the dataframe of all the data
        df = pd.concat([df, pd.DataFrame([code_results])], ignore_index=True)
        print(f"SAL code iteration {iteration} complete")

SAL code iteration 1 complete
SAL code iteration 2 complete
SAL code iteration 3 complete
SAL code iteration 4 complete
SAL code iteration 5 complete
SAL code iteration 6 complete
SAL code iteration 7 complete
SAL code iteration 8 complete
SAL code iteration 9 complete
SAL code iteration 10 complete
SAL code iteration 11 complete
SAL code iteration 12 complete
SAL code iteration 13 complete
SAL code iteration 14 complete
SAL code iteration 15 complete
SAL code iteration 16 complete
SAL code iteration 17 complete
SAL code iteration 18 complete
SAL code iteration 19 complete
SAL code iteration 20 complete
SAL code iteration 21 complete
SAL code iteration 22 complete
SAL code iteration 23 complete
SAL code iteration 24 complete
SAL code iteration 25 complete
SAL code iteration 26 complete
SAL code iteration 27 complete
SAL code iteration 28 complete
SAL code iteration 29 complete
SAL code iteration 30 complete
SAL code iteration 31 complete
SAL code iteration 32 complete
SAL code iteratio

In [332]:
df

Unnamed: 0,suburb_name,SAL_CODE_2021,2015_A Crimes against the person,2015_B Property and deception offences,2015_C Drug offences,2015_D Public order and security offences,2015_E Justice procedures offences,2015_F Other offences,2015_total,2016_A Crimes against the person,...,2027_D Public order and security offences,2028_D Public order and security offences,2025_E Justice procedures offences,2026_E Justice procedures offences,2027_E Justice procedures offences,2028_E Justice procedures offences,2025_F Other offences,2026_F Other offences,2027_F Other offences,2028_F Other offences
0,Abbeyard,20001,0.0,0.0,0.0,1.0,0.0,1.0,2.0,0.0,...,,,,,,,,,,
1,Abbotsford,20002,10.0,18.0,3.0,7.0,4.0,0.0,42.0,13.0,...,,,,,,,,,,
2,Aberfeldie,20003,6.0,11.0,2.0,0.0,1.0,0.0,20.0,8.0,...,,,,,,,,,,
3,Aberfeldy,20004,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,,,,,,,,,,
4,Acheron,20005,1.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3409,,22916,,,,,,,,,...,1.837,1.830,1.104,1.120,1.123,1.123,-0.000,-0.000,-0.000,-0.000
3410,,22917,,,,,,,,,...,7.084,7.156,5.666,5.465,5.344,5.271,1.159,1.239,1.278,1.298
3411,,22925,,,,,,,,,...,0.000,0.000,1.050,0.000,1.050,0.000,0.245,0.199,0.208,0.206
3412,,22930,,,,,,,,,...,0.399,0.399,0.258,0.288,0.292,0.292,-0.000,-0.000,-0.000,-0.000


In [333]:
# here we group by the dataframe on the to reduced the no. columns and get rid of all the NaN columns
# we also filter out any suburbs/towns irrelevant to the study
df = df.fillna(0) 
df = df.drop(columns="suburb_name")
df = df[df["SAL_CODE_2021"].isin(sal_codes)]
grouped_df = df.groupby('SAL_CODE_2021').sum()
grouped_df

Unnamed: 0_level_0,2015_A Crimes against the person,2015_B Property and deception offences,2015_C Drug offences,2015_D Public order and security offences,2015_E Justice procedures offences,2015_F Other offences,2015_total,2016_A Crimes against the person,2016_B Property and deception offences,2016_C Drug offences,...,2027_D Public order and security offences,2028_D Public order and security offences,2025_E Justice procedures offences,2026_E Justice procedures offences,2027_E Justice procedures offences,2028_E Justice procedures offences,2025_F Other offences,2026_F Other offences,2027_F Other offences,2028_F Other offences
SAL_CODE_2021,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20002,10.0,18.0,3.0,7.0,4.0,0.0,42.0,13.0,16.0,3.0,...,8.092,8.029,5.575,5.613,5.610,5.610,0.873,0.878,0.878,0.878
20003,6.0,11.0,2.0,0.0,1.0,0.0,20.0,8.0,11.0,2.0,...,1.921,1.921,2.446,2.320,2.292,2.285,0.103,0.100,0.100,0.100
20011,0.0,4.0,0.0,0.0,0.0,0.0,4.0,2.0,5.0,0.0,...,3.504,3.344,4.977,4.975,4.975,4.975,0.103,0.100,0.100,0.100
20015,11.0,16.0,5.0,8.0,2.0,1.0,43.0,15.0,16.0,5.0,...,5.915,5.851,5.955,7.000,5.955,7.000,0.587,0.599,0.600,0.600
20017,11.0,16.0,5.0,4.0,2.0,0.0,38.0,10.0,16.0,3.0,...,3.715,3.736,4.912,4.954,4.934,4.944,0.546,0.504,0.507,0.507
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22916,3.0,10.0,1.0,2.0,2.0,0.0,18.0,4.0,10.0,1.0,...,1.837,1.830,1.104,1.120,1.123,1.123,0.000,0.000,0.000,0.000
22917,13.0,16.0,3.0,5.0,4.0,2.0,43.0,12.0,15.0,3.0,...,7.084,7.156,5.666,5.465,5.344,5.271,1.159,1.239,1.278,1.298
22925,1.0,7.0,1.0,2.0,2.0,0.0,13.0,3.0,6.0,2.0,...,0.000,0.000,1.050,0.000,1.050,0.000,0.245,0.199,0.208,0.206
22930,0.0,4.0,0.0,0.0,0.0,0.0,4.0,2.0,3.0,0.0,...,0.399,0.399,0.258,0.288,0.292,0.292,0.000,0.000,0.000,0.000


In [334]:
# output the df
output_df = grouped_df
# counts can't below 0
output_df[output_df <= 0] = 0
print(output_df.shape)
output_df.to_csv(f"../../data/curated/crime_2015-2028.csv")

(567, 95)
