In [132]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, linear_model, metrics
from sklearn.metrics import accuracy_score

### 1: Predict future crime data based on suburb

In [133]:
crime = pd.read_csv("../data/curated/crime_data.csv")
crime = crime.rename({'Offence Count': 'offence_count'}, axis=1)
crime

Unnamed: 0,Year,SA2_codes,Postcode,offence_count
0,2013,206011106,3057,156
1,2013,206011107,3052,552
2,2013,206011107,3055,229
3,2013,206011109,3044,514
4,2013,206011495,3056,732
...,...,...,...,...
1795,2022,214011370,3201,675
1796,2022,214011371,3199,2994
1797,2022,214011372,3200,437
1798,2022,214011374,3910,344


In [134]:
# convert SA2 codes to string
crime["SA2_codes"] = crime["SA2_codes"].astype(str)

In [135]:
# create linear regression object
reg = linear_model.LinearRegression()

In [136]:
# predict based on year and suburb (I'll only take 1 suburb as an example, but later,
# we need to predict all the suburbs, so we need to run the model x times, where x is the number of suburbs)
df2 = crime.groupby(["Year", "SA2_codes"]).sum().reset_index()
SA2_code = df2["SA2_codes"][0]
df2 = df2.loc[df2["SA2_codes"] == SA2_code]
# remove 2022
df2 = df2.loc[df2["Year"] != 2022]
print(df2)

year = df2[["Year"]]
count = df2[["offence_count"]]

# modelling
reg.fit(year, count)
future_pred = reg.predict([[2023], [2024], [2025]])
print(future_pred)

      Year  SA2_codes  Postcode  offence_count
0     2013  206011106      3057            156
169   2014  206011106      3057            181
338   2015  206011106      3057            232
507   2016  206011106      3057            181
676   2017  206011106      3057            186
845   2018  206011106      3057            232
1014  2019  206011106      3057            197
1183  2020  206011106      3057            275
1352  2021  206011106      3057            279
[[288.72222222]
 [301.30555556]
 [313.88888889]]




In [137]:
# let's implement this to all suburbs
df2 = crime.groupby(["Year", "SA2_codes"]).sum().reset_index()

# future data (2023 only, need to add 2024 and 2025)
SA2_total = len(df2.loc[df2["Year"] == 2013]["Year"].tolist())
SA2_codes = df2.loc[df2["Year"] == 2013]["SA2_codes"]
future_df = pd.DataFrame({"Year": [2023] * SA2_total, "SA2_codes": SA2_codes})
# now for 2024 and 2025
future_df_2024 = pd.DataFrame({"Year": [2024] * SA2_total, "SA2_codes": SA2_codes})
future_df_2025 = pd.DataFrame({"Year": [2025] * SA2_total, "SA2_codes": SA2_codes})
future_df = pd.concat([future_df, future_df_2024, future_df_2025], ignore_index=True)
future_df["pred_crime_count"] = np.nan
future_df

Unnamed: 0,Year,SA2_codes,pred_crime_count
0,2023,206011106,
1,2023,206011107,
2,2023,206011109,
3,2023,206011495,
4,2023,206021110,
...,...,...,...
502,2025,214011370,
503,2025,214011371,
504,2025,214011372,
505,2025,214011374,


In [138]:
# modelling
for i in range(SA2_total):
    df3 = df2.copy()
    SA2_code = df2.at[i, "SA2_codes"]
    df3 = df3.loc[df3["SA2_codes"] == SA2_code]
    df3 = df3.loc[df3["Year"] != 2022]

    year = df3[["Year"]]
    count = df3[["offence_count"]]

    reg.fit(year, count)
    future_pred = reg.predict([[2023], [2024], [2025]])

    future_df.at[i, "pred_crime_count"] = future_pred[0]                # 2023
    future_df.at[SA2_total+i, "pred_crime_count"] = future_pred[1]      # 2024
    future_df.at[(2*SA2_total)+i, "pred_crime_count"] = future_pred[2]  # 2025
future_df



Unnamed: 0,Year,SA2_codes,pred_crime_count
0,2023,206011106,288.722222
1,2023,206011107,1214.622222
2,2023,206011109,680.855556
3,2023,206011495,1160.411111
4,2023,206021110,497.811111
...,...,...,...
502,2025,214011370,1143.866667
503,2025,214011371,3341.711111
504,2025,214011372,594.844444
505,2025,214011374,438.955556


### 2: Income

In [139]:
income_2021 = pd.read_csv("../data/raw/income_and_pop.csv")
income_2016 = pd.read_csv("../data/raw/income_and_pop_2016.csv")
income_2011 = pd.read_csv("../data/raw/income_and_pop_2011.csv")

income_2011

Unnamed: 0,region_id,Median_Tot_prsnl_inc_weekly,Median_rent_weekly,Median_Tot_fam_inc_weekly,Median_Tot_hhd_inc_weekly,total_population
0,201011001,609,250,1618,1405,6690
1,201011002,575,230,1573,1120,10792
2,201011003,534,210,1333,1011,19036
3,201011004,464,200,1062,857,20180
4,201011005,531,250,1623,1409,5537
...,...,...,...,...,...,...
395,217021429,541,120,1273,1050,5580
396,217021430,521,180,1272,1028,7577
397,217021431,475,195,1039,841,3075
398,217021432,536,220,1289,1047,16456


In [140]:
# the SA2 codes seem to be inconsistent between 2021, 2016, and 2011...
print(len(income_2021))
print(len(income_2016))
print(len(income_2011))
# but there are no duplicates, which means that some suburbs are missing in 2016, and even more are missing in 2011
print(income_2021[income_2021.duplicated(subset=["SA2_CODE_2021"], keep=False)])
print(income_2016[income_2016.duplicated(subset=["SA2_MAINCODE_2016"], keep=False)])
print(income_2011[income_2011.duplicated(subset=["region_id"], keep=False)])

471
430
400
Empty DataFrame
Columns: [SA2_CODE_2021, Median_tot_prsnl_inc_weekly, Median_rent_weekly, Median_tot_fam_inc_weekly, Median_tot_hhd_inc_weekly, total_population, geometry, SA2_NAME21]
Index: []
Empty DataFrame
Columns: [SA2_MAINCODE_2016, Median_tot_prsnl_inc_weekly, Median_rent_weekly, Median_tot_fam_inc_weekly, Median_tot_hhd_inc_weekly, total_population]
Index: []
Empty DataFrame
Columns: [region_id, Median_Tot_prsnl_inc_weekly, Median_rent_weekly, Median_Tot_fam_inc_weekly, Median_Tot_hhd_inc_weekly, total_population]
Index: []


In [141]:
# some cleaning
income_2021 = income_2021.rename(columns={"SA2_CODE_2021": "SA2_code"})[["SA2_code", "Median_tot_prsnl_inc_weekly", "total_population"]]
income_2016 = income_2016.rename(columns={"SA2_MAINCODE_2016": "SA2_code"})[["SA2_code", "Median_tot_prsnl_inc_weekly", "total_population"]]
income_2011 = income_2011.rename(columns={"region_id": "SA2_code"})[["SA2_code", "Median_Tot_prsnl_inc_weekly", "total_population"]]

income_2021 = income_2021.rename(columns={"Median_tot_prsnl_inc_weekly": "income_2021", "total_population": "pop_2021"})
income_2016 = income_2016.rename(columns={"Median_tot_prsnl_inc_weekly": "income_2016", "total_population": "pop_2016"})
income_2011 = income_2011.rename(columns={"Median_Tot_prsnl_inc_weekly": "income_2011", "total_population": "pop_2011"})

In [142]:
# do a left join
_temp = income_2021.merge(income_2016, how="left", on="SA2_code")
final = _temp.merge(income_2011, how="left", on="SA2_code")
final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 471 entries, 0 to 470
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   SA2_code     471 non-null    int64  
 1   income_2021  471 non-null    int64  
 2   pop_2021     471 non-null    int64  
 3   income_2016  363 non-null    float64
 4   pop_2016     363 non-null    float64
 5   income_2011  310 non-null    float64
 6   pop_2011     310 non-null    float64
dtypes: float64(4), int64(3)
memory usage: 29.4 KB


In [143]:
# looks like we can only predict 310 instances with all 2011, 2016, and 2021, 
# and only another 53 instances with 2016 and 2021 only,
# with the remaining instances being useless.
# I personally think this is not a very good forecast given that we literally only have 3 data values max,
# so this result might not be the most accurate. But anyway, we'll see.
# 
# alright, let's do the forecasting


In [144]:
# convert SA2 code to str
final["SA2_code"] = final["SA2_code"].astype(str)

# all 2011, 2016, and 2021 exist
df1 = final.dropna(subset=["income_2011"]).reset_index().drop(["index"], axis=1)
# only 2016 and 2021 exist
df2 = final.dropna(subset=["income_2016"])
df2 = pd.concat([df1,df2]).drop_duplicates(keep=False).reset_index().drop(["index", "income_2011", "pop_2011"], axis=1)

df1.head()

Unnamed: 0,SA2_code,income_2021,pop_2021,income_2016,pop_2016,income_2011,pop_2011
0,201011001,865,13320,702.0,9485.0,609.0,6690.0
1,201011002,842,11092,670.0,10922.0,575.0,10792.0
2,201011005,805,6041,638.0,6125.0,531.0,5537.0
3,201011006,775,8898,595.0,6085.0,515.0,4781.0
4,201011007,802,3457,646.0,3276.0,567.0,3065.0


In [145]:
# predict income
year_list = []
SA2_code_list = []
pred_income = []
pred_pop = []

for i in range(df1.shape[0]):
    year = [[2011], [2016], [2021]]
    income = [[df1.loc[i, "income_2011"]], [df1.loc[i, "income_2016"]], [df1.loc[i, "income_2021"]]]
    pop = [[df1.loc[i, "pop_2011"]], [df1.loc[i, "pop_2016"]], [df1.loc[i, "pop_2021"]]]

    reg.fit(year, income)
    future_pred_i = reg.predict([[2023], [2024], [2025]])
    reg.fit(year, pop)
    future_pred_p = reg.predict([[2023], [2024], [2025]])

    # append to list (to which evantually going to be joined into a dataframe)
    year_list.extend([2023, 2024, 2025])
    SA2_code_list.extend([df1.loc[i, "SA2_code"], df1.loc[i, "SA2_code"], df1.loc[i, "SA2_code"]])
    # a bit tricky here, future_pred is 2-dimensional which needs to be flattened to a 1d list of length 3
    future_pred_i = list(np.concatenate(future_pred_i).flat)
    future_pred_p = list(np.concatenate(future_pred_p).flat)
    pred_income.extend(future_pred_i)
    pred_pop.extend(future_pred_p)

In [146]:
# again, but for 2016 and 2021 only
for i in range(df2.shape[0]):
    year = [[2016], [2021]]
    income = [[df2.loc[i, "income_2016"]], [df2.loc[i, "income_2021"]]]
    pop = [[df2.loc[i, "pop_2016"]], [df2.loc[i, "pop_2021"]]]

    reg.fit(year, income)
    future_pred_i = reg.predict([[2023], [2024], [2025]])
    reg.fit(year, pop)
    future_pred_p = reg.predict([[2023], [2024], [2025]])

    # append to list (to which evantually going to be joined into a dataframe)
    year_list.extend([2023, 2024, 2025])
    SA2_code_list.extend([df2.loc[i, "SA2_code"], df2.loc[i, "SA2_code"], df2.loc[i, "SA2_code"]])
    # a bit tricky here, future_pred is 2-dimensional which needs to be flattened to a 1d list of length 3
    future_pred_i = list(np.concatenate(future_pred_i).flat)
    future_pred_p = list(np.concatenate(future_pred_p).flat)
    pred_income.extend(future_pred_i)
    pred_pop.extend(future_pred_p)

future_income_pop = pd.DataFrame({"Year": year_list, "SA2_codes": SA2_code_list, "pred_income": pred_income, "pred_pop": pred_pop})
future_income_pop

Unnamed: 0,Year,SA2_codes,pred_income,pred_pop
0,2023,201011001,904.533333,14472.666667
1,2024,201011001,930.133333,15135.666667
2,2025,201011001,955.733333,15798.666667
3,2023,201011002,882.566667,11145.333333
4,2024,201011002,909.266667,11175.333333
...,...,...,...,...
1084,2024,217041479,841.000000,19965.800000
1085,2025,217041479,868.000000,20154.400000
1086,2023,217041480,821.400000,12169.800000
1087,2024,217041480,851.600000,12179.200000


In [150]:
# join this result with the crime forecast result
fdf = future_df.merge(future_income_pop, how="outer", on=["Year", "SA2_codes"]) \
    .sort_values(by=["Year", "SA2_codes"]) \
    .reset_index().drop(["index"], axis=1)
fdf

Unnamed: 0,index,Year,SA2_codes,pred_crime_count,pred_income,pred_pop
0,507,2023,201011001,,904.533333,14472.666667
1,510,2023,201011002,,882.566667,11145.333333
2,513,2023,201011005,,849.800000,6253.800000
3,516,2023,201011006,,810.333333,9469.900000
4,519,2023,201011007,,836.166667,3540.400000
...,...,...,...,...,...,...
1210,1202,2025,217031476,,782.600000,4358.200000
1211,1205,2025,217041477,,893.800000,6439.400000
1212,1208,2025,217041478,,870.800000,9291.000000
1213,1211,2025,217041479,,868.000000,20154.400000


In [149]:
# might need to round some values (like population) to the nearest integer, because 
# right now some values don't really make sense if they are not integers
fdf.to_csv("../data/curated/forecast_data.csv")