In [10]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, linear_model, metrics
from sklearn.metrics import accuracy_score

### 1: Predict future crime data based on suburb

In [11]:
crime = pd.read_csv("../data/curated/crime_data.csv")
crime = crime.rename({'Offence Count': 'offence_count'}, axis=1)
crime

Unnamed: 0,Year,SA2_codes,Postcode,offence_count
0,2013,206011106,3057,156
1,2013,206011107,3052,552
2,2013,206011107,3055,229
3,2013,206011109,3044,514
4,2013,206011495,3056,732
...,...,...,...,...
1795,2022,214011370,3201,675
1796,2022,214011371,3199,2994
1797,2022,214011372,3200,437
1798,2022,214011374,3910,344


In [12]:
# convert SA2 codes to string
crime["SA2_codes"] = crime["SA2_codes"].astype(str)

In [13]:
# create linear regression object
reg = linear_model.LinearRegression()

In [14]:
# predict based on year and suburb (I'll only take 1 suburb as an example, but later,
# we need to predict all the suburbs, so we need to run the model x times, where x is the number of suburbs)
df2 = crime.groupby(["Year", "SA2_codes"]).sum().reset_index()
SA2_code = df2["SA2_codes"][0]
df2 = df2.loc[df2["SA2_codes"] == SA2_code]
# remove 2022
df2 = df2.loc[df2["Year"] != 2022]
print(df2)

year = df2[["Year"]]
count = df2[["offence_count"]]

# modelling
reg.fit(year, count)
future_pred = reg.predict([[2023], [2024], [2025]])
print(future_pred)

      Year  SA2_codes  Postcode  offence_count
0     2013  206011106      3057            156
169   2014  206011106      3057            181
338   2015  206011106      3057            232
507   2016  206011106      3057            181
676   2017  206011106      3057            186
845   2018  206011106      3057            232
1014  2019  206011106      3057            197
1183  2020  206011106      3057            275
1352  2021  206011106      3057            279
[[288.72222222]
 [301.30555556]
 [313.88888889]]




In [15]:
# let's implement this to all suburbs
df2 = crime.groupby(["Year", "SA2_codes"]).sum().reset_index()

# future data (2023 only, need to add 2024 and 2025)
SA2_total = len(df2.loc[df2["Year"] == 2013]["Year"].tolist())
SA2_codes = df2.loc[df2["Year"] == 2013]["SA2_codes"]
future_df = pd.DataFrame({"Year": [2023] * SA2_total, "SA2_codes": SA2_codes})
# now for 2024 and 2025
future_df_2024 = pd.DataFrame({"Year": [2024] * SA2_total, "SA2_codes": SA2_codes})
future_df_2025 = pd.DataFrame({"Year": [2025] * SA2_total, "SA2_codes": SA2_codes})
future_df = pd.concat([future_df, future_df_2024, future_df_2025], ignore_index=True)
future_df["pred_crime_count"] = np.nan
future_df

Unnamed: 0,Year,SA2_codes,pred_crime_count
0,2023,206011106,
1,2023,206011107,
2,2023,206011109,
3,2023,206011495,
4,2023,206021110,
...,...,...,...
502,2025,214011370,
503,2025,214011371,
504,2025,214011372,
505,2025,214011374,


In [16]:
# modelling
for i in range(SA2_total):
    df3 = df2.copy()
    SA2_code = df2.at[i, "SA2_codes"]
    df3 = df3.loc[df3["SA2_codes"] == SA2_code]
    df3 = df3.loc[df3["Year"] != 2022]

    year = df3[["Year"]]
    count = df3[["offence_count"]]

    reg.fit(year, count)
    future_pred = reg.predict([[2023], [2024], [2025]])

    future_df.at[i, "pred_crime_count"] = future_pred[0]                # 2023
    future_df.at[SA2_total+i, "pred_crime_count"] = future_pred[1]      # 2024
    future_df.at[(2*SA2_total)+i, "pred_crime_count"] = future_pred[2]  # 2025
future_df



Unnamed: 0,Year,SA2_codes,pred_crime_count
0,2023,206011106,288.722222
1,2023,206011107,1214.622222
2,2023,206011109,680.855556
3,2023,206011495,1160.411111
4,2023,206021110,497.811111
...,...,...,...
502,2025,214011370,1143.866667
503,2025,214011371,3341.711111
504,2025,214011372,594.844444
505,2025,214011374,438.955556


### 2: Income

In [45]:
income_2021 = pd.read_csv("../data/raw/income_and_pop.csv")
income_2016 = pd.read_csv("../data/raw/income_and_pop_2016.csv")
income_2011 = pd.read_csv("../data/raw/income_and_pop_2011.csv")

income_2011

Unnamed: 0,region_id,Median_Tot_prsnl_inc_weekly,Median_rent_weekly,Median_Tot_fam_inc_weekly,Median_Tot_hhd_inc_weekly,total_population
0,201011001,609,250,1618,1405,6690
1,201011002,575,230,1573,1120,10792
2,201011003,534,210,1333,1011,19036
3,201011004,464,200,1062,857,20180
4,201011005,531,250,1623,1409,5537
...,...,...,...,...,...,...
395,217021429,541,120,1273,1050,5580
396,217021430,521,180,1272,1028,7577
397,217021431,475,195,1039,841,3075
398,217021432,536,220,1289,1047,16456


In [40]:
# the SA2 codes seem to be inconsistent between 2021, 2016, and 2011...
print(len(income_2021))
print(len(income_2016))
print(len(income_2011))
# but there are no duplicates, which means that some suburbs are missing in 2016, and even more are missing in 2011
print(income_2021[income_2021.duplicated(subset=["SA2_CODE_2021"], keep=False)])
print(income_2016[income_2016.duplicated(subset=["SA2_MAINCODE_2016"], keep=False)])
print(income_2011[income_2011.duplicated(subset=["region_id"], keep=False)])

471
430
400
Empty DataFrame
Columns: [SA2_CODE_2021, Median_tot_prsnl_inc_weekly, Median_rent_weekly, Median_tot_fam_inc_weekly, Median_tot_hhd_inc_weekly, total_population, geometry, SA2_NAME21]
Index: []
Empty DataFrame
Columns: [SA2_MAINCODE_2016, Median_tot_prsnl_inc_weekly, Median_rent_weekly, Median_tot_fam_inc_weekly, Median_tot_hhd_inc_weekly, total_population]
Index: []
Empty DataFrame
Columns: [region_id, Median_Tot_prsnl_inc_weekly, Median_rent_weekly, Median_Tot_fam_inc_weekly, Median_Tot_hhd_inc_weekly, total_population]
Index: []


In [50]:
# some cleaning
income_2021 = income_2021.rename(columns={"SA2_CODE_2021": "SA2_code"})[["SA2_code", "Median_tot_prsnl_inc_weekly", "total_population"]]
income_2016 = income_2016.rename(columns={"SA2_MAINCODE_2016": "SA2_code"})[["SA2_code", "Median_tot_prsnl_inc_weekly", "total_population"]]
income_2011 = income_2011.rename(columns={"region_id": "SA2_code"})[["SA2_code", "Median_Tot_prsnl_inc_weekly", "total_population"]]

income_2021 = income_2021.rename(columns={"Median_tot_prsnl_inc_weekly": "income_2021", "total_population": "pop_2021"})
income_2016 = income_2016.rename(columns={"Median_tot_prsnl_inc_weekly": "income_2016", "total_population": "pop_2016"})
income_2011 = income_2011.rename(columns={"Median_Tot_prsnl_inc_weekly": "income_2011", "total_population": "pop_2011"})

In [55]:
# do a left join
_temp = income_2021.merge(income_2016, how="left", on="SA2_code")
final = _temp.merge(income_2011, how="left", on="SA2_code")
final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 471 entries, 0 to 470
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   SA2_code     471 non-null    int64  
 1   income_2021  471 non-null    int64  
 2   pop_2021     471 non-null    int64  
 3   income_2016  363 non-null    float64
 4   pop_2016     363 non-null    float64
 5   income_2011  310 non-null    float64
 6   pop_2011     310 non-null    float64
dtypes: float64(4), int64(3)
memory usage: 29.4 KB


In [None]:
# looks like we can only predict 310 instances with all 2011, 2016, and 2021, 
# and only another 53 instances with 2016 and 2021 only,
# with the remaining instances being useless.
# 
# alright, let's do the forecasting
# TODO: modelling
 