In [24]:
import sys
import os
import pandas as pd
import geopandas as gpd

sys.path.append(os.path.abspath('../../scripts/2. modules'))
import processing

# Inputs
CRIME_FILE_PATH = '../../data/1. landing/crime.csv'
POSTAL_FILE_PATH = '../../data/1. landing/shapefile/postal/POA_2021_AUST_GDA2020.shp'

# Outputs
CRIME_FILE_PATH_OUT = '../../data/2. raw'
CRIME_FILE_NAME = 'crime.csv'

In [18]:
regions_df = processing.get_regions_df(2)
crime_df = pd.read_csv(CRIME_FILE_PATH)
postal_df = gpd.read_file(POSTAL_FILE_PATH)

# fix the column names for crime and postal
processing.fix_col_names(crime_df)
processing.fix_col_names(postal_df)

postal_df['postcode'] = postal_df['poa_name21'].apply(pd.to_numeric, errors='coerce')

postal_df_filtered = gpd.GeoDataFrame(
    postal_df[['postcode', 'geometry']],
    geometry='geometry'
)

Join the postal regions to the crime regions

In [19]:
joined = pd.merge(crime_df, postal_df, on='postcode')

joined_gb = joined.groupby(['postcode']).agg({
    'geometry': 'first'
}).reset_index()

joined_gb = gpd.GeoDataFrame(
    joined_gb,
    geometry='geometry'
)

joined_gb['geometry_right'] = joined_gb['geometry']
joined_gb

Unnamed: 0,postcode,geometry,geometry_right
0,3000,"POLYGON ((144.96139 -37.8205, 144.96063 -37.82...","POLYGON ((144.96139 -37.8205, 144.96063 -37.82..."
1,3002,"POLYGON ((144.98978 -37.81906, 144.98973 -37.8...","POLYGON ((144.98978 -37.81906, 144.98973 -37.8..."
2,3003,"POLYGON ((144.95268 -37.8128, 144.95191 -37.81...","POLYGON ((144.95268 -37.8128, 144.95191 -37.81..."
3,3004,"POLYGON ((144.98001 -37.84422, 144.98031 -37.8...","POLYGON ((144.98001 -37.84422, 144.98031 -37.8..."
4,3006,"POLYGON ((144.96139 -37.8205, 144.96303 -37.82...","POLYGON ((144.96139 -37.8205, 144.96303 -37.82..."
...,...,...,...
688,3990,"POLYGON ((145.52915 -38.42196, 145.52883 -38.4...","POLYGON ((145.52915 -38.42196, 145.52883 -38.4..."
689,3991,"POLYGON ((145.51584 -38.46758, 145.51573 -38.4...","POLYGON ((145.51584 -38.46758, 145.51573 -38.4..."
690,3992,"POLYGON ((145.55759 -38.47959, 145.56055 -38.4...","POLYGON ((145.55759 -38.47959, 145.56055 -38.4..."
691,3995,"MULTIPOLYGON (((145.57364 -38.57838, 145.57356...","MULTIPOLYGON (((145.57364 -38.57838, 145.57356..."


Now perform weighted area aggregation 

In [20]:
regions_df_w_crs = regions_df.copy() #.set_crs('EPSG:4326')

# join based on the largest intersecting area
intersecting_zones_df = gpd.sjoin(regions_df_w_crs, joined_gb, how='left', predicate='intersects')

# calculate the amount of area that is overlapping
intersecting_zones_df['overlap_area'] = intersecting_zones_df.apply(
    lambda row: row.geometry.intersection(row['geometry_right']).area 
    if not pd.isna(row['geometry_right']) else 0, axis=1
)

# clear/fill NA
intersecting_zones_df['overlap_area'] = intersecting_zones_df['overlap_area'].fillna(0)
intersecting_zones_df = intersecting_zones_df.reset_index()

intersecting_zones_df['total_overlap_area_per_postcode'] = intersecting_zones_df.groupby('suburbs')['overlap_area'].transform('sum')

# Step 2: Normalize the overlap_area by dividing it by the total_overlap_area_per_postcode
intersecting_zones_df['normalized_overlap'] = intersecting_zones_df['overlap_area'] / intersecting_zones_df['total_overlap_area_per_postcode']

# now join on all 
joined_all = pd.merge(intersecting_zones_df, joined, on='postcode')


Now onehot encode feature columns

In [23]:
joined_all['normalized_overlap'] = pd.to_numeric(joined_all['normalized_overlap'], errors='coerce')
joined_all['offence_count'] = pd.to_numeric(joined_all['offence_count'], errors='coerce') * joined_all['normalized_overlap'] 

cols = list(joined_all['offence_subdivision'].unique())
df_encoded = pd.get_dummies(joined_all, columns=['offence_subdivision'], drop_first=False)

for col in cols:
    df_encoded[col] = df_encoded['offence_subdivision_' + col] * df_encoded['offence_count']

df_encoded = df_encoded.groupby(['suburbs', 'year', 'year_ending']).sum(cols).reset_index()

# save just the required files
final_df = df_encoded[['suburbs', 'year', 'year_ending'] + cols].copy()
final_df['total_crimes'] = final_df[cols].sum(axis=1)
processing.fix_col_names(final_df)

processing.to_csv(final_df, CRIME_FILE_PATH_OUT, CRIME_FILE_NAME)