Takes in x dataframes and merges them together.

In [25]:
import pandas as pd
import geopandas as gpd
import sys
import os

sys.path.append(os.path.abspath('../../scripts/2. modules'))
import processing

# Inputs
CRIME_PATH = '../../data/2. raw/crime.csv'
SCHOOLS_PATH = '../../data/3. curated/school/school_by_region.csv'
DISTANCES_PATH = '../../data/2. raw/distances.csv'
LAND_COVER_PATH = '../../data/2. raw/land_cover.csv'
PTV_PATH = '../../data/2. raw/ptv.csv'

# Output
OUTPUT_PATH = '../../data/3. curated/merged'
OUTPUT_FILENAME = 'crime_schools_distances_land.csv'

Load each dataframe from the raw layer and prepare it via feature selection for joining.

In [26]:
regions_df = processing.get_regions_df(2)

crime_df = pd.read_csv(CRIME_PATH)

def generate_quarters(year, quarter):
    result = []
    # Loop back to cover the last 4 quarters
    for i in range(3, -1, -1):  # 4 quarters back (3 before and the current quarter)
        new_quarter = (quarter - i - 1) % 4 + 1
        new_year = year if new_quarter <= quarter else year - 1
        result.append((new_year, new_quarter))
    return result

# Process each row in the dataframe and create new rows for each quarter
new_rows = []
for _, row in crime_df.iterrows():
    end_year = row['year']
    end_quarter = 2
    
    # Generate quarters for this row
    quarters = generate_quarters(end_year, end_quarter)
    
    # For each quarter, create a new row
    for year, quarter in quarters:
        new_row = row.copy()
        new_row['year'] = year
        new_row['quarter'] = quarter
        new_rows.append(new_row)

# Create a new dataframe from the expanded rows
expanded_crime_df = pd.DataFrame(new_rows).drop(columns=['Unnamed: 0', 'year_ending'])

# get the schools by region df
schools_by_region = pd.read_csv(SCHOOLS_PATH)

ptv_df = pd.read_csv(PTV_PATH)

# distances df
distances = pd.read_csv(DISTANCES_PATH).drop(columns=['Unnamed: 0', 'geometry', 'regions', 'code', 'centroid', 'route_to_cbd'])

# land cover df
land_cover = pd.read_csv(LAND_COVER_PATH).drop(columns=['Unnamed: 0'])

Now join the data together

In [27]:
suburbs = land_cover['suburbs'].unique()

df_data = []
for suburb in suburbs:
    for year in range(2000, 2028):
        for quarter in range(1, 5):
            df_data.append([suburb, year, quarter])

# get the left join base
df_left_join = pd.DataFrame(df_data, columns=['suburbs', 'year', 'quarter'])
df_left_join

# and left join on each value set for the dataframe
merged = pd.merge(df_left_join, expanded_crime_df, on=['suburbs', 'year', 'quarter'], how='left')
merged = pd.merge(merged, schools_by_region, on=['suburbs'], how='left')
merged = pd.merge(merged, distances, on=['suburbs'], how='left')
merged = pd.merge(merged, ptv_df, on=['suburbs'], how='left')
merged = pd.merge(merged, land_cover, on=['suburbs', 'year', 'quarter'], how='left')

processing.to_csv(merged, OUTPUT_PATH, OUTPUT_FILENAME)