# Preprocessing Income and Population Data
- This notebook preprocesses the income and population data.
- The preprocessed datasets are exported into the `./data/raw` directory.

Import relevant libraries.

In [1]:
import sys, os
sys.path.append(os.path.abspath('../'))
from scripts.utils import create_dir, get_runtime
import time 
start_time = time.time()

import pandas as pd
import numpy as np
import geopandas as gpd

## Income

### Load 2012-2016 and 2017-2021 Income datasets

In [2]:
income_1_df_unclean = pd.read_excel("../data/landing/income/income-2012-2016.xls",
                            "Table 1.4",
                            header=6)
income_2_df_unclean = pd.read_excel("../data/landing/income/income-2017-2021.xlsx",
                            "Table 1.4",
                            header=6)

### Clean 2012-2016 Income

In [None]:
income_1_df = income_1_df_unclean.copy()[[
    'SA2',
    'SA2 NAME',
    '2011-12.3',
    '2012-13.3',
    '2013-14.3',
    '2014-15.3',
    '2015-16.3'
]]

# Filter for Victoria SA2s (9-digit codes starting with '2')
income_1_df['SA2'] = income_1_df['SA2'].astype(str)
income_1_df = income_1_df.loc[
    (income_1_df['SA2'].str.len() == 9) &
    (income_1_df['SA2'].str.startswith('2'))
]
# drop rows with value 'np'
income_1_df = income_1_df.replace('np', np.nan)
income_1_df = income_1_df.dropna()
# Rename columns
income_1_df.columns = [
    'sa2_code',
    'sa2_name',
    '2012',
    '2013',
    '2014',
    '2015',
    '2016'
]
income_1_df

### Clean 2017-2021 Income

In [None]:
income_2_df = income_2_df_unclean.copy()[[
    'SA2',
    'SA2 NAME',
    '2016-17.3',
    '2017-18.3',
    '2018-19.3',
    '2019-20.3',
    '2020-21.3'
]]

# Filter for Victoria SA2s (9-digit codes starting with '2')
income_2_df['SA2'] = income_2_df['SA2'].astype(str)
income_2_df = income_2_df.loc[
    (income_2_df['SA2'].str.len() == 9) &
    (income_2_df['SA2'].str.startswith('2'))
]

# drop rows with value 'np'
income_2_df = income_2_df.replace('np', np.nan)
income_2_df = income_2_df.dropna()

# Rename columns
income_2_df.columns = [
    'sa2_code',
    'sa2_name',
    '2017',
    '2018',
    '2019',
    '2020',
    '2021'
]

income_2_df.sort_values(by='sa2_name', inplace=True)
income_2_df

### Add SA2 geometries

Use 2016 SA2 Digital Boundaries shapefile for `income_1_df`.

In [None]:
sa2_2016 = gpd.read_file('../data/landing/sa2/sa2-16-shp/')
sa2_2016
sa2_2016 = sa2_2016[['SA2_MAIN16', 'geometry']]
sa2_2016.columns = ['sa2_code', 'geometry']
income_1_df = pd.merge(income_1_df, sa2_2016, on='sa2_code')
income_1_df = income_1_df.melt(
    id_vars=['sa2_code', 'sa2_name', 'geometry'],
    value_vars=['2012', '2013', '2014', '2015', '2016'],
    var_name='year',
    value_name='median_income'
)
income_1_df['year'] = income_1_df['year'].astype(int)
income_1_df

Use 2021 SA2 Digital Boundaries shapefile for `income_2_df`.

In [None]:
sa2_2021 = gpd.read_file('../data/landing/sa2/sa2-21-shp/')
sa2_2021
sa2_2021 = sa2_2021[['SA2_CODE21', 'geometry']]
sa2_2021.columns = ['sa2_code', 'geometry']
income_2_df = pd.merge(income_2_df, sa2_2021, on='sa2_code')
income_2_df = income_2_df.melt(
    id_vars=['sa2_code', 'sa2_name', 'geometry'],
    value_vars=['2017', '2018', '2019', '2020', '2021'],
    var_name='year',
    value_name='median_income'
)
income_2_df['year'] = income_2_df['year'].astype(int)
income_2_df

### Push Income datasets to `raw` layer

In [None]:
create_dir('../data/raw/income')
income_1_df.to_csv('../data/raw/income/income-12-16.csv', index=False)
income_2_df.to_csv('../data/raw/income/income-17-21.csv', index=False)

## Population

### Load Population Dataset

In [8]:
population_df = pd.read_excel("../data/landing/population/population.xlsx",
                         "Table 1",
                         header = [5,6])

### Clean

In [None]:
population_df.columns = [
    (col[1] if str(col[0]).startswith('Unnamed') else col[0]) \
    for col in list(population_df.columns.values)
]
population_df = population_df.drop(
    ["GCCSA code",
    "GCCSA name",
    "SA3 code",
    "SA3 name",
    "SA4 code",
    "SA4 name"],
    axis=1
)
population_df = population_df.dropna()
population_df["SA2 code"] = population_df["SA2 code"].astype(int).astype(str)
population_df = population_df.rename(columns={"SA2 code": "sa2_code",
                                              "SA2 name": "sa2_name"})
population_df = population_df.loc[
    (population_df['sa2_code'].str.len() == 9) &
    (population_df['sa2_code'].str.startswith('2'))
]
year_cols = population_df.columns[2:]
population_df = population_df.melt(
    id_vars=['sa2_code', 'sa2_name'],
    value_vars=year_cols,
    var_name='year',
    value_name='population'
)
population_df['year'] = population_df['year'].astype(int)
population_df

### Add SA2 Geometry

In [None]:
population_df = pd.merge(population_df, sa2_2021, on='sa2_code')
population_df

### Push Population Dataset to `raw` layer

In [None]:
create_dir('../data/raw/population')
population_df.to_csv('../data/raw/population/population-01-23.csv', index=False)

In [12]:
# shape_1 = pd.read_csv("../data/landing/ptv/1/1/shapes.txt")
# stops_1 = pd.read_csv("../data/landing/ptv/1/1/stops.txt")
# stops_1 = stops_1.loc[:,["stop_name","stop_lat","stop_lon"]]
# gdf_stops_1 = gpd.GeoDataFrame(
#     stops_1,
#     geometry=gpd.points_from_xy(stops_1['stop_lon'], stops_1['stop_lat'])
# )

Get notebook runtime.

In [None]:
print(get_runtime(start_time))