In [1]:
# import required libries
import os
import numpy as np
import spicy as sp
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import seaborn as sns
import folium
from statsmodels.formula.api import ols, glm
from glmnet import ElasticNet

# define required location
raw_data = '../data/raw/'




In [2]:
# read total population dataset
tot_popu=pd.read_excel('https://www.abs.gov.au/statistics/people/population/regional-population/2021/32180DS0003_2001-21.xlsx',\
    sheet_name='Table 1',header=7)

In [3]:
# discard NA columns
del tot_popu['Unnamed: 31']
del tot_popu['Unnamed: 34']
# discard codes, names, and ERP change that not required
tot_popu = tot_popu.drop(columns=['S/T code', 'S/T name','GCCSA code', 'GCCSA name', 'SA4 code', 'SA4 name', 'SA3 code', 'SA3 name', \
    'no..21', '%'])
# drop NA rows
tot_popu = tot_popu.dropna()

In [4]:
tot_popu.columns = ['SA2_code', 'SA2_name', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', \
    '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019','2020', '2021', 'Area', 'Population_density_2021']

In [5]:
tot_popu.to_csv(raw_data + 'population.csv')

In [6]:
# read income dataset
income=pd.read_excel('https://www.abs.gov.au/statistics/labour/earnings-and-working-conditions/personal-income-australia/2014-15-2018-19/6524055002_DO001.xlsx', \
    sheet_name='Table 1.4',header=6)

In [7]:
del income['Unnamed: 27']

In [8]:
# drop columns with none information
income = income.dropna()

In [9]:
# rename the columns
income.columns = ['SA2_code', 'SA2_name', '2014_15_earners', '2015_16_earners',	'2016_17_earners', '2017_18_earners',\
     '2018_19_earners', '2014_15_age', '2015_16_age', '2016_17_age', '2017_18_age', '2018_19_age', '2014_15_sum', '2015_16_sum',\
        	'2016_17_sum', '2017_18_sum', '2018_19_sum', '2014_15_median', '2015_16_median', '2016_17_median', '2017_18_median', \
                '2018_19_median', '2014_15_mean', '2015_16_mean', '2016_17_mean', '2017_18_mean', '2018_19_mean']

In [10]:
income.to_csv(raw_data + 'income.csv')

In [11]:
# read population forecast dataset
# idea from https://stackoverflow.com/questions/62278538/pd-read-csv-produces-httperror-http-error-403-forbidden
url = 'https://www.gen-agedcaredata.gov.au/www_aihwgen/media/Population-Projections-2019/Victoria.csv'
storage_options = {'User-Agent': 'Mozilla/5.0'}
forecast = pd.read_csv(url, storage_options=storage_options)

In [12]:
forecast.to_csv(raw_data + 'forecast.csv')

In [13]:
# read school location dataset
school_location = pd.read_csv('https://www.education.vic.gov.au/Documents/about/research/datavic/dv331_schoollocations2022.csv',encoding='cp1252')

In [14]:
school_location.to_csv(raw_data + 'location.csv')

In [15]:
# read total population dataset
fitness = pd.read_excel('https://discover.data.vic.gov.au/dataset/e6db797e-3801-4cfa-bf02-82350d0f722d/resource/bfff5fff-9c74-4671-8396-43f793613b70/download/srv_ifmd_all-facilities.xlsx',\
    sheet_name='wholeIFMD')

In [16]:
required = ['Facility ID','Latitude','Longitude']
fitness = fitness[required]

In [17]:
fitness.to_csv(raw_data + 'facility.csv')

In [18]:
# read health dataset
health = pd.read_excel('https://www.abs.gov.au/statistics/health/health-conditions-and-risks/chronic-conditions/2017-18/4364055001do033_20172018.xlsx',\
    sheet_name='Table 33.4',header=11)

In [19]:
# find the proportion of people with three or more chronic condition
health = health[['SA2 Code','SA2 Label','Proportion \n(%)']].dropna()
health.columns = ['SA2_code', 'SA2_name','Proportion']

In [20]:
health.to_csv(raw_data + 'health.csv')