In [15]:
import pandas as pd
import numpy as np
import csv
import json

### Boundary

In [16]:
import geopandas as gpd

In [17]:
sf = gpd.read_file("../data/curated/sa2/SA2_2021_AUST_GDA94.shp")
gdf = gpd.GeoDataFrame(sf)
gdf = gdf[gdf['STE_NAME21']=='Victoria']

gdf = gdf[['SA2_NAME21','SA2_CODE21', 'geometry']]
gdf.to_file('../data/curated/boundary.shp')

### Postcode

In [17]:
# we extract postcode in Victoria only
tmp_lst = []
with open('../data/raw/postcode.csv','r') as f:
    reader = csv.reader(f)
    for row in reader:
        tmp_lst.append(row)
df = pd.DataFrame(tmp_lst[1:], columns=tmp_lst[0])

df = df[df['state']=='VIC']

# we select some important columns for this dataset, such as SA2 code, SA2 name, latitude, longitude and so on
data = df[['id','postcode','locality','state','long','lat','SA2_MAINCODE_2016','SA2_NAME_2016']] 


data.to_csv('../data/curated/postcode_vic.csv')

### Property

In [25]:
import re

In [60]:
# for property json file, we want to convert it into data frame and save as csv
f = open('../data/raw/example.json')
data = json.load(f)
df = pd.DataFrame.from_dict(data, orient='index')
print(len(df))

# drop the instances where coordinate information is not provided

for index, row in df.iterrows():
    if re.search(r'\[(-\d+.\d+)',str(row['coordinates'])): 
        continue
    else:
        df = df.drop(index)


print(len(df))

df.to_csv('../data/curated/property.csv')

15226
15222


### Unemployment

In [19]:
# we want information in Victoria only and only analyse unemployment rate
tmp_lst = []
with open('../data/raw/unemployment.csv','r') as f:
    reader = csv.reader(f)
    for row in reader:
        tmp_lst.append(row)
df = pd.DataFrame(tmp_lst[2:], columns=tmp_lst[1])

df = df[df['Data Item'] == 'Smoothed unemployment rate (%)']

vic_code = gdf['SA2_CODE21']

new = df['SA2 Code (2016 ASGS)'].isin(vic_code)
data = df[new]

data.dropna(subset = ["Dec-10","Mar-22"], inplace=True)

data.replace('-', 0, inplace=True)

# the data records statistics from four months in each year from 2011 to 2021, and we want to combine those by year
# we intend to take average of four months in each year
def combine_column(year):
    rate_list = []
    for index, row in data.iterrows():
        
        total = float(row[f'Mar-{year}']) + float(row[f'Jun-{year}']) + float(row[f'Sep-{year}']) + float(row[f'Dec-{year}'])
        rate_list.append(round(total/4,2))

    return rate_list


combine_list = []
year_list = list(range(11, 22))
for year in year_list:
    rate_list = combine_column(year)
    combine_list.append(rate_list)

index = 0
for year in year_list:
    data[f'20{year}_unemploy_rate'] = combine_list[index]
    index+=1

data = data[['Data Item','Statistical Area Level 2 (SA2) (2016 ASGS)','SA2 Code (2016 ASGS)','2011_unemploy_rate','2012_unemploy_rate','2013_unemploy_rate','2014_unemploy_rate',\
    '2015_unemploy_rate','2016_unemploy_rate','2017_unemploy_rate','2018_unemploy_rate','2019_unemploy_rate','2020_unemploy_rate','2021_unemploy_rate']]

data.to_csv('../data/curated/unemployment.csv')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[f'20{year}_unemploy_rate'] = combine_list[index]


### School

In [59]:
import math

In [71]:
tmp_lst = []
with open('../data/raw/school.csv', mode='r', encoding = "ISO-8859-1") as f:
    reader = csv.reader(f)
    header = next(reader)
    for row in reader:
        tmp_lst.append(row)
df = pd.DataFrame(tmp_lst, columns=header)
print(len(df))

df = df[['Education_Sector','School_Name','School_Type','Address_Town','Address_State','Address_Postcode','X','Y']]

for index, row in df.iterrows():
    if row['X'] == '' or row['Y'] == '':
        df = df.drop(index)

print(len(df))
df.to_csv('../data/curated/school.csv')

2299
2298
