In [1]:
import pandas as pd
import geopandas as gpd
import warnings
warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)

# 1. school location processing

In [2]:
school_dt_2020 = pd.read_csv('../data/raw/external/school/school_location_2020.csv')
school_dt_2021 = pd.read_csv('../data/raw/external/school/school_location_2021.csv', encoding='cp1252')
school_dt_2022 = pd.read_csv('../data/raw/external/school/school_location_2022.csv', encoding='cp1252')
school_dt_2023 = pd.read_csv('../data/raw/external/school/school_location_2023.csv', encoding='cp1252')

In [3]:
#select needed features
names = ['2020','2021','2022','2023']
columns = ['School_Name','School_Type','Address_Line_1','Address_Town','Address_Postcode','Address_State','LGA_ID','LGA_Name','X','Y']
school_data = {}

for name in names:
    school_data[name] = locals()[f'school_dt_{name}'][columns]

#remove whitespace & check null rows
nun_rows = {}
for name in names:
    school_data[name].replace("", float("NaN"), inplace=True)
    nun_rows[name] = school_data[name].isnull().sum().sort_values(ascending = False)

In [4]:
nun_rows['2020']

School_Name         0
School_Type         0
Address_Line_1      0
Address_Town        0
Address_Postcode    0
Address_State       0
LGA_ID              0
LGA_Name            0
X                   0
Y                   0
dtype: int64

In [5]:
nun_rows['2021']

School_Name         0
School_Type         0
Address_Line_1      0
Address_Town        0
Address_Postcode    0
Address_State       0
LGA_ID              0
LGA_Name            0
X                   0
Y                   0
dtype: int64

In [6]:
nun_rows['2022']

Y                   1
X                   1
School_Type         0
School_Name         0
Address_Line_1      0
Address_Town        0
Address_State       0
Address_Postcode    0
LGA_Name            0
LGA_ID              0
dtype: int64

In [7]:
nun_rows['2023']

Y                   1
X                   1
School_Type         0
School_Name         0
Address_Line_1      0
Address_Town        0
Address_State       0
Address_Postcode    0
LGA_Name            0
LGA_ID              0
dtype: int64

In [8]:
#save cleaned data
for name in names:
    school_data[name].to_csv(f'../data/curated/school_location_cleaned_{name}.csv')


# 2. park location data processing

In [9]:
park_dt = pd.read_csv('../data/raw/external/park/park_location.csv')

FileNotFoundError: [Errno 2] No such file or directory: '../data/raw/external/park/park_location.csv'

In [14]:
#select needed features
park_dt = park_dt[['geo_point_2d','longitude','latitude','postcode','area','name','postcode','suburb','projection']]


#remove whitespace & check null rows
park_dt.replace("", float("NaN"), inplace=True)
null_rows_park = park_dt.isnull().sum().sort_values
print(null_rows_park)

#save data
park_dt.to_csv('../data/curated/park_location_cleaned.csv')

<bound method Series.sort_values of geo_point_2d    0
longitude       0
latitude        0
postcode        0
area            0
name            0
postcode        0
suburb          0
projection      0
dtype: int64>


# 3. Train location data process

In [15]:
metropolitan_train_dt = pd.read_csv('../data/raw/external/train_station/metropolitan_train_location_2023.csv')
regional_train_dt = pd.read_csv('../data/raw/external/train_station/regional_train_location_2023.csv')

In [16]:
#select needed features
types = ['metropolitan','regional']
columns_train = ['Stop_ID','Stop_name','Stop_lat','Stop_long']
train_data = {}

for type in types:
    train_data[type] = locals()[f'{type}_train_dt'][columns_train]

#remove whitespace & check null rows
null_train = {}
for type in types:
    train_data[type].replace("", float("NaN"), inplace=True)
    null_train[type] = train_data[type].isnull().sum().sort_values(ascending = False)

In [17]:
null_train['metropolitan']

Stop_ID      0
Stop_name    0
Stop_lat     0
Stop_long    0
dtype: int64

In [18]:
null_train['regional']

Stop_ID      0
Stop_name    0
Stop_lat     0
Stop_long    0
dtype: int64

In [19]:
#save cleaned data
for type in types:
    train_data[type].to_csv(f'../data/curated/{type}_train_location_2023_cleaned.csv')

# 4. Population data processing

In [21]:
population = pd.read_excel('../data/raw/external/population/population_2001-23.xlsx',sheet_name='Table 1', header=5)

# Rename the column name
population.rename(columns={population.columns[1]: 'GCCSA name',population.columns[6]: 'SA2 code', population.columns[7]: 'SA2 name'}, inplace=True)

# Drop no need columns
population.drop(columns=['Unnamed: 0','Unnamed: 2','Unnamed: 3','Unnamed: 4','Unnamed: 5'],inplace=True)

# Skip first row
population = population.iloc[1:,:]

# Keep rows in victoria only
population_vic = population.loc[population['GCCSA name'].isin(['Rest of Vic.','Greater Melbourne'])]

population_vic.to_csv("../data/curated/population_cleaned.csv",index=False)

population_vic

Unnamed: 0,GCCSA name,SA2 code,SA2 name,2001,2002,2003,2004,2005,2006,2007,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
643,Rest of Vic.,201011001,Alfredton,5756,6092,6293,6480,6648,6761,7034,...,10338,11039,11852,12649,13537,14434,15507,16841,18002,18997
644,Rest of Vic.,201011002,Ballarat,11497,11708,12015,12189,12269,12356,12408,...,12327,12300,12301,12266,12244,12320,12196,12071,11938,11809
645,Rest of Vic.,201011005,Buninyong,5320,5399,5557,5620,5857,6037,6131,...,7082,7191,7311,7409,7418,7458,7377,7229,7247,7323
646,Rest of Vic.,201011006,Delacombe,4154,4225,4371,4465,4704,5041,5206,...,6583,6846,7195,7622,8183,8890,9755,10648,11798,12869
647,Rest of Vic.,201011007,Smythes Creek,3317,3378,3411,3473,3508,3542,3594,...,3945,3966,3990,4004,4042,4112,4152,4211,4223,4268
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1160,Rest of Vic.,217031476,Otway,3452,3479,3511,3511,3492,3459,3489,...,3519,3538,3556,3635,3710,3802,3911,3979,3974,3983
1161,Rest of Vic.,217041477,Moyne - East,6718,6704,6676,6643,6638,6652,6606,...,6734,6716,6709,6717,6746,6798,6883,6990,7046,7132
1162,Rest of Vic.,217041478,Moyne - West,8317,8387,8450,8487,8517,8601,8694,...,9383,9467,9603,9686,9783,9845,9859,9967,10098,10148
1163,Rest of Vic.,217041479,Warrnambool - North,17053,17449,17726,17937,18172,18528,18877,...,20930,21217,21442,21688,21954,22184,22416,22470,22586,22762
