In [2]:
import pandas as pd
import geopandas as gpd
import warnings
warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)

# 1. school location processing

In [3]:
school_dt_2020 = pd.read_csv('../data/raw/external/school/school_location_2020.csv')
school_dt_2021 = pd.read_csv('../data/raw/external/school/school_location_2021.csv', encoding='cp1252')
school_dt_2022 = pd.read_csv('../data/raw/external/school/school_location_2022.csv', encoding='cp1252')
school_dt_2023 = pd.read_csv('../data/raw/external/school/school_location_2023.csv', encoding='cp1252')

In [38]:
#select needed features
names = ['2020','2021','2022','2023']
columns = ['School_Name','School_Type','Address_Line_1','Address_Town','Address_Postcode','Address_State','LGA_ID','LGA_Name','X','Y']
school_data = {}

for name in names:
    school_data[name] = locals()[f'school_dt_{name}'][columns]

#remove whitespace & check null rows
nun_rows = {}
for name in names:
    school_data[name].replace("", float("NaN"), inplace=True)
    nun_rows[name] = school_data[name].isnull().sum().sort_values(ascending = False)

In [39]:
nun_rows['2020']

School_Name         0
School_Type         0
Address_Line_1      0
Address_Town        0
Address_Postcode    0
Address_State       0
LGA_ID              0
LGA_Name            0
X                   0
Y                   0
dtype: int64

In [40]:
nun_rows['2021']

School_Name         0
School_Type         0
Address_Line_1      0
Address_Town        0
Address_Postcode    0
Address_State       0
LGA_ID              0
LGA_Name            0
X                   0
Y                   0
dtype: int64

In [41]:
nun_rows['2022']

X                   1
Y                   1
School_Name         0
School_Type         0
Address_Line_1      0
Address_Town        0
Address_Postcode    0
Address_State       0
LGA_ID              0
LGA_Name            0
dtype: int64

In [42]:
nun_rows['2023']

X                   1
Y                   1
School_Name         0
School_Type         0
Address_Line_1      0
Address_Town        0
Address_Postcode    0
Address_State       0
LGA_ID              0
LGA_Name            0
dtype: int64

In [43]:
#save cleaned data
for name in names:
    school_data[name].to_csv(f'../data/raw/external/school/school_location_{name}.csv')


# 2. park location data processing

In [44]:
park_dt = pd.read_csv('../data/raw/external/park/park_location.csv')

In [45]:
#select needed features
park_dt = park_dt[['geo_point_2d','longitude','latitude','postcode','area','name','postcode','suburb','projection']]


#remove whitespace & check null rows
park_dt.replace("", float("NaN"), inplace=True)
null_rows_park = park_dt.isnull().sum().sort_values
print(null_rows_park)

#save data
park_dt.to_csv('../data/raw/external/park/park_location.csv')

<bound method Series.sort_values of geo_point_2d    0
longitude       0
latitude        0
postcode        0
area            0
name            0
postcode        0
suburb          0
projection      0
dtype: int64>


# 3. Train location data process

In [3]:
metropolitan_train_dt = pd.read_csv('../data/raw/external/train_station/metropolitan_train_location_2023.csv')
regional_train_dt = pd.read_csv('../data/raw/external/train_station/regional_train_location_2023.csv')

In [4]:
#select needed features
types = ['metropolitan','regional']
columns_train = ['Stop_ID','Stop_name','Stop_lat','Stop_long']
train_data = {}

for type in types:
    train_data[type] = locals()[f'{type}_train_dt'][columns_train]

#remove whitespace & check null rows
null_train = {}
for type in types:
    train_data[type].replace("", float("NaN"), inplace=True)
    null_train[type] = train_data[type].isnull().sum().sort_values(ascending = False)

In [5]:
null_train['metropolitan']

Stop_ID      0
Stop_name    0
Stop_lat     0
Stop_long    0
dtype: int64

In [6]:
null_train['regional']

Stop_ID      0
Stop_name    0
Stop_lat     0
Stop_long    0
dtype: int64

In [7]:
#save cleaned data
for type in types:
    train_data[type].to_csv(f'../data/raw/external/train_station/{type}_train_location_2023.csv')