In [3]:
import pandas as pd
import geopandas as gpd
import warnings
warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)

# 1. school location processing

In [2]:
school_dt_2020 = pd.read_csv('../data/raw/external/school/school_location_2020.csv')
school_dt_2021 = pd.read_csv('../data/raw/external/school/school_location_2021.csv', encoding='cp1252')
school_dt_2022 = pd.read_csv('../data/raw/external/school/school_location_2022.csv', encoding='cp1252')
school_dt_2023 = pd.read_csv('../data/raw/external/school/school_location_2023.csv', encoding='cp1252')

In [3]:
#select needed features
names = ['2020','2021','2022','2023']
columns = ['School_Name','School_Type','Address_Line_1','Address_Town','Address_Postcode','Address_State','LGA_ID','LGA_Name','X','Y']
school_data = {}

for name in names:
    school_data[name] = locals()[f'school_dt_{name}'][columns]

#remove whitespace & check null rows
nun_rows = {}
for name in names:
    school_data[name].replace("", float("NaN"), inplace=True)
    nun_rows[name] = school_data[name].isnull().sum().sort_values(ascending = False)

In [4]:
nun_rows['2020']

School_Name         0
School_Type         0
Address_Line_1      0
Address_Town        0
Address_Postcode    0
Address_State       0
LGA_ID              0
LGA_Name            0
X                   0
Y                   0
dtype: int64

In [5]:
nun_rows['2021']

School_Name         0
School_Type         0
Address_Line_1      0
Address_Town        0
Address_Postcode    0
Address_State       0
LGA_ID              0
LGA_Name            0
X                   0
Y                   0
dtype: int64

In [6]:
nun_rows['2022']

Y                   1
X                   1
School_Type         0
School_Name         0
Address_Line_1      0
Address_Town        0
Address_State       0
Address_Postcode    0
LGA_Name            0
LGA_ID              0
dtype: int64

In [7]:
nun_rows['2023']

Y                   1
X                   1
School_Type         0
School_Name         0
Address_Line_1      0
Address_Town        0
Address_State       0
Address_Postcode    0
LGA_Name            0
LGA_ID              0
dtype: int64

In [8]:
#save cleaned data
for name in names:
    school_data[name].to_csv(f'../data/curated/school_location_cleaned_{name}.csv')


# 2. park location data processing

In [4]:
#park_dt = pd.read_csv('../data/raw/external/park/park_location.csv')

In [5]:
# #select needed features
# park_dt = park_dt[['geo_point_2d','longitude','latitude','postcode','area','name','postcode','suburb','projection']]


# #remove whitespace & check null rows
# park_dt.replace("", float("NaN"), inplace=True)
# null_rows_park = park_dt.isnull().sum().sort_values
# print(null_rows_park)

# #save data
# park_dt.to_csv('../data/curated/park_location_cleaned.csv')

# 3. Train location data process

In [6]:
metropolitan_train_dt = pd.read_csv('../data/raw/external/train_station/metropolitan_train_location_2023.csv')
regional_train_dt = pd.read_csv('../data/raw/external/train_station/regional_train_location_2023.csv')

In [7]:
#select needed features
types = ['metropolitan','regional']
columns_train = ['Stop_ID','Stop_name','Stop_lat','Stop_long']
train_data = {}

for type in types:
    train_data[type] = locals()[f'{type}_train_dt'][columns_train]

#remove whitespace & check null rows
null_train = {}
for type in types:
    train_data[type].replace("", float("NaN"), inplace=True)
    null_train[type] = train_data[type].isnull().sum().sort_values(ascending = False)

In [8]:
# rename latitude and longitude columns

metropolitan_train_dt = metropolitan_train_dt.rename(columns={
    'Stop_lat': 'latitude',
    'Stop_long': 'longitude'
})
regional_train_dt = regional_train_dt.rename(columns={
    'Stop_lat': 'latitude',
    'Stop_long': 'longitude'
})




In [9]:
null_train['metropolitan']

Stop_ID      0
Stop_name    0
Stop_lat     0
Stop_long    0
dtype: int64

In [10]:
null_train['regional']

Stop_ID      0
Stop_name    0
Stop_lat     0
Stop_long    0
dtype: int64

In [20]:
#Merge the regional train station data with the metropolitan train data

metropolitan_selected = metropolitan_train_dt[['Stop_ID', 'Stop_name', 'latitude', 'longitude']]
regional_selected = regional_train_dt[['Stop_ID', 'Stop_name', 'latitude', 'longitude']]

# Merge the two dataframes
train_df = pd.concat([metropolitan_selected, regional_selected], ignore_index=True)

# Save the merged dataframe to a CSV file
train_df.to_csv("../data/curated/train_station_cleaned.csv", index=False)

print("train data saved to ../data/curated/train_station_cleaned.csv")


train data saved to ../data/curated/train_station_cleaned.csv


# 4. Population data processing

In [18]:
# population = pd.read_excel('../data/raw/external/population/population_2001-23.xlsx',sheet_name='Table 1', header=5)

# # Rename the column name
# population.rename(columns={population.columns[1]: 'GCCSA name',population.columns[6]: 'SA2 code', population.columns[7]: 'SA2 name'}, inplace=True)

# # Drop no need columns
# population.drop(columns=['Unnamed: 0','Unnamed: 2','Unnamed: 3','Unnamed: 4','Unnamed: 5'],inplace=True)

# # Skip first row
# population = population.iloc[1:,:]

# # Keep rows in victoria only
# population_vic = population.loc[population['GCCSA name'].isin(['Rest of Vic.','Greater Melbourne'])]

# population_vic.to_csv("../data/curated/population_cleaned.csv",index=False)

# population_vic

FileNotFoundError: [Errno 2] No such file or directory: '../data/raw/external/population/population_2001-23.xlsx'