#### Rent data:
- ../data/curated/rent_basic.csv

#### PTV data
- ../data/raw/PTV/public_trans.csv
- SA2_CODE21: int, id of district in SA2 formate
- metrobus_count: int, count of PTV_METRO_BUS_STOP
- metrotrain_count: int, count of PTV_METRO_TRAIN_STATION
- metrotram_count: int, count of PTV_METRO_TRAM_STOP
- regbus_count: int, count of PTV_REGIONAL_BUS_STOP
- regcoach_count: int, count of PTV_REGIONAL_COACH_STOP
- regtrain_count: int, count of PTV_REGIONAL_TRAIN_STATION
- skybus_count: int, count of PTV_SKYBUS_STOP


#### FOI:
- ../data/raw/FOI/foi_count_by_sa2.csv
- SA2 (int), recr_count (int), comm_count (int)

## PTV data was pulled manually
To request PTV.data manually:
- https://data.gov.au/dataset/ds-vic-f8155dba-4f9c-43ee-ad83-f149fc3f1e9e/details?q=tram%20station
- sign up (free) and add data to order
- proceed to order configuration
- select geographical of GDA2020 and ESRI shape file format with "select all area available" option checked
- file will be sent to the given email
- download and rename the file as PTV.zip, move it to ../data/raw/


## Point of interest data was pulled manually

To request FOI.data manually:
-  https://datashare.maps.vic.gov.au/search?md=019d7631-1234-5112-9f21-8f7346647b61
- sign up (free) and add data to order
- proceed to order configuration
- select geographical of GDA2020 and ESRI shape file format with "select all area available" option checked
- file will be sent to the given email
- download and rename the file as FOI.zip, move it to ../data/raw/

In [14]:
import os
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import re
from itertools import compress
import pandas as pd
import geopandas as gpd
import zipfile


spark = (
    SparkSession.builder.appName("Assignment_2")
    .config("spark.sql.repl.eagerEval.enabled", True)
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .getOrCreate()
)

INPUT_DIR = "../data/raw/"
OUTPUT_DIR = "../data/curated/"


headers = {"accept": "text/csv"}

In [15]:
def gpd_station_merge(poly_gdf, file_path, by_id_name = "SA2_CODE21",\
    station_id_name = "STOP_ID", method={"STOP_ID": "count"}):
    """
        A function used to merge shape file in path: {file_path} to a 
        geopandas dataframe {poly_gdf} with POLYGON geometry. 
        poly_gdf: a geopandas.GeoDataFrame object contains POLYGON geometry
        file_path: a String of file path to read target shape file
        by_id_name: a String of id name to perform groupby option
        station_id_name: a String of id name stated in the readed gdf
        method: a Dict of operations to perform after groupby
    """

    ### read station file
    station_gdf = gpd.read_file(file_path)

    # metro bus station feature selection
    station_gdf = station_gdf[[station_id_name, "geometry"]]
    


    # merge tabels
    join_gdf = gpd.sjoin(poly_gdf, station_gdf, how="left")
    join_gdf = join_gdf.groupby(by_id_name).agg(method)
    
    return join_gdf

## PTV data

In [16]:
output_dir = "../data/raw/PTV/ll_gda2020/esrishape/whole_of_dataset/victoria/PUBLIC_TRANSPORT/"

# unzip zip file
with zipfile.ZipFile(f"../data/raw/PTV.zip", "r") as zip_ref:
    zip_ref.extractall(f"../data/raw/PTV/")

In [17]:
### read shape file and make geometry readable
boundary_gdf = gpd.read_file("../data/raw/ABS/digitalBoundary/\
SA2_2021_AUST_GDA2020.shp")
boundary_gdf = boundary_gdf.loc[boundary_gdf["STE_NAME21"] == "Victoria"]

# digital boundary feature selection
boundary_gdf = boundary_gdf.reset_index()[["SA2_CODE21", "geometry"]].set_index("SA2_CODE21")
print(boundary_gdf.shape)
print(boundary_gdf.head(1))


(524, 1)
                                                     geometry
SA2_CODE21                                                   
201011001   POLYGON ((143.78282 -37.56666, 143.75558 -37.5...


In [18]:
mix_gdf = boundary_gdf.sort_values(by=["SA2_CODE21"])
file_paths = [
    f"{output_dir}PTV_METRO_BUS_STOP.shp",
    f"{output_dir}PTV_METRO_TRAIN_STATION.shp",
    f"{output_dir}PTV_METRO_TRAM_STOP.shp",
    f"{output_dir}PTV_REGIONAL_BUS_STOP.shp",
    f"{output_dir}PTV_REGIONAL_COACH_STOP.shp",
    f"{output_dir}PTV_REGIONAL_TRAIN_STATION.shp",
    f"{output_dir}PTV_SKYBUS_STOP.shp"
]

col_names = ["metrobus_count", "metrotrain_count", "metrotram_count",
    "regbus_count", "regcoach_count", "regtrain_count", "skybus_count"]


for col_name, file_path in zip(col_names, file_paths):
    print(col_name, file_path)
    cur_gdf = gpd_station_merge(boundary_gdf, file_path).rename({"STOP_ID": col_name}, axis=1)
    cur_gdf = cur_gdf.sort_values(by = col_name)
    mix_gdf = pd.concat([mix_gdf, cur_gdf], axis=1)

print(mix_gdf.shape)
print(mix_gdf.head())
mix_gdf.to_csv("../data/raw/PTV/public_trans.csv")

metrobus_count ../data/raw/PTV/ll_gda2020/esrishape/whole_of_dataset/victoria/PUBLIC_TRANSPORT/PTV_METRO_BUS_STOP.shp
metrotrain_count ../data/raw/PTV/ll_gda2020/esrishape/whole_of_dataset/victoria/PUBLIC_TRANSPORT/PTV_METRO_TRAIN_STATION.shp
metrotram_count ../data/raw/PTV/ll_gda2020/esrishape/whole_of_dataset/victoria/PUBLIC_TRANSPORT/PTV_METRO_TRAM_STOP.shp
regbus_count ../data/raw/PTV/ll_gda2020/esrishape/whole_of_dataset/victoria/PUBLIC_TRANSPORT/PTV_REGIONAL_BUS_STOP.shp
regcoach_count ../data/raw/PTV/ll_gda2020/esrishape/whole_of_dataset/victoria/PUBLIC_TRANSPORT/PTV_REGIONAL_COACH_STOP.shp
regtrain_count ../data/raw/PTV/ll_gda2020/esrishape/whole_of_dataset/victoria/PUBLIC_TRANSPORT/PTV_REGIONAL_TRAIN_STATION.shp
skybus_count ../data/raw/PTV/ll_gda2020/esrishape/whole_of_dataset/victoria/PUBLIC_TRANSPORT/PTV_SKYBUS_STOP.shp
(524, 8)
                                                     geometry  metrobus_count  \
SA2_CODE21                                                        

## Match SA2 to Rental data

In [19]:
# read local files
rent_df = pd.read_csv(f"../data/raw/rent/rent_raw.csv").reset_index()


rent_gdf = gpd.GeoDataFrame(rent_df, geometry=gpd\
        .points_from_xy(rent_df["Longitude"], rent_df["Latitude"]))
boundary_gdf = gpd.read_file(f"../data/raw/ABS/digitalBoundary/SA2_2021_AUST_GDA2020.shp")
rent_gdf["geometry"] = rent_gdf["geometry"].set_crs("epsg:7844")

# feature selction on boundary gdf
boundary_gdf = boundary_gdf.loc[boundary_gdf["STE_NAME21"] == "Victoria"]
boundary_gdf = boundary_gdf[["SA2_CODE21", "geometry"]]
boundary_gdf["SA2_CODE21"] = boundary_gdf["SA2_CODE21"].astype("int64")

# assgin sa2 to rent gdf
join_gdf = gpd.sjoin(rent_gdf, boundary_gdf, how="right")
join_gdf = join_gdf.dropna()
join_gdf["index_left"] = join_gdf["index_left"].astype("int")
rent_gdf = rent_gdf.loc[join_gdf["index_left"]]
join_gdf = join_gdf.reset_index()
rent_gdf["SA2"] = join_gdf[["SA2_CODE21"]]
rent_gdf = rent_gdf.dropna()
rent_gdf["SA2"] = rent_gdf["SA2"].astype("int64")
rent_df = rent_df.loc[rent_gdf["index"]]
rent_df["SA2"] = rent_gdf["SA2"]

rent_df["rent_id"] = range(rent_df.shape[0])
rent_df = rent_df.set_index("rent_id").drop(["index"], axis=1)

rent_df = rent_df.dropna().drop_duplicates()
print(rent_df.head())
print(rent_df.shape)
rent_df.to_csv(f"../data/curated/rent_basic.csv", header=True)


          rent  bedroom  baths  parking  \
rent_id                                   
0        490.0        4      2        2   
1        420.0        4      2        2   
2        520.0        4      2        2   
3        440.0        4      2        2   
4        440.0        4      2        2   

                                                       url   Latitude  \
rent_id                                                                 
0        https://www.domain.com.au/9-kilkenny-drive-alf... -37.563073   
1        https://www.domain.com.au/164-shortridge-drive... -37.547241   
2        https://www.domain.com.au/37-mullingar-drive-a... -37.566319   
3        https://www.domain.com.au/66-lugano-avenue-alf... -37.563453   
4        https://www.domain.com.au/57-dyson-drive-alfre... -37.550549   

          Longitude                     geometry        SA2  
rent_id                                                      
0        143.793875  POINT (143.79387 -37.56307)  206021110  


In [20]:
print(rent_df[["SA2", "Latitude", "Longitude"]].drop_duplicates().shape)

(3748, 3)


## Point of interest

To request FOI.data manually:
-  https://datashare.maps.vic.gov.au/search?md=019d7631-1234-5112-9f21-8f7346647b61
- sign up (free) and add data to order
- proceed to order configuration
- select geographical of GDA2020 and ESRI shape file format with "select all area available" option checked
- file will be sent to the given email
- download and rename the file as FOI.zip, move it to ../data/raw/

### FOI:
- ../data/raw/FOI/foi_count_by_sa2.csv
- SA2 (int), recr_count (int), comm_count (int)

In [21]:
import pandas as pd
import geopandas as gpd
import zipfile

In [22]:
# ../data/raw/FOI_POINT.shp
# unzip zip file
with zipfile.ZipFile(f"../data/raw/FOI.zip", "r") as zip_ref:
    zip_ref.extractall(f"../data/raw/FOI/")

In [23]:
foi_gdf = gpd.read_file(f"../data/raw/FOI/ll_gda2020/esrishape/whole_of_dataset/victoria/VMFEAT/FOI_POINT.shp")
boundary_gdf = gpd.read_file(f"../data/raw/ABS/digitalBoundary/SA2_2021_AUST_GDA2020.shp")
# foi_gdf.to_csv("../data/raw/FOI_raw.csv")

## feature selction for foi
foi_gdf = foi_gdf[["UFI", "PARENTFTID", "FTYPE", "NAME_LABEL", "geometry"]]
foi_gdf = foi_gdf.loc[foi_gdf["PARENTFTID"].isna()].drop(["PARENTFTID"], axis=1).dropna().drop_duplicates()
foi_gdf = foi_gdf.astype({"UFI": int})
foi_gdf = foi_gdf.set_index("UFI")

## feature selection for boundary
boundary_gdf = boundary_gdf.loc[boundary_gdf["STE_NAME21"] == "Victoria"]
boundary_gdf = boundary_gdf[["SA2_CODE21", "geometry"]]
boundary_gdf["SA2_CODE21"] = boundary_gdf["SA2_CODE21"].astype("int64")
boundary_gdf = boundary_gdf.rename(columns={"SA2_CODE21": "SA2"}).set_index("SA2")

print(foi_gdf.dtypes)
print(foi_gdf["FTYPE"].unique())
print(foi_gdf.shape)
print(foi_gdf.head())

FTYPE           object
NAME_LABEL      object
geometry      geometry
dtype: object
['control point' 'sign' 'recreational resource' 'admin facility'
 'landmark' 'community venue' 'communication service' 'health facility'
 'education centre' 'place of worship' 'place' 'community space'
 'dumping ground' 'sport facility' 'storage facility' 'excavation site'
 'emergency facility' 'cultural centre' 'commercial facility'
 'pipeline facility' 'care facility' 'power facility' 'cableway'
 'hospital' 'defence site']
(21834, 3)
                          FTYPE  NAME_LABEL                          geometry
UFI                                                                          
32592012          control point   7022-2301  MULTIPOINT (140.84663 -37.94721)
64946096                   sign      HVP002  MULTIPOINT (145.99438 -37.82302)
64946097                   sign      HVP017  MULTIPOINT (146.53764 -38.10037)
59765887  recreational resource  Lloyds Hut  MULTIPOINT (146.50104 -37.75399)
32590719 

In [24]:
join_gdf = gpd.sjoin(foi_gdf, boundary_gdf, how="right")


## count number of recreational resource in every SA2 district
recr_gdf = join_gdf.loc[join_gdf["FTYPE"] == "recreational resource"].groupby(["SA2", "FTYPE"])["index_left"].count().reset_index()
recr_gdf = recr_gdf.set_index("SA2")
recr_gdf = recr_gdf.join(boundary_gdf, how="right").drop(["FTYPE"], axis=1).rename(columns={"index_left": "recr_count"})
recr_gdf["recr_count"] = recr_gdf["recr_count"].fillna(0).astype({"recr_count": int}).sort_index()

## count number of commercial facility in every SA2 district
comm_gdf = join_gdf.loc[join_gdf["FTYPE"] == "commercial facility"].groupby(["SA2", "FTYPE"])["index_left"].count().reset_index()
comm_gdf = comm_gdf.set_index("SA2")
comm_gdf = comm_gdf.join(boundary_gdf, how="right").drop(["FTYPE"], axis=1).rename(columns={"index_left": "comm_count"})
comm_gdf["comm_count"] = comm_gdf["comm_count"].fillna(0).astype({"comm_count": int}).sort_index()

## combine features
foi_df = pd.DataFrame(recr_gdf.drop(columns = "geometry"))
foi_df["comm_count"] = comm_gdf["comm_count"]
foi_df.to_csv("../data/raw/FOI/foi_count_by_sa2.csv")

print(foi_df.shape)
print(foi_df.dtypes)
print(foi_df.head())

(524, 2)
recr_count    int64
comm_count    int64
dtype: object
           recr_count  comm_count
SA2                              
201011001           0           0
201011002           0           0
201011005           0           0
201011006           0           0
201011007           0           0


By Junhua Liu for study use only