In [1]:
import pandas as pd
import requests
import zipfile
from pyspark.sql import SparkSession
import os
import geopandas as gpd
import folium
from folium.plugins import HeatMap
from pyspark.sql import SparkSession
from shapely.geometry import Point


In [2]:

output_relative_dir = '../../data/landing/PTV/'
output_absolute_dir = '../../data/raw/PTV/'


if not os.path.exists(output_relative_dir):
    os.makedirs(output_relative_dir)
    print(f"Directory {output_relative_dir} created.")
else:
    print(f"Directory {output_relative_dir} already exists, skipping creation.")

if not os.path.exists(output_absolute_dir):
    os.makedirs(output_absolute_dir)
    print(f"Directory {output_absolute_dir} created.")
else:
    print(f"Directory {output_absolute_dir} already exists, skipping creation.")

Directory ../../data/landing/PTV/ already exists, skipping creation.
Directory ../../data/raw/PTV/ already exists, skipping creation.


In [3]:
url = "https://data.ptv.vic.gov.au/downloads/gtfs.zip"
download_path = "../../data/landing/PTV/gtfs.zip"
extract_to_path = "../../data/landing/PTV/"

os.makedirs(extract_to_path, exist_ok=True)

if not os.path.exists(download_path):
    print("Downloading file...")
    response = requests.get(url)
    with open(download_path, 'wb') as file:
        file.write(response.content)

    print("Extracting file...")
    with zipfile.ZipFile(download_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to_path)

    print("File downloaded and extracted successfully.")
else:
    print("Zip file already exists, skipping download and extraction.")

Zip file already exists, skipping download and extraction.


In [4]:
# Create SparkSession
spark = (
    SparkSession.builder.appName("MAST30034 Project 2")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.executor.memory", "4g") \
    .config("spark.driver.memory", "4g") \
    .getOrCreate()
)

# Define base directory paths
ptv_dir = '../../data/landing/PTV/'
output_dir = '../../data/raw/PTV/un_preprocess/'

# Ensure the output directory exists
os.makedirs(output_dir, exist_ok=True)

# Mapping of folders to corresponding output parquet filenames
folders_to_process = {
    "1": "Regional Train",
    "2": "Metropolitan Train",
    "3": "Metropolitan Tram",
    "4": "Metropolitan Bus",
    "5": "Regional Coach",
    "6": "Regional Bus"
}

# Loop through each folder to process
for folder, parquet_name in folders_to_process.items():
    folder_path = os.path.join(ptv_dir, folder)
    
    # Find zip files in the folder
    zip_files = [f for f in os.listdir(folder_path) if f.endswith('.zip')]
    
    if zip_files:
        zip_file_path = os.path.join(folder_path, zip_files[0])
        
        # Extract stops.txt from the zip file
        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
            if 'stops.txt' in zip_ref.namelist():
                zip_ref.extract('stops.txt', folder_path)
                print(f"Extracted stops.txt from {zip_file_path}")
            else:
                print(f"stops.txt not found in {zip_file_path}")
        
        # Read the extracted stops.txt file
        stops_txt_path = os.path.join(folder_path, 'stops.txt')
        if os.path.exists(stops_txt_path):
            # Use PySpark to read the stops.txt file
            df = spark.read.csv(stops_txt_path, header=True, inferSchema=True)
            
            # Define the path for the output parquet file
            parquet_file_path = os.path.join(output_dir, f'{parquet_name}.parquet')
            
            # Convert the DataFrame to parquet and save it
            df.write.parquet(parquet_file_path, mode='overwrite')
            print(f"Converted {stops_txt_path} to {parquet_file_path}")
        else:
            print(f"stops.txt not found in {folder_path}")
    else:
        print(f"No zip file found in {folder_path}")

24/09/12 18:47:34 WARN Utils: Your hostname, yoga resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
24/09/12 18:47:34 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/09/12 18:47:34 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Extracted stops.txt from ../../data/landing/PTV/1/google_transit.zip
Converted ../../data/landing/PTV/1/stops.txt to ../../data/raw/PTV/un_preprocess/Regional Train.parquet
Extracted stops.txt from ../../data/landing/PTV/2/google_transit.zip
Converted ../../data/landing/PTV/2/stops.txt to ../../data/raw/PTV/un_preprocess/Metropolitan Train.parquet
Extracted stops.txt from ../../data/landing/PTV/3/google_transit.zip
Converted ../../data/landing/PTV/3/stops.txt to ../../data/raw/PTV/un_preprocess/Metropolitan Tram.parquet
Extracted stops.txt from ../../data/landing/PTV/4/google_transit.zip
Converted ../../data/landing/PTV/4/stops.txt to ../../data/raw/PTV/un_preprocess/Metropolitan Bus.parquet
Extracted stops.txt from ../../data/landing/PTV/5/google_transit.zip
Converted ../../data/landing/PTV/5/stops.txt to ../../data/raw/PTV/un_preprocess/Regional Coach.parquet
Extracted stops.txt from ../../data/landing/PTV/6/google_transit.zip
Converted ../../data/landing/PTV/6/stops.txt to ../../dat

In [5]:
def check_parquet_features(directory):
    columns_set = set()
    parquet_files = []

    # Traverse the directory to find all .parquet files
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith(".parquet"):
                file_path = os.path.join(root, file)
                parquet_files.append(file_path)

    # Iterate over all .parquet files to get column names
    for file_path in parquet_files:
        df = spark.read.parquet(file_path)
        columns = set(df.columns)
        
        # Set the columns of the first file as the baseline
        if not columns_set:
            columns_set = columns
        else:
            # check if the columns of the current file are inconsistent with the baseline
            if columns != columns_set:
                spark.stop()
                return False, (columns_set, columns)

    spark.stop()
    return True, list(columns_set)

# Specify the directory containing .parquet files (relative path)
directory = '../../data/raw/PTV/Un_preprocess/'

# Test if the features of the files are consistent
features_consistent, columns_info = check_parquet_features(directory)

if features_consistent:
    print("All files have consistent features:")
    print(columns_info)
else:
    print("Files have different features:")
    print("Base columns:", columns_info[0])
    print("Different columns:", columns_info[1])

All files have consistent features:
[]


In [6]:
spark.stop()