In [1]:
import os

domain_data_dir = '../../data/landing/other_data'
os.makedirs(domain_data_dir, exist_ok=True)  # Create the folder if it doesn't exist

In [2]:
import requests
import zipfile
import io
import os
import pandas as pd

url = 'https://data.ptv.vic.gov.au/downloads/gtfs.zip'
response = requests.get(url)

with zipfile.ZipFile(io.BytesIO(response.content)) as the_zip:
    the_zip.extractall('../../data/landing/other_data/stops_datavic')

extracted_folder_path = '../../data/landing/other_data/stops_datavic'
for root, dirs, files in os.walk(extracted_folder_path):
    for file in files:
        if file.endswith('.zip'):
            file_path = os.path.join(root, file)
            with zipfile.ZipFile(file_path, 'r') as zip_ref:
                zip_ref.extractall(root)

stops_files = []
for root, dirs, files in os.walk(extracted_folder_path):
    for file in files:
        if file == 'stops.txt':
            file_path = os.path.join(root, file)
            df = pd.read_csv(file_path)
            stops_files.append(df)

combined_stops_df = pd.concat(stops_files, ignore_index=True)

stops_data_dir = '../../data/raw/stops_data'
os.makedirs(stops_data_dir, exist_ok=True)  # Create the folder if it doesn't exist

output_csv_path = os.path.join(stops_data_dir, 'stops_datavic.csv')
combined_stops_df.to_csv(output_csv_path, index=False)


In [3]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
import re

def process_stops_csv(input_csv, output_csv, regions_geojson):

    df = pd.read_csv(input_csv)
    
    before_count = len(df)
    df = df.drop_duplicates(subset=['stop_id'])
    after_count = len(df)
    deleted_count = before_count - after_count
    
    print(f"Number of rows deleted due to duplicate stop_id: {deleted_count}")
    
    geometry = [Point(xy) for xy in zip(df['stop_lon'], df['stop_lat'])]
    stops_gdf = gpd.GeoDataFrame(df, geometry=geometry)
    
    regions_gdf = gpd.read_file(regions_geojson)
    
    stops_in_regions = gpd.sjoin(stops_gdf, regions_gdf, how='left', predicate='within')
    
    stops_in_regions['region'] = stops_in_regions['SA2_NAME21']  
    
    stops_in_regions = stops_in_regions.drop(columns=['index_right', 'geometry'])
    
    def determine_stop_type(stop_name):
        if 'railway station' in stop_name.lower():
            return 'train station'
        elif re.match(r'^\d', stop_name):
            return 'tram station'
        else:
            return 'bus station'
    
    stops_in_regions['stop_type'] = stops_in_regions['stop_name'].apply(determine_stop_type)
    
    station_counts = stops_in_regions['stop_type'].value_counts()
    
    print(f"Train station count: {station_counts.get('train station', 0)}")
    print(f"Tram station count: {station_counts.get('tram station', 0)}")
    print(f"Bus station count: {station_counts.get('bus station', 0)}")
    
    stops_in_regions.to_csv(output_csv, index=False)
    
    return deleted_count, station_counts

input_csv = '../../data/raw/stops_data/stops_datavic.csv'
output_csv = '../../data/raw/stops_data/stops_datavic_mapped.csv'
regions_geojson = '../../data/landing/region_data/sa2_dataset/sa2_unzip'

deleted_rows, station_counts = process_stops_csv(input_csv, output_csv, regions_geojson)


Number of rows deleted due to duplicate stop_id: 507


Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: None
Right CRS: EPSG:7844

  stops_in_regions = gpd.sjoin(stops_gdf, regions_gdf, how='left', predicate='within')


Train station count: 559
Tram station count: 2020
Bus station count: 25237
