In [1]:
import pandas as pd
import requests
import zipfile
from pyspark.sql import SparkSession
import os
import geopandas as gpd
import folium
from folium.plugins import HeatMap
from pyspark.sql import SparkSession
from shapely.geometry import Point


In [2]:

output_relative_dir = '../../data/landing/PTV/'
output_absolute_dir = '../../data/raw/PTV/'


if not os.path.exists(output_relative_dir):
    os.makedirs(output_relative_dir)
    print(f"Directory {output_relative_dir} created.")
else:
    print(f"Directory {output_relative_dir} already exists, skipping creation.")

if not os.path.exists(output_absolute_dir):
    os.makedirs(output_absolute_dir)
    print(f"Directory {output_absolute_dir} created.")
else:
    print(f"Directory {output_absolute_dir} already exists, skipping creation.")

Directory ../../data/landing/PTV/ created.
Directory ../../data/raw/PTV/ already exists, skipping creation.


In [3]:
url = "https://data.ptv.vic.gov.au/downloads/gtfs.zip"
download_path = "../../data/landing/PTV/gtfs.zip"
extract_to_path = "../../data/landing/PTV/"

os.makedirs(extract_to_path, exist_ok=True)

if not os.path.exists(download_path):
    print("Downloading file...")
    response = requests.get(url)
    with open(download_path, 'wb') as file:
        file.write(response.content)

    print("Extracting file...")
    with zipfile.ZipFile(download_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to_path)

    print("File downloaded and extracted successfully.")
else:
    print("Zip file already exists, skipping download and extraction.")

Downloading file...
Extracting file...
File downloaded and extracted successfully.


In [4]:
spark = SparkSession.builder \
    .appName("Convert stops.txt to Parquet") \
    .getOrCreate()

# 定义基础目录路径
ptv_dir = '../../data/landing/PTV/'
output_dir = '../../data/raw/PTV/Un_preprocess_PTV/'

# 确保输出目录存在
os.makedirs(output_dir, exist_ok=True)

# 文件夹与对应输出文件名的映射
folders_to_process = {
    "1": "1 - Regional Train",
    "2": "2 - Metropolitan Train",
    "3": "3 - Metropolitan Tram",
    "4": "4 - Metropolitan Bus",
    "5": "5 - Regional Coach",
    "6": "6 - Regional Bus"
}

# 循环处理每个文件夹
for folder, parquet_name in folders_to_process.items():
    folder_path = os.path.join(ptv_dir, folder)
    
    # 查找压缩包文件
    zip_files = [f for f in os.listdir(folder_path) if f.endswith('.zip')]
    
    if zip_files:
        zip_file_path = os.path.join(folder_path, zip_files[0])
        
        # 解压 stops.txt 文件
        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
            if 'stops.txt' in zip_ref.namelist():
                zip_ref.extract('stops.txt', folder_path)
                print(f"Extracted stops.txt from {zip_file_path}")
            else:
                print(f"stops.txt not found in {zip_file_path}")
        
        # 读取解压后的 stops.txt 文件
        stops_txt_path = os.path.join(folder_path, 'stops.txt')
        if os.path.exists(stops_txt_path):
            # 使用 PySpark 读取 stops.txt 文件
            df = spark.read.csv(stops_txt_path, header=True, inferSchema=True)
            
            # 定义 parquet 文件的路径
            parquet_file_path = os.path.join(output_dir, f'{parquet_name}.parquet')
            
            # 转换为 parquet 并保存
            df.write.parquet(parquet_file_path, mode='overwrite')
            print(f"Converted {stops_txt_path} to {parquet_file_path}")
        else:
            print(f"stops.txt not found in {folder_path}")
    else:
        print(f"No zip file found in {folder_path}")

# 关闭 SparkSession
spark.stop()

your 131072x1 screen size is bogus. expect trouble
24/09/01 09:47:27 WARN Utils: Your hostname, LAPTOP-1H9MAQ2V resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
24/09/01 09:47:27 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/09/01 09:47:27 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Extracted stops.txt from ../../data/landing/PTV/1/google_transit.zip


                                                                                

Converted ../../data/landing/PTV/1/stops.txt to ../../data/raw/PTV/Un_preprocess_PTV/1 - Regional Train.parquet
Extracted stops.txt from ../../data/landing/PTV/2/google_transit.zip
Converted ../../data/landing/PTV/2/stops.txt to ../../data/raw/PTV/Un_preprocess_PTV/2 - Metropolitan Train.parquet
Extracted stops.txt from ../../data/landing/PTV/3/google_transit.zip
Converted ../../data/landing/PTV/3/stops.txt to ../../data/raw/PTV/Un_preprocess_PTV/3 - Metropolitan Tram.parquet
Extracted stops.txt from ../../data/landing/PTV/4/google_transit.zip
Converted ../../data/landing/PTV/4/stops.txt to ../../data/raw/PTV/Un_preprocess_PTV/4 - Metropolitan Bus.parquet
Extracted stops.txt from ../../data/landing/PTV/5/google_transit.zip
Converted ../../data/landing/PTV/5/stops.txt to ../../data/raw/PTV/Un_preprocess_PTV/5 - Regional Coach.parquet
Extracted stops.txt from ../../data/landing/PTV/6/google_transit.zip
Converted ../../data/landing/PTV/6/stops.txt to ../../data/raw/PTV/Un_preprocess_PTV/6

In [5]:
# 1. 创建 SparkSession
spark = SparkSession.builder.appName("Geometric Point Map with Folium").getOrCreate()

# 2. 读取 parquet 文件
parquet_path = '../../data/raw/PTV/Un_preprocess_PTV/1 - Regional Train.parquet'
stops_df = spark.read.parquet(parquet_path)

# 3. 选择经纬度列并转换为 Pandas DataFrame
stops_pd = stops_df.select("stop_lat", "stop_lon").toPandas()

# 4. 将 Pandas DataFrame 转换为 GeoDataFrame
geometry = [Point(xy) for xy in zip(stops_pd['stop_lon'], stops_pd['stop_lat'])]
gdf = gpd.GeoDataFrame(stops_pd, geometry=geometry)

# 5. 定义坐标参考系统 (CRS)
gdf.set_crs(epsg=4326, inplace=True)

# 6. 创建一个 Folium 地图对象，中心定位在站点的平均位置
m = folium.Map(location=[gdf['stop_lat'].mean(), gdf['stop_lon'].mean()], zoom_start=12)

# 7. 添加每个站点作为固定大小的 CircleMarker
for idx, row in gdf.iterrows():
    folium.CircleMarker(
        location=[row['stop_lat'], row['stop_lon']],
        radius=3,  # 点的半径，单位为像素
        color='blue',  # 边框颜色
        fill=True,
        fill_color='blue',  # 填充颜色
        fill_opacity=0.7
    ).add_to(m)

# 8. 保存地图
# map_path = ''
# m.save(map_path)

# 在浏览器中打开生成的 HTML 文件
# print(f"Map has been generated and saved to {map_path}. Open it in a browser to view.")
m

In [6]:
from pyspark.sql import SparkSession
import os

def check_parquet_features(directory):
    spark = SparkSession.builder.appName("Check Parquet Features").getOrCreate()
    
    # 初始化集合来存储所有文件的列
    columns_set = set()
    parquet_files = []

    # 遍历目录，找到所有 .parquet 文件
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith(".parquet"):
                file_path = os.path.join(root, file)
                parquet_files.append(file_path)

    # 遍历所有 .parquet 文件，获取列名
    for file_path in parquet_files:
        df = spark.read.parquet(file_path)
        columns = set(df.columns)
        
        # 将首个文件的列集合设为基准
        if not columns_set:
            columns_set = columns
        else:
            # 如果当前文件的列集合与基准不一致
            if columns != columns_set:
                spark.stop()
                return False, (columns_set, columns)

    spark.stop()
    return True, list(columns_set)

# 指定包含 .parquet 文件的目录（相对路径）
directory = '../../data/raw/PTV/Un_preprocess_PTV/'

# 测试文件特征是否一致
features_consistent, columns_info = check_parquet_features(directory)

if features_consistent:
    print("All files have consistent features:")
    print(columns_info)
else:
    print("Files have different features:")
    print("Base columns:", columns_info[0])
    print("Different columns:", columns_info[1])

24/09/01 09:47:40 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


All files have consistent features:
['stop_name', 'stop_lat', 'stop_id', 'stop_lon']
