In [None]:
import xml.etree.ElementTree as ET
import pandas as pd
import os 
import geopandas as gpd

# 00 Setup
def findfiles(filefolderpath, filetype='.csv', recursive=True):
    """
    尋找指定路徑下指定類型的檔案，並返回檔案路徑列表。

    Args:
        filefolderpath (str): 指定的檔案路徑。
        filetype (str, optional): 要尋找的檔案類型，預設為 '.csv'。
        recursive (bool, optional): 是否檢索所有子資料夾，預設為 True；反之為False，僅查找當前資料夾的所有file。

    Returns:
        list: 包含所有符合條件的檔案路徑的列表。
    """
    filelist = []

    if recursive:
        # 遍歷資料夾及其子資料夾
        for root, _, files in os.walk(filefolderpath):
            for file in files:
                if file.endswith(filetype):
                    file_path = os.path.join(root, file)
                    filelist.append(file_path)
    else:
        # 僅檢索當前資料夾
        for file in os.listdir(filefolderpath):
            file_path = os.path.join(filefolderpath, file)
            if os.path.isfile(file_path) and file.endswith(filetype):
                filelist.append(file_path)

    return filelist

# 01 讀取TDX資料
def read_bus_stop_of_route_xml(xml_path: str) -> pd.DataFrame:
    """
    讀取 TDX 公車站序 XML（BusStopOfRoute），回傳整理好的 pandas DataFrame。
    
    每一列 = 一個站牌（Stop），同時附上路線 / 營運業者資訊。
    """
    tree = ET.parse(xml_path)
    root = tree.getroot()

    # 自動從 root 解析出 namespace（避免寫死）
    if root.tag.startswith("{"):
        uri = root.tag.split("}")[0].strip("{")
    else:
        uri = "https://ptx.transportdata.tw/standard/schema/"
    ns = {"ns": uri}

    def gettext(elem, path):
        """安全取 text，找不到就回 None"""
        if elem is None:
            return None
        child = elem.find(path, ns)
        return child.text if child is not None else None

    rows = []

    # 每一個 <BusStopOfRoute> 代表一條路線 + 方向
    for bsr in root.findall("ns:BusStopOfRoute", ns):

        # 路線共同欄位
        base = {
            "RouteUID":          gettext(bsr, "ns:RouteUID"),
            "RouteID":           gettext(bsr, "ns:RouteID"),
            "RouteName_Zh":      gettext(bsr, "ns:RouteName/ns:Zh_tw"),
            "RouteName_En":      gettext(bsr, "ns:RouteName/ns:En"),
            "SubRouteUID":       gettext(bsr, "ns:SubRouteUID"),
            "SubRouteID":        gettext(bsr, "ns:SubRouteID"),
            "SubRouteName_Zh":   gettext(bsr, "ns:SubRouteName/ns:Zh_tw"),
            "SubRouteName_En":   gettext(bsr, "ns:SubRouteName/ns:En"),
            "Direction":         gettext(bsr, "ns:Direction"),
            "City":              gettext(bsr, "ns:City"),
            "CityCode":          gettext(bsr, "ns:CityCode"),
            "OperatorID":        gettext(bsr, "ns:Operators/ns:Operator/ns:OperatorID"),
            "OperatorName_Zh":   gettext(bsr, "ns:Operators/ns:Operator/ns:OperatorName/ns:Zh_tw"),
            "OperatorNo":        gettext(bsr, "ns:Operators/ns:Operator/ns:OperatorNo"),
        }

        # 底下所有 <Stop>
        for stop in bsr.findall("ns:Stops/ns:Stop", ns):
            row = base.copy()
            row.update({
                "StopUID":          gettext(stop, "ns:StopUID"),
                "StopID":           gettext(stop, "ns:StopID"),
                "StopName_Zh":      gettext(stop, "ns:StopName/ns:Zh_tw"),
                "StopName_En":      gettext(stop, "ns:StopName/ns:En"),
                "StopBoarding":     gettext(stop, "ns:StopBoarding"),
                "StopSequence":     gettext(stop, "ns:StopSequence"),
                "PositionLon":      gettext(stop, "ns:StopPosition/ns:PositionLon"),
                "PositionLat":      gettext(stop, "ns:StopPosition/ns:PositionLat"),
                "GeoHash":          gettext(stop, "ns:StopPosition/ns:GeoHash"),
                "StationID":        gettext(stop, "ns:StationID"),
                "StationGroupID":   gettext(stop, "ns:StationGroupID"),
                "LocationCityCode": gettext(stop, "ns:LocationCityCode"),
            })
            rows.append(row)

    df = pd.DataFrame(rows)

    # 可選：把數值欄位轉型（如果你需要的話）
    for col in ["StopSequence"]:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="ignore")
    for col in ["PositionLon", "PositionLat"]:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="ignore")

    return df

def read_bus_shape_of_route_xml(xml_path: str) -> pd.DataFrame:
    """
    讀取 TDX 公車路線 XML（BusShape），回傳整理好的 pandas DataFrame。
    """

    # 解析 XML
    tree = ET.parse(xml_path)
    root = tree.getroot()

    # 宣告 XML namespace（必須！）
    ns = {'ns': "https://ptx.transportdata.tw/standard/schema/"}

    records = []

    # 每一個 <BusShape> 就是一筆資料
    for bus in root.findall('ns:BusShape', ns):
        record = {
            "Geometry": bus.findtext('ns:Geometry', namespaces=ns),
            "EncodedPolyline": bus.findtext('ns:EncodedPolyline', namespaces=ns),
            "RouteUID": bus.findtext('ns:RouteUID', namespaces=ns),
            "RouteID": bus.findtext('ns:RouteID', namespaces=ns),
            "RouteName_Zh": bus.find('ns:RouteName/ns:Zh_tw', ns).text if bus.find('ns:RouteName/ns:Zh_tw', ns) is not None else None,
            "RouteName_En": bus.find('ns:RouteName/ns:En', ns).text if bus.find('ns:RouteName/ns:En', ns) is not None else None,
            "SubRouteUID": bus.findtext('ns:SubRouteUID', namespaces=ns),
            "SubRouteID": bus.findtext('ns:SubRouteID', namespaces=ns),
            "SubRouteName_Zh": bus.find('ns:SubRouteName/ns:Zh_tw', ns).text if bus.find('ns:SubRouteName/ns:Zh_tw', ns) is not None else None,
            "SubRouteName_En": bus.find('ns:SubRouteName/ns:En', ns).text if bus.find('ns:SubRouteName/ns:En', ns) is not None else None,
            "Direction": bus.findtext('ns:Direction', namespaces=ns),
            "UpdateTime": bus.findtext('ns:UpdateTime', namespaces=ns),
            "VersionID": bus.findtext('ns:VersionID', namespaces=ns),
        }
        records.append(record)

    # 轉成 DataFrame
    df = pd.DataFrame(records)

    return df

In [None]:
# 01-01 讀取站序xml

# 讀取所有站序 xml，轉存為 csv
busstopseq_folder = os.path.join(os.getcwd(), '..', "00_TDX資料下載", "01公車站序資料")
xml_files = findfiles(busstopseq_folder, filetype='.xml', recursive=False)

for xmlfile in xml_files:
    df = read_bus_stop_of_route_xml(xmlfile)
    df.to_csv(xmlfile.replace('.xml', '.csv'), index=False, encoding='utf-8-sig')

# 整併所有的csv
csv_files = findfiles(busstopseq_folder, filetype='.csv', recursive=False)
all_dfs = [pd.read_csv(f) for f in csv_files]
df_seq = pd.concat(all_dfs, ignore_index=True)

In [None]:
busstopseq_folder = os.path.join(os.getcwd(), '..', "00_TDX資料下載", "01公車站序資料")
xml_files = findfiles(busstopseq_folder, filetype='.xml', recursive=False)
xmlfile = xml_files[0]

In [None]:
xmlfile

In [None]:
# 02 讀取shp和seq比對是否每個路線都有對應的站序資料

route_gdf = gpd.read_file(r"D:\B-Project\2025\6800\Technical\12票證資料\其他分析資料\市區公車路線資料\新北市公車路線\B1F0230003V089.shp")
df_seq = pd.read_csv(r'D:\B-Project\2025\6800\Technical\12票證資料\TicketAnalysis\00_TDX資料下載\01公車站序資料\公車站序資料_新北市_2025-11-21.csv')


In [None]:
# 01-02 讀取路線xml資料
busroute_folder = os.path.join(os.getcwd(), '..', "00_TDX資料下載", "02公車路線資料")
xml_files = findfiles(busroute_folder, filetype='.xml', recursive=False)
xmlfile = xml_files[0]

In [None]:
xmlfile

In [None]:
pd.read_csv(xmlfile)