In [60]:
import xml.etree.ElementTree as ET
import pandas as pd
import geopandas as gpd
import os 
from shapely import wkt # for WKT 轉幾何物件
from shapely.geometry import LineString, Point, MultiLineString
from shapely.ops import substring, linemerge

# 00 Setup
def create_folder(folder_name):
    """建立資料夾"""
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)
    return os.path.abspath(folder_name)

def findfiles(filefolderpath, filetype='.csv', recursive=True):
    """
    尋找指定路徑下指定類型的檔案，並返回檔案路徑列表。

    Args:
        filefolderpath (str): 指定的檔案路徑。
        filetype (str, optional): 要尋找的檔案類型，預設為 '.csv'。
        recursive (bool, optional): 是否檢索所有子資料夾，預設為 True；反之為False，僅查找當前資料夾的所有file。

    Returns:
        list: 包含所有符合條件的檔案路徑的列表。
    """
    filelist = []

    if recursive:
        # 遍歷資料夾及其子資料夾
        for root, _, files in os.walk(filefolderpath):
            for file in files:
                if file.endswith(filetype):
                    file_path = os.path.join(root, file)
                    filelist.append(file_path)
    else:
        # 僅檢索當前資料夾
        for file in os.listdir(filefolderpath):
            file_path = os.path.join(filefolderpath, file)
            if os.path.isfile(file_path) and file.endswith(filetype):
                filelist.append(file_path)

    return filelist

def read_combined_dataframe(file_list, filepath = True):
    dataframes = []
    
    for file in file_list:
        try:
            if file.endswith('.csv'):
                df = pd.read_csv(file)
            elif file.endswith('.shp'):
                df = gpd.read_file(file)
            elif file.endswith(('.xls', '.xlsx')):
                df = pd.read_excel(file)
            else:
                print(f"Unsupported file format: {file}")
                continue
            if filepath:
                df['FilePath'] = file  # 添加來源檔案路徑欄位
            dataframes.append(df)
        except Exception as e:
            print(f"Error reading {file}: {e}")

    # 合併所有 DataFrame
    combined_df = pd.concat(dataframes, ignore_index=True)
    return combined_df

def dataframe_to_point(df, lon_col, lat_col, crs="EPSG:4326", target_crs="EPSG:3826"):
    '''
    Parameters:
    df (dataframe) : 含經緯度座標欄位的dataframe
    lon_col (str) : 緯度欄位
    Lat_col (str) : 經度欄位
    crs (str) : 目前經緯度座標的座標系統，常用的為4326(WGS84)、3826(TWD97)
    target_crs：目標轉換的座標系統
    '''

    # from shapely.geometry import Point
    # import pandas as pd
    # import geopandas as gpd
    # Create Point geometries from the longitude and latitude columns
    geometry = [Point(xy) for xy in zip(df[lon_col], df[lat_col])]
    # Create a GeoDataFrame with the original CRS
    gdf = gpd.GeoDataFrame(df, geometry=geometry, crs=crs)
    # Convert the GeoDataFrame to the target CRS
    gdf = gdf.to_crs(epsg=target_crs.split(":")[1])
    return gdf

# 01 讀取TDX資料
def read_bus_stop_of_route_xml(xml_path: str) -> pd.DataFrame:
    """
    讀取 TDX 公車站序 XML（BusStopOfRoute），回傳整理好的 pandas DataFrame。
    
    每一列 = 一個站牌（Stop），同時附上路線 / 營運業者資訊。
    """
    tree = ET.parse(xml_path)
    root = tree.getroot()

    # 自動從 root 解析出 namespace（避免寫死）
    if root.tag.startswith("{"):
        uri = root.tag.split("}")[0].strip("{")
    else:
        uri = "https://ptx.transportdata.tw/standard/schema/"
    ns = {"ns": uri}

    def gettext(elem, path):
        """安全取 text，找不到就回 None"""
        if elem is None:
            return None
        child = elem.find(path, ns)
        return child.text if child is not None else None

    rows = []

    # 每一個 <BusStopOfRoute> 代表一條路線 + 方向
    for bsr in root.findall("ns:BusStopOfRoute", ns):

        # 路線共同欄位
        base = {
            "RouteUID":          gettext(bsr, "ns:RouteUID"),
            "RouteID":           gettext(bsr, "ns:RouteID"),
            "RouteName_Zh":      gettext(bsr, "ns:RouteName/ns:Zh_tw"),
            "RouteName_En":      gettext(bsr, "ns:RouteName/ns:En"),
            "SubRouteUID":       gettext(bsr, "ns:SubRouteUID"),
            "SubRouteID":        gettext(bsr, "ns:SubRouteID"),
            "SubRouteName_Zh":   gettext(bsr, "ns:SubRouteName/ns:Zh_tw"),
            "SubRouteName_En":   gettext(bsr, "ns:SubRouteName/ns:En"),
            "Direction":         gettext(bsr, "ns:Direction"),
            "City":              gettext(bsr, "ns:City"),
            "CityCode":          gettext(bsr, "ns:CityCode"),
            "OperatorID":        gettext(bsr, "ns:Operators/ns:Operator/ns:OperatorID"),
            "OperatorName_Zh":   gettext(bsr, "ns:Operators/ns:Operator/ns:OperatorName/ns:Zh_tw"),
            "OperatorNo":        gettext(bsr, "ns:Operators/ns:Operator/ns:OperatorNo"),
        }

        # 底下所有 <Stop>
        for stop in bsr.findall("ns:Stops/ns:Stop", ns):
            row = base.copy()
            row.update({
                "StopUID":          gettext(stop, "ns:StopUID"),
                "StopID":           gettext(stop, "ns:StopID"),
                "StopName_Zh":      gettext(stop, "ns:StopName/ns:Zh_tw"),
                "StopName_En":      gettext(stop, "ns:StopName/ns:En"),
                "StopBoarding":     gettext(stop, "ns:StopBoarding"),
                "StopSequence":     gettext(stop, "ns:StopSequence"),
                "PositionLon":      gettext(stop, "ns:StopPosition/ns:PositionLon"),
                "PositionLat":      gettext(stop, "ns:StopPosition/ns:PositionLat"),
                "GeoHash":          gettext(stop, "ns:StopPosition/ns:GeoHash"),
                "StationID":        gettext(stop, "ns:StationID"),
                "StationGroupID":   gettext(stop, "ns:StationGroupID"),
                "LocationCityCode": gettext(stop, "ns:LocationCityCode"),
            })
            rows.append(row)

    df = pd.DataFrame(rows)

    # 可選：把數值欄位轉型（如果你需要的話）
    for col in ["StopSequence"]:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="ignore")
    for col in ["PositionLon", "PositionLat"]:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="ignore")

    return df

def read_bus_shape_of_route_xml(xml_path: str) -> pd.DataFrame:
    """
    讀取 TDX 公車路線 XML（BusShape），回傳整理好的 pandas DataFrame。
    """

    # 解析 XML
    tree = ET.parse(xml_path)
    root = tree.getroot()

    # 宣告 XML namespace（必須！）
    ns = {'ns': "https://ptx.transportdata.tw/standard/schema/"}

    records = []

    # 每一個 <BusShape> 就是一筆資料
    for bus in root.findall('ns:BusShape', ns):
        record = {
            "Geometry": bus.findtext('ns:Geometry', namespaces=ns),
            "EncodedPolyline": bus.findtext('ns:EncodedPolyline', namespaces=ns),
            "RouteUID": bus.findtext('ns:RouteUID', namespaces=ns),
            "RouteID": bus.findtext('ns:RouteID', namespaces=ns),
            "RouteName_Zh": bus.find('ns:RouteName/ns:Zh_tw', ns).text if bus.find('ns:RouteName/ns:Zh_tw', ns) is not None else None,
            "RouteName_En": bus.find('ns:RouteName/ns:En', ns).text if bus.find('ns:RouteName/ns:En', ns) is not None else None,
            "SubRouteUID": bus.findtext('ns:SubRouteUID', namespaces=ns),
            "SubRouteID": bus.findtext('ns:SubRouteID', namespaces=ns),
            "SubRouteName_Zh": bus.find('ns:SubRouteName/ns:Zh_tw', ns).text if bus.find('ns:SubRouteName/ns:Zh_tw', ns) is not None else None,
            "SubRouteName_En": bus.find('ns:SubRouteName/ns:En', ns).text if bus.find('ns:SubRouteName/ns:En', ns) is not None else None,
            "Direction": bus.findtext('ns:Direction', namespaces=ns),
            "UpdateTime": bus.findtext('ns:UpdateTime', namespaces=ns),
            "VersionID": bus.findtext('ns:VersionID', namespaces=ns),
        }
        records.append(record)

    # 轉成 DataFrame
    df = pd.DataFrame(records)

    return df

# 02 檢查路線
def compare_column_values(df_a, df_b, column, name_a='df_a', name_b='df_b'):
    # 轉 set 做比較
    set_a = set(df_a[column])
    set_b = set(df_b[column])

    only_in_a = set_a - set_b
    only_in_b = set_b - set_a
    in_both = set_a & set_b

    # 組成輸出用文字
    text = []
    text.append(f"只在 {name_a} 出現的 {column}：{len(only_in_a)}")
    text.append(f"只在 {name_b} 出現的 {column}：{len(only_in_b)}")
    text.append(f"兩邊都有的 {column}：{len(in_both)}")

    output_text = "\n".join(text)

    return output_text, only_in_a, only_in_b, in_both

# 03 拆分路線

def snap_points_to_line(
    stops_gdf, routes_gdf,
    route_id_col, route_direction_col,
    seq_id_col, seq_direction_col,
    seq_lat_col, seq_lng_col,
    route_geom_col="geometry"
    ):
    """
    不處理 CRS、不檢查任何欄位、不做 eps 修正。
    只負責把點投影到路線上並回寫 __m__。
    """

    snapped_points = []
    measures = []

    for _, stop in stops_gdf.iterrows():

        # 找對應路線
        matching = routes_gdf[
            (routes_gdf[route_id_col] == stop[seq_id_col]) &
            (routes_gdf[route_direction_col] == stop[seq_direction_col])
        ]

        if matching.empty:
            snapped_points.append(stop.geometry)
            measures.append(None)
            continue

        geom = matching.iloc[0][route_geom_col]

        # MultiLineString 盡量合併成單線
        try:
            line = linemerge(geom)
        except Exception:
            line = geom

        # 投影與插值
        m = line.project(stop.geometry)
        snapped = line.interpolate(m)

        snapped_points.append(snapped)
        measures.append(float(m))

    out = stops_gdf.copy()
    out["geometry"] = snapped_points
    out[seq_lat_col] = out.geometry.y
    out[seq_lng_col] = out.geometry.x
    out["__m__"] = measures

    return out

def split_routes(
    busroute_select, 
    seq_select,
    route_id_col='RouteName',
    route_direction_col='Direction',
    seq_id_col='RouteName',
    seq_direction_col='Direction',
    seq_seq_col='Seq',
    route_geom_col='geometry',
    eps=1e-6 ):
    """
    依站序把路線切成多段，只輸出 LineString 段落。
    要求 seq_select 已經由 snap_points_to_line 產生 __m__ 欄位；若沒有，我們會用 geometry 計算。
    """
    output = []

    # CRSs 對齊
    if hasattr(busroute_select, "crs") and hasattr(seq_select, "crs"):
        if busroute_select.crs != seq_select.crs:
            seq_select = seq_select.to_crs(busroute_select.crs)

    for _, route in busroute_select.iterrows():
        rid = route[route_id_col]
        direc = route[route_direction_col]
        geom = route[route_geom_col]

        # 先把路線合併成單條（盡量）
        try:
            line = linemerge(geom)
        except Exception:
            line = geom

        # 取對應站點（依站序排序）
        stops = seq_select[
            (seq_select[seq_id_col] == rid) &
            (seq_select[seq_direction_col] == direc)
        ].sort_values(seq_seq_col).copy()

        if stops.empty:
            continue

        # 若沒有 __m__ 就現算
        if "__m__" not in stops.columns or stops["__m__"].isna().any():
            stops["__m__"] = stops.geometry.apply(lambda p: line.project(p))

        # 夾界在 [0, line.length]
        L = line.length
        stops["__m__"] = stops["__m__"].clip(lower=0.0, upper=L)

        # 去除「同一 m 值」的重複點（避免零長度段）
        # 若同一 m 有多筆，保留站序最小的那一筆
        stops = stops.sort_values([ "__m__", seq_seq_col ])
        stops = stops.drop_duplicates(subset="__m__", keep="first")

        # 回到站序順序（你要依站序切段）
        stops = stops.sort_values(seq_seq_col)

        m_vals = stops["__m__"].to_numpy()
        seq_vals = stops[seq_seq_col].to_numpy()

        for i in range(len(m_vals) - 1):
            m0 = float(m_vals[i])
            m1 = float(m_vals[i+1])

            # 修正順序：substring 需要 start <= end
            start_m = min(m0, m1)
            end_m   = max(m0, m1)

            # 過濾太短或同點（避免回傳 Point）
            if end_m - start_m <= eps:
                continue

            # 切段
            seg = substring(line, start_m, end_m, normalized=False)

            # 只保留 LineString（或非零長度的 MultiLineString）
            if isinstance(seg, LineString):
                if seg.length > eps:
                    output.append({
                        'ID': rid,
                        'Direction': direc,
                        'StartSeq': seq_vals[i],
                        'EndSeq': seq_vals[i+1],
                        'geometry': seg
                    })
            elif isinstance(seg, MultiLineString):
                # 可能因為 line 還是多段，挑長度>0的子段各自輸出
                for part in seg.geoms:
                    if part.length > eps:
                        output.append({
                            'ID': rid,
                            'Direction': direc,
                            'StartSeq': seq_vals[i],
                            'EndSeq': seq_vals[i+1],
                            'geometry': part
                        })
            else:
                # Point / 空幾何都丟掉
                continue

    return gpd.GeoDataFrame(output, geometry="geometry", crs=getattr(busroute_select, "crs", None))

def inspect_route_geometries(gdf):
    """
    檢查 GeoDataFrame 的 geometry 型別，並回傳文字報告（string）。
    不做 print，只組成 text 回傳。
    """

    lines = []  # 用來存文字行

    lines.append("幾何型別分佈：")
    geom_counts = gdf.geom_type.value_counts(dropna=False)
    lines.append(str(geom_counts))

    # 找出非 LineString / MultiLineString，或空幾何、NaN
    mask_bad = (
        ~gdf.geom_type.isin(["LineString", "MultiLineString"])
        | gdf.geometry.isna()
        | gdf.is_empty
    )
    bad = gdf[mask_bad]

    lines.append(f"\n疑似有問題的筆數：{len(bad)}")

    bad_geom_types = bad.geom_type.value_counts(dropna=False).rename("bad_geom_types")
    lines.append(str(bad_geom_types))

    # 前幾筆索引
    bad_indices = list(bad.index[:10])
    lines.append(f"\n前 10 筆問題索引：{bad_indices}")

    # 將所有行組成字串
    text_report = "\n".join(lines)
    return text_report


# ===== 步驟 =====
# 01-01 讀取站序xml
def read_seq(busstopseq_folder = os.path.join(os.getcwd(), '..', "00_TDX資料下載", "01公車站序資料")):
    # 讀取所有站序 xml，轉存為 csv
    xml_files = findfiles(busstopseq_folder, filetype='.xml', recursive=False)
    for xmlfile in xml_files:
        df = read_bus_stop_of_route_xml(xmlfile)
        df.to_csv(xmlfile.replace('.xml', '.csv'), index=False, encoding='utf-8-sig')

    # 整併所有的csv
    df_seq = read_combined_dataframe(findfiles(busstopseq_folder, filetype='.csv', recursive=False))
    df_seq = df_seq.drop_duplicates(subset=['RouteUID', 'SubRouteID', 'Direction', 'StopSequence'])

    return df_seq

# 01-02 讀取路線xml資料
def read_busroute(busroute_folder = os.path.join(os.getcwd(), '..', "00_TDX資料下載", "02公車路線資料")):
    xml_files = findfiles(busroute_folder, filetype='.xml', recursive=False)
    for xmlfile in xml_files:
        df = read_bus_shape_of_route_xml(xmlfile)
        df.to_csv(xmlfile.replace('.xml', '.csv'), index=False, encoding='utf-8-sig')

    # 整併所有的csv
    df_route = read_combined_dataframe(findfiles(busroute_folder, filetype='.csv', recursive=False))
    df_route = df_route.drop_duplicates(subset=['RouteUID', 'SubRouteID', 'Direction'])

    return df_route

# 02 檢查路線
def check_routes_and_save(df_route, df_seq, output_path="route_check_output.txt"):
    # 呼叫 compare_column_values
    text_routeUID, only_in_route_routeUID, only_in_seq_routeUID, in_both_routeUID = compare_column_values(
        df_route, df_seq, 'RouteUID', name_a='df_route', name_b='df_seq'
    )
    text_subrouteUID, only_in_route_subrouteUID, only_in_seq_subrouteUID, in_both_subrouteUID = compare_column_values(
        df_route, df_seq, 'SubRouteUID', name_a='df_route', name_b='df_seq'
    )

    # 組合所有輸出文字
    output_text = (
        text_routeUID + "\n"
        + "-----------\n"
        + text_subrouteUID
    )

    # 寫入 txt 檔
    with open(output_path, "w", encoding="utf-8") as f:
        f.write(output_text)

    print(f"已輸出檢查結果到：{output_path}")
    return in_both_subrouteUID

# 03 拆分路線

# 03-01 將站序和路線轉為 GeoDataFrame
def get_gdfroute_gdfseq(df_route, df_seq):
    '''03-01 將站序和路線轉為 GeoDataFrame'''
    df_route = df_route.copy()
    df_seq = df_seq.copy()
    
    # df_route["Geometry"] = df_route["Geometry"].apply(wkt.loads) # 將 Geometry 欄位的 WKT 轉為 shapely geometry 
    # gdf_route = gpd.GeoDataFrame(df_route, geometry="Geometry", crs="EPSG:4326") # 建立 GeoDataFrame 
    # gdf_route.rename(columns={'Geometry':'geometry'}, inplace=True)

    df_route["geometry"] = df_route["Geometry"].apply(wkt.loads)
    gdf_route = gpd.GeoDataFrame(
        df_route,
        geometry="geometry",
        crs="EPSG:4326"
    )    

    gdf_seq = dataframe_to_point(df_seq, lon_col='PositionLon', lat_col='PositionLat', crs="EPSG:4326", target_crs="EPSG:4326")

    return gdf_route, gdf_seq

# 03-02 拆分路線
def get_bySubRouteUID(gdf_route, gdf_seq, in_both_subrouteUID):

    # 先處理SubRouteUID 一致的路線
    gdf_route_selectbySubRouteUID = gdf_route[gdf_route['SubRouteUID'].isin(in_both_subrouteUID)].reset_index(drop=True)
    gdf_seq_selectbySubRouteUID = gdf_seq[gdf_seq['SubRouteUID'].isin(in_both_subrouteUID)].reset_index(drop=True)

    
    # 將公車站序點位投影到路線上
    gdf_snapstop_select = snap_points_to_line(gdf_seq_selectbySubRouteUID, gdf_route_selectbySubRouteUID,
                                            route_id_col='SubRouteUID',
                                            route_direction_col='Direction',
                                            seq_id_col='SubRouteUID',
                                            seq_direction_col='Direction',
                                            seq_lat_col='PositionLat',
                                            seq_lng_col='PositionLon')
    # 依站序拆分路線
    gdf_routesegment_select = split_routes(gdf_route_selectbySubRouteUID, gdf_snapstop_select,
                                        route_id_col='SubRouteUID',
                                        route_direction_col='Direction',
                                        seq_id_col='SubRouteUID',
                                        seq_direction_col='Direction',
                                        seq_seq_col='StopSequence')

    gdf_routesegment_select.rename(columns = {'ID':'SubRouteUID'}, inplace=True)
    gdf_routesegment_select = pd.merge(gdf_routesegment_select,
                                        gdf_seq_selectbySubRouteUID[['RouteUID', 'RouteName_Zh' ,'SubRouteUID', 'SubRouteName_Zh']].drop_duplicates(subset=['RouteUID', 'SubRouteUID']),
                                        on='SubRouteUID', how='left')
    gdf_routesegment_select = pd.merge(gdf_routesegment_select,
                                        gdf_seq_selectbySubRouteUID[['RouteUID', 'SubRouteUID', 'Direction', 'StopSequence', 'StopUID', 'StopName_Zh']].rename(columns={'StopUID':'OStopUID', 'StopName_Zh':'OStopName', 'StopSequence':'StartSeq'}).drop_duplicates(subset=['RouteUID', 'SubRouteUID', 'Direction', 'StartSeq']),
                                        on=['RouteUID', 'SubRouteUID', 'Direction', 'StartSeq'], how='left')
    
    gdf_routesegment_select = pd.merge(gdf_routesegment_select,
                                        gdf_seq_selectbySubRouteUID[['RouteUID', 'SubRouteUID', 'Direction', 'StopSequence', 'StopUID', 'StopName_Zh']].rename(columns={'StopUID':'DStopUID', 'StopName_Zh':'DStopName', 'StopSequence':'EndSeq'}).drop_duplicates(subset=['RouteUID', 'SubRouteUID', 'Direction', 'EndSeq']),
                                        on=['RouteUID', 'SubRouteUID', 'Direction', 'EndSeq'], how='left')
    
    gdf_routesegment_select = gdf_routesegment_select.rename(columns = {'RouteName_Zh':'RouteName', 'SubRouteName_Zh':'SubRouteName'})

    return gdf_routesegment_select

def get_byRouteUID(gdf_route, gdf_seq, in_both_subrouteUID):

    gdf_route_others = gdf_route[gdf_route['SubRouteUID'].isin(in_both_subrouteUID)==False]
    gdf_seq_others = gdf_seq[gdf_seq['SubRouteUID'].isin(in_both_subrouteUID) == False]

    gdf_snapstop_others = snap_points_to_line(gdf_seq_others, gdf_route_others,
                                            route_id_col='RouteUID',
                                            route_direction_col='Direction',
                                            seq_id_col='RouteUID',
                                            seq_direction_col='Direction',
                                            seq_lat_col='PositionLat',
                                            seq_lng_col='PositionLon')

    gdf_routesegment_others = split_routes(gdf_route_others, gdf_snapstop_others,
                                        route_id_col='RouteUID',
                                        route_direction_col='Direction',
                                        seq_id_col='RouteUID',
                                        seq_direction_col='Direction',
                                        seq_seq_col='StopSequence')

    gdf_routesegment_others.rename(columns = {'ID':'RouteUID'}, inplace=True)


    gdf_routesegment_others = gdf_routesegment_others.merge(gdf_route_others[['RouteUID', 'RouteName_Zh']].drop_duplicates(), on='RouteUID', how='left')
    gdf_routesegment_others = pd.merge(gdf_routesegment_others , 
                                    gdf_seq_others[['RouteUID', 'Direction', 'StopSequence', 'StopUID', 'StopName_Zh']].drop_duplicates().rename(columns = {'StopUID':'OstopUID', 'StopName_Zh':'OStopName',  'StopSequence':'StartSeq'}), 
                                    on = ['RouteUID', 'Direction', 'StartSeq'], how = 'left')
    gdf_routesegment_others = pd.merge(gdf_routesegment_others , 
                                    gdf_seq_others[['RouteUID', 'Direction', 'StopSequence', 'StopUID', 'StopName_Zh']].drop_duplicates().rename(columns = {'StopUID':'DstopUID', 'StopName_Zh':'DStopName',  'StopSequence':'EndSeq'}), 
                                    on = ['RouteUID', 'Direction', 'EndSeq'], how = 'left')
    
    gdf_routesegment_others = gdf_routesegment_others.rename(columns = {'RouteName_Zh':'RouteName'})

    return gdf_routesegment_others

def get_splitroute(gdf_route, gdf_seq, in_both_subrouteUID, 
                   routesegment_folder = create_folder(os.path.join(os.getcwd(), '..', "03_處理後資料", "01_公車路線依站序拆分")), 
                   enable_separate_output = True):
    '''黏貼路網成步驟'''

    # 處理SubRouteUID 一致的路線
    gdf_routesegment_select = get_bySubRouteUID(gdf_route, gdf_seq, in_both_subrouteUID)
    if enable_separate_output:
        gdf_routesegment_select.to_file(os.path.join(routesegment_folder, 'BySubRouteUID.shp'), index=False)

    # 處理只有RouteUID 一致的路線
    gdf_routesegment_others = get_byRouteUID(gdf_route, gdf_seq, in_both_subrouteUID)
    if enable_separate_output:
        gdf_routesegment_others.to_file(os.path.join(routesegment_folder, 'ByRouteUID.shp'), index=False)

    # 最後合併
    gdf_routesegment = pd.concat([gdf_routesegment_select, gdf_routesegment_others], ignore_index=True)
    gdf_routesegment = gdf_routesegment.reindex(columns = ['RouteUID', 'RouteName', 'SubRouteUID', 'SubRouteName', 'Direction', 
                                                           'StartSeq', 'OStopName', 'OStopUID', 
                                                           'EndSeq', 'DStopName', 'DStopUID', 
                                                           'geometry'])
    

    print(inspect_route_geometries(gdf_routesegment))

    # 輸出
    routesegment_filepath = os.path.join(routesegment_folder, '市區公車拆分.shp')
    gdf_routesegment.to_file(routesegment_filepath, 
                             index=False)
    
    return gdf_routesegment

def main():
    df_seq = read_seq(busstopseq_folder = os.path.join(os.getcwd(), '..', "00_TDX資料下載", "01公車站序資料"))
    df_route = read_busroute(busroute_folder = os.path.join(os.getcwd(), '..', "00_TDX資料下載", "02公車路線資料"))

    in_both_subrouteUID = check_routes_and_save(df_route = df_route, 
                                                df_seq = df_seq, 
                                                output_path= os.path.abspath(os.path.join(os.getcwd(), '..', '02_初步分析', '票證及路線數據檢查.txt')))
    
    gdf_route, gdf_seq = get_gdfroute_gdfseq(df_route, df_seq)

    gdf_routesegment = get_splitroute(gdf_route = gdf_route, 
                                  gdf_seq = gdf_seq, 
                                  in_both_subrouteUID = in_both_subrouteUID)
    
# if __name__ == "__main__":
#     main()

In [73]:
df_seq = read_seq(busstopseq_folder = os.path.join(os.getcwd(), '..', "00_TDX資料下載", "01公車站序資料"))
df_route = read_busroute(busroute_folder = os.path.join(os.getcwd(), '..', "00_TDX資料下載", "02公車路線資料"))


in_both_subrouteUID = check_routes_and_save(df_route = df_route, 
                                            df_seq = df_seq, 
                                            output_path= os.path.abspath(os.path.join(os.getcwd(), '..', '02_初步分析', '票證及路線數據檢查.txt')))

gdf_route, gdf_seq = get_gdfroute_gdfseq(df_route, df_seq)


  df[col] = pd.to_numeric(df[col], errors="ignore")
  df[col] = pd.to_numeric(df[col], errors="ignore")


已輸出檢查結果到：d:\B-Project\2025\6800\Technical\12票證資料\TicketAnalysis\02_初步分析\票證及路線數據檢查.txt


In [96]:
def snap_points_to_line(stops_gdf, 
                        routes_gdf, 
                        route_id_col, 
                        route_direction_col, 
                        seq_id_col, 
                        seq_direction_col, 
                        seq_lat_col, 
                        seq_lng_col):
    """
    將公車站點 (stops_gdf) 投影到公車路線 (routes_gdf) 上，並動態帶入欄位名稱。
    Parameters:
        stops_gdf (GeoDataFrame): 包含公車站點的 GeoDataFrame。
        routes_gdf (GeoDataFrame): 包含公車路線的 GeoDataFrame。
        route_id_col (str): 路線名稱欄位名稱。
        route_direction_col (str): 路線方向欄位名稱。
        seq_routename_col (str): 站點路線名稱欄位名稱。
        seq_direction_col (str): 站點方向欄位名稱。
        seq_lat_col (str): 站點緯度欄位名稱。
        seq_lng_col (str): 站點經度欄位名稱。
    Returns:
        GeoDataFrame: 更新後的公車站點 GeoDataFrame，其中 geometry 已投影到路線。
    """
    snapped_points = []

    for _, stop in stops_gdf.iterrows():
        # 找到與站點路線名稱和方向相符的路線
        matching_route = routes_gdf[(routes_gdf[route_id_col] == stop[seq_id_col]) & 
                                    (routes_gdf[route_direction_col] == stop[seq_direction_col])]

        if not matching_route.empty:
            # 取出該路線的 geometry
            line = matching_route.iloc[0].geometry
            # 計算站點投影到該路線的最近點
            snapped_point = line.interpolate(line.project(stop.geometry))
            snapped_points.append(snapped_point)
        else:
            # 如果沒有匹配的路線，保持原點
            snapped_points.append(stop.geometry)

    # 更新站點的 geometry
    stops_gdf = stops_gdf.copy()
    stops_gdf['geometry'] = snapped_points
    stops_gdf[seq_lat_col] = stops_gdf.geometry.y
    stops_gdf[seq_lng_col] = stops_gdf.geometry.x
    return stops_gdf

def split_routes(busroute_select, 
                 seq_select,
                 route_id_col='RouteName',
                 route_direction_col='Direction',
                 seq_id_col='RouteName',
                 seq_direction_col='Direction',
                 seq_seq_col='Seq',
                 seq_lat_col='Lat',
                 seq_lng_col='Lon'):
    """
    將公車路線 (busroute_select) 依據提供的站序 (seq_select) 上，分為數段的shp。
    Parameters:
        busroute_select (GeoDataFrame): 包含公車路線名稱的 GeoDataFrame。
        seq_select (DataFrame): 包含公車路線站序的 DataFrame。
        seq_routename_col (str): 路線名稱欄位名稱。
        seq_direction_col (str): 路線方向欄位名稱。
        seq_seq_col (str): 站點方向欄位名稱。
        seq_lat_col (str): 站點緯度欄位名稱。
        seq_lng_col (str): 站點經度欄位名稱。
    Returns:
        GeoDataFrame: 更新後的公車站點 GeoDataFrame，其中 geometry 已投影到路線。
    """

    output = []

    for _, route in busroute_select.iterrows():
        route_id = route[route_id_col]
        direction = route[route_direction_col]
        geometry = route['geometry']

        # 過濾對應路線與方向的站點
        stops = seq_select[(seq_select[seq_id_col] == route_id) & 
                           (seq_select[seq_direction_col] == direction)].sort_values(seq_seq_col)

        # 確保站點順序對應於路線
        stop_coords = [(row[seq_lng_col], row[seq_lat_col]) for _, row in stops.iterrows()]

        for i in range(len(stop_coords) - 1):
            start_point = Point(stop_coords[i])
            end_point = Point(stop_coords[i + 1])

            # 找到站點在路線中的比例位置
            start_distance = geometry.project(start_point)
            end_distance = geometry.project(end_point)

            # 提取路線幾何分段
            segment = substring(geometry, start_distance, end_distance)

            output.append({
                'RouteName': route_id,
                'Direction': direction,
                'StartSeq': stops.iloc[i][seq_seq_col],
                'EndSeq': stops.iloc[i + 1][seq_seq_col],
                'geometry': segment
            })

    return gpd.GeoDataFrame(output)

In [112]:


# 先處理SubRouteUID 一致的路線
outputfolder = r"D:\B-Project\2025\6800\Technical\12票證資料\TicketAnalysis\00_TDX資料下載\Trial"

route_id_col='SubRouteUID'
route_direction_col='Direction'
seq_id_col='SubRouteUID'
seq_direction_col='Direction'
seq_seq_col = 'StopSequence'
seq_lat_col='PositionLat'
seq_lng_col='PositionLon'

busroute = gdf_route[gdf_route['SubRouteUID'].isin(in_both_subrouteUID)].reset_index(drop=True)
seq = gdf_seq[gdf_seq['SubRouteUID'].isin(in_both_subrouteUID)].reset_index(drop=True)

gdf_segment_routes = []
gdf_snappoints = []

original_gdfseq = []
original_gdfroute = []

for route in list(in_both_subrouteUID)[:10]:
    for direction in [0,1]:
            # 取得對應的方向的route
            print("\n\n=====Start=====")
            print(f"route {route}" )
            print(f"direction {direction}")
            print("======")
            busroute_select = busroute[ (busroute[route_id_col] == route) & (busroute[route_direction_col] == direction)][[route_id_col,route_direction_col,'geometry' ]].reset_index(drop = True)
            # 也要有對應的seq
            seq_select = seq[ (seq[seq_id_col] == route) & (seq[seq_direction_col] == direction) ].sort_values(seq_seq_col).reset_index(drop = True)
            seq_select['geometry'] = seq_select.apply(lambda row: Point(row[seq_lng_col], row[seq_lat_col]), axis=1)
            # 將seq_select從 Pandas DataFrame 轉換為 GeoDataFrame
            seq_select = gpd.GeoDataFrame(seq_select, geometry='geometry').drop_duplicates(subset=[seq_seq_col]).reset_index(drop = True)
            seq_select = seq_select.set_crs(epsg=4326, inplace=True)
            if len(seq_select) > 0: 
                print(f"seq_select_{route}_{direction}:", end="　")
                print(len(seq_select))

                # 01_將公車站序點位投影到路線上
                gdf_snapstop_select = snap_points_to_line(seq_select, 
                                                        busroute_select,
                                                        route_id_col='SubRouteUID',
                                                        route_direction_col='Direction',
                                                        seq_id_col='SubRouteUID',
                                                        seq_direction_col='Direction',
                                                        seq_lat_col='PositionLat',
                                                        seq_lng_col='PositionLon')
                if len (gdf_snapstop_select) > 0 :
                    gdf_snappoints.append(gdf_snapstop_select)

                # print(f"gdf_snapstop_select_{route}_{direction}:", end="　")
                # print(len(gdf_snapstop_select))

                # 02_將路線進行拆分
                # print(busroute_select.head())
                gdf_routesegment_select = split_routes(busroute_select, 
                                                    seq_select,
                                                    route_id_col='SubRouteUID',
                                                    route_direction_col='Direction',
                                                    seq_id_col='SubRouteUID',
                                                    seq_direction_col='Direction',
                                                    seq_seq_col='StopSequence', 
                                                    seq_lat_col='PositionLat',
                                                    seq_lng_col='PositionLon')

                # print(f"gdf_routesegment_select_{route}_{direction}:", end="　")
                # print(len(gdf_routesegment_select))
                gdf_routesegment_select = gdf_routesegment_select.set_crs(epsg=4326, inplace=True)
                gdf_segment_routes.append(gdf_routesegment_select)

print(len(gdf_snappoints))
if len(gdf_snappoints) > 0 : 
    gdf_snappoints = pd.concat(gdf_snappoints)
    gdf_snappoints.to_file(os.path.join(outputfolder, 'SnappedSequence.shp'))
    print("gdf_snappoints 輸出成功")

if len(original_gdfseq) > 0 : 
    original_gdfseq = pd.concat(original_gdfseq)
    original_gdfseq.to_file(os.path.join(outputfolder, 'original_gdfseq.shp'))
    print("original_gdfseq 輸出成功")

if len(original_gdfroute) > 0 : 
    original_gdfroute = pd.concat(original_gdfroute)
    original_gdfroute.to_file(os.path.join(outputfolder, 'original_gdfroute.shp'))
    print("original_gdfroute 輸出成功")

if len(gdf_segment_routes) > 0 : 
    gdf_segment_routes = pd.concat(gdf_segment_routes)
    gdf_segment_routes.to_file(os.path.join(outputfolder, 'gdf_segment_routes.shp'))
    print("gdf_segment_routes 輸出成功")



=====Start=====
route NWT158039
direction 0
seq_select_NWT158039_0:　87


=====Start=====
route NWT158039
direction 1


=====Start=====
route THB1915B2
direction 0


=====Start=====
route THB1915B2
direction 1
seq_select_THB1915B2_1:　5


=====Start=====
route TAO11892
direction 0
seq_select_TAO11892_0:　16


=====Start=====
route TAO11892
direction 1
seq_select_TAO11892_1:　16


=====Start=====
route THB680101
direction 0
seq_select_THB680101_0:　66


=====Start=====
route THB680101
direction 1


=====Start=====
route THB112501
direction 0
seq_select_THB112501_0:　13


=====Start=====
route THB112501
direction 1


=====Start=====
route THB691802
direction 0


=====Start=====
route THB691802
direction 1
seq_select_THB691802_1:　73


=====Start=====
route TAO5098
direction 0
seq_select_TAO5098_0:　37


=====Start=====
route TAO5098
direction 1
seq_select_TAO5098_1:　37


=====Start=====
route TAO5016
direction 0
seq_select_TAO5016_0:　66


=====Start=====
route TAO5016
direction 1
seq_select_TA

  gdf_snappoints.to_file(os.path.join(outputfolder, 'SnappedSequence.shp'))
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(


In [108]:
gdf_snappoints

[]

In [67]:
gdf_snappoints = pd.concat(gdf_snappoints)
gdf_snappoints.to_file(os.path.join(outputfolder, 'SnappedSequence.shp'))

original_gdfseq = pd.concat(original_gdfseq)
original_gdfseq.to_file(os.path.join(outputfolder, 'original_gdfseq.shp'))

original_gdfroute = pd.concat(original_gdfroute)
original_gdfroute.to_file(os.path.join(outputfolder, 'original_gdfroute.shp'))

  gdf_snappoints.to_file(os.path.join(outputfolder, 'SnappedSequence.shp'))
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  original_gdfseq.to_file(os.path.join(outputfolder, 'original_gdfseq.shp'))
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  original_gdfroute.to_file(os.path.join(outputfolder, 'original_gdfroute.shp'))
  ogr_write(


In [None]:
original_gdfseq.crs

In [None]:
gdf_routesegment_select = split_routes(gdf_route_selectbySubRouteUID, gdf_snapstop_select,
                                       route_id_col='SubRouteUID',
                                       route_direction_col='Direction',
                                       seq_id_col='SubRouteUID',
                                       seq_direction_col='Direction',
                                       seq_seq_col='StopSequence', 
                                       seq_lat_col='PositionLat',
                                       seq_lng_col='PositionLon').set_crs(epsg=4326, inplace=True)

gdf_routesegment_select = gdf_routesegment_select.to_crs(epsg=3826) # 轉換為TWD97的才有辦法進行計算

In [None]:
gdf_seq.columns