In [60]:
import xml.etree.ElementTree as ET
import pandas as pd
import geopandas as gpd
import os 
from shapely import wkt # for WKT 轉幾何物件
from shapely.geometry import LineString, Point, MultiLineString
from shapely.ops import substring, linemerge

# ===== 自己新增所使用的套件 =====
from TDXdataframe import read_bus_stop_of_route_xml, read_bus_shape_of_route_xml, read_businfo_xml
from basicprocess import create_folder, findfiles, read_combined_dataframe

# 00 Setup
def dataframe_to_point(df, lon_col, lat_col, crs="EPSG:4326", target_crs="EPSG:3826"):
    '''
    Parameters:
    df (dataframe) : 含經緯度座標欄位的dataframe
    lon_col (str) : 緯度欄位
    Lat_col (str) : 經度欄位
    crs (str) : 目前經緯度座標的座標系統，常用的為4326(WGS84)、3826(TWD97)
    target_crs：目標轉換的座標系統
    '''

    # from shapely.geometry import Point
    # import pandas as pd
    # import geopandas as gpd
    # Create Point geometries from the longitude and latitude columns
    geometry = [Point(xy) for xy in zip(df[lon_col], df[lat_col])]
    # Create a GeoDataFrame with the original CRS
    gdf = gpd.GeoDataFrame(df, geometry=geometry, crs=crs)
    # Convert the GeoDataFrame to the target CRS
    gdf = gdf.to_crs(epsg=target_crs.split(":")[1])
    return gdf

# 01 讀取TDX資料
# 讀取 TDXdataframe

# 02 檢查路線
def compare_column_values(df_a, df_b, column, name_a='df_a', name_b='df_b'):
    # 轉 set 做比較
    set_a = set(df_a[column])
    set_b = set(df_b[column])

    only_in_a = set_a - set_b
    only_in_b = set_b - set_a
    in_both = set_a & set_b

    # 組成輸出用文字
    text = []
    text.append(f"只在 {name_a} 出現的 {column}：{len(only_in_a)}")
    text.append(f"只在 {name_b} 出現的 {column}：{len(only_in_b)}")
    text.append(f"兩邊都有的 {column}：{len(in_both)}")

    output_text = "\n".join(text)

    return output_text, only_in_a, only_in_b, in_both

# 03 拆分路線

def snap_points_to_line(
    stops_gdf, routes_gdf,
    route_id_col, route_direction_col,
    seq_id_col, seq_direction_col,
    seq_lat_col, seq_lng_col,
    route_geom_col="geometry"
    ):
    """
    不處理 CRS、不檢查任何欄位、不做 eps 修正。
    只負責把點投影到路線上並回寫 __m__。
    """

    snapped_points = []
    measures = []

    for _, stop in stops_gdf.iterrows():

        # 找對應路線
        matching = routes_gdf[
            (routes_gdf[route_id_col] == stop[seq_id_col]) &
            (routes_gdf[route_direction_col] == stop[seq_direction_col])
        ]

        if matching.empty:
            snapped_points.append(stop.geometry)
            measures.append(None)
            continue

        geom = matching.iloc[0][route_geom_col]

        # MultiLineString 盡量合併成單線
        try:
            line = linemerge(geom)
        except Exception:
            line = geom

        # 投影與插值
        m = line.project(stop.geometry)
        snapped = line.interpolate(m)

        snapped_points.append(snapped)
        measures.append(float(m))

    out = stops_gdf.copy()
    out["geometry"] = snapped_points
    out[seq_lat_col] = out.geometry.y
    out[seq_lng_col] = out.geometry.x
    out["__m__"] = measures

    return out

def split_routes(
    busroute_select, 
    seq_select,
    route_id_col='RouteName',
    route_direction_col='Direction',
    seq_id_col='RouteName',
    seq_direction_col='Direction',
    seq_seq_col='Seq',
    route_geom_col='geometry',
    eps=1e-6 ):
    """
    依站序把路線切成多段，只輸出 LineString 段落。
    要求 seq_select 已經由 snap_points_to_line 產生 __m__ 欄位；若沒有，我們會用 geometry 計算。
    """
    output = []

    # CRSs 對齊
    if hasattr(busroute_select, "crs") and hasattr(seq_select, "crs"):
        if busroute_select.crs != seq_select.crs:
            seq_select = seq_select.to_crs(busroute_select.crs)

    for _, route in busroute_select.iterrows():
        rid = route[route_id_col]
        direc = route[route_direction_col]
        geom = route[route_geom_col]

        # 先把路線合併成單條（盡量）
        try:
            line = linemerge(geom)
        except Exception:
            line = geom

        # 取對應站點（依站序排序）
        stops = seq_select[
            (seq_select[seq_id_col] == rid) &
            (seq_select[seq_direction_col] == direc)
        ].sort_values(seq_seq_col).copy()

        if stops.empty:
            continue

        # 若沒有 __m__ 就現算
        if "__m__" not in stops.columns or stops["__m__"].isna().any():
            stops["__m__"] = stops.geometry.apply(lambda p: line.project(p))

        # 夾界在 [0, line.length]
        L = line.length
        stops["__m__"] = stops["__m__"].clip(lower=0.0, upper=L)

        # 去除「同一 m 值」的重複點（避免零長度段）
        # 若同一 m 有多筆，保留站序最小的那一筆
        stops = stops.sort_values([ "__m__", seq_seq_col ])
        stops = stops.drop_duplicates(subset="__m__", keep="first")

        # 回到站序順序（你要依站序切段）
        stops = stops.sort_values(seq_seq_col)

        m_vals = stops["__m__"].to_numpy()
        seq_vals = stops[seq_seq_col].to_numpy()

        for i in range(len(m_vals) - 1):
            m0 = float(m_vals[i])
            m1 = float(m_vals[i+1])

            # 修正順序：substring 需要 start <= end
            start_m = min(m0, m1)
            end_m   = max(m0, m1)

            # 過濾太短或同點（避免回傳 Point）
            if end_m - start_m <= eps:
                continue

            # 切段
            seg = substring(line, start_m, end_m, normalized=False)

            # 只保留 LineString（或非零長度的 MultiLineString）
            if isinstance(seg, LineString):
                if seg.length > eps:
                    output.append({
                        'ID': rid,
                        'Direction': direc,
                        'StartSeq': seq_vals[i],
                        'EndSeq': seq_vals[i+1],
                        'geometry': seg
                    })
            elif isinstance(seg, MultiLineString):
                # 可能因為 line 還是多段，挑長度>0的子段各自輸出
                for part in seg.geoms:
                    if part.length > eps:
                        output.append({
                            'ID': rid,
                            'Direction': direc,
                            'StartSeq': seq_vals[i],
                            'EndSeq': seq_vals[i+1],
                            'geometry': part
                        })
            else:
                # Point / 空幾何都丟掉
                continue

    return gpd.GeoDataFrame(output, geometry="geometry", crs=getattr(busroute_select, "crs", None))

def inspect_route_geometries(gdf):
    """
    檢查 GeoDataFrame 的 geometry 型別，並回傳文字報告（string）。
    不做 print，只組成 text 回傳。
    """

    lines = []  # 用來存文字行

    lines.append("幾何型別分佈：")
    geom_counts = gdf.geom_type.value_counts(dropna=False)
    lines.append(str(geom_counts))

    # 找出非 LineString / MultiLineString，或空幾何、NaN
    mask_bad = (
        ~gdf.geom_type.isin(["LineString", "MultiLineString"])
        | gdf.geometry.isna()
        | gdf.is_empty
    )
    bad = gdf[mask_bad]

    lines.append(f"\n疑似有問題的筆數：{len(bad)}")

    bad_geom_types = bad.geom_type.value_counts(dropna=False).rename("bad_geom_types")
    lines.append(str(bad_geom_types))

    # 前幾筆索引
    bad_indices = list(bad.index[:10])
    lines.append(f"\n前 10 筆問題索引：{bad_indices}")

    # 將所有行組成字串
    text_report = "\n".join(lines)
    return text_report


# ===== 步驟 =====
# 01-01 讀取站序xml
def read_seq(busstopseq_folder = os.path.join(os.getcwd(), '..', "00_TDX資料下載", "01公車站序資料"), reset_seq = True):
    # 讀取所有站序 xml，轉存為 csv
    xml_files = findfiles(busstopseq_folder, filetype='.xml', recursive=False)
    for xmlfile in xml_files:
        df = read_bus_stop_of_route_xml(xmlfile)
        df.to_csv(xmlfile.replace('.xml', '.csv'), index=False, encoding='utf-8-sig')

    # 整併所有的csv
    df_seq = read_combined_dataframe(findfiles(busstopseq_folder, filetype='.csv', recursive=False))
    df_seq = df_seq.drop_duplicates(subset=['RouteUID', 'SubRouteID', 'Direction', 'StopSequence'])

    if reset_seq == True:
        df_seq = (
            df_seq
            .sort_values(['RouteUID', 'SubRouteUID', 'Direction', 'StopSequence'])
            .assign(
                Seq_real=lambda d: (
                    d.groupby(['RouteUID', 'SubRouteUID', 'Direction'])
                    .cumcount() + 1
                )
            )
        )


    return df_seq

# 01-02 讀取路線xml資料
def read_busroute(busroute_folder = os.path.join(os.getcwd(), '..', "00_TDX資料下載", "02公車路線資料")):
    xml_files = findfiles(busroute_folder, filetype='.xml', recursive=False)
    for xmlfile in xml_files:
        df = read_bus_shape_of_route_xml(xmlfile)
        df.to_csv(xmlfile.replace('.xml', '.csv'), index=False, encoding='utf-8-sig')

    # 整併所有的csv
    df_route = read_combined_dataframe(findfiles(busroute_folder, filetype='.csv', recursive=False))
    df_route = df_route.drop_duplicates(subset=['RouteUID', 'SubRouteID', 'Direction'])

    return df_route

# 02 檢查路線
def check_routes_and_save(df_route, df_seq, output_path="route_check_output.txt"):
    # 呼叫 compare_column_values
    text_routeUID, only_in_route_routeUID, only_in_seq_routeUID, in_both_routeUID = compare_column_values(
        df_route, df_seq, 'RouteUID', name_a='df_route', name_b='df_seq'
    )
    text_subrouteUID, only_in_route_subrouteUID, only_in_seq_subrouteUID, in_both_subrouteUID = compare_column_values(
        df_route, df_seq, 'SubRouteUID', name_a='df_route', name_b='df_seq'
    )

    # 組合所有輸出文字
    output_text = (
        text_routeUID + "\n"
        + "-----------\n"
        + text_subrouteUID
    )

    # 寫入 txt 檔
    with open(output_path, "w", encoding="utf-8") as f:
        f.write(output_text)

    print(f"已輸出檢查結果到：{output_path}")
    return in_both_subrouteUID

# 03 拆分路線

# 03-01 將站序和路線轉為 GeoDataFrame
def get_gdfroute_gdfseq(df_route, df_seq):
    '''03-01 將站序和路線轉為 GeoDataFrame'''
    df_route = df_route.copy()
    df_seq = df_seq.copy()
    
    # df_route["Geometry"] = df_route["Geometry"].apply(wkt.loads) # 將 Geometry 欄位的 WKT 轉為 shapely geometry 
    # gdf_route = gpd.GeoDataFrame(df_route, geometry="Geometry", crs="EPSG:4326") # 建立 GeoDataFrame 
    # gdf_route.rename(columns={'Geometry':'geometry'}, inplace=True)

    df_route["geometry"] = df_route["Geometry"].apply(wkt.loads)
    gdf_route = gpd.GeoDataFrame(
        df_route,
        geometry="geometry",
        crs="EPSG:4326"
    )    

    gdf_seq = dataframe_to_point(df_seq, lon_col='PositionLon', lat_col='PositionLat', crs="EPSG:4326", target_crs="EPSG:4326")

    return gdf_route, gdf_seq

# 03-02 拆分路線
def get_bySubRouteUID(gdf_route, gdf_seq, in_both_subrouteUID):

    # 先處理SubRouteUID 一致的路線
    gdf_route_selectbySubRouteUID = gdf_route[gdf_route['SubRouteUID'].isin(in_both_subrouteUID)].reset_index(drop=True)
    gdf_seq_selectbySubRouteUID = gdf_seq[gdf_seq['SubRouteUID'].isin(in_both_subrouteUID)].reset_index(drop=True)

    
    # 將公車站序點位投影到路線上
    gdf_snapstop_select = snap_points_to_line(gdf_seq_selectbySubRouteUID, gdf_route_selectbySubRouteUID,
                                            route_id_col='SubRouteUID',
                                            route_direction_col='Direction',
                                            seq_id_col='SubRouteUID',
                                            seq_direction_col='Direction',
                                            seq_lat_col='PositionLat',
                                            seq_lng_col='PositionLon')
    # 依站序拆分路線
    gdf_routesegment_select = split_routes(gdf_route_selectbySubRouteUID, gdf_snapstop_select,
                                        route_id_col='SubRouteUID',
                                        route_direction_col='Direction',
                                        seq_id_col='SubRouteUID',
                                        seq_direction_col='Direction',
                                        seq_seq_col='StopSequence')

    gdf_routesegment_select.rename(columns = {'ID':'SubRouteUID'}, inplace=True)
    gdf_routesegment_select = pd.merge(gdf_routesegment_select,
                                        gdf_seq_selectbySubRouteUID[['RouteUID', 'RouteName_Zh' ,'SubRouteUID', 'SubRouteName_Zh']].drop_duplicates(subset=['RouteUID', 'SubRouteUID']),
                                        on='SubRouteUID', how='left')
    gdf_routesegment_select = pd.merge(gdf_routesegment_select,
                                        gdf_seq_selectbySubRouteUID[['RouteUID', 'SubRouteUID', 'Direction', 'StopSequence', 'StopUID', 'StopName_Zh']].rename(columns={'StopUID':'OStopUID', 'StopName_Zh':'OStopName', 'StopSequence':'StartSeq'}).drop_duplicates(subset=['RouteUID', 'SubRouteUID', 'Direction', 'StartSeq']),
                                        on=['RouteUID', 'SubRouteUID', 'Direction', 'StartSeq'], how='left')
    
    gdf_routesegment_select = pd.merge(gdf_routesegment_select,
                                        gdf_seq_selectbySubRouteUID[['RouteUID', 'SubRouteUID', 'Direction', 'StopSequence', 'StopUID', 'StopName_Zh']].rename(columns={'StopUID':'DStopUID', 'StopName_Zh':'DStopName', 'StopSequence':'EndSeq'}).drop_duplicates(subset=['RouteUID', 'SubRouteUID', 'Direction', 'EndSeq']),
                                        on=['RouteUID', 'SubRouteUID', 'Direction', 'EndSeq'], how='left')
    
    gdf_routesegment_select = gdf_routesegment_select.rename(columns = {'RouteName_Zh':'RouteName', 'SubRouteName_Zh':'SubRouteName'})

    return gdf_routesegment_select

def get_byRouteUID(gdf_route, gdf_seq, in_both_subrouteUID):

    gdf_route_others = gdf_route[gdf_route['SubRouteUID'].isin(in_both_subrouteUID)==False]
    gdf_seq_others = gdf_seq[gdf_seq['SubRouteUID'].isin(in_both_subrouteUID) == False]

    gdf_snapstop_others = snap_points_to_line(gdf_seq_others, gdf_route_others,
                                            route_id_col='RouteUID',
                                            route_direction_col='Direction',
                                            seq_id_col='RouteUID',
                                            seq_direction_col='Direction',
                                            seq_lat_col='PositionLat',
                                            seq_lng_col='PositionLon')

    gdf_routesegment_others = split_routes(gdf_route_others, gdf_snapstop_others,
                                        route_id_col='RouteUID',
                                        route_direction_col='Direction',
                                        seq_id_col='RouteUID',
                                        seq_direction_col='Direction',
                                        seq_seq_col='StopSequence')

    gdf_routesegment_others.rename(columns = {'ID':'RouteUID'}, inplace=True)


    gdf_routesegment_others = gdf_routesegment_others.merge(gdf_route_others[['RouteUID', 'RouteName_Zh']].drop_duplicates(), on='RouteUID', how='left')
    gdf_routesegment_others = pd.merge(gdf_routesegment_others , 
                                    gdf_seq_others[['RouteUID', 'Direction', 'StopSequence', 'StopUID', 'StopName_Zh']].drop_duplicates().rename(columns = {'StopUID':'OstopUID', 'StopName_Zh':'OStopName',  'StopSequence':'StartSeq'}), 
                                    on = ['RouteUID', 'Direction', 'StartSeq'], how = 'left')
    gdf_routesegment_others = pd.merge(gdf_routesegment_others , 
                                    gdf_seq_others[['RouteUID', 'Direction', 'StopSequence', 'StopUID', 'StopName_Zh']].drop_duplicates().rename(columns = {'StopUID':'DstopUID', 'StopName_Zh':'DStopName',  'StopSequence':'EndSeq'}), 
                                    on = ['RouteUID', 'Direction', 'EndSeq'], how = 'left')
    
    gdf_routesegment_others = gdf_routesegment_others.rename(columns = {'RouteName_Zh':'RouteName'})

    return gdf_routesegment_others

def get_splitroute(gdf_route, gdf_seq, in_both_subrouteUID, 
                   routesegment_folder = create_folder(os.path.join(os.getcwd(), '..', "03_處理後資料", "01_公車路線依站序拆分")), 
                   enable_separate_output = True):
    '''黏貼路網成步驟'''

    # 處理SubRouteUID 一致的路線
    gdf_routesegment_select = get_bySubRouteUID(gdf_route, gdf_seq, in_both_subrouteUID)
    if enable_separate_output:
        gdf_routesegment_select.to_file(os.path.join(routesegment_folder, 'BySubRouteUID.shp'), index=False)

    # 處理只有RouteUID 一致的路線
    gdf_routesegment_others = get_byRouteUID(gdf_route, gdf_seq, in_both_subrouteUID)
    if enable_separate_output:
        gdf_routesegment_others.to_file(os.path.join(routesegment_folder, 'ByRouteUID.shp'), index=False)

    # 最後合併
    gdf_routesegment = pd.concat([gdf_routesegment_select, gdf_routesegment_others], ignore_index=True)
    gdf_routesegment = gdf_routesegment.reindex(columns = ['RouteUID', 'RouteName', 'SubRouteUID', 'SubRouteName', 'Direction', 
                                                           'StartSeq', 'OStopName', 'OStopUID', 
                                                           'EndSeq', 'DStopName', 'DStopUID', 
                                                           'geometry'])
    

    print(inspect_route_geometries(gdf_routesegment))

    # 輸出
    routesegment_filepath = os.path.join(routesegment_folder, '市區公車拆分.shp')
    gdf_routesegment.to_file(routesegment_filepath, 
                             index=False)
    
    return gdf_routesegment

def main():
    df_seq = read_seq(busstopseq_folder = os.path.join(os.getcwd(), '..', "00_TDX資料下載", "01公車站序資料"))
    df_route = read_busroute(busroute_folder = os.path.join(os.getcwd(), '..', "00_TDX資料下載", "02公車路線資料"))

    in_both_subrouteUID = check_routes_and_save(df_route = df_route, 
                                                df_seq = df_seq, 
                                                output_path= os.path.abspath(os.path.join(os.getcwd(), '..', '02_初步分析', '票證及路線數據檢查.txt')))
    
    gdf_route, gdf_seq = get_gdfroute_gdfseq(df_route, df_seq)

    gdf_routesegment = get_splitroute(gdf_route = gdf_route, 
                                  gdf_seq = gdf_seq, 
                                  in_both_subrouteUID = in_both_subrouteUID)
    
# if __name__ == "__main__":
#     main()

# 開始嘗試讀取資料

In [2]:
def read_routeinfo(inputfolder = os.path.join('..', '00_TDX資料下載', '03公車路線營運資料')):
    busrouteinfofiles = findfiles(inputfolder, 'xml')
    routeinfo = []
    for file in busrouteinfofiles:
        routeinfo.append(read_businfo_xml(file))
    df_routeinfo = pd.concat(routeinfo)
    df_routeinfo = df_routeinfo.rename(columns = {'RouteNameZh':'RouteName_Zh', 'SubRouteNameZh':'SubRouteName_Zh'})
    return df_routeinfo

def compare_uid_sets(A, B, Acol, Bcol, dropna=True):
    """
    比較兩個 DataFrame 指定欄位的集合差異

    Parameters
    ----------
    A, B : pandas.DataFrame
    Acol, Bcol : str
        要比較的欄位名稱
    dropna : bool, default True
        是否忽略 NaN（建議 True）

    Returns
    -------
    dict
        {
            'common': set,
            'only_in_A': set,
            'only_in_B': set,
            'A_common_df': DataFrame,
            'A_only_df': DataFrame,
            'B_common_df': DataFrame,
            'B_only_df': DataFrame
        }
    """
    if dropna:
        set_A = set(A[Acol].dropna())
        set_B = set(B[Bcol].dropna())
    else:
        set_A = set(A[Acol])
        set_B = set(B[Bcol])

    common = set_A & set_B
    only_A = set_A - set_B
    only_B = set_B - set_A

    return {
        'common': common,
        'only_in_A': only_A,
        'only_in_B': only_B,
        'A_common_df': A[A[Acol].isin(common)],
        'A_only_df': A[A[Acol].isin(only_A)],
        'B_common_df': B[B[Bcol].isin(common)],
        'B_only_df': B[B[Bcol].isin(only_B)],
    }



In [48]:
df_seq = read_seq(busstopseq_folder = os.path.join(os.getcwd(), '..', "00_TDX資料下載", "01公車站序資料"))
df_route = read_busroute(busroute_folder = os.path.join(os.getcwd(), '..', "00_TDX資料下載", "02公車路線資料"))
df_routeinfo = read_routeinfo()

  df[col] = pd.to_numeric(df[col], errors="ignore")
  df[col] = pd.to_numeric(df[col], errors="ignore")


In [57]:
df_seq

Unnamed: 0,RouteUID,RouteID,RouteName_Zh,RouteName_En,SubRouteUID,SubRouteID,SubRouteName_Zh,SubRouteName_En,Direction,City,...,StopBoarding,StopSequence,PositionLon,PositionLat,GeoHash,StationID,StationGroupID,LocationCityCode,FilePath,Seq_new
62900,KEE0155,155,3044,3044,KEE015501,015501,3044,3044,0,Keelung,...,1,1,121.738670,25.131280,wsqwfpp2k,141049,200－049,KEE,d:\B-Project\2025\6800\Technical\12票證資料\Ticket...,1
62901,KEE0155,155,3044,3044,KEE015501,015501,3044,3044,0,Keelung,...,0,2,121.740275,25.129851,wsqwfq8xf,141125,200－050,KEE,d:\B-Project\2025\6800\Technical\12票證資料\Ticket...,2
62902,KEE0155,155,3044,3044,KEE015501,015501,3044,3044,0,Keelung,...,0,3,121.736985,25.130240,wsqwfny4m,141167,203－029,KEE,d:\B-Project\2025\6800\Technical\12票證資料\Ticket...,3
62903,KEE0155,155,3044,3044,KEE015501,015501,3044,3044,0,Keelung,...,0,4,121.739089,25.133076,wsqwfprdy,143441,203－105,KEE,d:\B-Project\2025\6800\Technical\12票證資料\Ticket...,4
62904,KEE0155,155,3044,3044,KEE015501,015501,3044,3044,0,Keelung,...,0,5,121.740028,25.134482,wsqwfr86u,141126,203－027,KEE,d:\B-Project\2025\6800\Technical\12票證資料\Ticket...,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
179098,TPE810,810,敦化幹線,Dunhua Metro Bus,TPE810,810,敦化幹線,Duhua Metro Bus,1,Taipei,...,-1,41,121.553535,25.023243,wsqqq52df,2146,,TPE,d:\B-Project\2025\6800\Technical\12票證資料\Ticket...,41
179099,TPE810,810,敦化幹線,Dunhua Metro Bus,TPE810,810,敦化幹線,Duhua Metro Bus,1,Taipei,...,0,42,121.555911,25.021034,wsqqq4fq4,2085,,TPE,d:\B-Project\2025\6800\Technical\12票證資料\Ticket...,42
179100,TPE810,810,敦化幹線,Dunhua Metro Bus,TPE810,810,敦化幹線,Duhua Metro Bus,1,Taipei,...,0,43,121.557804,25.019261,wsqqq4eey,2172,,TPE,d:\B-Project\2025\6800\Technical\12票證資料\Ticket...,43
179101,TPE810,810,敦化幹線,Dunhua Metro Bus,TPE810,810,敦化幹線,Duhua Metro Bus,1,Taipei,...,0,44,121.558530,25.018250,wsqqq4kjz,2157,,TPE,d:\B-Project\2025\6800\Technical\12票證資料\Ticket...,44


In [36]:
filterlist = ['寵物', '通勤']

pattern = '|'.join(filterlist)

df_filtered = df_seq[~df_seq['SubRouteName_Zh'].str.contains(pattern, na=False)]


In [37]:
df_filtered

Unnamed: 0,RouteUID,RouteName_Zh,SubRouteUID,SubRouteName_Zh,Direction,R_S_UID
0,THB0968,968,THB096801,0968,0,THB0968-THB096801
20,THB0968,968,THB096802,0968,1,THB0968-THB096802
40,THB0968,968,THB0968A1,0968A,0,THB0968-THB0968A1
62,THB0968,968,THB0968A2,0968A,1,THB0968-THB0968A2
84,THB1031,1031,THB103101,1031,0,THB1031-THB103101
...,...,...,...,...,...,...
178969,TPE19721,雙園巴士,TPE162411,雙園巴士(首都),0,TPE19721-TPE162411
178986,TPE19721,雙園巴士,TPE162411,雙園巴士(首都),1,TPE19721-TPE162411
179002,TPE19755,新莊-臺北車站,TPE162471,新莊-臺北車站,0,TPE19755-TPE162471
179017,TPE810,敦化幹線,TPE810,敦化幹線,0,TPE810-TPE810


In [5]:
seqcolumns = df_seq.columns.to_list()
routecolumns = df_route.columns.to_list()

In [6]:
reindex_columns = ['RouteUID',  'RouteName_Zh', 'SubRouteUID', 'SubRouteName_Zh', 'Direction']
df_seq = df_seq.reindex(columns = reindex_columns).drop_duplicates()

In [7]:
df_routeinfo['R_S_UID'] = (
    df_routeinfo['RouteUID'].fillna('')
    + '-'
    + df_routeinfo['SubRouteUID'].fillna('')
)

df_seq['R_S_UID'] = (
    df_seq['RouteUID'].fillna('')
    + '-'
    + df_seq['SubRouteUID'].fillna('')
)

df_route['R_S_UID'] = (
    df_route['RouteUID'].fillna('')
    + '-'
    + df_route['SubRouteUID'].fillna('')
)

returnlist = compare_uid_sets(A = df_route, B = df_seq, Acol = 'R_S_UID', Bcol = 'R_S_UID', dropna=True)
commonlist = returnlist['common']
onlyinroute = returnlist['A_only_df']
onlyinseq = returnlist['B_only_df']
route_common_df = returnlist['A_common_df']
seq_common_df = returnlist['B_common_df']


In [21]:
# 一、找出現在
temp = onlyinseq.groupby(['RouteUID', 'Direction']).agg(Count=('SubRouteUID', 'count'), 
                                                        RouteName_Zh = ('RouteName_Zh', 'first'), 
                                                        SubRouteName_Zh = ('SubRouteName_Zh', 'first'),
                                                        R_S_UID=('R_S_UID', lambda x: ','.join(x.astype(str).unique())),
                                                        SubRouteUID=('SubRouteUID', lambda x: ','.join(x.astype(str).unique())) # 堆疊出所有的SubRouteUID
                                                        ).reset_index().reindex(columns = ['RouteUID',  'RouteName_Zh','SubRouteName_Zh', 'Direction', 'R_S_UID', 'SubRouteUID', 'Count'])

routeuidlist_1 = list(temp[temp['Count'] == 1]['RouteUID'].unique()) # 代表在df_seq之中RouteUID、Direction、SubRouteUID只會有一組組合 但沒有被配對到的 
seq_forpairing = temp[(temp['RouteUID'].isin(routeuidlist_1)) & (~temp['R_S_UID'].isin(commonlist))] #需要同時間排除是在共同清單裡的 也需要是在這個「唯一組合」的

In [58]:
df_seq[df_seq['RouteUID'] == 'TPE17967'][['RouteUID', 'RouteName_Zh', 'SubRouteUID', 'SubRouteName_Zh', 'Direction', 'StopName_Zh', 'StopSequence', 'Seq_new']]

Unnamed: 0,RouteUID,RouteName_Zh,SubRouteUID,SubRouteName_Zh,Direction,StopName_Zh,StopSequence,Seq_new
176748,TPE17967,通勤7,TPE159542,通勤7去程半,0,師大綜合大樓,1,1
176749,TPE17967,通勤7,TPE159542,通勤7去程半,0,臺大,2,2
176750,TPE17967,通勤7,TPE159542,通勤7去程半,0,臺大計資中心,3,3
176751,TPE17967,通勤7,TPE159542,通勤7去程半,0,中研院內郵局,7,4
176752,TPE17967,通勤7,TPE159543,通勤7返程半,1,中研院內郵局,1,1
176753,TPE17967,通勤7,TPE159543,通勤7返程半,1,臺大計資中心,5,2
176754,TPE17967,通勤7,TPE159543,通勤7返程半,1,捷運台電大樓站,6,3
176755,TPE17967,通勤7,TPE159543,通勤7返程半,1,師大綜合大樓,7,4
176756,TPE17967,通勤7,TPE159543,通勤7返程半,1,臺大,8,5
176757,TPE17967,通勤7,TPE159630,通勤7中研院發,0,臺大,1,1


In [54]:
df_route[df_route['SubRouteUID'].isna()][['RouteUID', 'RouteName_Zh', 'SubRouteUID', 'SubRouteName_Zh', 'Direction']]

Unnamed: 0,RouteUID,RouteName_Zh,SubRouteUID,SubRouteName_Zh,Direction
1992,NWT10116,242,,,0
1993,NWT10116,242,,,1
1994,NWT10143,棕7,,,0
1995,NWT10143,棕7,,,1
1996,NWT10148,702,,,0
...,...,...,...,...,...
4986,TPE19755,新莊-臺北車站,,,0
4987,TPE19759,通勤30,,,0
4988,TPE19759,通勤30,,,1
4989,TPE810,敦化幹線,,,0


In [47]:
df_route[df_route['RouteUID'] == 'TPE17967']

Unnamed: 0,Geometry,EncodedPolyline,RouteUID,RouteID,RouteName_Zh,RouteName_En,SubRouteUID,SubRouteID,SubRouteName_Zh,SubRouteName_En,Direction,UpdateTime,VersionID,FilePath,R_S_UID
4919,"LINESTRING (121.529955809466 25.0263708727189,...",y}vwCgiwdV|@sYNgBPKvRhA|BLnH|@tMhBh@D^DlC\LE@M...,TPE17967,17967,通勤7,CB7,,,,,0,2025-11-21T13:07:41+08:00,0,d:\B-Project\2025\6800\Technical\12票證資料\Ticket...,TPE17967-
4920,"LINESTRING (121.616288543535 25.042810303741, ...",qdzwCydheVUPOhDGHaATGGy@oDc@g@y@qBy@sAg@e@oBaA...,TPE17967,17967,通勤7,CB7,,,,,1,2025-11-21T13:07:41+08:00,0,d:\B-Project\2025\6800\Technical\12票證資料\Ticket...,TPE17967-


In [26]:
temp[temp['Count'] > 1]

Unnamed: 0,RouteUID,RouteName_Zh,SubRouteName_Zh,Direction,R_S_UID,SubRouteUID,Count
20,NWT10172,243,243,0,"NWT10172-NWT101720,NWT10172-NWT159537","NWT101720,NWT159537",2
21,NWT10172,243,243,1,"NWT10172-NWT101720,NWT10172-NWT159537","NWT101720,NWT159537",2
102,NWT16296,藍15,藍15,0,"NWT16296-NWT157291,NWT16296-NWT159525","NWT157291,NWT159525",2
103,NWT16296,藍15,藍15,1,"NWT16296-NWT157291,NWT16296-NWT159525","NWT157291,NWT159525",2
128,NWT16430,紅36,紅36,0,"NWT16430-NWT157426,NWT16430-NWT161307","NWT157426,NWT161307",2
...,...,...,...,...,...,...,...
1706,TPE17929,紅68,紅68,1,"TPE17929-TPE159413,TPE17929-TPE161201,TPE17929...","TPE159413,TPE161201,TPE161202,TPE162404",4
1711,TPE17967,通勤7,通勤7去程半,0,"TPE17967-TPE159542,TPE17967-TPE159630","TPE159542,TPE159630",2
1712,TPE17967,通勤7,通勤7返程半,1,"TPE17967-TPE159543,TPE17967-TPE159630","TPE159543,TPE159630",2
1789,TPE19721,雙園巴士,雙園巴士(大都會),0,"TPE19721-TPE162406,TPE19721-TPE162407,TPE19721...","TPE162406,TPE162407,TPE162408,TPE162409,TPE162...",6


In [None]:
temp = onlyinroute.reindex(columns = reindex_columns).groupby(['RouteUID', 'Direction']).agg(Count=('SubRouteUID', 'count'), 
                                                                                               RouteName_Zh = ('RouteName_Zh', 'first'), 
                                                                                               SubRouteName_Zh = ('SubRouteName_Zh', 'first')).reset_index().reindex(columns = ['RouteUID',  'RouteName_Zh', 'SubRouteUID', 'SubRouteName_Zh', 'Direction', 'Count'])
route_add_subrouteUID = pd.merge(onlyinroute[onlyinroute['RouteUID'].isin(routeuidlist_1)], 
                                 seq_forpairing.reindex(columns = ['RouteUID', 'SubRouteUID', 'SubRouteName_Zh','Direction']).drop_duplicates(['RouteUID', 'Direction']), 
                                 on = ['RouteUID', 'Direction'], 
                                 how = 'left', 
                                 suffixes=['', '_add'])

temp = route_add_subrouteUID[route_add_subrouteUID['SubRouteUID_add'].isna()].drop(columns = ['SubRouteUID_add', 'SubRouteName_Zh_add'])
route_add_subrouteUID = route_add_subrouteUID[~route_add_subrouteUID['SubRouteUID_add'].isna()]


okroute = pd.concat([route_common_df, route_add_subrouteUID])



In [53]:
returnlist = compare_uid_sets(A = df_route, B = df_routeinfo, Acol = 'R_S_UID', Bcol = 'R_S_UID', dropna=True)
commonlist = returnlist['common']
onlyinroute = returnlist['A_only_df']
onlyinrouteinfo = returnlist['B_only_df']
route_common_df = returnlist['A_common_df']
routeinfo_common_df = returnlist['B_common_df']

In [56]:
onlyinroute

Unnamed: 0,Geometry,EncodedPolyline,RouteUID,RouteID,RouteName_Zh,RouteName_En,SubRouteUID,SubRouteID,SubRouteName_Zh,SubRouteName_En,Direction,UpdateTime,VersionID,FilePath,R_S_UID
938,"LINESTRING(120.684120 23.754240,120.684020 23....",_o~oCwbr_VNRX\^d@\\FLb@h@BD~@hA??j@e@??dATPDP@...,THB6723,6723,6723,6723,THB672301,672301,6723,6723,0,2025-11-27T01:26:18+08:00,0,d:\B-Project\2025\6800\Technical\12票證資料\Ticket...,THB6723-THB672301
939,"LINESTRING(120.65659 23.63272,120.65673 23.632...",owfoCuvl_VE[w@aB{AeCsAeB[s@a@uA??GSgA?c@B_ADOH...,THB6723,6723,6723,6723,THB672302,672302,6723,6723,1,2025-11-27T01:26:18+08:00,0,d:\B-Project\2025\6800\Technical\12票證資料\Ticket...,THB6723-THB672302
940,"LINESTRING(120.684110 23.754240,120.684020 23....",_o~oCubr_VNPX\^d@\\FLb@h@BD~@hA??aCrC??|AnBpAn...,THB6724,6724,6724,6724,THB672401,672401,6724,6724,0,2025-11-27T01:26:18+08:00,0,d:\B-Project\2025\6800\Technical\12票證資料\Ticket...,THB6724-THB672401
941,"LINESTRING(120.71533 23.70441,120.7154 23.7043...",qwtoCyex_VJMPSFGJIHCNCxB^z@Nd@HZD^BL?L@R@B@J@N...,THB6724,6724,6724,6724,THB672402,672402,6724,6724,1,2025-11-27T01:26:18+08:00,0,d:\B-Project\2025\6800\Technical\12票證資料\Ticket...,THB6724-THB672402
1992,"LINESTRING (121.49087853265 24.9901727687255, ...",q{owC_uodVtBw@@KsEwAi@mHWoA_@q@m@o@aAu@u@e@mCu...,NWT10116,10116,242,242,,,,,0,2025-11-21T12:47:37+08:00,0,d:\B-Project\2025\6800\Technical\12票證資料\Ticket...,NWT10116-
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4986,LINESTRING (121.45272006084542 25.058925431717...,ii}wCofhdV?lAf@`@jZLHMDOGuFAmZAgCB_FQm@kFB{C?}...,TPE19755,19755,新莊-臺北車站,Xinzhuang-Taipei Main Sta.,,,,,0,2025-11-21T13:07:41+08:00,0,d:\B-Project\2025\6800\Technical\12票證資料\Ticket...,TPE19755-
4987,LINESTRING (121.54579893213516 25.094649965638...,qhdxCglzdVLYAEGGEDc@`A_@v@o@tAw@jBq@xAo@nA[d@Y...,TPE19759,19759,通勤30,CB30,,,,,0,2025-11-21T13:07:41+08:00,0,d:\B-Project\2025\6800\Technical\12票證資料\Ticket...,TPE19759-
4988,LINESTRING (121.50908523243753 25.039045840649...,amywCyfsdVPmENwDD[NCzCPhG^`CJHH?TaBjFsA`EaBlFO...,TPE19759,19759,通勤30,CB30,,,,,1,2025-11-21T13:07:41+08:00,0,d:\B-Project\2025\6800\Technical\12票證資料\Ticket...,TPE19759-
4989,"LINESTRING (121.560786999999 25.0159169999998,...",o|twC}i}dV[T[L}DnAo@XsEfESf@cBdBeCpC_RhSmG`Ha@...,TPE810,810,敦化幹線,Dunhua Metro Bus,,,,,0,2025-11-21T13:07:41+08:00,0,d:\B-Project\2025\6800\Technical\12票證資料\Ticket...,TPE810-


# 嘗試

In [None]:
df_seq = read_seq(busstopseq_folder = os.path.join(os.getcwd(), '..', "00_TDX資料下載", "01公車站序資料"))
df_route = read_busroute(busroute_folder = os.path.join(os.getcwd(), '..', "00_TDX資料下載", "02公車路線資料"))


in_both_subrouteUID = check_routes_and_save(df_route = df_route, 
                                            df_seq = df_seq, 
                                            output_path= os.path.abspath(os.path.join(os.getcwd(), '..', '02_初步分析', '票證及路線數據檢查.txt')))

gdf_route, gdf_seq = get_gdfroute_gdfseq(df_route, df_seq)


In [None]:
def snap_points_to_line(stops_gdf, 
                        routes_gdf, 
                        route_id_col, 
                        route_direction_col, 
                        seq_id_col, 
                        seq_direction_col, 
                        seq_lat_col, 
                        seq_lng_col):
    """
    將公車站點 (stops_gdf) 投影到公車路線 (routes_gdf) 上，並動態帶入欄位名稱。
    Parameters:
        stops_gdf (GeoDataFrame): 包含公車站點的 GeoDataFrame。
        routes_gdf (GeoDataFrame): 包含公車路線的 GeoDataFrame。
        route_id_col (str): 路線名稱欄位名稱。
        route_direction_col (str): 路線方向欄位名稱。
        seq_routename_col (str): 站點路線名稱欄位名稱。
        seq_direction_col (str): 站點方向欄位名稱。
        seq_lat_col (str): 站點緯度欄位名稱。
        seq_lng_col (str): 站點經度欄位名稱。
    Returns:
        GeoDataFrame: 更新後的公車站點 GeoDataFrame，其中 geometry 已投影到路線。
    """
    snapped_points = []

    for _, stop in stops_gdf.iterrows():
        # 找到與站點路線名稱和方向相符的路線
        matching_route = routes_gdf[(routes_gdf[route_id_col] == stop[seq_id_col]) & 
                                    (routes_gdf[route_direction_col] == stop[seq_direction_col])]

        if not matching_route.empty:
            # 取出該路線的 geometry
            line = matching_route.iloc[0].geometry
            # 計算站點投影到該路線的最近點
            snapped_point = line.interpolate(line.project(stop.geometry))
            snapped_points.append(snapped_point)
        else:
            # 如果沒有匹配的路線，保持原點
            snapped_points.append(stop.geometry)

    # 更新站點的 geometry
    stops_gdf = stops_gdf.copy()
    stops_gdf['geometry'] = snapped_points
    stops_gdf[seq_lat_col] = stops_gdf.geometry.y
    stops_gdf[seq_lng_col] = stops_gdf.geometry.x
    return stops_gdf

def split_routes(busroute_select, 
                 seq_select,
                 route_id_col='RouteName',
                 route_direction_col='Direction',
                 seq_id_col='RouteName',
                 seq_direction_col='Direction',
                 seq_seq_col='Seq',
                 seq_lat_col='Lat',
                 seq_lng_col='Lon'):
    """
    將公車路線 (busroute_select) 依據提供的站序 (seq_select) 上，分為數段的shp。
    Parameters:
        busroute_select (GeoDataFrame): 包含公車路線名稱的 GeoDataFrame。
        seq_select (DataFrame): 包含公車路線站序的 DataFrame。
        seq_routename_col (str): 路線名稱欄位名稱。
        seq_direction_col (str): 路線方向欄位名稱。
        seq_seq_col (str): 站點方向欄位名稱。
        seq_lat_col (str): 站點緯度欄位名稱。
        seq_lng_col (str): 站點經度欄位名稱。
    Returns:
        GeoDataFrame: 更新後的公車站點 GeoDataFrame，其中 geometry 已投影到路線。
    """

    output = []

    for _, route in busroute_select.iterrows():
        route_id = route[route_id_col]
        direction = route[route_direction_col]
        geometry = route['geometry']

        # 過濾對應路線與方向的站點
        stops = seq_select[(seq_select[seq_id_col] == route_id) & 
                           (seq_select[seq_direction_col] == direction)].sort_values(seq_seq_col)

        # 確保站點順序對應於路線
        stop_coords = [(row[seq_lng_col], row[seq_lat_col]) for _, row in stops.iterrows()]

        for i in range(len(stop_coords) - 1):
            start_point = Point(stop_coords[i])
            end_point = Point(stop_coords[i + 1])

            # 找到站點在路線中的比例位置
            start_distance = geometry.project(start_point)
            end_distance = geometry.project(end_point)

            # 提取路線幾何分段
            segment = substring(geometry, start_distance, end_distance)

            output.append({
                'RouteName': route_id,
                'Direction': direction,
                'StartSeq': stops.iloc[i][seq_seq_col],
                'EndSeq': stops.iloc[i + 1][seq_seq_col],
                'geometry': segment
            })

    return gpd.GeoDataFrame(output)

In [None]:


# 先處理SubRouteUID 一致的路線
outputfolder = r"D:\B-Project\2025\6800\Technical\12票證資料\TicketAnalysis\00_TDX資料下載\Trial"

route_id_col='SubRouteUID'
route_direction_col='Direction'
seq_id_col='SubRouteUID'
seq_direction_col='Direction'
seq_seq_col = 'StopSequence'
seq_lat_col='PositionLat'
seq_lng_col='PositionLon'

busroute = gdf_route[gdf_route['SubRouteUID'].isin(in_both_subrouteUID)].reset_index(drop=True)
seq = gdf_seq[gdf_seq['SubRouteUID'].isin(in_both_subrouteUID)].reset_index(drop=True)

gdf_segment_routes = []
gdf_snappoints = []

original_gdfseq = []
original_gdfroute = []

for route in list(in_both_subrouteUID):
    for direction in [0,1]:
            # 取得對應的方向的route
            print("\n\n=====Start=====")
            print(f"route {route}" )
            print(f"direction {direction}")
            print("======")
            busroute_select = busroute[ (busroute[route_id_col] == route) & (busroute[route_direction_col] == direction)][[route_id_col,route_direction_col,'geometry' ]].reset_index(drop = True)
            # 也要有對應的seq
            seq_select = seq[ (seq[seq_id_col] == route) & (seq[seq_direction_col] == direction) ].sort_values(seq_seq_col).reset_index(drop = True)
            seq_select['geometry'] = seq_select.apply(lambda row: Point(row[seq_lng_col], row[seq_lat_col]), axis=1)
            # 將seq_select從 Pandas DataFrame 轉換為 GeoDataFrame
            seq_select = gpd.GeoDataFrame(seq_select, geometry='geometry').drop_duplicates(subset=[seq_seq_col]).reset_index(drop = True)
            seq_select = seq_select.set_crs(epsg=4326, inplace=True)
            if len(seq_select) > 0: 
                print(f"seq_select_{route}_{direction}:", end="　")
                print(len(seq_select))
                original_gdfseq.append(seq_select)
                original_gdfroute.append(busroute_select)
                

                # 01_將公車站序點位投影到路線上
                gdf_snapstop_select = snap_points_to_line(seq_select, 
                                                        busroute_select,
                                                        route_id_col='SubRouteUID',
                                                        route_direction_col='Direction',
                                                        seq_id_col='SubRouteUID',
                                                        seq_direction_col='Direction',
                                                        seq_lat_col='PositionLat',
                                                        seq_lng_col='PositionLon')
                if len (gdf_snapstop_select) > 0 :
                    gdf_snappoints.append(gdf_snapstop_select)

                # print(f"gdf_snapstop_select_{route}_{direction}:", end="　")
                # print(len(gdf_snapstop_select))

                # 02_將路線進行拆分
                # print(busroute_select.head())
                gdf_routesegment_select = split_routes(busroute_select, 
                                                    seq_select,
                                                    route_id_col='SubRouteUID',
                                                    route_direction_col='Direction',
                                                    seq_id_col='SubRouteUID',
                                                    seq_direction_col='Direction',
                                                    seq_seq_col='StopSequence', 
                                                    seq_lat_col='PositionLat',
                                                    seq_lng_col='PositionLon')

                # print(f"gdf_routesegment_select_{route}_{direction}:", end="　")
                # print(len(gdf_routesegment_select))
                if len (gdf_routesegment_select) > 0 :
                    gdf_snappoints.append(gdf_snapstop_select)                
                    gdf_routesegment_select = gdf_routesegment_select.set_crs(epsg=4326, inplace=True)
                    gdf_segment_routes.append(gdf_routesegment_select)

print(len(gdf_snappoints))
if len(gdf_snappoints) > 0 : 
    gdf_snappoints = pd.concat(gdf_snappoints)
    gdf_snappoints.to_file(os.path.join(outputfolder, 'SnappedSequence.shp'))
    print("gdf_snappoints 輸出成功")

if len(original_gdfseq) > 0 : 
    original_gdfseq = pd.concat(original_gdfseq)
    original_gdfseq.to_file(os.path.join(outputfolder, 'original_gdfseq.shp'))
    print("original_gdfseq 輸出成功")

if len(original_gdfroute) > 0 : 
    original_gdfroute = pd.concat(original_gdfroute)
    original_gdfroute.to_file(os.path.join(outputfolder, 'original_gdfroute.shp'))
    print("original_gdfroute 輸出成功")

if len(gdf_segment_routes) > 0 : 
    gdf_segment_routes = pd.concat(gdf_segment_routes)
    gdf_segment_routes.to_file(os.path.join(outputfolder, 'gdf_segment_routes.shp'))
    print("gdf_segment_routes 輸出成功")