In [106]:
import os
import geopandas as gpd
import pandas as pd 
from shapely.geometry import Point, LineString
from shapely import wkt # for WKT 轉幾何物件
from TDXdataframe import read_businfo_xml

In [107]:
# 00_setup_os處理函數
def create_folder(folder_name):
    """建立資料夾"""
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)
    return os.path.abspath(folder_name)

def findfiles(filefolderpath, filetype='.csv', recursive=True):
    """
    尋找指定路徑下指定類型的檔案，並返回檔案路徑列表。

    Args:
        filefolderpath (str): 指定的檔案路徑。
        filetype (str, optional): 要尋找的檔案類型，預設為 '.csv'。
        recursive (bool, optional): 是否檢索所有子資料夾，預設為 True；反之為False，僅查找當前資料夾的所有file。

    Returns:
        list: 包含所有符合條件的檔案路徑的列表。
    """
    filelist = []

    if recursive:
        # 遍歷資料夾及其子資料夾
        for root, _, files in os.walk(filefolderpath):
            for file in files:
                if file.endswith(filetype):
                    file_path = os.path.join(root, file)
                    filelist.append(file_path)
    else:
        # 僅檢索當前資料夾
        for file in os.listdir(filefolderpath):
            file_path = os.path.join(filefolderpath, file)
            if os.path.isfile(file_path) and file.endswith(filetype):
                filelist.append(file_path)

    return filelist

def read_combined_dataframe(file_list, filepath = True):
    dataframes = []
    
    for file in file_list:
        try:
            if file.endswith('.csv'):
                df = pd.read_csv(file)
            elif file.endswith('.shp'):
                df = gpd.read_file(file)
            elif file.endswith(('.xls', '.xlsx')):
                df = pd.read_excel(file)
            else:
                print(f"Unsupported file format: {file}")
                continue
            if filepath:
                df['FilePath'] = file  # 添加來源檔案路徑欄位
            dataframes.append(df)
        except Exception as e:
            print(f"Error reading {file}: {e}")

    # 合併所有 DataFrame
    combined_df = pd.concat(dataframes, ignore_index=True)
    return combined_df

# 01_geodataframe 圖像處理

def dataframe_to_point(df, lon_col, lat_col, crs="EPSG:4326", target_crs="EPSG:3826"):
    '''
    Parameters:
    df (dataframe) : 含經緯度座標欄位的dataframe
    lon_col (str) : 緯度欄位
    Lat_col (str) : 經度欄位
    crs (str) : 目前經緯度座標的座標系統，常用的為4326(WGS84)、3826(TWD97)
    target_crs：目標轉換的座標系統
    '''

    # from shapely.geometry import Point
    # import pandas as pd
    # import geopandas as gpd
    # Create Point geometries from the longitude and latitude columns
    geometry = [Point(xy) for xy in zip(df[lon_col], df[lat_col])]
    # Create a GeoDataFrame with the original CRS
    gdf = gpd.GeoDataFrame(df, geometry=geometry, crs=crs)
    # Convert the GeoDataFrame to the target CRS
    gdf = gdf.to_crs(epsg=target_crs.split(":")[1])
    return gdf

def get_line(df, x1 = 'Lon_o', x2 = 'Lon_d', y1 = 'Lat_o', y2 = 'Lat_d', target_crs = "EPSG:3826"):
    '''
    Parameters:
    df (dataframe) : 含經緯度座標欄位的dataframe
    x1 (str) : 起點經度欄位
    y1 (str) : 起點緯度欄位
    x2 (str) : 迄點經度欄位
    y2 (str) : 迄點緯度欄位

    預設立場：輸出為wgs84轉換的經緯度點位
    '''
    # from shapely.geometry import LineString
    # import pandas as pd
    # import geopandas as gpd
    df['geometry'] = df.apply(lambda row: LineString([(row[x1], row[y1]), (row[x2], row[y2])]), axis=1)
    gdf = gpd.GeoDataFrame(df, geometry='geometry')
    # 設定座標系統 (假設 WGS 84 / EPSG:4326)
    gdf.set_crs(epsg=4326, inplace=True)
    gdf = gdf.to_crs(epsg=target_crs.split(":")[1])
    return gdf



In [108]:
def get_screenline_and_surveypoint(SLfolder):
    gdf_sl = gpd.read_file(os.path.join(SLfolder, 'TRTS5_屏柵線線型.shp'))
    gdf_sl['Category'] = gdf_sl['TRTS5'].apply(
        lambda x: 0 if 'CD' in x else 1 if 'SL' in x else None
    )
    gdf_sl['Number'] = (
        gdf_sl['TRTS5']
        .str.extract(r'(\d+)')
        .astype(int)
    )
    gdf_sl = gdf_sl.sort_values(['Category', 'Number'])

    gdf_sp = gpd.read_file(os.path.join(SLfolder, 'TRTS5_全部調查點位.shp'))
    gdf_sp = gdf_sp.rename(columns = {'X':'PositionLon', 'Y':'PositionLat'})
    gdf_sp[['TRTS5', 'SP_No']] = gdf_sp['Name'].str.split('-', expand=True)
    gdf_sp['SP_No'] = gdf_sp['SP_No'].astype('int64')
    gdf_sp['Category'] = gdf_sp['TRTS5'].apply(
        lambda x: 0 if 'CD' in x else 1 if 'SL' in x else None
    )
    gdf_sp['Number'] = (
        gdf_sp['TRTS5']
        .str.extract(r'(\d+)')
        .astype(int)
    )
    gdf_sp = gdf_sp.sort_values(['Category', 'Number', 'SP_No'])

    return gdf_sl, gdf_sp

def get_gdf_sp_unique(gdf_sp):
    '''因為調查點位同一個名稱的有太多資料調查來源：先取平均'''

    df_sp_unique = gdf_sp.sort_values(['Category','TRTS5', 'SP_No']).groupby(['Category','TRTS5', 'SP_No']).agg({'Name':'first','PositionLon':'mean', 'PositionLat':'mean', 'Surveyname':'first'}).reset_index()
    gdf_sp_unique = dataframe_to_point(df = df_sp_unique, lon_col = 'PositionLon', lat_col = 'PositionLat', crs="EPSG:4326", target_crs="EPSG:3826")
    

    return gdf_sp_unique

def get_df_seq(seqfolder):
    '''讀取公車站序並轉為geodataframe
    seqfolder(str):站序點位資料csv所在資料夾'''
    df_seq = read_combined_dataframe(file_list=findfiles(seqfolder))
    df_seq = df_seq.reindex(columns = ['RouteUID', 'SubRouteUID', 'SubRouteName_Zh','Direction','StopUID', 'StopSequence', 'PositionLon', 'PositionLat']).rename(columns = {'SubRouteName_Zh':'SRouteName'})
    df_seq = df_seq.drop_duplicates(subset= ['RouteUID', 'SubRouteUID','Direction', 'StopSequence', 'PositionLon', 'PositionLat'])
    if (len(df_seq[df_seq['StopSequence'].isna()]) / len(df_seq) < 0.05):
        df_seq = df_seq[df_seq['StopSequence'].notna()] # 會有沒有站序的
    else:
        percentage = ((df_seq[df_seq['StopSequence'].isna()]) / len(df_seq)) * 100
        print(f"===== 需要檢查沒有站序的站點有哪些，共有{percentage}%的資料沒有站序 =====")

    return df_seq

def get_gdfroutepair(df_seq):
    # 先排序，確保順序正確
    df_seq = df_seq.sort_values(
        by=['RouteUID', 'SubRouteUID', 'Direction', 'StopSequence']
    )

    # 建立 To 欄位（往下一站）
    df_seq['ToStop'] = df_seq.groupby(
        ['RouteUID', 'SubRouteUID', 'Direction']
    )['StopUID'].shift(-1)

    df_seq['ToSeq'] = df_seq.groupby(
        ['RouteUID', 'SubRouteUID', 'Direction']
    )['StopSequence'].shift(-1)

    df_seq['ToLon'] = df_seq.groupby(
        ['RouteUID', 'SubRouteUID', 'Direction']
    )['PositionLon'].shift(-1)

    df_seq['ToLat'] = df_seq.groupby(
        ['RouteUID', 'SubRouteUID', 'Direction']
    )['PositionLat'].shift(-1)

    # 建立 From 欄位（本站）
    df_seq['FromSeq'] = df_seq['StopSequence']
    df_seq['FromLon'] = df_seq['PositionLon']
    df_seq['FromLat'] = df_seq['PositionLat']
    df_seq['FromStop'] = df_seq['StopUID']

    # 只保留需要的欄位
    df_result = df_seq.reindex(columns = [
        'RouteUID', 'SubRouteUID', 'Direction',
        'FromStop', 'FromSeq', 'FromLon', 'FromLat',
        'ToStop', 'ToSeq', 'ToLon', 'ToLat'])

    # 移除最後一站（沒有 To）
    df_result = df_result.dropna(subset=['ToSeq'])

    df_result = df_result.reset_index(drop=True)

    gdf_result = get_line(df_result, x1 = 'FromLon', x2 = 'ToLon', y1 = 'FromLat', y2 = 'ToLat')

    return gdf_result

def buffer_distance(gdf, buffermeters):
    gdf = gdf.copy()
    gdf['geometry'] = gdf.geometry.buffer(buffermeters)
    return gdf 

def get_gdf_route(routefolder):
    df_route = read_combined_dataframe(findfiles(routefolder))
    df_route["geometry"] = df_route["Geometry"].apply(wkt.loads)
    df_route = df_route.drop(columns = ['Geometry'])
    gdf_route = gpd.GeoDataFrame(
        df_route,
        geometry="geometry",
        crs="EPSG:4326"
    ) 

    return gdf_route



In [109]:
# 00_Setup 所有全域函數
# 1.) 參數
SL_buffer_meters = 500
SP_buffer_meters = 50

# 2.) 資料input資料夾
referencefolder = os.path.abspath(os.path.join(os.getcwd(), '..', '參考資料'))
seqfolder = os.path.abspath(os.path.join(os.getcwd(), '..', '00_TDX資料下載', '01公車站序資料'))
routefolder = os.path.abspath(os.path.join(os.getcwd(), '..', '00_TDX資料下載', '02公車路線資料'))
SLfolder = os.path.abspath(os.path.join(os.getcwd(), '..', '..', 'TRTS5屏柵線'))


In [110]:
# 1. 讀取資料
# 讀取模型編修組提供的屏柵線 & 調查點位
gdf_sl, gdf_sp = get_screenline_and_surveypoint(SLfolder=SLfolder)
# 轉為TWD97 (EPSG:3826) 才能使用buffer功能
gdf_sp_unique = get_gdf_sp_unique(gdf_sp).to_crs(epsg=3826) # 因為提供的點位有分路段，所以先改為同一個地點
gdf_sl_twd97 = gdf_sl.to_crs(epsg=3826).reindex(columns = ['TRTS5', 'Category', 'Number', 'geometry']) # 轉換為TWD97處理
# 讀取公車站序
df_seq = get_df_seq(seqfolder=seqfolder)
gdf_seq = dataframe_to_point(df = df_seq, lon_col='PositionLon', lat_col= 'PositionLat', crs="EPSG:4326", target_crs="EPSG:3826")
# 讀取公車原始線型
gdf_route  = get_gdf_route(routefolder)
gdf_route = gdf_route.to_crs(epsg=3826)

# 2. Buffer初步確認那些會經過調查點位
gdf_routepair = get_gdfroutepair(df_seq) # 取得公車路線pair直線
gdf_sl_buffer = buffer_distance(gdf=gdf_sl_twd97, buffermeters=SL_buffer_meters).reset_index(drop = True) # 屏柵線取buffer
gdf_sp_buffer = buffer_distance(gdf=gdf_sp_unique, buffermeters=SP_buffer_meters).reset_index(drop = True) # 調查點位取buffer

route_in_buffer = gpd.sjoin(gdf_routepair,
                            gdf_sp_buffer[['Category', 'TRTS5', 'SP_No', 'Name', 'PositionLon', 'PositionLat','Surveyname', 'geometry']],
                            how='inner',
                            predicate='intersects'
                            ).drop(columns='index_right')

# route_in_buffer = gpd.sjoin(gdf_routepair,
#                             gdf_sl_buffer[['TRTS5','Category','Number','geometry']],
#                             how='inner',
#                             predicate='intersects'
#                             ).drop(columns='index_right')

# 確認中

In [111]:
# gdf_route_in_spbuffer = gpd.sjoin(gdf_route,
#                                   gdf_sp_buffer[['Category', 'TRTS5', 'SP_No', 'Name', 'PositionLon', 'PositionLat','Surveyname', 'geometry']],
#                                   how='inner',
#                                   predicate='intersects').drop(columns='index_right')

先重新確認我們的站序跟站名資訊是否可以核對成功

In [112]:
busrouteinfofiles = findfiles(os.path.join('..', '00_TDX資料下載', '03公車路線營運資料'), 'xml')
routeinfo = []
for file in busrouteinfofiles:
    routeinfo.append(read_businfo_xml(file))
df_routeinfo = pd.concat(routeinfo)
df_routeinfo = df_routeinfo[['RouteUID', 'RouteNameZh', 'SubRouteUID', 'SubRouteNameZh','Headsign',  'Direction']].rename(columns = {'RouteNameZh':'RouteName_Zh', 'SubRouteNameZh':'SubRouteName_Zh'})
df_routeinfo['R_S_UID'] = (
    df_routeinfo['RouteUID'].fillna('')
    + '-'
    + df_routeinfo['SubRouteUID'].fillna('')
)


In [113]:
df_seq =  read_combined_dataframe(file_list=findfiles(seqfolder))
df_seq = df_seq.reindex(columns = ['RouteUID', 'RouteName_Zh', 'SubRouteUID', 'SubRouteName_Zh', 'Direction']).drop_duplicates().reset_index(drop = True)
df_seq['R_S_UID'] = (
    df_seq['RouteUID'].fillna('')
    + '-'
    + df_seq['SubRouteUID'].fillna('')
)



In [114]:
gdf_route  = get_gdf_route(routefolder)
gdf_route = gdf_route.to_crs(epsg=3826)
gdf_route = gdf_route.reindex(columns = ['RouteUID', 'RouteName_Zh', 'SubRouteUID', 'SubRouteName_Zh', 'Direction', 'geometry'])
gdf_route['R_S_UID'] = (
    gdf_route['RouteUID'].fillna('')
    + '-'
    + gdf_route['SubRouteUID'].fillna('')
)


In [115]:
def compare_uid_sets(A, B, Acol, Bcol, dropna=True):
    """
    比較兩個 DataFrame 指定欄位的集合差異

    Parameters
    ----------
    A, B : pandas.DataFrame
    Acol, Bcol : str
        要比較的欄位名稱
    dropna : bool, default True
        是否忽略 NaN（建議 True）

    Returns
    -------
    dict
        {
            'common': set,
            'only_in_A': set,
            'only_in_B': set,
            'A_common_df': DataFrame,
            'A_only_df': DataFrame,
            'B_common_df': DataFrame,
            'B_only_df': DataFrame
        }
    """
    if dropna:
        set_A = set(A[Acol].dropna())
        set_B = set(B[Bcol].dropna())
    else:
        set_A = set(A[Acol])
        set_B = set(B[Bcol])

    common = set_A & set_B
    only_A = set_A - set_B
    only_B = set_B - set_A

    return {
        'common': common,
        'only_in_A': only_A,
        'only_in_B': only_B,
        'A_common_df': A[A[Acol].isin(common)],
        'A_only_df': A[A[Acol].isin(only_A)],
        'B_common_df': B[B[Bcol].isin(common)],
        'B_only_df': B[B[Bcol].isin(only_B)],
    }


In [116]:
returnlist  = compare_uid_sets(A = gdf_route, B = df_seq, Acol = 'R_S_UID', Bcol = 'R_S_UID', dropna=True)
commonlist = returnlist['common']
A_only_df = returnlist['A_only_df']
B_only_df = returnlist['B_only_df']
A_common_df = returnlist['A_common_df']
B_common_df = returnlist['B_common_df']

onlyinseq = B_only_df.drop(columns = 'Direction').drop_duplicates().reset_index(drop = True)
onlyinroute = A_only_df.drop_duplicates().reset_index(drop = True)

In [117]:
temp = onlyinseq.groupby(['RouteUID']).agg({'SubRouteUID':'count'}).reset_index()
routeuidlist_1 = list(temp[temp['SubRouteUID'] == 1]['RouteUID'].unique())
routeuidlist_2up = list(temp[temp['SubRouteUID'] > 1]['RouteUID'].unique())
del temp

In [131]:
temp = onlyinroute.groupby(['RouteUID', 'Direction']).agg({'SubRouteUID':'count'}).reset_index()
temp = onlyinroute[~onlyinroute['RouteUID'].isin(routeuidlist_2up)].groupby(['RouteUID', 'Direction']).agg({'R_S_UID':'count'}).reset_index()
routelist = temp[temp['R_S_UID'] > 1]['RouteUID'].unique()

In [147]:
# df_routeinfo[df_routeinfo['RouteUID'].isin(routelist)]
# df_routeinfo[df_routeinfo['SubRouteUID'] == 'NWT159019']
df_routeinfo[df_routeinfo['RouteUID'] == 'TPE19759']

Unnamed: 0,RouteUID,RouteName_Zh,SubRouteUID,SubRouteName_Zh,Headsign,Direction,R_S_UID
1162,TPE19759,通勤30,TPE162482,通勤30,,0,TPE19759-TPE162482
1163,TPE19759,通勤30,TPE162482,通勤30,,1,TPE19759-TPE162482


In [146]:
gdf_route[gdf_route['SubRouteUID'].isna()]
# gdf_route[gdf_route['RouteUID'] == 'NWT10116']

Unnamed: 0,RouteUID,RouteName_Zh,SubRouteUID,SubRouteName_Zh,Direction,geometry,R_S_UID
1992,NWT10116,242,,,0,"LINESTRING (299553.61 2764778.762, 299582.627 ...",NWT10116-
1993,NWT10116,242,,,1,"LINESTRING (300812.599 2771018.674, 300989.041...",NWT10116-
1994,NWT10143,棕7,,,0,"LINESTRING (298806.806 2761550.822, 298820.173...",NWT10143-
1995,NWT10143,棕7,,,1,"LINESTRING (307359.914 2770432.049, 307343.592...",NWT10143-
1996,NWT10148,702,,,0,"LINESTRING (287071.597 2758898.241, 287069.852...",NWT10148-
...,...,...,...,...,...,...,...
4986,TPE19755,新莊-臺北車站,,,0,"LINESTRING (295676.049 2772380.709, 295637.168...",TPE19755-
4987,TPE19759,通勤30,,,0,"LINESTRING (305051.198 2776372.441, 305064.018...",TPE19759-
4988,TPE19759,通勤30,,,1,"LINESTRING (301371.246 2770198.956, 301475.833...",TPE19759-
4989,TPE810,敦化幹線,,,0,"LINESTRING (306599.127 2767657.692, 306588.029...",TPE810-


In [140]:
gdf_route[gdf_route['RouteUID'].isin(routelist)]

Unnamed: 0,RouteUID,RouteName_Zh,SubRouteUID,SubRouteName_Zh,Direction,geometry,R_S_UID
2242,NWT16543,882,,,0,"LINESTRING (300018.017 2794012.885, 300053.036...",NWT16543-
2243,NWT16543,882,,,0,"LINESTRING (296322.089 2782851.695, 296309.184...",NWT16543-
2244,NWT16543,882,,,1,"LINESTRING (296322.089 2782851.695, 296309.184...",NWT16543-
2245,NWT16543,882,NWT161417,882捷運紅樹林站,1,"LINESTRING (296322.089 2782851.695, 296309.184...",NWT16543-NWT161417
2551,NWT16768,595,,,0,"LINESTRING (295006.35 2784411.519, 295055.844 ...",NWT16768-
2552,NWT16768,595,,,0,"LINESTRING (295011.994 2784406.52, 295056.201 ...",NWT16768-
2553,NWT16768,595,,,0,"LINESTRING (295056.513 2784372.097, 295067.42 ...",NWT16768-
2554,NWT16768,595,,,1,"LINESTRING (295056.513 2784372.097, 295067.42 ...",NWT16768-
2555,NWT16768,595,NWT157895,595新民去,0,"LINESTRING (295007.737 2784410.408, 295056.513...",NWT16768-NWT157895
2556,NWT16768,595,NWT158972,595新民返,1,"LINESTRING (295011.994 2784406.52, 295056.201 ...",NWT16768-NWT158972
