In [None]:
import os
import geopandas as gpd
import pandas as pd 
import logging
from pathlib import Path
from shapely.geometry import Point, LineString
from shapely import wkt # for WKT 轉幾何物件
from TDXdataframe import read_businfo_xml
from basicprocess import create_folder, findfiles, read_combined_dataframe, get_df_log, outputlog

# 函數庫

## Trash

In [None]:
# # ===== 後續不會用到 =====
# def get_gdfroutepair(df_seq):
#     # 先排序，確保順序正確
#     df_seq = df_seq.sort_values(
#         by=['RouteUID', 'SubRouteUID', 'Direction', 'StopSequence']
#     )

#     # 建立 To 欄位（往下一站）
#     df_seq['ToStop'] = df_seq.groupby(
#         ['RouteUID', 'SubRouteUID', 'Direction']
#     )['StopUID'].shift(-1)

#     df_seq['ToSeq'] = df_seq.groupby(
#         ['RouteUID', 'SubRouteUID', 'Direction']
#     )['StopSequence'].shift(-1)

#     df_seq['ToLon'] = df_seq.groupby(
#         ['RouteUID', 'SubRouteUID', 'Direction']
#     )['PositionLon'].shift(-1)

#     df_seq['ToLat'] = df_seq.groupby(
#         ['RouteUID', 'SubRouteUID', 'Direction']
#     )['PositionLat'].shift(-1)

#     # 建立 From 欄位（本站）
#     df_seq['FromSeq'] = df_seq['StopSequence']
#     df_seq['FromLon'] = df_seq['PositionLon']
#     df_seq['FromLat'] = df_seq['PositionLat']
#     df_seq['FromStop'] = df_seq['StopUID']

#     # 只保留需要的欄位
#     df_result = df_seq.reindex(columns = [
#         'RouteUID', 'SubRouteUID', 'Direction',
#         'FromStop', 'FromSeq', 'FromLon', 'FromLat',
#         'ToStop', 'ToSeq', 'ToLon', 'ToLat'])

#     # 移除最後一站（沒有 To）
#     df_result = df_result.dropna(subset=['ToSeq'])

#     df_result = df_result.reset_index(drop=True)

#     gdf_result = get_line(df_result, x1 = 'FromLon', x2 = 'ToLon', y1 = 'FromLat', y2 = 'ToLat')

#     return gdf_result


## Used

In [None]:
# 01_geodataframe 圖像處理
def dataframe_to_point(df, lon_col, lat_col, crs="EPSG:4326", target_crs="EPSG:3826"):
    '''
    Parameters:
    df (dataframe) : 含經緯度座標欄位的dataframe
    lon_col (str) : 緯度欄位
    Lat_col (str) : 經度欄位
    crs (str) : 目前經緯度座標的座標系統，常用的為4326(WGS84)、3826(TWD97)
    target_crs：目標轉換的座標系統
    '''

    # from shapely.geometry import Point
    # import pandas as pd
    # import geopandas as gpd
    # Create Point geometries from the longitude and latitude columns
    geometry = [Point(xy) for xy in zip(df[lon_col], df[lat_col])]
    # Create a GeoDataFrame with the original CRS
    gdf = gpd.GeoDataFrame(df, geometry=geometry, crs=crs)
    # Convert the GeoDataFrame to the target CRS
    gdf = gdf.to_crs(epsg=target_crs.split(":")[1])
    return gdf

def get_line(df, x1 = 'Lon_o', x2 = 'Lon_d', y1 = 'Lat_o', y2 = 'Lat_d', target_crs = "EPSG:3826"):
    '''
    Parameters:
    df (dataframe) : 含經緯度座標欄位的dataframe
    x1 (str) : 起點經度欄位
    y1 (str) : 起點緯度欄位
    x2 (str) : 迄點經度欄位
    y2 (str) : 迄點緯度欄位

    預設立場：輸出為wgs84轉換的經緯度點位
    '''
    # from shapely.geometry import LineString
    # import pandas as pd
    # import geopandas as gpd
    df['geometry'] = df.apply(lambda row: LineString([(row[x1], row[y1]), (row[x2], row[y2])]), axis=1)
    gdf = gpd.GeoDataFrame(df, geometry='geometry')
    # 設定座標系統 (假設 WGS 84 / EPSG:4326)
    gdf.set_crs(epsg=4326, inplace=True)
    gdf = gdf.to_crs(epsg=target_crs.split(":")[1])
    return gdf


In [None]:
def get_screenline_and_surveypoint(SLfolder):
    gdf_sl = gpd.read_file(os.path.join(SLfolder, 'TRTS5_屏柵線線型.shp'))
    gdf_sl['Category'] = gdf_sl['TRTS5'].apply(
        lambda x: 0 if 'CD' in x else 1 if 'SL' in x else None
    )
    gdf_sl['Number'] = (
        gdf_sl['TRTS5']
        .str.extract(r'(\d+)')
        .astype(int)
    )
    gdf_sl = gdf_sl.sort_values(['Category', 'Number'])

    gdf_sp = gpd.read_file(os.path.join(SLfolder, 'TRTS5_全部調查點位.shp'))
    gdf_sp = gdf_sp.rename(columns = {'X':'PositionLon', 'Y':'PositionLat'})
    gdf_sp[['TRTS5', 'SP_No']] = gdf_sp['Name'].str.split('-', expand=True)
    gdf_sp['SP_No'] = gdf_sp['SP_No'].astype('int64')
    gdf_sp['Category'] = gdf_sp['TRTS5'].apply(
        lambda x: 0 if 'CD' in x else 1 if 'SL' in x else None
    )
    gdf_sp['Number'] = (
        gdf_sp['TRTS5']
        .str.extract(r'(\d+)')
        .astype(int)
    )
    gdf_sp = gdf_sp.sort_values(['Category', 'Number', 'SP_No'])

    return gdf_sl, gdf_sp

def get_gdf_sp_unique(gdf_sp):
    '''因為調查點位同一個名稱的有太多資料調查來源：先取平均'''

    df_sp_unique = gdf_sp.sort_values(['Category','TRTS5', 'SP_No']).groupby(['Category','TRTS5', 'SP_No']).agg({'Name':'first','PositionLon':'mean', 'PositionLat':'mean', 'Surveyname':'first'}).reset_index()
    gdf_sp_unique = dataframe_to_point(df = df_sp_unique, lon_col = 'PositionLon', lat_col = 'PositionLat', crs="EPSG:4326", target_crs="EPSG:3826")
    

    return gdf_sp_unique

def get_df_seq(seqfolder):
    '''讀取公車站序並轉為geodataframe
    seqfolder(str):站序點位資料csv所在資料夾'''
    df_seq = read_combined_dataframe(file_list=findfiles(seqfolder))
    df_seq = df_seq.reindex(columns = ['RouteUID', 'SubRouteUID', 'SubRouteName_Zh','Direction','StopUID', 'StopSequence', 'PositionLon', 'PositionLat']).rename(columns = {'SubRouteName_Zh':'SRouteName'})
    df_seq = df_seq.drop_duplicates(subset= ['RouteUID', 'SubRouteUID','Direction', 'StopSequence', 'PositionLon', 'PositionLat'])
    if (len(df_seq[df_seq['StopSequence'].isna()]) / len(df_seq) < 0.05):
        df_seq = df_seq[df_seq['StopSequence'].notna()] # 會有沒有站序的
    else:
        percentage = ((df_seq[df_seq['StopSequence'].isna()]) / len(df_seq)) * 100
        print(f"===== 需要檢查沒有站序的站點有哪些，共有{percentage}%的資料沒有站序 =====")

    return df_seq

def buffer_distance(gdf, buffermeters):
    gdf = gdf.copy()
    gdf['geometry'] = gdf.geometry.buffer(buffermeters)
    return gdf 

def get_gdf_route(routefolder):
    df_route = read_combined_dataframe(findfiles(routefolder))
    df_route["geometry"] = df_route["Geometry"].apply(wkt.loads)
    df_route = df_route.drop(columns = ['Geometry'])
    gdf_route = gpd.GeoDataFrame(
        df_route,
        geometry="geometry",
        crs="EPSG:4326"
    ) 

    return gdf_route


# Setup

In [None]:
# 00_Setup 所有全域函數
# 0.) Log輸出檔案位置
logfile = os.path.abspath(os.path.join(os.getcwd(), '..', 'Log', '04_SelectSegment.log'))
logging.basicConfig(
    filename=logfile,
    level=logging.INFO,
    format="%(asctime)s | %(levelname)s | %(message)s"
)

# 1.) 參數
SL_buffer_meters = 500
SP_buffer_meters = 50

# 2.) 資料input資料夾
referencefolder = os.path.abspath(os.path.join(os.getcwd(), '..', '參考資料'))
seqfolder = os.path.abspath(os.path.join(os.getcwd(), '..', '00_TDX資料下載', '01公車站序資料'))
routefolder = os.path.abspath(os.path.join(os.getcwd(), '..', '00_TDX資料下載', '02公車路線資料'))
SLfolder = os.path.abspath(os.path.join(os.getcwd(), '..', '..', 'TRTS5屏柵線'))
routesegmentfolder = os.path.abspath(os.path.join(os.getcwd(), '..', '03_處理後資料', '01_公車路線依站序拆分'))


# Main

In [None]:
# 1. 讀取資料
logging.info('Job started')
logging.info('Start reading Data')
# 讀取模型編修組提供的屏柵線 & 調查點位
gdf_sl, gdf_sp = get_screenline_and_surveypoint(SLfolder=SLfolder)
logging.info(f'讀取屏柵線檔案 該線形crs為 {gdf_sl.crs.name}')
logging.info(f'讀取調查點檔案 該點位crs為 {gdf_sp.crs.name}')

# 轉為TWD97 (EPSG:3826) 才能使用buffer功能
gdf_sp_unique = get_gdf_sp_unique(gdf_sp).to_crs(epsg=3826) # 因為提供的點位有分路段，所以先改為同一個地點
logging.info(f'讀取調查點檔案 該點位crs為 {gdf_sp_unique.crs.name}')

gdf_sl_twd97 = gdf_sl.to_crs(epsg=3826).reindex(columns = ['TRTS5', 'Category', 'Number', 'geometry']) # 轉換為TWD97處理
logging.info(f'讀取屏柵線檔案 該線形crs為 {gdf_sl_twd97.crs.name}')

# 讀取公車站序
df_seq = get_df_seq(seqfolder=seqfolder)
gdf_seq = dataframe_to_point(df = df_seq, lon_col='PositionLon', lat_col= 'PositionLat', crs="EPSG:4326", target_crs="EPSG:3826")
logging.info(f'讀取公車站序geodataframe 該點位crs為 {gdf_seq.crs.name}')

# 讀取公車原始線型
gdf_route  = get_gdf_route(routefolder)
gdf_route = gdf_route.to_crs(epsg=3826)
logging.info(f'讀取公車原始線形 該點位crs為 {gdf_route.crs.name}')

# 讀取公車片段
gdf_route_segment = gpd.read_file(os.path.join(routesegmentfolder, 'gdf_segment_routes.shp'))
logging.info(f'讀取公車路線拆分片段 該點位crs為 {gdf_route_segment.crs.name}')
# gdf_route_segment.columns = ['RouteUID', 'SRouteUID', 'Direction', 'StartSeq', 'EndSeq', 'OriStart', 'OriEnd', 'geometry']
gdf_route_segment = gdf_route_segment.to_crs(epsg=3826)
logging.info(f'讀取公車路線拆分片段 該點位crs為 {gdf_route_segment.crs.name}')
gdf_route_segment["len_seg"] = gdf_route_segment.geometry.length
logging.info(f'新增公車路線的路線總長')


# 2. Buffer初步確認那些會經過調查點位
gdf_sl_buffer = buffer_distance(gdf=gdf_sl_twd97, buffermeters=SL_buffer_meters).reset_index(drop = True) # 屏柵線取buffer
gdf_sp_buffer = buffer_distance(gdf=gdf_sp_unique, buffermeters=SP_buffer_meters).reset_index(drop = True) # 調查點位取buffer


# route_in_buffer = gpd.sjoin(gdf_routepair,
#                             gdf_sl_buffer[['TRTS5','Category','Number','geometry']],
#                             how='inner',
#                             predicate='intersects'
#                             ).drop(columns='index_right')


# outputlog(logfile=logfile)

In [None]:
outputlog(logfile=logfile)

# 確認中

In [None]:
# gdf_route_in_spbuffer = gpd.sjoin(gdf_route,
#                                   gdf_sp_buffer[['Category', 'TRTS5', 'SP_No', 'Name', 'PositionLon', 'PositionLat','Surveyname', 'geometry']],
#                                   how='inner',
#                                   predicate='intersects').drop(columns='index_right')

先重新確認我們的站序跟站名資訊是否可以核對成功

In [None]:
busrouteinfofiles = findfiles(os.path.join('..', '00_TDX資料下載', '03公車路線營運資料'), 'xml')
routeinfo = []
for file in busrouteinfofiles:
    routeinfo.append(read_businfo_xml(file))
df_routeinfo = pd.concat(routeinfo)
df_routeinfo = df_routeinfo[['RouteUID', 'RouteNameZh', 'SubRouteUID', 'SubRouteNameZh','Headsign',  'Direction']].rename(columns = {'RouteNameZh':'RouteName_Zh', 'SubRouteNameZh':'SubRouteName_Zh'})
df_routeinfo['R_S_UID'] = (
    df_routeinfo['RouteUID'].fillna('')
    + '-'
    + df_routeinfo['SubRouteUID'].fillna('')
)


In [None]:
df_seq =  read_combined_dataframe(file_list=findfiles(seqfolder))
df_seq = df_seq.reindex(columns = ['RouteUID', 'RouteName_Zh', 'SubRouteUID', 'SubRouteName_Zh', 'Direction']).drop_duplicates().reset_index(drop = True)
df_seq['R_S_UID'] = (
    df_seq['RouteUID'].fillna('')
    + '-'
    + df_seq['SubRouteUID'].fillna('')
)



In [None]:
gdf_route  = get_gdf_route(routefolder)
gdf_route = gdf_route.to_crs(epsg=3826)
gdf_route = gdf_route.reindex(columns = ['RouteUID', 'RouteName_Zh', 'SubRouteUID', 'SubRouteName_Zh', 'Direction', 'geometry'])
gdf_route['R_S_UID'] = (
    gdf_route['RouteUID'].fillna('')
    + '-'
    + gdf_route['SubRouteUID'].fillna('')
)


In [None]:
def compare_uid_sets(A, B, Acol, Bcol, dropna=True):
    """
    比較兩個 DataFrame 指定欄位的集合差異

    Parameters
    ----------
    A, B : pandas.DataFrame
    Acol, Bcol : str
        要比較的欄位名稱
    dropna : bool, default True
        是否忽略 NaN（建議 True）

    Returns
    -------
    dict
        {
            'common': set,
            'only_in_A': set,
            'only_in_B': set,
            'A_common_df': DataFrame,
            'A_only_df': DataFrame,
            'B_common_df': DataFrame,
            'B_only_df': DataFrame
        }
    """
    if dropna:
        set_A = set(A[Acol].dropna())
        set_B = set(B[Bcol].dropna())
    else:
        set_A = set(A[Acol])
        set_B = set(B[Bcol])

    common = set_A & set_B
    only_A = set_A - set_B
    only_B = set_B - set_A

    return {
        'common': common,
        'only_in_A': only_A,
        'only_in_B': only_B,
        'A_common_df': A[A[Acol].isin(common)],
        'A_only_df': A[A[Acol].isin(only_A)],
        'B_common_df': B[B[Bcol].isin(common)],
        'B_only_df': B[B[Bcol].isin(only_B)],
    }


In [None]:
returnlist  = compare_uid_sets(A = gdf_route, B = df_seq, Acol = 'R_S_UID', Bcol = 'R_S_UID', dropna=True)
commonlist = returnlist['common']
A_only_df = returnlist['A_only_df']
B_only_df = returnlist['B_only_df']
A_common_df = returnlist['A_common_df']
B_common_df = returnlist['B_common_df']

onlyinseq = B_only_df.drop(columns = 'Direction').drop_duplicates().reset_index(drop = True)
onlyinroute = A_only_df.drop_duplicates().reset_index(drop = True)

In [None]:
temp = onlyinseq.groupby(['RouteUID']).agg({'SubRouteUID':'count'}).reset_index()
routeuidlist_1 = list(temp[temp['SubRouteUID'] == 1]['RouteUID'].unique())
routeuidlist_2up = list(temp[temp['SubRouteUID'] > 1]['RouteUID'].unique())
del temp

In [None]:
temp = onlyinroute.groupby(['RouteUID', 'Direction']).agg({'SubRouteUID':'count'}).reset_index()
temp = onlyinroute[~onlyinroute['RouteUID'].isin(routeuidlist_2up)].groupby(['RouteUID', 'Direction']).agg({'R_S_UID':'count'}).reset_index()
routelist = temp[temp['R_S_UID'] > 1]['RouteUID'].unique()

In [None]:
# df_routeinfo[df_routeinfo['RouteUID'].isin(routelist)]
# df_routeinfo[df_routeinfo['SubRouteUID'] == 'NWT159019']
df_routeinfo[df_routeinfo['RouteUID'] == 'TPE19759']

In [None]:
gdf_route[gdf_route['SubRouteUID'].isna()]
# gdf_route[gdf_route['RouteUID'] == 'NWT10116']

In [None]:
gdf_route[gdf_route['RouteUID'].isin(routelist)]