In [1]:
import pandas as pd 
import numpy as np
import os 
import datetime
import geopandas as gpd
from collections import Counter   # 用來方便累加每個 chunk 的統計結果

In [2]:
# 00_setup_os處理函數
def create_folder(folder_name):
    """建立資料夾"""
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)
    return os.path.abspath(folder_name)

def findfiles(filefolderpath, filetype='.csv', recursive=True):
    """
    尋找指定路徑下指定類型的檔案，並返回檔案路徑列表。

    Args:
        filefolderpath (str): 指定的檔案路徑。
        filetype (str, optional): 要尋找的檔案類型，預設為 '.csv'。
        recursive (bool, optional): 是否檢索所有子資料夾，預設為 True；反之為False，僅查找當前資料夾的所有file。

    Returns:
        list: 包含所有符合條件的檔案路徑的列表。
    """
    filelist = []

    if recursive:
        # 遍歷資料夾及其子資料夾
        for root, _, files in os.walk(filefolderpath):
            for file in files:
                if file.endswith(filetype):
                    file_path = os.path.join(root, file)
                    filelist.append(file_path)
    else:
        # 僅檢索當前資料夾
        for file in os.listdir(filefolderpath):
            file_path = os.path.join(filefolderpath, file)
            if os.path.isfile(file_path) and file.endswith(filetype):
                filelist.append(file_path)

    return filelist

def read_combined_dataframe(file_list, filepath = True):
    dataframes = []
    
    for file in file_list:
        try:
            if file.endswith('.csv'):
                df = pd.read_csv(file)
            elif file.endswith('.shp'):
                df = gpd.read_file(file)
            elif file.endswith(('.xls', '.xlsx')):
                df = pd.read_excel(file)
            else:
                print(f"Unsupported file format: {file}")
                continue
            if filepath:
                df['FilePath'] = file  # 添加來源檔案路徑欄位
            dataframes.append(df)
        except Exception as e:
            print(f"Error reading {file}: {e}")

    # 合併所有 DataFrame
    combined_df = pd.concat(dataframes, ignore_index=True)
    return combined_df

# 01_資料預處理
def filter_ticket_data(filepath, 
                       selectdate_start, 
                       selectdate_end, 
                       outputfolder,
                       skiprows=1, 
                       chunksize=1000,
                        on_time_column = 'BoardingTime', 
                       off_time_column = 'DeboardingTime', 
                       infodate_column = 'InfoDate',):
    """
    分批讀取大型票證 CSV，依上車時間欄位做日期篩選後輸出新的 CSV。
    
    Parameters
    ----------
    filepath : str
        原始 CSV 路徑
    on_time_column : str
        上車時間欄位名稱
    off_time_column : str
        下車時間欄位名稱（保留未來擴充）
    selectdate_start : str
        篩選起始日期（YYYY-MM-DD）
    selectdate_end : str
        篩選結束日期（YYYY-MM-DD）
    outputfolder : str
        最終輸出 CSV 的資料夾路徑
    skiprows : int
        讀取 CSV 時跳過的列
    chunksize : int
        每批讀取筆數

    Returns
    -------
    outputpath : str
        最終輸出 CSV 的完整路徑
    """

    # 建立輸出資料夾（如不存在）
    os.makedirs(outputfolder, exist_ok=True)

    # 產生輸出檔名
    filename = os.path.basename(filepath).replace(
        ".csv", f"_{selectdate_start}_to_{selectdate_end}.csv"
    )
    outputpath = os.path.join(outputfolder, filename)

    # 日期轉 datetime
    start = pd.to_datetime(selectdate_start)
    end   = pd.to_datetime(selectdate_end)

    # 分批讀取
    chunks = pd.read_csv(filepath, skiprows=skiprows, chunksize=chunksize)
    first_chunk = True

    for chunk in chunks:
        # 轉成 datetime
        # chunk[on_time_column] = pd.to_datetime(chunk[on_time_column], errors='coerce')
        # chunk[off_time_column] = pd.to_datetime(chunk[off_time_column], errors='coerce')
        chunk[infodate_column] = pd.to_datetime(chunk[infodate_column], errors='coerce')

        # 日期篩選
        # mask = (
        #     ((chunk[on_time_column]  >= start) & (chunk[on_time_column]  <= end)) |
        #     ((chunk[off_time_column] >= start) & (chunk[off_time_column] <= end))
        # )    
        # mask = (chunk[on_time_column] >= start) & (chunk[on_time_column] <= end)
        mask = (chunk[infodate_column] >= start) & (chunk[infodate_column] <= end)
        filtered_chunk = chunk[mask]

        if filtered_chunk.empty:
            continue

        # 寫入 CSV
        filtered_chunk.to_csv(
            outputpath,
            mode='w' if first_chunk else 'a',
            header=first_chunk,
            index=False,
            encoding='utf-8-sig'
        )
        first_chunk = False

    return outputpath

# def tickets_cleaning(
#     tickets, 
#     on_time_column='on_time_column', 
#     off_time_column='off_time_column', 
#     getonstop='GetOnStop', 
#     getoffstop='GetOffStop', 
#     getonseq='GetOnSeq', 
#     getoffseq='GetOffSeq'):
#     """
#     清理票證資料，篩選出符合條件的票證並輸出統計結果。
#     可以用於檢查票證資料的正確性。
#     """
#     # 原始票證數量
#     original_count = len(tickets)

#     # 建立篩選條件
#     valid_conditions = (
#         (tickets[on_time_column] < tickets[off_time_column]) &  # 上車時間早於下車時間
#         (tickets[getonstop] != tickets[getoffstop]) &  # 上下車站不同
#         (tickets[getonseq] < tickets[getoffseq])  # 上下車序正確
#     )

#     # 檢查每個條件的異常數量
#     late_count = (tickets[on_time_column] > tickets[off_time_column]).sum()
#     same_stop_count = (tickets[getonstop] == tickets[getoffstop]).sum()
#     seq_error_count = (tickets[getonseq] >= tickets[getoffseq]).sum()
    

#     # 篩選出符合條件的票證
#     cleaned_tickets = tickets[valid_conditions]
#     canuse_count = len(cleaned_tickets)

#     # 統計結果
#     output = {
#         '原始票證數量': original_count,
#         '資料正常':canuse_count, 
#         '資料異常 - 上車晚於下車': late_count,
#         '資料異常 - 同站上下車': same_stop_count,
#         '資料異常 - 上下車次序錯誤': seq_error_count
#     }

#     correctrate = round((canuse_count / original_count) * 100, 1)
#     return cleaned_tickets, output, correctrate

def tickets_cleaning(
    tickets,
    on_time_column='BoardingTime',
    off_time_column='DeboardingTime',
    getonstop='BoardingStopUID',
    getoffstop='DeboardingStopUID',
    getonseq='BoardingStopSequence',
    getoffseq='DeboardingStopSequence'):

    n = len(tickets)

    # ---- 型別轉換（你不把缺值當異常，但比較要正確）----
    on_time  = pd.to_datetime(tickets[on_time_column], errors='coerce')
    off_time = pd.to_datetime(tickets[off_time_column], errors='coerce')
    on_seq   = pd.to_numeric(tickets[getonseq], errors='coerce')
    off_seq  = pd.to_numeric(tickets[getoffseq], errors='coerce')
    on_stop  = tickets[getonstop]
    off_stop = tickets[getoffstop]

    # ---- 能確定的三種異常（缺值不算異常）----
    m_time_rev  = (on_time > off_time)               # 上車晚於下車
    m_same_stop = (on_stop == off_stop)              # 同站上下車
    m_seq_err   = (on_seq >= off_seq)                # 上序 >= 下序

    # ---- 資料正常（只有確定異常才算異常，其餘都正常）----
    m_ok = ~(m_time_rev | m_same_stop | m_seq_err)

    cleaned = tickets[m_ok].copy()

    # ---------------------------------------------------------
    # ⭐⭐ 依你的要求：新增 ErrorMsg 欄位，描述缺哪些資料（但不當異常）
    # ---------------------------------------------------------
    miss_off_time = off_time.isna()
    miss_off_stop = off_stop.isna()

    def combine_err(row):
        msgs = []
        if row['miss_off_time']:
            msgs.append("沒有下車刷卡時間")
        if row['miss_off_stop']:
            msgs.append("沒有下車站點資料")
        return "；".join(msgs)

    temp_df = pd.DataFrame({
        "miss_off_time": miss_off_time,
        "miss_off_stop": miss_off_stop
    })

    cleaned["ErrorMsg"] = temp_df.loc[cleaned.index].apply(combine_err, axis=1)
    # 若沒有錯誤，改成空字串
    cleaned["ErrorMsg"] = cleaned["ErrorMsg"].replace("", "")

    # ---- 統計輸出 ----
    output = {
        '原始票證數量': int(n),
        '資料正常': int(m_ok.sum()),
        '資料異常 - 上車晚於下車': int(m_time_rev.sum()),
        '資料異常 - 同站上下車': int(m_same_stop.sum()),
        '資料異常 - 上下車次序錯誤': int(m_seq_err.sum()),
        # 額外統計（可選）：缺哪些資料
        '資訊缺失 - 沒有下車刷卡時間': int(miss_off_time.sum()),
        '資訊缺失 - 沒有下車站點資料': int(miss_off_stop.sum())
    }

    correctrate = round((output['資料正常'] / n) * 100, 2) if n else 0.0
    return cleaned, output, correctrate

def mark_ticket_errors(
    tickets, 
    on_time_column='on_time_column', 
    off_time_column='off_time_column', 
    getonstop='GetOnStop', 
    getoffstop='GetOffStop', 
    getonseq='GetOnSeq', 
    getoffseq='GetOffSeq'):
    """
    在票證資料上貼三種錯誤標籤，為 0/1。
    不做篩選，不刪資料，只新增欄位。
    """
    tickets['error_time'] = (tickets[on_time_column] > tickets[off_time_column]).astype(int)
    tickets['error_same_stop'] = (tickets[getonstop] == tickets[getoffstop]).astype(int)
    tickets['error_seq'] = (tickets[getonseq] >= tickets[getoffseq]).astype(int)

    tickets['error'] = (
        (tickets['error_time'] == 1) |
        (tickets['error_same_stop'] == 1) |
        (tickets['error_seq'] == 1)
    ).astype(int)

    return tickets

def export_ticketcorrectrate(filename, output, correctrate, txt_path):

    # 運算時間
    timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    # 判斷檔案是否已存在
    file_exists = os.path.exists(txt_path)

    # 若檔案不存在 → 用 w (寫入 header)
    # 若檔案存在 → 用 a (不寫 header)
    mode = "a" if file_exists else "w"

    with open(txt_path, mode, encoding="utf-8") as f:

        # 如果是新檔案，寫入 header
        if not file_exists:
            f.write("filename,timestamp,key,value\n")

        # 寫入 output 每筆資料
        for key, value in output.items():
            f.write(f"{filename},{timestamp},{key},{value}\n")

        # 寫入正確率
        f.write(f"{filename},{timestamp},正確率,{correctrate}\n")

    print(f"TXT (CSV 格式) 已輸出：{txt_path}")

def get_stop_fromtickets(df):
    """
    從票證資料中提取所有上下車站點資訊，並合併成一個包含所有站點的 DataFrame。
    用於檢查票種的站點是否為可用的站點，因為有站點才有辦法核對到GIS。
    
    參數:
    df (DataFrame): 包含票證資料的 DataFrame，需包含上下車站點相關欄位。
    
    回傳:
    DataFrame: 包含所有上下車站點資訊的 DataFrame。
    """
    
     # 選取需要的欄位
    select_columns = ['Authority', 'OperatorNo',  
                    'RouteUID', 'RouteName', 'SubRouteUID', 'SubRouteName', 'Direction']
    boarding_stop_columns = ['BoardingStopUID', 'BoardingStopName', 'BoardingStopSequence']
    deboarding_stop_columns = ['DeboardingStopUID', 'DeboardingStopName', 'DeboardingStopSequence']

    # 取上車資料
    dfboarding =  df[select_columns + boarding_stop_columns]
    dfboarding[select_columns + boarding_stop_columns] = dfboarding[select_columns + boarding_stop_columns].fillna('-99')
    dfboarding.columns = dfboarding.columns.str.replace('Boarding', '')
    dfboarding['OnorOff'] = 'On'

    # 取下車資料
    dfdeboarding =  df[select_columns + deboarding_stop_columns]
    dfdeboarding[select_columns + deboarding_stop_columns] = dfdeboarding[select_columns+ deboarding_stop_columns].fillna('-99')
    dfdeboarding.columns = dfdeboarding.columns.str.replace('Deboarding', '')
    dfdeboarding['OnorOff'] = 'Off'
    # 合併上下車站點資料
    df_stops = pd.concat([dfboarding, dfdeboarding], ignore_index=True)
    
    df_stops = (
        df_stops
        .fillna(-99)
        .groupby(df_stops.columns.tolist())
        .size()
        .reset_index(name='Count')
    )

    return df_stops

def match_stop_coordinates(
    dfstop, 
    stop_gdf, 
    col_uid="StopUID", 
    col_name="StopName", 
    col_lat="Lat", 
    col_lon="Lon"):
    """
    進行兩階段站點比對，並將所有原本 print 的文字改成 text 文字回傳：
    回傳：
        dfcount_final : 二階段比對後結果 DataFrame
        text : 報表文字（取代 print）
    """

    text_output = []

    # 第一次比對：比對 StopUID 與 StopName
    dfcount = pd.merge(
        dfstop,
        stop_gdf[[col_uid, col_name, col_lon, col_lat]].drop_duplicates(subset=[col_uid, col_name]),
        on=[col_uid, col_name],
        how="left"
    )

    total = dfstop["Count"].sum()
    abnormal = dfcount[(dfcount[col_lon].isna()) | (dfcount[col_lat].isna())]["Count"].sum()

    text_output.append("第一次比對結果")
    text_output.append(f"總共有幾筆資料: {total:,}")
    text_output.append(f"沒有對應經緯度座標的資料異常數量: {abnormal:,}")
    text_output.append(f"影響比例: {abnormal / total:.4%}")
    text_output.append("============================")

    # 第二次比對：只比對 StopUID
    dfcount_2ndround = dfcount[(dfcount[col_lon].isna()) | (dfcount[col_lat].isna())].copy()

    dfcount_2ndround = pd.merge(
        dfcount_2ndround.drop(columns=[col_lon, col_lat]),
        stop_gdf[[col_uid, col_lon, col_lat, col_name]].drop_duplicates(subset=[col_uid]),
        on=[col_uid],
        how="left",
        suffixes=("", "_gdf")
    )

    total_2ndround = dfcount_2ndround["Count"].sum()
    abnormal_2ndround = dfcount_2ndround[(dfcount_2ndround[col_lon].isna()) | (dfcount_2ndround[col_lat].isna())]["Count"].sum()

    text_output.append("第二次比對結果")
    text_output.append(f"第二次比對 - 總共有幾筆資料: {total_2ndround:,}")
    text_output.append(f"第二次比對 - 沒有對應經緯度座標的資料異常數量: {abnormal_2ndround:,}")
    text_output.append(f"第二次比對 - 影響比例: {abnormal_2ndround / total_2ndround:.4%}")
    text_output.append(f"第二次比對 - 影響佔可用票證的原始比例: {abnormal_2ndround / total:.4%}")
    text_output.append("============================")

    # 最終合併：第一次成功 + 第二次比對結果
    dfcount_final = pd.concat(
        [dfcount[~((dfcount[col_lon].isna()) | (dfcount[col_lat].isna()))], 
         dfcount_2ndround],
        ignore_index=True
    )

    # 將文字合成一個字串
    text = "\n".join(text_output)

    return dfcount_final, text


# 02_資料分析處理

In [30]:
# 00_Setup 所有全域函數

# 1.) 設定篩選日期區間
selectdate_start = '2024-10-01'
selectdate_end = '2024-11-30'

# 2.) 建立輸出資料夾
selecttime_ticket_folder = create_folder(os.path.join(os.getcwd(), '..', '01_初步篩選整理票證', '01_指定時間區間票證資料')) # 建立01-01 指定時間區間票證資料夾
checkok_ticketfolder = create_folder(os.path.join(os.getcwd(), '..', '01_初步篩選整理票證', '02_過濾可用票證資料')) # 建立01-02 過濾可用票證資料夾
check_stopfolder = create_folder(os.path.join(os.getcwd(), '..', '01_初步篩選整理票證', '03_所有使用到的點位')) # 建立01-03 所有使用到的點位資料夾
reformat_folder = create_folder(os.path.join(os.getcwd(), '..', '01_初步篩選整理票證', '04_計算交通量格式')) # 建立01-03 所有使用到的點位資料夾

hourlycount_folder = create_folder(os.path.join(os.getcwd(), '..', '02_初步分析', '01_分時計次')) # 建立01-03 所有使用到的點位資料夾
dailybetweenstops_folder = create_folder(os.path.join(os.getcwd(), '..', '02_初步分析', '02_全日站間量'))


In [4]:
# 預處理01: 指定時間區間票證資料切分
def pre01_split_ticket_with_day(selectdate_start, selectdate_end, outputfolder):
        orginal_ticket_files = [
                                r'D:\B-Project\2025\6800\Technical\12票證資料\2024_2025\公路客運電子票證資料(TO1A)\公路客運電子票證資料(TO1A).csv', 
                                r'D:\B-Project\2025\6800\Technical\12票證資料\2024_2025\新北市公車電子票證資料(TO1A)\新北市公車電子票證資料(TO1A).csv', 
                                r'D:\B-Project\2025\6800\Technical\12票證資料\2024_2025\桃園市公車電子票證資料(TO1A)\桃園市公車電子票證資料(TO1A).csv', 
                                ]
        for file in orginal_ticket_files:
                output = filter_ticket_data(
                        filepath = file,
                        infodate_column = 'InfoDate',
                        selectdate_start = selectdate_start,
                        selectdate_end = selectdate_end,
                        outputfolder = outputfolder,
                        skiprows = 1,
                        chunksize = 1000
                        )
                print("輸出路徑：", output)

# pre01_split_ticket_with_day(selectdate_start, selectdate_end, selecttime_ticket_folder)

In [None]:
# 額外處理 -> 為了找到是否有問題的
marked_ticketfolder = create_folder(
    os.path.join(os.getcwd(), '..', '01_初步篩選整理票證', '01-01_指定時間區間票證資料_但有錯誤標記')
)

selecttime_ticket_files = findfiles(selecttime_ticket_folder, filetype='.csv', recursive=False)
selecttime_ticket_files = [f for f in selecttime_ticket_files if 'TO1' in f]

for file in selecttime_ticket_files:
    marked_output_file = os.path.join(
        marked_ticketfolder,
        os.path.basename(file).replace(".csv", "_marked.csv")
    )

    # 如果 mark_ticket_errors 需要全表上下文，改成 chunksize=None
    reader = pd.read_csv(file, chunksize=1000)

    first_chunk = True
    for chunk in reader:
        output = mark_ticket_errors(
            tickets=chunk, 
            on_time_column='BoardingTime',
            off_time_column='DeboardingTime',
            getonstop='BoardingStopUID',
            getoffstop='DeboardingStopUID',
            getonseq='BoardingStopSequence',
            getoffseq='DeboardingStopSequence'
        )

        output.to_csv(
            marked_output_file,
            mode='w' if first_chunk else 'a',
            header=first_chunk,
            index=False,
            encoding='utf-8-sig'
        )
        first_chunk = False  


In [5]:
# 預處理02: 指定時間區間票證資料切分
def pre02_get_correct_tickets(selecttime_ticket_folder, checkok_ticketfolder):

    selecttime_ticket_files = findfiles(selecttime_ticket_folder, filetype='.csv', recursive=False)
    correctratelog_path = os.path.join(checkok_ticketfolder, '客運票證資料正確率記錄.txt')

    chunksize = 10000   

    for file in selecttime_ticket_files:

        print(f"\n=== 開始處理：{file} ===")

        # 統計資料累加器
        total_stat = Counter()

        # 輸出清洗後 CSV 的路徑
        cleaned_output_path = os.path.join(
            checkok_ticketfolder,
            os.path.basename(file).replace(".csv", "_cleaned.csv")
        )

        first_chunk = True  # 控制 header

        # 分批讀取整個檔案
        for chunk in pd.read_csv(file, chunksize=chunksize, encoding='utf-8-sig'):

            # 跑你自己的清洗函數
            cleaned_df, correct_stat_info, correctrate_chunk = tickets_cleaning(
                chunk,
                on_time_column='BoardingTime',
                off_time_column='DeboardingTime',
                getonstop='BoardingStopUID',
                getoffstop='DeboardingStopUID',
                getonseq='BoardingStopSequence',
                getoffseq='DeboardingStopSequence'
            )

            # 累加統計
            total_stat.update(correct_stat_info)

            # 將清洗後的 cleaned_df 分批寫入新 CSV
            if not cleaned_df.empty:
                cleaned_df.to_csv(
                    cleaned_output_path,
                    mode='w' if first_chunk else 'a',
                    header=first_chunk,
                    index=False,
                    encoding='utf-8-sig'
                )
                first_chunk = False

        # -------- 整份 CSV 的整體正確率 --------
        original_count = total_stat.get('原始票證數量', 0)
        canuse_count   = total_stat.get('資料正常', 0)

        if original_count > 0:
            final_correctrate = round(canuse_count / original_count * 100, 2)
        else:
            final_correctrate = 0.0

        # -------- 寫入 TXT（CSV 格式） --------
        export_ticketcorrectrate(
            filename=file,
            output=dict(total_stat),
            correctrate=final_correctrate,
            txt_path=correctratelog_path
        )

        print(f"清洗後資料輸出：{cleaned_output_path}")

# pre02_get_correct_tickets(selecttime_ticket_folder, checkok_ticketfolder)

In [6]:
# 預處理03: 確認所有站點的經緯度在TDX都可以被核對出來

def pre03_findstops(checkok_ticketfolder, 
                    seqfolder = r"D:\B-Project\2025\6800\Technical\12票證資料\TicketAnalysis\00_TDX資料下載\01公車站序資料"):

    files = findfiles(checkok_ticketfolder)
    files = [f for f in files if 'TO1' in f]
    for file in files:
        df = pd.read_csv(file, encoding='utf-8-sig')
        stop = get_stop_fromtickets(df)
        stop['file_source'] = os.path.basename(file)

        outputfilename = os.path.join(check_stopfolder, os.path.basename(file).replace('_cleaned.csv', '_stops.csv'))
        stop.to_csv(outputfilename, index=False, encoding='utf-8-sig')
        print(f"站點資料輸出：{outputfilename}")

    df_stop = read_combined_dataframe(findfiles(check_stopfolder, filetype='csv', recursive=False), filepath=False)

    df_seq = read_combined_dataframe(findfiles(seqfolder, 
                                            filetype='csv', 
                                            recursive=False), filepath=False)
    df_stopfromseq = df_seq[['StopUID', 'StopName_Zh', 'PositionLon', 'PositionLat']].drop_duplicates(subset=['StopUID']).sort_values(['StopUID'])

    df_final, report_text = match_stop_coordinates(
        dfstop=df_stop.copy().rename(columns = {'StopName':'StopName_Zh'}),
        stop_gdf=df_stopfromseq,
        col_uid="StopUID",
        col_name="StopName_Zh",
        col_lat="PositionLat",
        col_lon="PositionLon"
    )

    print(report_text)



    # a = df_final[((df_final['PositionLon'].isna()) | (df_final['PositionLat'].isna())) & (df_final['StopUID'] != "-99")][['StopUID', 'StopName_Zh']].drop_duplicates()
    # a['Auth'] = a['StopUID'].str[:3]
    # a.sort_values(['Auth'])

# pre03_findstops(checkok_ticketfolder, seqfolder = r"D:\B-Project\2025\6800\Technical\12票證資料\TicketAnalysis\00_TDX資料下載\01公車站序資料")

In [None]:
# 預處理04: 加上必要欄位 (平假日欄位、刪除不重要的欄位）

def add_weekdayandweekendcolumns(df, 
                                 timecolumns='InfoDate',
                                 filterdate=None):
    """
    將 DataFrame 中的時間欄位轉換為日期時間格式，新增 DaysofWeek 和 WDWK 欄位，
    並可選擇性地過濾掉特定日期。

    Args:
        df (pd.DataFrame): 原始 DataFrame。
        timecolumns (str): 包含日期的欄位名稱，預設為 'InfoDate'。
        filterdate (list/None): 要過濾掉的日期字串列表 (例如 ['YYYY-MM-DD'])。

    Returns:
        pd.DataFrame: 處理後的 DataFrame。
    """
    df = df.copy()
    
    # 1. 將時間欄位轉換為 datetime
    df[timecolumns] = pd.to_datetime(df[timecolumns], errors='coerce')

    # 2. 新增 'DaysofWeek' 欄位
    
    df['DaysofWeek'] = df[timecolumns].dt.dayofweek # .dt.dayofweek 會回傳：0=週一, 1=週二, ..., 6=週日

    # 3. 處理過濾日期 (如果 filterdate 不是 None 且有內容)
    if filterdate and len(filterdate) > 0:
        # 將 filterdate 列表轉換為 datetime 格式，以便進行比較
        filter_dates_dt = pd.to_datetime(filterdate)
        
        # 找出不在 filter_dates_dt 中的日期 (布林遮罩)
        # .dt.normalize() 將日期時間的時間部分設為 00:00:00，確保只比較日期
        filter_mask = ~df[timecolumns].dt.normalize().isin(filter_dates_dt)
        
        # 套用遮罩，只保留不在過濾列表中的資料
        df = df[filter_mask].copy()

    # 4. 新增 'WDWK' 欄位
    # .dt.dayofweek 回傳：0=週一, 1=週二, 2=週三, 3=週四, 4=週五, 5=週六, 6=週日
    
    # 定義條件：
    # WDWK = 1 (週二=1, 週三=2, 週四=3)
    wdwk_1_condition = df['DaysofWeek'].isin([1, 2, 3])
    
    # WDWK = -1 (週六=5, 週日=6)
    wdwk_neg1_condition = df['DaysofWeek'].isin([5, 6])
    
    # 使用 np.select (比多個 if/elif 判斷更快)
    
    df['WDWK'] = np.select(
        [wdwk_1_condition, wdwk_neg1_condition], # 條件列表
        [1, -1],                                # 對應的值
        default=0                               # 預設值 (其他日子=0)
    )

    return df

def must_outputformat(df):
    df['BoardingTime'] = pd.to_datetime(df['BoardingTime'], errors='coerce')
    df['DeboardingTime'] = pd.to_datetime(df['DeboardingTime'], errors='coerce')
    df['BoardinngDate'] = df['BoardingTime'].dt.date
    df['DeboardingDate'] = df['DeboardingTime'].dt.date
    df['BoardingHour'] = df['BoardingTime'].dt.hour
    df['DeboardingHour'] = df['DeboardingTime'].dt.hour

    reindexcolumns = ['Authority', 'OperatorNo', 'HolderType', 'TicketType', 'SubTicketType', 
                    'RouteUID', 'RouteName', 'SubRouteUID', 'SubRouteName', 'Direction', 
                    'BoardingStopUID', 'BoardingStopName', 'BoardingStopSequence', 'BoardinngDate',  'BoardingHour', 
                    'DeboardingStopUID', 'DeboardingStopName', 'DeboardingStopSequence', 'DeboardingDate', 'DeboardingHour',
                    'InfoDate', 'DaysofWeek', 'WDWK']

    df = df.reindex(columns=reindexcolumns)
    return df 

def pre04_reformat(checkok_ticketfolder, reformat_folder):

    filelist = findfiles(checkok_ticketfolder, filetype='csv', recursive=False)

    for file in filelist:

        reformat_output_file = os.path.join(
            reformat_folder,
            os.path.basename(file).replace("_cleaned.csv", "_reformatted.csv")
        )


        # 如果 mark_ticket_errors 需要全表上下文，改成 chunksize=None
        reader = pd.read_csv(file, chunksize=1000)

        first_chunk = True
        for chunk in reader:

            output = add_weekdayandweekendcolumns(df=chunk,
                                            timecolumns= 'InfoDate', 
                                            filterdate= ['2024-10-09', '2024-10-10', '2024-10-11', '2024-10-12', '2024-10-13', '2024-10-14', '2024-10-15'])
            output = must_outputformat(output)

            output.to_csv(
                reformat_output_file,
                mode='w' if first_chunk else 'a',
                header=first_chunk,
                index=False,
                encoding='utf-8-sig'
            )
            first_chunk = False  

pre04_reformat(checkok_ticketfolder, reformat_folder)

In [None]:
filelist = findfiles(checkok_ticketfolder, filetype='csv', recursive=False)

for file in filelist:

    reformat_output_file = os.path.join(
        reformat_folder,
        os.path.basename(file).replace("_cleaned.csv", "_reformatted.csv")
    )


    # 如果 mark_ticket_errors 需要全表上下文，改成 chunksize=None
    reader = pd.read_csv(file, chunksize=1000)

    first_chunk = True
    for chunk in reader:

        output = add_weekdayandweekendcolumns(df=chunk,
                                        timecolumns= 'InfoDate', 
                                        filterdate= ['2024-10-09', '2024-10-10', '2024-10-11', '2024-10-12', '2024-10-13', '2024-10-14', '2024-10-15'])
        output = must_outputformat(output)

        output.to_csv(
            reformat_output_file,
            mode='w' if first_chunk else 'a',
            header=first_chunk,
            index=False,
            encoding='utf-8-sig'
        )
        first_chunk = False  

In [29]:
df.head()

Unnamed: 0,Authority,OperatorNo,HolderType,TicketType,SubTicketType,RouteUID,RouteName,SubRouteUID,SubRouteName,Direction,...,BoardinngDate,BoardingHour,DeboardingStopUID,DeboardingStopName,DeboardingStopSequence,DeboardingDate,DeboardingHour,InfoDate,DaysofWeek,WDWK
0,NewTaipei,301,A,4,#NOR-1200,NWT18258,712副,NWT160167,712副,1,...,2024-10-01,16,NWT221599,迴龍派出所,28,2024-10-01,16,2024-10-01,1,1
1,NewTaipei,301,B,1,,NWT16515,858,NWT157550,858,0,...,2024-10-01,12,NWT131311,泰山區公所,37,2024-10-01,12,2024-10-01,1,1
2,NewTaipei,301,A,4,#NOR-1200,NWT18119,967,NWT159907,967體育大學行政教學大樓,1,...,2024-10-01,14,NWT203915,空間樂園社區,40,2024-10-01,15,2024-10-01,1,1
3,NewTaipei,301,A,1,,NWT10474,636,NWT10474,636,1,...,2024-10-01,20,NWT10566,捷運新莊站(新莊郵局),20,2024-10-01,21,2024-10-01,1,1
4,NewTaipei,301,A,4,#NOR-1200,NWT16591,857,NWT157639,857,0,...,2024-10-01,17,NWT139066,菜寮(重新路),65,2024-10-01,18,2024-10-01,1,1


In [None]:
# 分析01: 確認資料各票種、各路線、平假日、起點、迄點筆數

def analytics01_hourlycount(checkok_ticketfolder, 
                            hourlycount_folder, 
                            seqfolder = r"D:\B-Project\2025\6800\Technical\12票證資料\TicketAnalysis\00_TDX資料下載\01公車站序資料",
                            returndf = True):

    files = findfiles(checkok_ticketfolder)
    files = [f for f in files if 'TO1' in f]
    df = read_combined_dataframe(files)
    df['BoardingTime'] = pd.to_datetime(df['BoardingTime'], errors='coerce')
    df['DeboardingTime'] = pd.to_datetime(df['DeboardingTime'], errors='coerce')
    df['BoardinngDate'] = df['BoardingTime'].dt.date
    df['DeboardingDate'] = df['DeboardingTime'].dt.date
    df['BoardingHour'] = df['BoardingTime'].dt.hour
    df['DeboardingHour'] = df['DeboardingTime'].dt.hour

    groupbycolumns = ['HolderType', 
                    'RouteUID', 'RouteName', 'SubRouteUID', 'SubRouteName',
                    'BoardingStopUID', 'BoardingStopName', 'BoardingStopSequence', 'BoardinngDate', 'BoardingHour',
                    'DeboardingStopUID', 'DeboardingStopName', 'DeboardingStopSequence', 'DeboardingDate', 'DeboardingHour', 'FilePath']

    df[groupbycolumns] = df[groupbycolumns].fillna('-99')
    df_count = df.groupby(groupbycolumns).size().reset_index(name='Count')

    df_seq = read_combined_dataframe(findfiles(seqfolder, 
                                            filetype='csv', 
                                            recursive=False), filepath=False)
    df_stopfromseq = df_seq[['StopUID', 'StopName_Zh', 'PositionLon', 'PositionLat']].drop_duplicates(subset=['StopUID']).sort_values(['StopUID'])

    df_count = pd.merge(df_count, 
                        df_stopfromseq[['StopUID', 'PositionLon', 'PositionLat']].rename(columns = {'StopUID':'BoardingStopUID', 'PositionLon':'BoardingLon', 'PositionLat':'BoardingLat'}), 
                        on = 'BoardingStopUID', 
                        how='left')

    df_count = pd.merge(df_count, 
                        df_stopfromseq[['StopUID', 'PositionLon', 'PositionLat']].rename(columns = {'StopUID':'DeboardingStopUID', 'PositionLon':'DeboardingLon', 'PositionLat':'DeboardingLat'}), 
                        on = 'DeboardingStopUID', 
                        how='left')
    df_count = df_count.reindex(columns= ['HolderType', 'RouteUID', 'RouteName', 'SubRouteUID', 'SubRouteName', 
                                        'BoardingStopUID', 'BoardingStopName', 'BoardingStopSequence','BoardinngDate', 'BoardingHour', 'BoardingLon', 'BoardingLat', 
                                        'DeboardingStopUID','DeboardingStopName', 'DeboardingStopSequence', 'DeboardingDate', 'DeboardingHour', 'DeboardingLon', 'DeboardingLat', 
                                        'FilePath', 'Count'])

    outputfile = os.path.join(hourlycount_folder, '上下車區分票種分時計次.csv')
    df_count.to_csv(outputfile, index=False)

    if returndf:
        return df_count

analytics01_hourlycount(checkok_ticketfolder, 
                        hourlycount_folder, 
                        seqfolder = r"D:\B-Project\2025\6800\Technical\12票證資料\TicketAnalysis\00_TDX資料下載\01公車站序資料", 
                        returndf=False)

In [None]:
# 