In [None]:
import pandas as pd 
import numpy as np
import os 
import datetime
from collections import Counter   # 用來方便累加每個 chunk 的統計結果

In [None]:
# 00_setup_os處理函數
def create_folder(folder_name):
    """建立資料夾"""
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)
    return os.path.abspath(folder_name)

def findfiles(filefolderpath, filetype='.csv', recursive=True):
    """
    尋找指定路徑下指定類型的檔案，並返回檔案路徑列表。

    Args:
        filefolderpath (str): 指定的檔案路徑。
        filetype (str, optional): 要尋找的檔案類型，預設為 '.csv'。
        recursive (bool, optional): 是否檢索所有子資料夾，預設為 True；反之為False，僅查找當前資料夾的所有file。

    Returns:
        list: 包含所有符合條件的檔案路徑的列表。
    """
    filelist = []

    if recursive:
        # 遍歷資料夾及其子資料夾
        for root, _, files in os.walk(filefolderpath):
            for file in files:
                if file.endswith(filetype):
                    file_path = os.path.join(root, file)
                    filelist.append(file_path)
    else:
        # 僅檢索當前資料夾
        for file in os.listdir(filefolderpath):
            file_path = os.path.join(filefolderpath, file)
            if os.path.isfile(file_path) and file.endswith(filetype):
                filelist.append(file_path)

    return filelist

def read_combined_dataframe(file_list, filepath = True):
    dataframes = []
    
    for file in file_list:
        try:
            if file.endswith('.csv'):
                df = pd.read_csv(file)
            elif file.endswith('.shp'):
                df = gpd.read_file(file)
            elif file.endswith(('.xls', '.xlsx')):
                df = pd.read_excel(file)
            else:
                print(f"Unsupported file format: {file}")
                continue
            if filepath:
                df['FilePath'] = file  # 添加來源檔案路徑欄位
            dataframes.append(df)
        except Exception as e:
            print(f"Error reading {file}: {e}")

    # 合併所有 DataFrame
    combined_df = pd.concat(dataframes, ignore_index=True)
    return combined_df


In [None]:
df.columns

In [None]:
# 處理 
filepath = r"D:\B-Project\2025\6800\Technical\12票證資料\TicketAnalysis\01_初步篩選整理票證\01-01_指定時間區間票證資料_但有錯誤標記\新北市公車電子票證資料(TO1A)_2024-10-01_to_2024-11-30_marked.csv"
df = pd.read_csv(filepath)
df = df[df['error'] == 1 ]

In [None]:
df[[]]

In [None]:
# 檢查 01 : 上車時間晚於下車時間
check = df[df['error_time'] == 1]
check['BoardingDate'] = pd.to_datetime(check['BoardingTime']).dt.date
check['BoardingHour'] = pd.to_datetime(check['BoardingTime']).dt.hour
check['DeboardingDate'] = pd.to_datetime(check['DeboardingTime']).dt.date
check['DeboardingHour'] = pd.to_datetime(check['DeboardingTime']).dt.hour

check.reindex(columns = ['RouteUID', 'RouteName', 'SubRouteUID', 'SubRouteName', 'Direction', 
                         'BoardingTime', 'BoardingStopUID', 'BoardingStopName', 'BoardingDate', 'BoardingHour', 
                         'DeboardingTime', 'DeboardingStopUID', 'DeboardingStopName', 'DeboardingDate', 'DeboardingHour',
                         'ErroTimestampNone'])

# 經確認目前有error_time 上車時間晚於下車時間的都是吳下車刷卡資料的

In [20]:
# 檢查 02 : 同站上下車
df[df['error_same_stop'] == 1].reindex(columns = ['RouteUID', 'RouteName', 'SubRouteUID', 'SubRouteName', 'Direction', 'BoardingTime', 'BoardingStopUID', 'BoardingStopName', 'DeboardingTime', 'DeboardingStopUID', 'DeboardingStopName'])

Unnamed: 0,RouteUID,RouteName,SubRouteUID,SubRouteName,Direction,BoardingTime,BoardingStopUID,BoardingStopName,DeboardingTime,DeboardingStopUID,DeboardingStopName
14,NWT18248,捷運七張站-全球工業區,NWT160147,捷運七張站-全球工業區去,0,2024-10-01 07:00:00,NWT205463,捷運七張站,2024-10-01 08:00:00,NWT205463,捷運七張站
28,NWT17519,916,NWT158687,916,0,2024-10-01 11:00:00,NWT178750,三峽國小(三鶯國民運動中心),2024-10-01 12:00:00,NWT178750,三峽國小(三鶯國民運動中心)
117,NWT10474,636,NWT10474,636,0,2024-10-01 06:00:00,NWT10512,捷運迴龍站,2024-10-01 06:00:00,NWT10512,捷運迴龍站
171,NWT17523,806,NWT158923,806蘆洲總站,1,2024-10-01 14:00:00,NWT179005,致理科技大學,2024-10-01 14:00:00,NWT179005,致理科技大學
220,NWT16282,紅38,NWT157277,紅38,0,2024-10-01 16:00:00,NWT150914,鳳翔區,2024-10-01 16:00:00,NWT150914,鳳翔區
...,...,...,...,...,...,...,...,...,...,...,...
29550932,NWT17516,藍18,NWT158684,藍18中和站,0,2024-11-30 10:00:00,NWT193232,中和站,2024-11-30 11:00:00,NWT193232,中和站
29551170,NWT16604,925,NWT160357,925,0,2024-11-30 07:00:00,NWT206646,林口站,2024-11-30 07:00:00,NWT206646,林口站
29551198,NWT16113,658,NWT16113,658,0,2024-11-30 12:00:00,NWT194524,藝香公園(果菜市場),2024-11-30 12:00:00,NWT194524,藝香公園(果菜市場)
29551423,NWT10172,243,NWT101720,243,0,2024-11-30 10:00:00,NWT34490,中和站,2024-11-30 10:00:00,NWT34490,中和站


In [None]:
check = pd.read_csv(r"D:\B-Project\2025\6800\Technical\12票證資料\TicketAnalysis\01_初步篩選整理票證\01_指定時間區間票證資料\公路客運電子票證資料(TO1A)_2024-10-01_to_2024-11-30.csv", 
                    nrows = 10000).reindex(columns = ['RouteUID', 'RouteName', 'SubRouteUID', 'SubRouteName', 'Direction', 'BoardingTime', 'BoardingStopUID', 'BoardingStopName', 'DeboardingTime', 'DeboardingStopUID', 'DeboardingStopName'])