In [5]:
import pandas as pd 
import numpy as np
import os 
import datetime
from collections import Counter   # 用來方便累加每個 chunk 的統計結果

In [6]:
# 00_setup_os處理函數
def create_folder(folder_name):
    """建立資料夾"""
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)
    return os.path.abspath(folder_name)

def findfiles(filefolderpath, filetype='.csv', recursive=True):
    """
    尋找指定路徑下指定類型的檔案，並返回檔案路徑列表。

    Args:
        filefolderpath (str): 指定的檔案路徑。
        filetype (str, optional): 要尋找的檔案類型，預設為 '.csv'。
        recursive (bool, optional): 是否檢索所有子資料夾，預設為 True；反之為False，僅查找當前資料夾的所有file。

    Returns:
        list: 包含所有符合條件的檔案路徑的列表。
    """
    filelist = []

    if recursive:
        # 遍歷資料夾及其子資料夾
        for root, _, files in os.walk(filefolderpath):
            for file in files:
                if file.endswith(filetype):
                    file_path = os.path.join(root, file)
                    filelist.append(file_path)
    else:
        # 僅檢索當前資料夾
        for file in os.listdir(filefolderpath):
            file_path = os.path.join(filefolderpath, file)
            if os.path.isfile(file_path) and file.endswith(filetype):
                filelist.append(file_path)

    return filelist

def read_combined_dataframe(file_list, filepath = True):
    dataframes = []
    
    for file in file_list:
        try:
            if file.endswith('.csv'):
                df = pd.read_csv(file)
            elif file.endswith('.shp'):
                df = gpd.read_file(file)
            elif file.endswith(('.xls', '.xlsx')):
                df = pd.read_excel(file)
            else:
                print(f"Unsupported file format: {file}")
                continue
            if filepath:
                df['FilePath'] = file  # 添加來源檔案路徑欄位
            dataframes.append(df)
        except Exception as e:
            print(f"Error reading {file}: {e}")

    # 合併所有 DataFrame
    combined_df = pd.concat(dataframes, ignore_index=True)
    return combined_df


In [7]:
# 處理 
filepath = r"D:\B-Project\2025\6800\Technical\12票證資料\TicketAnalysis\01_初步篩選整理票證\01-01_指定時間區間票證資料_但有錯誤標記\新北市公車電子票證資料(TO1A)_2024-10-01_to_2024-11-30_marked.csv"
df = pd.read_csv(filepath)
df = df[df['error'] == 1 ]

In [23]:
df.columns

Index(['Authority', 'OperatorNo', 'IDType', 'HolderType', 'TicketType',
       'SubTicketType', 'RouteUID', 'RouteName', 'SubRouteUID', 'SubRouteName',
       'Direction', 'FarePricingType', 'BoardingStopUID', 'BoardingStopName',
       'BoardingStopSequence', 'BoardingTime', 'DeboardingStopUID',
       'DeboardingStopName', 'DeboardingStopSequence', 'DeboardingTime',
       'Price', 'Discount', 'TransferCode', 'DiscountInfo', 'PaymentPrice',
       'IsAbnormal', 'ErrorCode', 'Result', 'TicketCount', 'SrcUpdateTime',
       'UpdateTime', 'InfoDate', 'error_time', 'error_same_stop', 'error_seq',
       'error'],
      dtype='object')

In [24]:
# 檢查 01 : 上車時間晚於下車時間
df[df['error_time'] == 1]

Unnamed: 0,Authority,OperatorNo,IDType,HolderType,TicketType,SubTicketType,RouteUID,RouteName,SubRouteUID,SubRouteName,...,ErrorCode,Result,TicketCount,SrcUpdateTime,UpdateTime,InfoDate,error_time,error_same_stop,error_seq,error
0,NewTaipei,301,EasyCard,A,4,#NOR-1200,NWT18258,712副,NWT160167,712副,...,0,3,1,2024-11-29 09:37:51,2024-12-13 10:21:47,2024-10-01,1,0,0,1
1,NewTaipei,301,EasyCard,B,1,,NWT16515,858,NWT157550,858,...,0,3,1,2024-11-29 09:37:51,2024-12-13 10:21:47,2024-10-01,1,0,0,1
5,NewTaipei,301,EasyCard,A,4,#NOR-1200,NWT16694,786,NWT160792,786不繞林口轉運站,...,0,3,1,2024-11-29 09:37:51,2024-12-13 10:21:47,2024-10-01,1,0,0,1
6,NewTaipei,410,EasyCard,B,1,,NWT18318,918區,NWT160228,918區,...,0,3,1,2024-11-21 09:13:41,2024-11-27 12:18:08,2024-10-01,1,0,0,1
7,NewTaipei,410,EasyCard,A,1,,NWT16296,藍15,NWT159525,藍15寵物友善公車(新北市)(先行取消),...,0,3,1,2024-11-21 09:13:41,2024-11-27 12:18:08,2024-10-01,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29554072,NewTaipei,303,EasyCard,A,1,,NWT16655,937,NWT157719,937,...,0,3,1,2024-12-17 20:21:02,2025-04-30 11:28:14,2024-11-30,1,0,0,1
29554073,NewTaipei,303,EasyCard,A,1,,NWT16593,橘18,NWT157641,橘18,...,0,3,1,2024-12-17 20:21:02,2025-04-30 11:28:14,2024-11-30,1,0,0,1
29554074,NewTaipei,303,EasyCard,A,1,,NWT16593,橘18,NWT157641,橘18,...,0,3,1,2024-12-17 20:21:02,2025-04-30 11:28:14,2024-11-30,1,0,0,1
29554076,NewTaipei,303,EasyCard,A,4,#NOR-1200,NWT16655,937,NWT157719,937,...,0,3,1,2024-12-17 20:21:02,2025-04-30 11:28:14,2024-11-30,1,0,0,1


In [30]:
# 檢查 02 : 同站上下車
df[df['error_same_stop'] != 1].reindex(columns = ['RouteUID', 'RouteName', 'SubRouteUID', 'SubRouteName', 'Direction', 'BoardingTime', 'BoardingStopUID', 'BoardingStopName', 'DeboardingTime', 'DeboardingStopUID', 'DeboardingStopName'])

Unnamed: 0,RouteUID,RouteName,SubRouteUID,SubRouteName,Direction,BoardingTime,BoardingStopUID,BoardingStopName,DeboardingTime,DeboardingStopUID,DeboardingStopName
0,NWT18258,712副,NWT160167,712副,1,2024-10-01 16:00:00,NWT205562,崑崙里,2024-10-01 16:00:00,NWT221599,迴龍派出所
1,NWT16515,858,NWT157550,858,0,2024-10-01 12:00:00,NWT131308,義學里,2024-10-01 12:00:00,NWT131311,泰山區公所
5,NWT16694,786,NWT160792,786不繞林口轉運站,1,2024-10-01 17:00:00,NWT150268,綜合運動場,2024-10-01 17:00:00,NWT150271,港尾
6,NWT18318,918區,NWT160228,918區,1,2024-10-01 07:00:00,NWT205962,新莊高中(幸福路),2024-10-01 07:00:00,NWT205969,福泰里
7,NWT16296,藍15,NWT159525,藍15寵物友善公車(新北市)(先行取消),1,2024-10-01 21:00:00,NWT115015,捷運南港展覽館站(南港路),2024-10-01 21:00:00,NWT115019,東方科學園區
...,...,...,...,...,...,...,...,...,...,...,...
29554073,NWT16593,橘18,NWT157641,橘18,0,2024-11-30 18:00:00,NWT139235,捷運三民高中站(復興路),2024-11-30 18:00:00,NWT139239,中興街(復興路)
29554074,NWT16593,橘18,NWT157641,橘18,0,2024-11-30 09:00:00,NWT139231,光華和平路口,2024-11-30 09:00:00,NWT139235,捷運三民高中站(復興路)
29554076,NWT16655,937,NWT157719,937,0,2024-11-30 16:00:00,NWT145135,尚品社區,2024-11-30 16:00:00,NWT145141,瑞士花園9
29554077,NWT16593,橘18,NWT157641,橘18,0,2024-11-30 18:00:00,NWT139249,溪尾街口,2024-11-30 19:00:00,NWT139242,信義路
