# 站間量分析

需要準備資料：
1. 票證資料：須包含所有佔位點的資料
2. 站序資料：需帶有'Direction'欄位
3. 班表資料：需帶有'Direction'、'IsWorkday'欄位
4. 營運月報 (optional)：做票證放大率佐證用 

## 基礎設定

包含環境設定，以及指定對應資料夾路徑（input、process、output）

In [1]:
import os
import pandas as pd
import plotly.graph_objects as go
import ipywidgets as widgets
from IPython.display import display
from tickets_cleaning import tickets_cleaning, date_defined , getDaysCount, getMagnification, tickets_match_shift

In [2]:
inputfolder_path = os.path.join(os.getcwd(),'..', 'input')
outputfolder_path = os.path.join(os.getcwd(),'..', 'output')
processfolder_path = os.path.join(os.getcwd(),'..', 'process')

# 確保資料夾存在
os.makedirs(inputfolder_path, exist_ok=True)
os.makedirs(outputfolder_path, exist_ok=True)
os.makedirs(processfolder_path, exist_ok=True)


In [3]:
date_turn_holiday=[20230929] # 補假、國定假日、颱風天
date_turn_workday=[20230923] # 補班
startdate = 20230701
enddate = 20230930
# 計算每月的假日與平日數
dayscount = getDaysCount(startdate, enddate, date_turn_holiday, date_turn_workday)


In [11]:
# 定義票證資料的欄位名稱
direction_col = 'DIRECTION'
getontime_col ='GETON_DATE'
getofftime_col ='GETOFF_DATE'
getonstop_col ='GETON_STOP_NAME'
getoffstop_col ='GETOFF_STOP_NAME'
getonseq_col ='GETON_STOP_SEQ'
getoffseq_col ='GETOFF_STOP_SEQ'

## 資料前處理

1. 票證清洗(去除不可用資料)
2. 票證定義日期欄位 (年月、平假日)
3. 處理票證放大率

In [None]:
'''進行基礎的票證清洗
1. 找到上車時間 < 下車時間
2. 上車站序 < 下車站序
3. 上下車站名不同'''

# 定義 tickets.csv 的相對路徑
tickets_path = os.path.join(inputfolder_path , 'tickets.csv')
tickets = pd.read_csv(tickets_path)

# 讀取資料並進行清理
tickets, errorstat, correctrate = tickets_cleaning(tickets, 
    getontime=getontime_col, 
    getofftime=getofftime_col, 
    getonstop=getonstop_col, 
    getoffstop=getoffstop_col, 
    getonseq=getonseq_col, 
    getoffseq=getoffseq_col)

# 把清洗過的資料轉存至process
tickets.to_csv(os.path.join(processfolder_path , 'tickets_cleaned.csv'))
# tickets.to_csv(os.path.join(os.path.dirname(__file__), '..', 'process', 'tickets_cleaned.csv'))

# 輸出數據清洗統計
errorstat_path = os.path.join(outputfolder_path , 'ErrorDataStat.txt')
with open(errorstat_path , 'w', encoding='utf-8') as file:
    for key, value in errorstat.items():
        file.write(f"{key}: {value}\n")
# del errorstat
tickets = date_defined(tickets, getontime_columns=getontime_col, date_turn_holiday=date_turn_holiday,\
                       date_turn_workday=date_turn_workday)


In [None]:
# tickets = pd.read_csv(os.path.join(processfolder_path , 'tickets_cleaned.csv'))
# tickets = tickets.rename(columns = {'RouteName':'ROUTE_NAME'})

In [6]:
'''處理票證資料放大率'''

operation = pd.read_csv(os.path.join(inputfolder_path, 'operation.csv'))

# 計算 DataYearMonth 並格式化
operation['DataYearMonth'] = (
    pd.to_datetime((operation['YEAR'] + 1911) * 100 + operation['MONTH'], format='%Y%m')
    .dt.strftime('%Y%m')
)

tickets_magnification = getMagnification(
    tickets=tickets,
    tickets_routename_col='RouteName',
    tickets_yearmonth_col='DataYearMonth',  # 指定票證數據的年月欄位
    operation=operation,
    operation_routename_col='ROUTE_NAME',
    operation_yearmonth_col='DataYearMonth',  # 指定運營數據的年月欄位
    operation_passengers_col='PASSENGERS'
)

# 列出所有放大率會有異常的路線
ooc_route_list = list(set(tickets_magnification[tickets_magnification['Magnification'] >= 1.3]['RouteName'].unique()).union(
    set(tickets_magnification[tickets_magnification['Magnification'] <= 0.8]['RouteName'].unique())
))

KeyError: 'RouteName'

In [None]:
# # 讀取班表資料
# shift = pd.read_excel(os.path.join(inputfolder_path, 'shift.xlsx'))
# shift.columns = ['RouteName', 'Direction', 'Shift', 'IsWorkday']
# shift['IsWorkday'] = shift['IsWorkday'].replace({'假日': '0', '平日': '1'})
# shift['Shift'] = shift['Shift'].astype(str)
# shift['Shift'] = pd.to_datetime(shift['Shift'], format='%H:%M').dt.time
# shift = shift.sort_values(['RouteName', 'IsWorkday','Shift', 'Direction'], ascending=[True, True, True, True])

# 讀取班表資料
shift = pd.read_csv(os.path.join(inputfolder_path, 'shift.csv'))
shift['Shift'] = shift['Shift'].astype(str)
shift['Shift'] = pd.to_datetime(shift['Shift'], format='%H:%M').dt.time
shift = shift.sort_values(['RouteName', 'IsWorkday','Shift', 'Direction'], ascending=[True, True, True, True]).reset_index(drop = True)

# 讀取相關的站序 
seq = pd.read_csv(os.path.join(inputfolder_path,'seq.csv'))

# 具有班表的RouteName_list
shift_routename_list = list(shift['RouteName'].unique()) 
tickets_routename_list = list(tickets['RouteName'].unique())
seq_routename_list = list(shift['RouteName'].unique())

# 不在 tickets_routename_list 中但在 shift_routename_list 中的項目
only_in_shift = list(set(shift_routename_list) - set(tickets_routename_list))
# 不在 shift_routename_list 中但在 tickets_routename_list 中的項目
only_in_tickets = list(set(tickets_routename_list) - set(shift_routename_list))
common_routes = list(set(tickets_routename_list) & set(shift_routename_list) & set(seq_routename_list))

# 印出結果
print("缺票證資料:", only_in_shift)
print("缺班表資料:", only_in_tickets)
print("本次可算的路線:", common_routes)

### 基本判讀指標：是否繼續往下做

1. 列出本次資料正常資料的佔比
2. 列出本次放大率異常的路線 ( 可以進一步以plotly 圖表檢視長條圖)

In [7]:
print(f'資料可用比例 = {correctrate}%',end=' ')
if correctrate <= 95:
    print('本次取得的資料錯誤率太高，建議重新檢視')
else : 
    print('本次的資料可以使用')

try:
    if len(ooc_route_list) > 0:
        print(f'本次放大率異常路線共{len(ooc_route_list)}條')
        print('票證放大率異常的路線編號', end= ':')
        print(ooc_route_list)
except:
    pass

NameError: name 'correctrate' is not defined

In [8]:
# unique_year_months = tickets_magnification["DataYearMonth"].unique()
# # 創建篩選器 (Dropdown)
# dropdown = widgets.Dropdown(
#     options=unique_year_months,
#     value=unique_year_months[0],
#     description="月份:"
# )

# # 定義繪圖函數
# def plot_barchart(selected_month):
#     # 篩選 DataFrame
#     filtered_df = tickets_magnification[tickets_magnification["DataYearMonth"] == selected_month]
    
#     if filtered_df.empty:
#         print(f"No data available for {selected_month}")
#         return
    
#     # 創建條形圖
#     fig = go.Figure()

#     # 定義顯示在 hover 上的格式
#     hover_text_tickets = [
#     f"RouteName: {row['RouteName']}<br>Magnification: {row['Magnification'] * 100:.2f}%<br>Tickets: {row['Tickets']:,}"  # Magnification 顯示為百分比，Tickets 顯示為實際數字
#     for _, row in filtered_df.iterrows()
#     ]
#     hover_text_passengers = [
#     f"RouteName: {row['RouteName']}<br>Magnification: {row['Magnification'] * 100:.2f}%<br>Passengers: {row['Passengers']:,}"  # Magnification 顯示為百分比，Passengers 顯示為實際數字
#     for _, row in filtered_df.iterrows()
#     ]

#     # 添加 Tickets 的長條圖
#     fig.add_trace(go.Bar(
#         x=filtered_df["RouteName"],
#         y=filtered_df["Tickets"],
#         name="Tickets",
#         marker_color="#84C1FF",
#         hovertext=hover_text_tickets,  # 顯示格式化過的 hovertext
#         hoverinfo="text"  # 只顯示 hovertext 的內容
#     ))

#     # 添加 Passengers 的長條圖
#     fig.add_trace(go.Bar(
#         x=filtered_df["RouteName"],
#         y=filtered_df["Passengers"],  # 更新欄位名稱為 Passengers
#         name="Passengers",
#         marker_color="#FF8000",
#         hovertext=hover_text_passengers,  # 顯示格式化過的 hovertext
#         hoverinfo="text"  # 只顯示 hovertext 的內容
#     ))

#     # 設定標題與軸標籤
#     fig.update_layout(
#         title=f"Tickets and Passengers for {selected_month}",
#         xaxis_title="路線編號",
#         yaxis_title="人次",
#         barmode="group",  # 並列顯示長條圖
#         xaxis_tickangle=-90,
#         template="plotly_white"  # 使用白色背景的模板
#     )

#     # 顯示圖表
#     fig.show()

# # 綁定事件到篩選器
# dropdown.observe(lambda change: plot_barchart(change.new), names="value")

# # 初始顯示
# display(dropdown)
# plot_barchart(dropdown.value)


## 資料運算

1. 把班表資料黏上

In [9]:
route = common_routes[1]
print(route)

NameError: name 'common_routes' is not defined

In [None]:
# 要開始用for 迴圈套入route 進行計算

# route 以 common_routes 可以的進行 

In [None]:


seq_select = seq[seq['RouteName'] == route].reset_index(drop = True)
shift_select = shift[shift['RouteName'] == route].reset_index(drop = True)

# 先挑選特定路線
tickets_select = tickets[tickets['RouteName'] == route].sort_values(getontime_col).reset_index(drop = True)

# 根據這個路線 挑選他有提供的月份進行計算
yearmonthlist = list(tickets['DataYearMonth'].unique())
yearmonth = yearmonthlist[0]
tickets_select_month = tickets_select[tickets_select['DataYearMonth'] == yearmonth].sort_values(getontime_col).reset_index(drop = True)
tickets_select_month = tickets_select_month[tickets_select_month['DIRECTION'].isin(list(shift_select['Direction'].unique()))].sort_values(getontime_col).reset_index(drop = True)
# tickets_select_month = tickets_select_month.rename(columns= {direction_col:'Direction'})

以下嘗試

In [None]:
def tickets_match_shift(tickets, shifts, routename_col = "ROUTE_NAME" ,getontime_col='GETON_DATE', direction_col='DIRECTION'):
    """
    將刷卡資料匹配到最接近的班次。

    參數:
        tickets (DataFrame): 包含刷卡資料的 DataFrame。
        shifts (DataFrame): 包含班次資料的 DataFrame。
        getontime_col (str): 表示刷卡時間的欄位名稱，默認為 'GETON_DATE'。
        direction_col (str): 表示方向的欄位名稱，默認為 'DIRECTION'。

    返回:
        DataFrame: 加入 "Matched_Shift" 欄位的刷卡資料 DataFrame。
    """
    import pandas as pd
    # 轉換時間欄位格式
    tickets[getontime_col] = pd.to_datetime(tickets[getontime_col])
    shifts["Shift"] = pd.to_datetime(shifts["Shift"], format="%H:%M:%S").dt.time

    # 定義函數來匹配班次
    def match_shift(row, available_shifts):
        geton_time = row[getontime_col].time()
        available_shifts = sorted(available_shifts)
        for i in range(len(available_shifts)):
            if geton_time < available_shifts[i]:  # 比最早的 Shift 早
                return available_shifts[max(0, i - 1)]  # 返回上一個班次（或第一個班次）
        return available_shifts[-1]  # 晚於所有班次，返回最後一個班次

    # 匹配班次的主要邏輯
    matched_shifts = []
    for _, ticket in tickets.iterrows():
        # 找到對應的班次
        route_shifts = shifts[
            (shifts["RouteName"] == ticket[routename_col]) &
            (shifts["IsWorkday"] == ticket["IsWorkday"]) &
            (shifts["Direction"] == ticket[direction_col])
        ]["Shift"].tolist()

        # 如果有班次，匹配
        if route_shifts:
            matched_shift = match_shift(ticket, route_shifts)
            matched_shifts.append(matched_shift)
        else:
            matched_shifts.append(None)  # 如果沒有匹配的班次

    # 新增匹配結果欄位
    tickets["Matched_Shift"] = matched_shifts
    return tickets

In [None]:
tickets_select_month = tickets_match_shift(tickets_select_month.rename(columns={'RouteName':'ROUTE_NAME'}), shift_select)
tickets_select_month

In [None]:
getontime_col = 'GETON_DATE'
direction_col = 'DIRECTION'

# 轉換時間欄位格式
tickets_select_month[getontime_col] = pd.to_datetime(tickets_select_month[getontime_col])
shift_select["Shift"] = pd.to_datetime(shift_select["Shift"], format="%H:%M:%S").dt.time

# 定義函數來匹配班次
def match_shift(row, shifts):
    geton_time = row[getontime_col].time()
    shifts = sorted(shifts)
    for i in range(len(shifts)):
        if geton_time < shifts[i]:  # 比最早的 Shift 早
            return shifts[max(0, i - 1)]  # 返回上一個班次（或第一個班次）
    return shifts[-1]  # 晚於所有班次，返回最後一個班次

# 匹配班次函數
def assign_matched_shifts(tickets_df, shifts_df):
    matched_shifts = []
    for _, ticket in tickets_df.iterrows():
        # 找到對應的班次
        route_shifts = shifts_df[
            (shifts_df["RouteName"] == ticket["RouteName"]) &
            (shifts_df["IsWorkday"] == ticket["IsWorkday"]) &
            (shifts_df["Direction"] == ticket[direction_col])
        ]["Shift"].tolist()

        # 如果有班次，匹配
        if route_shifts:
            matched_shift = match_shift(ticket, route_shifts)
            matched_shifts.append(matched_shift)
        else:
            matched_shifts.append(None)  # 如果沒有匹配的班次

    # 新增匹配結果欄位
    tickets_df["Matched_Shift"] = matched_shifts
    return tickets_df

# 執行匹配
tickets_select_month = assign_matched_shifts(tickets_select_month, shift_select)


In [None]:
shift_select

In [None]:
tickets_select_month[ (tickets_select_month['IsWorkday'] == 0) & (tickets_select_month[direction_col] == 1) ][['RouteName','GETON_DATE','IsWorkday',direction_col,'GETON_STOP_SEQ','GETON_STOP_NAME', 'GETOFF_STOP_SEQ','GETOFF_STOP_NAME','Matched_Shift']].head(30)

In [None]:
# getontime_col = 'GETON_DATE'
# direction_col = 'DIRECTION'

# # 轉換時間欄位格式
# tickets_select_month["GETON_DATE"] = pd.to_datetime(tickets_select_month["GETON_DATE"])
# shift_select["Shift"] = pd.to_datetime(shift_select["Shift"], format="%H:%M:%S").dt.time

# # 定義函數來匹配班次
# def match_shift(row, shifts):
#     geton_time = row["GETON_DATE"].time()
#     shifts = sorted(shifts)
#     for i in range(len(shifts)):
#         if geton_time < shifts[i]:  # 比最早的 Shift 早
#             return shifts[max(0, i - 1)]  # 返回上一個班次（或第一個班次）
#     return shifts[-1]  # 晚於所有班次，返回最後一個班次

# # 開始比對
# matched_shifts = []
# for _, ticket in tickets_select_month.iterrows():
#     # 找到對應的班次
#     route_shifts = shift_select[
#         (shift_select["RouteName"] == ticket["RouteName"]) &
#         (shift_select["IsWorkday"] == ticket["IsWorkday"]) &
#         (shift_select["Direction"] == ticket["Direction"])
#     ]["Shift"].tolist()

#     # 如果有班次，匹配
#     if route_shifts:
#         matched_shift = match_shift(ticket, route_shifts)
#         matched_shifts.append(matched_shift)
#     else:
#         matched_shifts.append(None)  # 如果沒有匹配的班次

# # 新增匹配結果欄位
# tickets_select_month["Matched_Shift"] = matched_shifts

# # 輸出結果
# print(tickets_select_month)


In [None]:
tickets_select_month[['RouteName','GETON_DATE','IsWorkday','Direction','GETON_STOP_SEQ','GETON_STOP_NAME', 'GETOFF_STOP_SEQ','GETOFF_STOP_NAME','Matched_Shift']]

In [None]:
tickets_select_month[tickets_select_month['DIRECTION'] == 0][['RouteName','GETON_DATE','IsWorkday','DIRECTION','GETON_STOP_SEQ','GETON_STOP_NAME', 'GETOFF_STOP_SEQ','GETOFF_STOP_NAME']]

In [None]:
tickets_select_month[['RouteName','GETON_DATE','IsWorkday','DIRECTION','GETON_STOP_SEQ', 'GETOFF_STOP_SEQ']]

In [None]:
tickets_select_month.info()

In [None]:
tickets

In [None]:
def find_bus_shift(swipe_time, is_weekend, schedule):
    shifts = schedule['Shift'].tolist()
    if swipe_time < shifts[0]:
        return shifts[0]
    for i in range(1, len(shifts)):
        if shifts[i-1] <= swipe_time < shifts[i]:
            return shifts[i-1]
    return shifts[-1]

# 刷卡找到對應班次
results = []
for i, row in df.iterrows():
    swipe_time = row['上車時間']
    is_weekend = row['平日假日'] == '假日'
    filtered_schedule = sch[sch['WeekendorNot'] == ('假日' if is_weekend else '平日')]
    shift = find_bus_shift(swipe_time, is_weekend, filtered_schedule)
    results.append(shift)

df['Shift'] = results