# 站間量分析

需要準備資料：
1. 票證資料：須包含所有佔位點的資料
2. 站序資料：需帶有'Direction'欄位
3. 班表資料：需帶有'Direction'、'IsWorkday'欄位
4. 營運月報 (optional)：做票證放大率佐證用 

## 基礎設定

包含環境設定，以及指定對應資料夾路徑（input、process、output）

In [1]:
import os
import pandas as pd
import plotly.graph_objects as go
import ipywidgets as widgets
from IPython.display import display
from tickets_cleaning import tickets_cleaning, date_defined , getDaysCount, getMagnification, tickets_match_shift

In [2]:
inputfolder_path = os.path.join(os.getcwd(),'..', 'input')
outputfolder_path = os.path.join(os.getcwd(),'..', 'output')
processfolder_path = os.path.join(os.getcwd(),'..', 'process')

# 確保資料夾存在
os.makedirs(inputfolder_path, exist_ok=True)
os.makedirs(outputfolder_path, exist_ok=True)
os.makedirs(processfolder_path, exist_ok=True)


In [None]:
# 加入補班、國定假日等日期  Note:需要手動調整
date_turn_holiday=[20230929] # 補假、國定假日、颱風天
date_turn_workday=[20230923] # 補班
startdate = 20230701
enddate = 20230930
# 計算每月的假日與平日數
dayscount = getDaysCount(startdate, enddate, date_turn_holiday, date_turn_workday)


In [14]:
# 定義票證資料的欄位名稱 Note:需要手動調整
direction_col = 'DIRECTION'
getontime_col ='GETON_DATE'
getofftime_col ='GETOFF_DATE'
getonstop_col ='GETON_STOP_NAME'
getoffstop_col ='GETOFF_STOP_NAME'
getonseq_col ='GETON_STOP_SEQ'
getoffseq_col ='GETOFF_STOP_SEQ'
routename_col = "ROUTE_NAME"

## 資料前處理

1. 票證清洗(去除不可用資料)
2. 票證定義日期欄位 (年月、平假日)
3. 處理票證放大率

In [12]:
'''進行基礎的票證清洗
1. 找到上車時間 < 下車時間
2. 上車站序 < 下車站序
3. 上下車站名不同'''

# 定義 tickets.csv 的相對路徑
tickets_path = os.path.join(inputfolder_path , 'tickets.csv')
tickets = pd.read_csv(tickets_path)

# 讀取資料並進行清理
tickets, errorstat, correctrate = tickets_cleaning(tickets, 
    getontime=getontime_col, 
    getofftime=getofftime_col, 
    getonstop=getonstop_col, 
    getoffstop=getoffstop_col, 
    getonseq=getonseq_col, 
    getoffseq=getoffseq_col)

# 把清洗過的資料轉存至process
tickets.to_csv(os.path.join(processfolder_path , 'tickets_cleaned.csv'))
# tickets.to_csv(os.path.join(os.path.dirname(__file__), '..', 'process', 'tickets_cleaned.csv'))

# 輸出數據清洗統計
errorstat_path = os.path.join(outputfolder_path , 'ErrorDataStat.txt')
with open(errorstat_path , 'w', encoding='utf-8') as file:
    for key, value in errorstat.items():
        file.write(f"{key}: {value}\n")
# del errorstat
tickets = date_defined(tickets, getontime_columns=getontime_col, date_turn_holiday=date_turn_holiday,\
                       date_turn_workday=date_turn_workday)


  tickets = pd.read_csv(tickets_path)


In [None]:
# tickets = pd.read_csv(os.path.join(processfolder_path , 'tickets_cleaned.csv'))
# tickets = tickets.rename(columns = {'RouteName':'ROUTE_NAME'})

In [15]:
'''處理票證資料放大率'''

operation = pd.read_csv(os.path.join(inputfolder_path, 'operation.csv'))

# 計算 DataYearMonth 並格式化
operation['DataYearMonth'] = (
    pd.to_datetime((operation['YEAR'] + 1911) * 100 + operation['MONTH'], format='%Y%m')
    .dt.strftime('%Y%m')
)

tickets_magnification = getMagnification(
    tickets=tickets,
    tickets_routename_col=routename_col,
    tickets_yearmonth_col='DataYearMonth',  # 指定票證數據的年月欄位 Note:需要手動調整
    operation=operation,
    operation_routename_col='ROUTE_NAME', # 這個Operation須要手動調整欄位名稱 Note:需要手動調整
    operation_yearmonth_col='DataYearMonth',  # 指定運營數據的年月欄位 Note:需要手動調整
    operation_passengers_col='PASSENGERS' # Note:需要手動調整
)

# 列出所有放大率會有異常的路線
ooc_route_list = list(set(tickets_magnification[tickets_magnification['Magnification'] >= 1.3]['RouteName'].unique()).union(
    set(tickets_magnification[tickets_magnification['Magnification'] <= 0.8]['RouteName'].unique())
))

In [17]:
# # 讀取班表資料
# shift = pd.read_excel(os.path.join(inputfolder_path, 'shift.xlsx'))
# shift.columns = ['RouteName', 'Direction', 'Shift', 'IsWorkday']
# shift['IsWorkday'] = shift['IsWorkday'].replace({'假日': '0', '平日': '1'})
# shift['Shift'] = shift['Shift'].astype(str)
# shift['Shift'] = pd.to_datetime(shift['Shift'], format='%H:%M').dt.time
# shift = shift.sort_values(['RouteName', 'IsWorkday','Shift', 'Direction'], ascending=[True, True, True, True])

# 讀取班表資料
shift = pd.read_csv(os.path.join(inputfolder_path, 'shift.csv'))
shift['Shift'] = shift['Shift'].astype(str)
shift['Shift'] = pd.to_datetime(shift['Shift'], format='%H:%M').dt.time
shift = shift.sort_values(['RouteName', 'IsWorkday','Shift', 'Direction'], ascending=[True, True, True, True]).reset_index(drop = True)

# 讀取相關的站序 
seq = pd.read_csv(os.path.join(inputfolder_path,'seq.csv'))

# 具有班表的RouteName_list
shift_routename_list = list(shift['RouteName'].unique()) 
tickets_routename_list = list(tickets[routename_col].unique())
seq_routename_list = list(shift['RouteName'].unique())

# 不在 tickets_routename_list 中但在 shift_routename_list 中的項目
only_in_shift = list(set(shift_routename_list) - set(tickets_routename_list))
# 不在 shift_routename_list 中但在 tickets_routename_list 中的項目
only_in_tickets = list(set(tickets_routename_list) - set(shift_routename_list))
common_routes = list(set(tickets_routename_list) & set(shift_routename_list) & set(seq_routename_list))

# 印出結果
print("缺票證資料:", only_in_shift)
print("缺班表資料:", only_in_tickets)
print("本次可算的路線:", common_routes)

缺票證資料: ['T513', '5050A', '5008', 'T516A', '724', '5096A', '5035', '201', '5081A', '181', '262', '723', '309C', '5096', '5011', '5109A', 'T515', '266', '261', 'T517A', 'T517', '609', 'T516B', '722', '119', '5086C', '506', '5010', '5019', '208C', '265A', '5616B', '5106A', '608', '263', '176', '5107C', '5616A', 'T512A', '5022A', '722A', '5616C', '265B', '5022', '5018', '5107A', '5006', 'T516', 'T512', '5106B', '5107B', '723A', '5099A']
缺班表資料: ['大溪高中(中壢C+E線)', '南崁高中(大園線)', '壽山高中三鶯線學生專車', '桃園高中南祥線學生專車', '5646B', 'S315', '桃/壽高中-林口線', '大園國際高中(永安大竹線)', '南崁高中-寶慶線', '南崁高中(八德大有線)', '大園高中(楊梅線)', '503', '720', '內壢高中竹圍甲線學生專車', '壽山高中大園線學生專車', '內壢高中觀音線學生專車', '桃園高中(武陵線)', '南崁高中(青溪中正線)', '大溪高中(中壢BD線)', 'F902A', '中壢高中大溪線學生專車', 602, 603, 606, 607, '內壢高中(龍岡線)', '大溪高遶經羅浮學專車', '陽明高中大園線學生專車', '大園高中中正Ａ線學生專車', '青埔國中(大江線)', '大溪高中桃園Ａ線學生專車', '505', '觀音高中永安線學生專車', '桃園高中-大溪線', '中大壢中(大湳線)', '內壢高中渴望中豐線學生專車', '永豐高中龍潭線學生專車', 703, '內壢高中(新屋線)', '大園高中士校內壢線學生專車', 708, 'F903', '永豐高中龜山線學生專車', '大溪高中蝙蝠洞線學生專車', '武陵高中-大溪線', '平鎮高中

### 基本判讀指標：是否繼續往下做

1. 列出本次資料正常資料的佔比
2. 列出本次放大率異常的路線 ( 可以進一步以plotly 圖表檢視長條圖)

In [18]:
print(f'資料可用比例 = {correctrate}%',end=' ')
if correctrate <= 95:
    print('本次取得的資料錯誤率太高，建議重新檢視')
else : 
    print('本次的資料可以使用')

try:
    if len(ooc_route_list) > 0:
        print(f'本次放大率異常路線共{len(ooc_route_list)}條')
        print('票證放大率異常的路線編號', end= ':')
        print(ooc_route_list)
except:
    pass

資料可用比例 = 98.9% 本次的資料可以使用
本次放大率異常路線共327條
票證放大率異常的路線編號:['5078', '5020', '大溪高中(中壢C+E線)', '253', '715', '桃園高中南祥線學生專車', '5646B', '113', '南崁高中(八德大有線)', '大園高中(楊梅線)', '503', '5071', '125', '720', '5057C', '壽山高中大園線學生專車', '115A', '內壢高中觀音線學生專車', 'BR', '5014', '5083', '5646', '大溪高中(中壢BD線)', '105A', '5049', 'F902A', '709', '中壢高中大溪線學生專車', '5038', '5094', '5099', '129', '5112', '5028', '5104', '213', '5057B', '大溪高中桃園Ａ線學生專車', '5623', '708', '5118', '152', '5039', '5104A', '觀音高中永安線學生專車', '5055', '5107', '中大壢中(大湳線)', '173', '5068', '5081', '171', '252A', '5623A', '大園高中士校內壢線學生專車', 'F903', '永豐高中龜山線學生專車', '5073', '139', '大溪高中蝙蝠洞線學生專車', '709B', '武陵高中-大溪線', '平鎮高中-火車站線(下課)', 217, '602', '706B', '231', '710A', '5672', '1', 'F907', '武陵高中東興下課', '116', '5014A', '5025', '120', '5623B', '168', '5021', '5050', '5063', '710', '東興國中(華勛線)', '135', '117A', '5644', '603', '桃園高中-龍潭線', '301', '5098', '711', '117', '5015', '5105', '102', '大溪高中龍潭Ｂ線學生專車', '內壢高中(東社線)', '169A', '5647', '5084', '220A', '5032', '5617A', '桃園高中崎海線學

In [8]:
# unique_year_months = tickets_magnification["DataYearMonth"].unique()
# # 創建篩選器 (Dropdown)
# dropdown = widgets.Dropdown(
#     options=unique_year_months,
#     value=unique_year_months[0],
#     description="月份:"
# )

# # 定義繪圖函數
# def plot_barchart(selected_month):
#     # 篩選 DataFrame
#     filtered_df = tickets_magnification[tickets_magnification["DataYearMonth"] == selected_month]
    
#     if filtered_df.empty:
#         print(f"No data available for {selected_month}")
#         return
    
#     # 創建條形圖
#     fig = go.Figure()

#     # 定義顯示在 hover 上的格式
#     hover_text_tickets = [
#     f"RouteName: {row['RouteName']}<br>Magnification: {row['Magnification'] * 100:.2f}%<br>Tickets: {row['Tickets']:,}"  # Magnification 顯示為百分比，Tickets 顯示為實際數字
#     for _, row in filtered_df.iterrows()
#     ]
#     hover_text_passengers = [
#     f"RouteName: {row['RouteName']}<br>Magnification: {row['Magnification'] * 100:.2f}%<br>Passengers: {row['Passengers']:,}"  # Magnification 顯示為百分比，Passengers 顯示為實際數字
#     for _, row in filtered_df.iterrows()
#     ]

#     # 添加 Tickets 的長條圖
#     fig.add_trace(go.Bar(
#         x=filtered_df["RouteName"],
#         y=filtered_df["Tickets"],
#         name="Tickets",
#         marker_color="#84C1FF",
#         hovertext=hover_text_tickets,  # 顯示格式化過的 hovertext
#         hoverinfo="text"  # 只顯示 hovertext 的內容
#     ))

#     # 添加 Passengers 的長條圖
#     fig.add_trace(go.Bar(
#         x=filtered_df["RouteName"],
#         y=filtered_df["Passengers"],  # 更新欄位名稱為 Passengers
#         name="Passengers",
#         marker_color="#FF8000",
#         hovertext=hover_text_passengers,  # 顯示格式化過的 hovertext
#         hoverinfo="text"  # 只顯示 hovertext 的內容
#     ))

#     # 設定標題與軸標籤
#     fig.update_layout(
#         title=f"Tickets and Passengers for {selected_month}",
#         xaxis_title="路線編號",
#         yaxis_title="人次",
#         barmode="group",  # 並列顯示長條圖
#         xaxis_tickangle=-90,
#         template="plotly_white"  # 使用白色背景的模板
#     )

#     # 顯示圖表
#     fig.show()

# # 綁定事件到篩選器
# dropdown.observe(lambda change: plot_barchart(change.new), names="value")

# # 初始顯示
# display(dropdown)
# plot_barchart(dropdown.value)


## 資料運算

逐條路線執行
1. 比對班次：依據上車時間比對發車班次（適用班次兼具較少的班次）
2. 班表 - 站序配對
3. 計算**上下車人次**及**站間量**

In [19]:
route = common_routes[1]
print(route)

5078


In [None]:
# 要開始用for 迴圈套入route 進行計算

# route 以 common_routes 可以的進行 

In [21]:
seq_select = seq[seq['RouteName'] == route].reset_index(drop = True)
shift_select = shift[shift['RouteName'] == route].reset_index(drop = True)

# 先挑選特定路線
tickets_select = tickets[tickets[routename_col] == route].sort_values(getontime_col).reset_index(drop = True)

# 根據這個路線 挑選他有提供的月份進行計算
yearmonthlist = list(tickets['DataYearMonth'].unique())
yearmonth = yearmonthlist[0]

tickets_select_month = tickets_select[tickets_select['DataYearMonth'] == yearmonth].sort_values(getontime_col).reset_index(drop = True)
tickets_select_month = tickets_select_month[tickets_select_month[direction_col].isin(list(shift_select['Direction'].unique()))].sort_values(getontime_col).reset_index(drop = True)
# tickets_select_month = tickets_select_month.rename(columns= {direction_col:'Direction'})

In [23]:
tickets_select_month = tickets_match_shift(tickets=tickets_select_month, shifts=shift_select, routename_col=routename_col, getontime_col=getontime_col, direction_col=direction_col)
tickets_select_month

Unnamed: 0,OPERATOR_NAME,ROUTE_TYPE,OPERATOR_ID,CAR_NUMBER,ROUTE_ID,ROUTE_NAME,CARD_OPERATOR,CARDID_DEC,CARDID_HEX,CITIZENCARD,...,GETOFF_LON,GETOFF_COUNTY,GETOFF_TOWN,GETOFF_VILL,SHOULD_DEDUCTED,CONSUMPTION_DEDUCTED,SUBTOTAL,IsWorkday,DataYearMonth,Matched_Shift
0,桃園客運,市區公車,1,587-FY,5078,5078,ECC,1836327489,6D742241,0,...,121.236239,桃園市,中壢區,永福里,18,,14.0,1,202309,06:45:00
1,桃園客運,市區公車,1,587-FY,5078,5078,ECC,1836444705,6D75EC21,0,...,121.236239,桃園市,中壢區,永福里,18,,14.0,1,202309,06:45:00
2,桃園客運,市區公車,1,587-FY,5078,5078,ECC,284364062,10F30D1E,0,...,121.236100,桃園市,中壢區,永福里,18,,0.0,1,202309,06:45:00
3,桃園客運,市區公車,1,587-FY,5078,5078,ECC,1836426193,6D75A3D1,0,...,121.236239,桃園市,中壢區,永福里,18,,14.0,1,202309,06:45:00
4,桃園客運,市區公車,1,587-FY,5078,5078,ECC,1836644913,6D78FA31,0,...,121.236239,桃園市,中壢區,永福里,18,,0.0,1,202309,06:45:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2469,桃園客運,市區公車,1,099-FX,5078,5078,ECC,1388903137,52C8FAE1,0,...,121.223944,桃園市,中壢區,石頭里,18,,0.0,0,202309,17:40:00
2470,桃園客運,市區公車,1,KKA-3733,5078,5078,ECC,1200687797,47910AB5,0,...,121.229598,桃園市,中壢區,內定里,18,,18.0,0,202309,16:40:00
2471,桃園客運,市區公車,1,KKA-3733,5078,5078,ECC,1405901649,53CC5B51,0,...,121.229598,桃園市,中壢區,內定里,18,,14.0,0,202309,16:40:00
2472,桃園客運,市區公車,1,KKA-3733,5078,5078,ECC,1115289445,4279F765,0,...,121.228350,桃園市,大園區,五權里,18,,18.0,0,202309,18:45:00


以下嘗試

# 站序 - 配對班表 
