# 班表下載（爬蟲）

In [2]:
import requests
from requests.auth import HTTPBasicAuth
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from webdriver_manager.chrome import ChromeDriverManager
import time
from datetime import datetime
import random
import os 


In [9]:
def extract_routes(url):
    # 設定 Chrome 瀏覽器選項
    chrome_options = Options()
    chrome_options.add_argument('--headless')  # 啟用無頭模式
    chrome_options.add_argument('--disable-gpu')
    chrome_options.add_argument('--no-sandbox')

    # 初始化儲存路線名稱和路徑的清單
    RouteNameList = []
    RoutePathList = []

    # 啟動 ChromeDriver
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)

    try:
        # 訪問網站
        driver.get(url)

        # 等待路線名稱和路徑元素出現
        WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, 'div[class^="MuiBox-root"] a.jss104'))
        )

        # 獲取所有路線名稱和路徑元素
        route_elements = driver.find_elements(By.CSS_SELECTOR, 'div[class^="MuiBox-root"] a.jss104')

        # 提取每個路線名稱和路徑
        for elem in route_elements:
            route_name = elem.find_element(By.CSS_SELECTOR, 'p.MuiTypography-root').text.strip()
            route_path = elem.get_attribute('href')

            RouteNameList.append(route_name)
            RoutePathList.append(route_path)

    finally:
        # 關閉瀏覽器
        driver.quit()

    return RouteNameList, RoutePathList
def extract_freeroutes(url):
    # 設定 Chrome 瀏覽器選項
    chrome_options = Options()
    chrome_options.add_argument('--headless')  # 啟用無頭模式
    chrome_options.add_argument('--disable-gpu')
    chrome_options.add_argument('--no-sandbox')

    # 初始化儲存路線名稱和路徑的清單
    RouteNameList = []
    RoutePathList = []

    # 啟動 ChromeDriver
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)

    try:
        # 訪問網站
        driver.get(url)

        # 等待路線名稱和路徑元素出現
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, 'div[class^="MuiBox-root"] a.jss105'))
        )

        # 獲取所有路線名稱和路徑元素
        route_elements = driver.find_elements(By.CSS_SELECTOR, 'div[class^="MuiBox-root"] a.jss105')

        # 提取每個路線名稱和路徑
        for elem in route_elements:
            route_name = elem.find_element(By.CSS_SELECTOR, 'p.MuiTypography-root').text.strip()
            route_path = elem.get_attribute('href')

            RouteNameList.append(route_name)
            RoutePathList.append(route_path)

    finally:
        # 關閉瀏覽器
        driver.quit()

    return RouteNameList, RoutePathList
def find_all_files(path, file_extension):
    import glob
    import os
    path = os.path.abspath(path)
    # 搜尋該路徑下所有匹配文件
    matching_files = glob.glob(os.path.join(path, f"*{file_extension}"))
    return matching_files

def scrape_bus_timetable(routemap_url, date):
    # 創建一個空的 DataFrame，並指定列名
    BusTimeTable = pd.DataFrame(columns=['RouteName','Direction','Shift'])
    Bus_cycle_route = []
    Bus_cycle_status = []

    # 設定 Chrome 驅動程式
    options = Options()
    options.add_argument("--start-maximized")
    service = Service(ChromeDriverManager().install())

    # 啟動 Chrome 瀏覽器
    driver = webdriver.Chrome(service=service, options=options)

    YYYY, MM, DD = date.split('-')

    for i in range(len(routemap_url)):
        RouteName = routemap_url.loc[i, 'RouteName']
        url = routemap_url.loc[i, 'URL']
        print(RouteName)
        try:
            # 第一步：開啟網頁
            driver.get(url)
            time.sleep(2)

            # 第二步：點擊 "時刻表" 按鈕
            WebDriverWait(driver, 2).until(
                EC.element_to_be_clickable((By.ID, "timeTable"))
            ).click()

            # 定位日期輸入框
            def get_date_input_element(driver):
                return WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.ID, "date"))
                )

            # 確保每次操作都重新獲取元素
            date_input = get_date_input_element(driver)
            date_input.clear()
            date_input.send_keys(YYYY)
            date_input.send_keys(Keys.RIGHT)  
            date_input.send_keys(MM)
            date_input.send_keys(Keys.RIGHT)
            date_input.send_keys(DD)

            # 第三步：抓取去程時刻表數據
            BusTimeTable = scrape_direction(driver, BusTimeTable, RouteName, 0)

            # 第四步：抓取返程時刻表數據
            BusTimeTable = scrape_direction(driver, BusTimeTable, RouteName, 1)

            # 第五步：檢查是否循環線
            Bus_cycle_route, Bus_cycle_status = check_cycle_route(driver, RouteName, Bus_cycle_route, Bus_cycle_status)

        except Exception as e:
            print(f"發生錯誤: {e}")

    driver.quit()

    BusTimeTable['DataDate'] = date
    BusTimeTable['DataUpdateTime'] = datetime.now()
    
    return BusTimeTable, Bus_cycle_route, Bus_cycle_status

def scrape_direction(driver, BusTimeTable, RouteName, direction):
    try:
        xpath = f'//*[@id="root"]/div/div[1]/div[2]/div[2]/div[2]/div[1]/div[2]/div/div[2]/div/div/div[{direction+1}]/div[2]/div/p'
        time_table_elements = WebDriverWait(driver, 5).until(
            EC.presence_of_all_elements_located((By.XPATH, xpath))
        )
        time_table = [element.text for element in time_table_elements]
        timetabledf = pd.DataFrame({
            'Shift': time_table,
            'Direction': direction,
            'RouteName': RouteName
        })
        BusTimeTable = pd.concat([BusTimeTable, timetabledf], ignore_index=True)  
    except Exception as e:
        pass
    return BusTimeTable

def check_cycle_route(driver, RouteName, Bus_cycle_route, Bus_cycle_status):
    try:
        cycle_element = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[2]/div[2]/div[2]/div[1]/div[2]/div/div[2]/div/div/div[2]/div[2]/p')
        cycle_text = cycle_element.text
        Bus_cycle_route.append(RouteName)
        Bus_cycle_status.append(cycle_text)
    except Exception as e:
        pass
    return Bus_cycle_route, Bus_cycle_status

def process_and_save_bus_data(BusTimeTable, Bus_cycle_route, Bus_cycle_status, output_filename):
    # Process the data
    def process_bus_data(BusTimeTable, Bus_cycle_route, Bus_cycle_status):
        # Create BusCycle DataFrame
        BusCycle = pd.DataFrame({
            'RouteName': Bus_cycle_route,
            'Status': Bus_cycle_status
        })

        # Count shifts per route
        RouteShiftCount = BusTimeTable.groupby(['RouteName']).size().reset_index(name='班次數')

        # Get unique cycle routes
        cycle_routes = BusCycle['RouteName'].unique()

        # Add '循環線' column
        RouteShiftCount['循環線'] = RouteShiftCount['RouteName'].apply(lambda x: 'V' if x in cycle_routes else '')

        # Update '班次數' for cycle routes
        RouteShiftCount['班次數'] = RouteShiftCount.apply(
            lambda row: row['班次數'] * 2 if row['循環線'] == 'V' else row['班次數'], axis=1
        )

        return BusCycle, RouteShiftCount

    # Process the data
    BusCycle, RouteShiftCount = process_bus_data(BusTimeTable, Bus_cycle_route, Bus_cycle_status)

    # Save to Excel
    with pd.ExcelWriter(output_filename) as writer:
        BusTimeTable.to_excel(writer, sheet_name='各路線班表列表', index=False)
        BusCycle.to_excel(writer, sheet_name='循環路線', index=False)
        RouteShiftCount.to_excel(writer, sheet_name='平日路線數', index=False)
    
    print(f"文件 '{output_filename}' 已成功創建。")


In [38]:
try:
    citybus_names, citybus_paths = extract_routes('https://ebus.tycg.gov.tw/ebus/driving-map?route-group=cityBus')
    df_citybus = pd.DataFrame({
    'RouteName': citybus_names,
    'URL': citybus_paths
    })
    df_citybus['RouteType'] = 'Citybus'
except:
    pass
time.sleep(10)
try:
    freebus_names, freebus_paths = extract_routes(f'https://ebus.tycg.gov.tw/ebus/driving-map?route-group=freeBus')
    df_freebus = pd.DataFrame({
    'RouteName': freebus_names,
    'URL': freebus_paths
    })
    df_freebus['RouteType'] = 'Freebus'
except:
    pass
# 將結果轉換為 DataFrame
try:
    routemap_url = pd.concat([df_citybus, df_freebus]).reset_index(drop = True)
    del df_citybus
    del df_freebus
except:
    pass

# Trial

In [49]:
routemap_url = df_citybus

In [3]:
inputfolder_path = os.path.join(os.getcwd(),'..', 'input','Shift')
os.makedirs(inputfolder_path, exist_ok=True)

In [53]:
# 使用函數
date = '2024-10-16'
BusTimeTable, Bus_cycle_route, Bus_cycle_status = scrape_bus_timetable(routemap_url, date)
outputname = os.path.join(inputfolder_path , '202410平日班表整理.xlsx')
process_and_save_bus_data(BusTimeTable, Bus_cycle_route, Bus_cycle_status, outputname)

1
1A
1B
101
102
103
105
105A
105B
106
107
109
111
112N
112S
113
115A
115B
116
117
117A
118
119
120
122
125
129
130
132
133
135
137
139
151
152
155
156
157
167
167A
168
168A
169
169A
170
171
172
173
176
181
188
189
201
202
206
206A
206B
206C
206D
206E
208
208A
208C
212
212A
213
220
220A
221
222
223
225
225A
226
227
227A
228
229
230
231
231A
232
232A
233
235
236
237
238
239
251
252
252A
253
261
262
263
265A
265B
266
301
301A
302
302A
307
307A
309C
501
502
502(大溪站發車)
503
505
506
5000
5006
5008
5009
5010
5011
5014
5014A
5015
5016
5017
5018
5019
5020
5021
5022
5022A
5023
5025
5026
5027
5027A
5028
5030
5031
5032
5033
5035
5038
5039
5040
5040A
5040B
5041
5042
5043
5043A
5044
5048
5049
5050
5050A
5051
5053
5053A
5055
5056
5057
5057A
5057B
5057C
5059
5060
5061
5063
5065
5068
5069
5071
5071A
5071B
5073
5073A
5077
5078
5081
5081A
5082
5083
5084
5085
5086
5086A
5086C
5087
5087A
5089
5090
5091
5093
5094
5096
5096A
5097
5098
5099
5099A
5101
5104
5104A
5104B
5105
5106
5106A
5106B
5107
5107A
5107B
510

In [54]:
# 使用函數
date = '2024-10-05'
BusTimeTable, Bus_cycle_route, Bus_cycle_status = scrape_bus_timetable(routemap_url, date)
outputname = os.path.join(inputfolder_path , '202410假日班表整理.xlsx')
process_and_save_bus_data(BusTimeTable, Bus_cycle_route, Bus_cycle_status, outputname)

1
1A
1B
101
102
103
105
105A
105B
106
107
109
111
112N
112S
113
115A
115B
116
117
117A
118
119
120
122
125
129
130
132
133
135
137
139
151
152
155
156
157
167
167A
168
168A
169
169A
170
171
172
173
176
181
188
189
201
202
206
206A
206B
206C
206D
206E
208
208A
208C
212
212A
213
220
220A
221
222
223
225
225A
226
227
227A
228
229
230
231
231A
232
232A
233
235
236
237
238
239
251
252
252A
253
261
262
263
265A
265B
266
301
301A
302
302A
307
307A
309C
501
502
502(大溪站發車)
503
505
506
5000
5006
5008
5009
5010
5011
5014
5014A
5015
5016
5017
5018
5019
5020
5021
5022
5022A
5023
5025
5026
5027
5027A
5028
5030
5031
5032
5033
5035
5038
5039
5040
5040A
5040B
5041
5042
5043
5043A
5044
5048
5049
5050
5050A
5051
5053
5053A
5055
5056
5057
5057A
5057B
5057C
5059
5060
5061
5063
5065
5068
5069
5071
5071A
5071B
5073
5073A
5077
5078
5081
5081A
5082
5083
5084
5085
5086
5086A
5086C
5087
5087A
5089
5090
5091
5093
5094
5096
5096A
5097
5098
5099
5099A
5101
5104
5104A
5104B
5105
5106
5106A
5106B
5107
5107A
5107B
510

In [79]:
found_files = find_all_files(inputfolder_path, '.xlsx')
shifts = []
for file in found_files:
    shift = pd.read_excel(file,sheet_name="各路線班表列表")
    shift['DataDate'] = pd.to_datetime(shift['DataDate'])
    shift['IsWorkday'] = shift['DataDate'].apply(lambda x: 1 if x.weekday() < 5 else 0)
    shift = shift[['RouteName','IsWorkday', 'Direction','Shift']].sort_values(['RouteName','IsWorkday', 'Direction','Shift'])
    cycle = pd.read_excel(file, sheet_name="循環路線")
    cycle = list(cycle['RouteName'].unique())

    shift_turn = shift[shift['RouteName'].isin(cyclelist)].copy()
    shift_turn['Direction'] = 1- shift_turn['Direction']
    shift = pd.concat([shift, shift_turn])
    shift = shift.sort_values(['RouteName','IsWorkday', 'Direction','Shift'])

    shifts.append(shift)
shifts = pd.concat(shifts)

shifts.to_csv(os.path.join(os.getcwd(),'..', 'input','shift.csv'), index = False)

In [78]:
shifts

Unnamed: 0,RouteName,IsWorkday,Direction,Shift
0,1,0,0,05:25
1,1,0,0,05:36
2,1,0,0,05:44
3,1,0,0,05:52
4,1,0,0,06:00
...,...,...,...,...
3805,T517A,1,0,10:10
3806,T517A,1,0,17:10
3807,T517A,1,1,09:10
3808,T517A,1,1,11:10


In [None]:
found_files = find_all_files(inputfolder_path, '.xlsx')
shifts = []
for file in found_files:
    shift = pd.read_excel(file,sheet_name="各路線班表列表")
    shifts.append(shift)
    shift['IsWorkday'] = shift['DataDate'].apply(lambda x: 1 if x.weekday() < 5 else 0)
    shift = shift[['RouteName','IsWorkday', 'Direction','Shift']].sort_values(['RouteName','IsWorkday', 'Direction','Shift'])
    cycle = pd.read_excel(file, sheet_name="循環路線")
    cycle = list(cycle['RouteName'].unique())

# shifts = pd.concat(shifts)
# cyclelist = list(set(cyclelist))

# shifts['DataDate'] = pd.to_datetime(shifts['DataDate'])

In [72]:
file = found_files[1]
shift = pd.read_excel(file,sheet_name="各路線班表列表")
shift['DataDate'] = pd.to_datetime(shift['DataDate'])
shift['IsWorkday'] = shift['DataDate'].apply(lambda x: 1 if x.weekday() < 5 else 0)
shift = shift[['RouteName','IsWorkday', 'Direction','Shift']].sort_values(['RouteName','IsWorkday', 'Direction','Shift'])
cycle = pd.read_excel(file, sheet_name="循環路線")
cycle = list(cycle['RouteName'].unique())

In [73]:
shift_turn = shift[shift['RouteName'].isin(cyclelist)].copy()
shift_turn['Direction'] = 1- shift_turn['Direction']
shift = pd.concat([shift, shift_turn])
shift = shift.sort_values(['RouteName','IsWorkday', 'Direction','Shift'])
shift

Unnamed: 0,RouteName,IsWorkday,Direction,Shift
0,1,1,0,05:25
1,1,1,0,05:36
2,1,1,0,05:44
3,1,1,0,05:52
4,1,1,0,06:00
...,...,...,...,...
3805,T517A,1,0,10:10
3806,T517A,1,0,17:10
3807,T517A,1,1,09:10
3808,T517A,1,1,11:10


In [None]:
# for i in cyclelist:
#     # 篩選出該路線的資料
#     route_data = shift[shift['RouteName'] == i]
#     # 獲取 Direction 的唯一值
#     unique_directions = route_data['Direction'].unique()
#     # 如果方向數量大於 1，則輸出該路線及方向
#     if len(unique_directions) == 1:
#         if unique_directions[0] == 1:
#             print(f"{i}: {unique_directions}")

In [50]:
shift

Unnamed: 0,RouteName,IsWorkday,Direction,Shift
0,1,0,0,05:25
1,1,0,0,05:36
2,1,0,0,05:44
3,1,0,0,05:52
4,1,0,0,06:00
...,...,...,...,...
3818,T517A,0,0,10:10
3819,T517A,0,0,17:10
3820,T517A,0,1,09:10
3821,T517A,0,1,11:10


In [None]:
found_files = find_all_files(inputfolder_path, '.xlsx')
shifts = []
cyclelist = []
for file in found_files:
    shift = pd.read_excel(file,sheet_name="各路線班表列表")
    shifts.append(shift)
    shift['IsWorkday'] = shift['DataDate'].apply(lambda x: 1 if x.weekday() < 5 else 0)
    shift = shift[['RouteName','IsWorkday', 'Direction','Shift']].sort_values(['RouteName','IsWorkday', 'Direction','Shift'])
    cycle = pd.read_excel(file, sheet_name="循環路線")
    cycle
    cyclelist.extend(cycle['RouteName'].unique())  # 使用 extend 展平加入一維列表

# shifts = pd.concat(shifts)
# cyclelist = list(set(cyclelist))

# shifts['DataDate'] = pd.to_datetime(shifts['DataDate'])
# shifts['IsWorkday'] = shifts['DataDate'].apply(lambda x: 1 if x.weekday() < 5 else 0)
# shifts = shifts[['RouteName','IsWorkday', 'Direction','Shift']].sort_values(['RouteName','IsWorkday', 'Direction','Shift'])

In [44]:
for i in cyclelist:
    if(shifts[shifts['RouteName']==i]['Direction'].unique() > 1):
        print(i,end=":")
        print(shifts[shifts['RouteName']==i]['Direction'].unique())
    else:
        pass

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [None]:
shifts['IsWorkday'] = 

In [26]:
shifts

Unnamed: 0,RouteName,Direction,Shift,DataDate,DataUpdateTime,IsWorkday
0,1,0,05:25,2024-10-05,2024-12-09 19:51:26.766,0
1,1,0,05:36,2024-10-05,2024-12-09 19:51:26.766,0
2,1,0,05:44,2024-10-05,2024-12-09 19:51:26.766,0
3,1,0,05:52,2024-10-05,2024-12-09 19:51:26.766,0
4,1,0,06:00,2024-10-05,2024-12-09 19:51:26.766,0
...,...,...,...,...,...,...
3805,T517A,0,10:10,2024-10-16,2024-12-09 19:17:09.107,1
3806,T517A,0,17:10,2024-10-16,2024-12-09 19:17:09.107,1
3807,T517A,1,09:10,2024-10-16,2024-12-09 19:17:09.107,1
3808,T517A,1,11:10,2024-10-16,2024-12-09 19:17:09.107,1


In [22]:
shifts.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7633 entries, 0 to 3809
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   RouteName       7633 non-null   object        
 1   Direction       7633 non-null   int64         
 2   Shift           7633 non-null   object        
 3   DataDate        7633 non-null   object        
 4   DataUpdateTime  7633 non-null   datetime64[ns]
dtypes: datetime64[ns](1), int64(1), object(3)
memory usage: 357.8+ KB
