# 班表下載（爬蟲）

In [26]:
import requests
from requests.auth import HTTPBasicAuth
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from webdriver_manager.chrome import ChromeDriverManager
import time
from datetime import datetime
import random
import os 


In [27]:
def extract_routes(url):
    # 設定 Chrome 瀏覽器選項
    chrome_options = Options()
    chrome_options.add_argument('--headless')  # 啟用無頭模式
    chrome_options.add_argument('--disable-gpu')
    chrome_options.add_argument('--no-sandbox')

    # 初始化儲存路線名稱和路徑的清單
    RouteNameList = []
    RoutePathList = []

    # 啟動 ChromeDriver
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)

    try:
        # 訪問網站
        driver.get(url)

        # 等待路線名稱和路徑元素出現
        WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, 'div[class^="MuiBox-root"] a.jss104'))
        )

        # 獲取所有路線名稱和路徑元素
        route_elements = driver.find_elements(By.CSS_SELECTOR, 'div[class^="MuiBox-root"] a.jss104')

        # 提取每個路線名稱和路徑
        for elem in route_elements:
            route_name = elem.find_element(By.CSS_SELECTOR, 'p.MuiTypography-root').text.strip()
            route_path = elem.get_attribute('href')

            RouteNameList.append(route_name)
            RoutePathList.append(route_path)

    finally:
        # 關閉瀏覽器
        driver.quit()

    return RouteNameList, RoutePathList
def extract_freeroutes(url):
    # 設定 Chrome 瀏覽器選項
    chrome_options = Options()
    chrome_options.add_argument('--headless')  # 啟用無頭模式
    chrome_options.add_argument('--disable-gpu')
    chrome_options.add_argument('--no-sandbox')

    # 初始化儲存路線名稱和路徑的清單
    RouteNameList = []
    RoutePathList = []

    # 啟動 ChromeDriver
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)

    try:
        # 訪問網站
        driver.get(url)

        # 等待路線名稱和路徑元素出現
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, 'div[class^="MuiBox-root"] a.jss105'))
        )

        # 獲取所有路線名稱和路徑元素
        route_elements = driver.find_elements(By.CSS_SELECTOR, 'div[class^="MuiBox-root"] a.jss105')

        # 提取每個路線名稱和路徑
        for elem in route_elements:
            route_name = elem.find_element(By.CSS_SELECTOR, 'p.MuiTypography-root').text.strip()
            route_path = elem.get_attribute('href')

            RouteNameList.append(route_name)
            RoutePathList.append(route_path)

    finally:
        # 關閉瀏覽器
        driver.quit()

    return RouteNameList, RoutePathList


In [28]:
try:
    citybus_names, citybus_paths = extract_routes('https://ebus.tycg.gov.tw/ebus/driving-map?route-group=cityBus')
    df_citybus = pd.DataFrame({
    'RouteName': citybus_names,
    'URL': citybus_paths
    })
    df_citybus['RouteType'] = 'Citybus'
except:
    pass
time.sleep(10)
try:
    freebus_names, freebus_paths = extract_routes(f'https://ebus.tycg.gov.tw/ebus/driving-map?route-group=freeBus')
    df_freebus = pd.DataFrame({
    'RouteName': freebus_names,
    'URL': freebus_paths
    })
    df_freebus['RouteType'] = 'Freebus'
except:
    pass
# 將結果轉換為 DataFrame
try:
    routemap_url = pd.concat([df_citybus, df_freebus]).reset_index(drop = True)
    del df_citybus
    del df_freebus
except:
    pass

# Trial

In [29]:
routemap_url = df_citybus

In [30]:
def scrape_bus_timetable(routemap_url, date):
    # 創建一個空的 DataFrame，並指定列名
    BusTimeTable = pd.DataFrame(columns=['RouteName','Direction','Shift'])
    Bus_cycle_route = []
    Bus_cycle_status = []

    # 設定 Chrome 驅動程式
    options = Options()
    options.add_argument("--start-maximized")
    service = Service(ChromeDriverManager().install())

    # 啟動 Chrome 瀏覽器
    driver = webdriver.Chrome(service=service, options=options)

    YYYY, MM, DD = date.split('-')

    for i in range(len(routemap_url)):
        RouteName = routemap_url.loc[i, 'RouteName']
        url = routemap_url.loc[i, 'URL']
        print(RouteName)
        try:
            # 第一步：開啟網頁
            driver.get(url)
            time.sleep(2)

            # 第二步：點擊 "時刻表" 按鈕
            WebDriverWait(driver, 2).until(
                EC.element_to_be_clickable((By.ID, "timeTable"))
            ).click()

            # 定位日期輸入框
            def get_date_input_element(driver):
                return WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.ID, "date"))
                )

            # 確保每次操作都重新獲取元素
            date_input = get_date_input_element(driver)
            date_input.clear()
            date_input.send_keys(YYYY)
            date_input.send_keys(Keys.RIGHT)  
            date_input.send_keys(MM)
            date_input.send_keys(Keys.RIGHT)
            date_input.send_keys(DD)

            # 第三步：抓取去程時刻表數據
            BusTimeTable = scrape_direction(driver, BusTimeTable, RouteName, 0)

            # 第四步：抓取返程時刻表數據
            BusTimeTable = scrape_direction(driver, BusTimeTable, RouteName, 1)

            # 第五步：檢查是否循環線
            Bus_cycle_route, Bus_cycle_status = check_cycle_route(driver, RouteName, Bus_cycle_route, Bus_cycle_status)

        except Exception as e:
            print(f"發生錯誤: {e}")

    driver.quit()

    BusTimeTable['DataDate'] = date
    BusTimeTable['DataUpdateTime'] = datetime.now()
    
    return BusTimeTable, Bus_cycle_route, Bus_cycle_status

def scrape_direction(driver, BusTimeTable, RouteName, direction):
    try:
        xpath = f'//*[@id="root"]/div/div[1]/div[2]/div[2]/div[2]/div[1]/div[2]/div/div[2]/div/div/div[{direction+1}]/div[2]/div/p'
        time_table_elements = WebDriverWait(driver, 5).until(
            EC.presence_of_all_elements_located((By.XPATH, xpath))
        )
        time_table = [element.text for element in time_table_elements]
        timetabledf = pd.DataFrame({
            'Shift': time_table,
            'Direction': direction,
            'RouteName': RouteName
        })
        BusTimeTable = pd.concat([BusTimeTable, timetabledf], ignore_index=True)  
    except Exception as e:
        pass
    return BusTimeTable

def check_cycle_route(driver, RouteName, Bus_cycle_route, Bus_cycle_status):
    try:
        cycle_element = driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/div[2]/div[2]/div[2]/div[1]/div[2]/div/div[2]/div/div/div[2]/div[2]/p')
        cycle_text = cycle_element.text
        Bus_cycle_route.append(RouteName)
        Bus_cycle_status.append(cycle_text)
    except Exception as e:
        pass
    return Bus_cycle_route, Bus_cycle_status

def process_and_save_bus_data(BusTimeTable, Bus_cycle_route, Bus_cycle_status, output_filename):
    # Process the data
    def process_bus_data(BusTimeTable, Bus_cycle_route, Bus_cycle_status):
        # Create BusCycle DataFrame
        BusCycle = pd.DataFrame({
            'RouteName': Bus_cycle_route,
            'Status': Bus_cycle_status
        })

        # Count shifts per route
        RouteShiftCount = BusTimeTable.groupby(['RouteName']).size().reset_index(name='班次數')

        # Get unique cycle routes
        cycle_routes = BusCycle['RouteName'].unique()

        # Add '循環線' column
        RouteShiftCount['循環線'] = RouteShiftCount['RouteName'].apply(lambda x: 'V' if x in cycle_routes else '')

        # Update '班次數' for cycle routes
        RouteShiftCount['班次數'] = RouteShiftCount.apply(
            lambda row: row['班次數'] * 2 if row['循環線'] == 'V' else row['班次數'], axis=1
        )

        return BusCycle, RouteShiftCount

    # Process the data
    BusCycle, RouteShiftCount = process_bus_data(BusTimeTable, Bus_cycle_route, Bus_cycle_status)

    # Save to Excel
    with pd.ExcelWriter(output_filename) as writer:
        BusTimeTable.to_excel(writer, sheet_name='各路線班表列表', index=False)
        BusCycle.to_excel(writer, sheet_name='循環路線', index=False)
        RouteShiftCount.to_excel(writer, sheet_name='平日路線數', index=False)
    
    print(f"文件 '{output_filename}' 已成功創建。")


In [32]:
inputfolder_path = os.path.join(os.getcwd(),'..', 'input')

In [None]:
# 使用函數
date = '2024-10-16'
BusTimeTable, Bus_cycle_route, Bus_cycle_status = scrape_bus_timetable(routemap_url, date)
outputname = os.path.join(inputfolder_path , '202410平日班表整理.xlsx')
process_and_save_bus_data(BusTimeTable, Bus_cycle_route, Bus_cycle_status, outputname)

In [None]:
# 使用函數
date = '2024-10-05'
BusTimeTable, Bus_cycle_route, Bus_cycle_status = scrape_bus_timetable(routemap_url, date)
outputname = os.path.join(inputfolder_path , '202410假日班表整理.xlsx')
process_and_save_bus_data(BusTimeTable, Bus_cycle_route, Bus_cycle_status, outputname)

RouteName  Direction
1          0            105
           1            108
101        0             18
102        0             33
1A         1              4
1B         0              7
dtype: int64