### 方法1. requests (較快)

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
# 設定要訪問的網址，這裡以台灣證交所某個資料頁面為例
url = 'http://www.twse.com.tw/exchangeReport/STOCK_DAY?response=html&date=20231001&stockNo=2330'

# 使用 requests 庫發送 HTTP GET 請求
response = requests.get(url)

# 確認網頁回應狀態碼，如果是200表示正常
if response.status_code == 200:
    print("Successfully retrieved data")
else:
    print("Failed to retrieve data")


Successfully retrieved data


In [3]:
# 原始 HTML 網頁內容，只顯示前200個字元
response.text[:200]

'<!doctype html>\n<html lang="zh">\n<head>\n    <meta http-equiv="Content-Type" content="text/html; charset=utf-8">\n    <meta http-equiv="X-UA-Compatible" content="IE=edge">\n    <title> 報表 - TWSE 臺灣證券交易所<'

In [4]:
# 使用 BeautifulSoup 解析 HTML 內容，使我們可以訪問 HTML 文件中的不同部分，如標籤、屬性、文本等。
soup = BeautifulSoup(response.text, 'lxml')
soup

<!DOCTYPE html>
<html lang="zh">
<head>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<title> 報表 - TWSE 臺灣證券交易所</title>
<style>
    table { border-collapse: collapse; margin: 1em 0; }
    table, th, td { border: 1px solid lightgray; }
    table thead th, table thead td { text-align: center; background-color: #eee; }
    table thead div { font-size: 1.5em; padding: 10px; }
    table tbody tr:nth-child(even){ background-color:#f8f8f8; }
    td { padding: 4px; }
    td>p { margin: 0; text-align: center; }
    div.notes { margin: 2em 0 4em 0; line-height: 1.5em; }
    table tbody td { text-align: right; }
    table tbody td:first-child { text-align: center; }
    table tbody td:nth-child() { text-align: left;}
    
    
    
    
    
    
    /*table tbody td:first-child + td { text-align: right; }*/
    /*table tbody td:last-child { text-align: left; }*/
    </style>
</head>
<body>
<div>
<table>
<thead>
<tr>
<t

In [5]:
# 找到網頁中所有的表格
tables = soup.find_all('table')
tables

[<table>
 <thead>
 <tr>
 <th colspan="9">
 <div>112年10月 2330 台積電           各日成交資訊</div>
 </th>
 </tr>
 <tr>
 <th>日期</th>
 <th>成交股數</th>
 <th>成交金額</th>
 <th>開盤價</th>
 <th>最高價</th>
 <th>最低價</th>
 <th>收盤價</th>
 <th>漲跌價差</th>
 <th>成交筆數</th>
 </tr>
 </thead>
 <tbody>
 <tr align="center" style="font-size:14px;">
 <td>112/10/02</td>
 <td>26,891,996</td>
 <td>14,300,676,237</td>
 <td>530.00</td>
 <td>534.00</td>
 <td>528.00</td>
 <td>533.00</td>
 <td>+10.00</td>
 <td>23,151</td>
 </tr>
 <tr align="center" style="font-size:14px;">
 <td>112/10/03</td>
 <td>17,601,440</td>
 <td>9,335,110,434</td>
 <td>528.00</td>
 <td>533.00</td>
 <td>528.00</td>
 <td>529.00</td>
 <td>-4.00</td>
 <td>17,430</td>
 </tr>
 <tr align="center" style="font-size:14px;">
 <td>112/10/04</td>
 <td>29,808,729</td>
 <td>15,519,741,014</td>
 <td>521.00</td>
 <td>523.00</td>
 <td>519.00</td>
 <td>520.00</td>
 <td>-9.00</td>
 <td>49,176</td>
 </tr>
 <tr align="center" style="font-size:14px;">
 <td>112/10/05</td>
 <td>25,749,184

In [6]:
# 假設我們需要的資料在第一個表格中
table = tables[0]
table

<table>
<thead>
<tr>
<th colspan="9">
<div>112年10月 2330 台積電           各日成交資訊</div>
</th>
</tr>
<tr>
<th>日期</th>
<th>成交股數</th>
<th>成交金額</th>
<th>開盤價</th>
<th>最高價</th>
<th>最低價</th>
<th>收盤價</th>
<th>漲跌價差</th>
<th>成交筆數</th>
</tr>
</thead>
<tbody>
<tr align="center" style="font-size:14px;">
<td>112/10/02</td>
<td>26,891,996</td>
<td>14,300,676,237</td>
<td>530.00</td>
<td>534.00</td>
<td>528.00</td>
<td>533.00</td>
<td>+10.00</td>
<td>23,151</td>
</tr>
<tr align="center" style="font-size:14px;">
<td>112/10/03</td>
<td>17,601,440</td>
<td>9,335,110,434</td>
<td>528.00</td>
<td>533.00</td>
<td>528.00</td>
<td>529.00</td>
<td>-4.00</td>
<td>17,430</td>
</tr>
<tr align="center" style="font-size:14px;">
<td>112/10/04</td>
<td>29,808,729</td>
<td>15,519,741,014</td>
<td>521.00</td>
<td>523.00</td>
<td>519.00</td>
<td>520.00</td>
<td>-9.00</td>
<td>49,176</td>
</tr>
<tr align="center" style="font-size:14px;">
<td>112/10/05</td>
<td>25,749,184</td>
<td>13,563,941,939</td>
<td>523.00</td>
<td>529.00

In [7]:
# 從表格抓取所有行（tr 表示一行）
rows = table.find_all('tr')
rows

[<tr>
 <th colspan="9">
 <div>112年10月 2330 台積電           各日成交資訊</div>
 </th>
 </tr>,
 <tr>
 <th>日期</th>
 <th>成交股數</th>
 <th>成交金額</th>
 <th>開盤價</th>
 <th>最高價</th>
 <th>最低價</th>
 <th>收盤價</th>
 <th>漲跌價差</th>
 <th>成交筆數</th>
 </tr>,
 <tr align="center" style="font-size:14px;">
 <td>112/10/02</td>
 <td>26,891,996</td>
 <td>14,300,676,237</td>
 <td>530.00</td>
 <td>534.00</td>
 <td>528.00</td>
 <td>533.00</td>
 <td>+10.00</td>
 <td>23,151</td>
 </tr>,
 <tr align="center" style="font-size:14px;">
 <td>112/10/03</td>
 <td>17,601,440</td>
 <td>9,335,110,434</td>
 <td>528.00</td>
 <td>533.00</td>
 <td>528.00</td>
 <td>529.00</td>
 <td>-4.00</td>
 <td>17,430</td>
 </tr>,
 <tr align="center" style="font-size:14px;">
 <td>112/10/04</td>
 <td>29,808,729</td>
 <td>15,519,741,014</td>
 <td>521.00</td>
 <td>523.00</td>
 <td>519.00</td>
 <td>520.00</td>
 <td>-9.00</td>
 <td>49,176</td>
 </tr>,
 <tr align="center" style="font-size:14px;">
 <td>112/10/05</td>
 <td>25,749,184</td>
 <td>13,563,941,939</td>
 

In [8]:
# 獲取所有column名稱（th 表示表頭）
columns = rows[1]
cols = columns.find_all('th')
cols

[<th>日期</th>,
 <th>成交股數</th>,
 <th>成交金額</th>,
 <th>開盤價</th>,
 <th>最高價</th>,
 <th>最低價</th>,
 <th>收盤價</th>,
 <th>漲跌價差</th>,
 <th>成交筆數</th>]

In [9]:
# 把表頭文字提取出來
cols = [ele.text.strip() for ele in cols]
cols

['日期', '成交股數', '成交金額', '開盤價', '最高價', '最低價', '收盤價', '漲跌價差', '成交筆數']

In [10]:
# 遍歷每一行，抓取每個單元格的內容
toatl_values = []
for row in rows:
    values = row.find_all('td')
    # 每個單元格（td 表示一個單元格）的文字內容
    values = [ele.text.strip() for ele in values]
    toatl_values.append(values)
toatl_values

[[],
 [],
 ['112/10/02',
  '26,891,996',
  '14,300,676,237',
  '530.00',
  '534.00',
  '528.00',
  '533.00',
  '+10.00',
  '23,151'],
 ['112/10/03',
  '17,601,440',
  '9,335,110,434',
  '528.00',
  '533.00',
  '528.00',
  '529.00',
  '-4.00',
  '17,430'],
 ['112/10/04',
  '29,808,729',
  '15,519,741,014',
  '521.00',
  '523.00',
  '519.00',
  '520.00',
  '-9.00',
  '49,176'],
 ['112/10/05',
  '25,749,184',
  '13,563,941,939',
  '523.00',
  '529.00',
  '523.00',
  '528.00',
  '+8.00',
  '19,027'],
 ['112/10/06',
  '16,160,314',
  '8,587,684,448',
  '530.00',
  '533.00',
  '529.00',
  '532.00',
  '+4.00',
  '14,047'],
 ['112/10/11',
  '61,670,562',
  '33,477,968,593',
  '542.00',
  '544.00',
  '540.00',
  '544.00',
  '+12.00',
  '43,385'],
 ['112/10/12',
  '36,350,241',
  '19,916,786,786',
  '545.00',
  '550.00',
  '544.00',
  '550.00',
  '+6.00',
  '35,333'],
 ['112/10/13',
  '34,704,924',
  '19,128,100,250',
  '550.00',
  '554.00',
  '548.00',
  '553.00',
  '+3.00',
  '31,159'],
 ['112

In [11]:
pd.DataFrame(toatl_values[2:], columns=cols)  # 第一行是表頭，所以從第二行開始取數據

Unnamed: 0,日期,成交股數,成交金額,開盤價,最高價,最低價,收盤價,漲跌價差,成交筆數
0,112/10/02,26891996,14300676237,530.0,534.0,528.0,533.0,10.0,23151
1,112/10/03,17601440,9335110434,528.0,533.0,528.0,529.0,-4.0,17430
2,112/10/04,29808729,15519741014,521.0,523.0,519.0,520.0,-9.0,49176
3,112/10/05,25749184,13563941939,523.0,529.0,523.0,528.0,8.0,19027
4,112/10/06,16160314,8587684448,530.0,533.0,529.0,532.0,4.0,14047
5,112/10/11,61670562,33477968593,542.0,544.0,540.0,544.0,12.0,43385
6,112/10/12,36350241,19916786786,545.0,550.0,544.0,550.0,6.0,35333
7,112/10/13,34704924,19128100250,550.0,554.0,548.0,553.0,3.0,31159
8,112/10/16,21867048,11909042136,546.0,547.0,542.0,545.0,-8.0,21225
9,112/10/17,19462529,10712565922,550.0,552.0,548.0,551.0,6.0,19538


### 方法2. selenium(較慢，但容易上手，有介面可以看)
範例，沒有實際跑，懶的裝ChromeDriver.....

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import pandas as pd

# 設定 WebDriver 路徑
driver_path = 'path_to_your_chromedriver'
driver = webdriver.Chrome(executable_path=driver_path)

# 打開目標網頁
url = 'target_website_url'  # 證交所的某個具體網頁
driver.get(url)

# 等待網頁加載（視需要調整）
driver.implicitly_wait(10)

# 找到表格元素
table = driver.find_element(By.TAG_NAME, 'table')

# 抓取所有的行（tr元素）
rows = table.find_elements(By.TAG_NAME, 'tr')

# 列表用於存儲每一行的資料
data = []
for row in rows:
    # 抓取一行中的所有單元格（td元素）
    cols = row.find_elements(By.TAG_NAME, 'td') 
    cols = [ele.text.strip() for ele in cols]
    data.append(cols)

# 關閉瀏覽器
driver.quit()

# 創建 pandas DataFrame
df = pd.DataFrame(data)

# 顯示 DataFrame
print(df)
