<a href="https://colab.research.google.com/github/LoTzuChin/113-1-FinancialBigData-yahooCrawler/blob/main/yahooFinanceCrawler.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time  # 用於控制抓取速度，防止過於頻繁的請求

# 建立存儲結果的DataFrame
all_stock_data = pd.DataFrame()

# 股票代碼範圍從 0001 到 9999
for stock_code in range(1, 9963):
    # 格式化股票代碼為四位數字，例如 '0001', '2330', '9999'
    stock_code_str = f"{stock_code:04d}"

    url = f"https://tw.stock.yahoo.com/quote/{stock_code_str}.TW"

    try:
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')

            # 找到包含股價數據的主要區塊
            layer_1 = soup.find('div', {'id': 'main-2-QuoteOverview-Proxy'})

            if layer_1:
                # 繼續深入查找包含具體股價數據的層級
                layer_2 = layer_1.find('div', class_="Fx(n) W(316px) Bxz(bb) Pstart(16px) Pt(12px)")

                if layer_2:
                    # 找到數據所在的子層
                    layer_3 = layer_2.find('div', class_="Pos(r)")

                    if layer_3:
                        # 這裡用 find_all 來抓取所有的 list item 元素
                        rows = layer_3.find_all('li', class_="price-detail-item")

                        # 用於存儲數據的字典
                        data = {"股票代碼": stock_code_str}  # 加入股票代碼

                        # 遍歷每一行，提取標籤和對應的值
                        for row in rows:
                            # 取出標籤名，例如開盤、最高、最低、收盤
                            label_td = row.find('span', class_='C(#232a31)')
                            # 對應的值
                            value_td = row.find('span', class_='Fw(600)')

                            # 確保提取到了標籤和數據
                            if label_td and value_td:
                                label = label_td.text.strip()
                                value = value_td.text.strip()
                                # 將數據存入字典
                                data[label] = value

                        # 將該支股票的數據存儲到 DataFrame 並附加到總結果中
                        stock_df = pd.DataFrame([data])
                        all_stock_data = pd.concat([all_stock_data, stock_df], ignore_index=True)

                        # 控制抓取速度，避免頻繁請求被封鎖
                        time.sleep(1)
                    else:
                        print(f"無法找到 {stock_code_str} 的 Pos(r) 層")
                else:
                    print(f"無法找到 {stock_code_str} 的股價層級 2")
            else:
                print(f"無法找到 {stock_code_str} 的股價數據區塊")
        # else:
        #     print(f"無法獲取 {stock_code_str} 的網頁，狀態碼: {response.status_code}")

    except Exception as e:
        print(f"抓取股票代碼 {stock_code_str} 時出錯: {e}")

# 將結果保存到 CSV 文件中
all_stock_data.to_csv('all_stock_data.csv', index=False)

# 打印結果
all_stock_data


Unnamed: 0,股票代碼,成交,開盤,最高,最低,均價,成交金額(億),昨收,漲跌幅,漲跌,總量,昨量,振幅
0,0050,183.25,182.55,183.65,182.55,183.18,14.44,182.45,0.44%,0.80,7883,13617,0.60%
1,0051,80.70,80.50,80.80,80.50,80.70,0.027,80.35,0.44%,0.35,33,35,0.37%
2,0052,178.60,178.00,178.90,177.80,178.32,0.574,177.80,0.45%,0.80,322,679,0.62%
3,0053,97.90,97.75,98.10,97.75,97.89,0.025,97.75,0.15%,0.15,26,15,0.36%
4,0055,29.23,29.14,29.24,29.10,29.18,0.060,29.04,0.65%,0.19,205,176,0.48%
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1821,9105,5.01,5.04,5.06,5.00,5.03,0.069,5.04,0.60%,0.03,1378,13802,1.19%
1822,9110,7.15,7.10,7.15,7.10,7.10,0.001,7.17,0.28%,0.02,11,62,0.70%
1823,9136,6.50,6.49,6.53,6.47,6.50,0.002,6.45,0.78%,0.05,26,266,0.93%
1824,9573,-,-,-,-,-,-,-,-,-,-,-,0.00%
