In [None]:
!pip install requests beautifulsoup4


In [None]:
pip install selenium

In [None]:
from bs4 import BeautifulSoup
import re

def scrape_teachers_expertise():
    """
    爬取亞洲大學資工系老師的專長資訊
    """
    # 從已保存的HTML文件中讀取
    try:
        with open("page_source.html", "r", encoding="utf-8") as f:
            html_content = f.read()
    except:
        print("找不到page_source.html文件")
        return []

    # 解析HTML
    soup = BeautifulSoup(html_content, 'html.parser')
    text = soup.get_text()

    # 使用正則表達式尋找教師信息
    pattern = r'姓名[\s\:：]+([\w\(\)\-\,\. ]+)\s*\n+\s*職稱[\s\:：]+([^\n]+)\s*\n+[\s\S]*?研究領域[\s\:：]+([\s\S]*?)(?=\s*Office hour|網站|分機)'
    matches = re.findall(pattern, text)

    formatted_output = []

    for match in matches:
        name, title, expertise = match
        name = name.strip()
        expertise = expertise.strip().replace('\n', ' ')

        # 修正逗號與頓號
        expertise = expertise.replace(', ', '、').replace('，', '、').replace(' ,', '、')
        expertise = expertise.replace(',、', '、')  # 修正NGUYEN老師的專長

        # 按照指定格式輸出
        formatted_output.append(f"**{name}**：{expertise}。")

    # 添加特殊情況的教師
    special_cases = [
        "**Tadao Murata**：分散式通訊軟體、網路協議、邏輯與規則基礎AI系統、製造系統、平行計算系統和具有模糊延遲的系統的Petri網應用。",
        "**曾憲章(Zeng Xianzhang)**：計算機科學。",
        "**李錦輝(Chin-Hui Lee)**：語音訊號處理、機器學習。",
        "**黃光彩**：電機工程。",
        "**林一平(Jason Yi-Bing Lin)**：個人通信網路、行動計算、系統模擬。",
        "**張嘉淵(Zhang Jiayuan)**：雲端運算、大數據分析、演算法、社群媒體、人工智慧物聯網、人本創新應用。",
        "**許健(Gene Sheu)**：電子電路、微電子、產品研發、積體電路。",
        "**梁文隆(Wen-Lung Liang)**：物聯網技術、嵌入式系統、智慧家庭。",
        "**林詠章(Lin Yongzhang)**：資訊安全、區塊鏈應用、精準健康、智慧醫療、工控安全。"
    ]

    for case in special_cases:
        formatted_output.append(case)

    return formatted_output

def main():
    formatted_output = scrape_teachers_expertise()

    if formatted_output:
        # 將結果寫入文件
        with open('teachers_expertise_formatted.txt', 'w', encoding='utf-8') as f:
            for line in formatted_output:
                f.write(line + '\n')

        # 顯示結果
        for line in formatted_output:
            print(line)


    else:
        print("沒有找到教師資訊")

if __name__ == "__main__":
    main()