In [None]:
### 組合程式
import requests
from bs4 import BeautifulSoup as bs 


# 移除標籤
# - div.article-metaline
# - div.article-metaline-right
# - span.f2
def remove_dirty_tag(soup):
    
    # 若存在 , 則移除標籤
    if len(soup.select("div.article-metaline")) >0 :
        
        # 標籤可能多項 , 使用 for-loop 移除
        for tag in soup.select("div.article-metaline"):
            tag.extract()
            
    if len(soup.select("div.article-metaline-right")) >0 :
        for tag in soup.select("div.article-metaline-right"):
            tag.extract()
            
    if len(soup.select("span.f2")) >0 :
        for tag in soup.select("span.f2"):
            tag.extract()
    
    return soup

def get_resp_data(ele):
    span_tags = ele.select("span")
    return {
        "tag"     : span_tags[0].text.strip(),
        "author"  : span_tags[1].text.strip(),
        "content" : span_tags[2].text.replace(": ","").strip(), 
        "time"    : span_tags[3].text.strip()
    }


def get_data(soup,url):
    ### 抓取本文的 作者 , 看板 , 標題 , 時間 
    span_tags = soup.select("div#main-content span.article-meta-value")

    # 作者
    author = span_tags[0].text

    # 看板
    category = span_tags[1].text

    # 標題
    title = span_tags[2].text

    # 時間
    time = span_tags[3].text

    ### 抓取本文的 回應

    #############
    # 為了標籤拔除 , 故先抓取 回應資料
    # 加入拔除標籤動作
    push_tags = soup.select("div#main-content div.push")
    resp_data = []

    if len(push_tags) >0:

        for ele in push_tags:
            ele.extract()  # 宣告從 div#main-content 中,拔除 div.push 標籤

            resp = get_resp_data(ele)

            resp_data.append(resp)
            
    ### 內容
    soup = remove_dirty_tag(soup)
    content = soup.select("div#main-content")[0].text.strip()
    
    
    ### 包成 dict return 回去
    return {
        "author" : author,
        "category" : category,
        "title" : title,
        "time" : time,
        "resp_data" : resp_data,
        "content" : content,
        "url" : url
    }



### main 程式

## 取得首頁 source code
base_url = "https://www.ptt.cc/bbs/Stock/index.html"

headers = {
    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36"
}

res = requests.get(base_url,headers=headers)
soup = bs(res.text,"lxml")

print("* 取得首頁 source code 完成！")


## 抓取首頁文章連結
links = []

for a_tag in soup.select("div#main-container div.r-ent div.title a"):
    
    # 過濾 版規 & 盤後閒聊 / 盤中閒聊
    title = a_tag.text
    
    if "股票板板規" in title or "盤後閒聊" in title or "盤中閒聊" in title :
        continue # 跳過此步, 執行下一動迴圈
    else:
        url = "https://www.ptt.cc" + a_tag["href"]
        links.append(url)
        
print("* 取得首頁文章連結 完成！")


        
## 抓取 分頁文章 連結
for i in range(1,6):
    
    # 建構 '上頁' 連結
    link = soup.select("div#action-bar-container div.btn-group-paging a")[1]["href"]
    previous_link = "https://www.ptt.cc" + link

    res = requests.get(previous_link,headers=headers)
    soup = bs(res.text,"lxml")

    for a_tag in soup.select("div#main-container div.r-ent div.title a"):

        # 過濾 版規 & 盤後閒聊 / 盤中閒聊
        title = a_tag.text

        if "股票板板規" in title or "盤後閒聊" in title or "盤中閒聊" in title :
            continue # 跳過此步, 執行下一動迴圈
        else:
            url = "https://www.ptt.cc" + a_tag["href"]
            links.append(url)
            
    print("{} is ok.".format(previous_link))
        
print("* 抓取分頁文章連結 完成！")


## 抓取文章本文 source code + 資料清洗

dataList = []

for url in links[:10]:
    res2 = requests.get(url,headers=headers)
    soup2 = bs(res2.text,"lxml")
    data = get_data(soup2,url)
    
    dataList.append(data)
    
    print("{} is ok.".format(url))
    
print("* 抓取本文資料 完成！")

### 資料落地
for data in dataList:

    file_name = data["title"]

    with open("sample/{}.txt".format(file_name),"w") as out_file:

        record = ""

        # 除回應外 , 其餘資料拼接
        for key in data:
            if key != "resp_data": record += "{}: {}\n".format(key,data[key])
                
        # 回應分隔線
        record += ("="*80)
        record += "\n"
        
        # 回應資料
        resp = ""
        for ele in data["resp_data"]:
            resp += "{},{},{},{}\n".format(ele["tag"],ele["author"],ele["content"],ele["time"])
            
        record += resp
        out_file.write(record)
        
print("* 資料落地 完成！")

print("Done.")