### 載入套件 和定義函式

dcard 標籤要多注意，因為很常被更換

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer
import urllib.request as req
import numpy as np
from scipy.linalg import norm
import pandas as pd
import warnings
import bs4

def tfidf_similarity(s1, s2):
    def add_space(s):
        return ' '.join(list(s))
    # 將字中間加入空格
    s1, s2 = add_space(s1), add_space(s2)
    # 轉化為TF矩陣
    cv = TfidfVectorizer(tokenizer=lambda s: s.split())
    corpus = [s1, s2]
    vectors = cv.fit_transform(corpus).toarray()
    
    # 計算TF係數
    fiend = np.dot(vectors[0], vectors[1]) / (norm(vectors[0]) * norm(vectors[1])) 
    np.seterr(invalid='ignore') # 當計算結果為無意義(分母為0)，忽略此警告
    return fiend
    
def stopwordslist(filepath):
    stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]
    return stopwords

def movestopwords(sentence):
    stopwords = stopwordslist('stopword.txt')  # 這裏加載停用詞的路徑
    outstr = ''
    for word in sentence:           
        if word not in stopwords:  
            if word != '\t'and'\n':
                outstr += word
    return outstr


In [52]:
def dcardCraw(url):
    # 建立一個Request 物件，附加Request Headers 的資訊
    request = req.Request(url, headers={
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36"
    })
    with req.urlopen(request) as response:
        data = response.read().decode("utf-8")
    # 「解析」原始碼，取得每篇文章的問題
    # utf-8(比較省空間)有部分的漢字不能轉換所以要用GB18030編碼

    # 讓beautifulSoup協助我們解析HTML格式文件
    root = bs4.BeautifulSoup(data, "html.parser")
    titles = root.find("div", class_="sc-ba53eaa8-0 hKkUKs")  # 用列表顯示全部爬蟲下來的標題

    for title in titles:
        result = title.text.strip().replace('\n', '').replace(' ', '')
        #印出內文
        print(result)
    
    return result
    # titles代表div標籤
    # 尋找class = "title" 的div 標籤，因為class是保留字，所以寫成class_
    # root 代表整個網頁、title是網頁標籤也是網頁標題
    # cls 是清空終端機(terminal)
    # mode = "a"是以附加的方式打開並寫入文件，因為mode = "w"會將檔案清空在寫入，mode="a"不會清空
    
def pttCraw(url):
    #建立一個Request 物件，附加Request Headers 的資訊
    request = req.Request(url, headers={
        "cookie":"over18=1",
        "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36"
    })
    with req.urlopen(request) as response:
        data = response.read().decode("utf-8")

    # print(data)
    #「解析」原始碼，取得每篇文章的問題
    # utf-8(比較省空間)有部分的漢字不能轉換所以要用GB18030編碼

    root = bs4.BeautifulSoup(data, "html.parser") # 讓beautifulSoup協助我們解析HTML格式文件
    titles = root.find("div", class_ = "bbs-screen bbs-content").text # 用爬蟲抓內文
    
    #去除掉 target_content
    target_content = '※ 發信站: 批踢踢實業坊(ptt.cc),'
    content = titles.split(target_content)
    
    #去除掉 作者 看板 標題 時間
    results = root.select('span.article-meta-value')

    if len(results)>3:
        #作者 看板 標題 時間
        firstLine = "作者" + results[0].text + "看板" + results[1].text + "標題" + results[2].text + "時間" + results[3].text

    content = content[0].split(firstLine)
    
    #去除掉文末 --
    main_content = content[1].replace('--', '')

    #去除掉換行
    main_content = main_content.replace('\n', '')
    
    #印出內文
    print(main_content)
    
    return main_content

### Dcard API(沒用到)

In [None]:
import urllib.request
import json
from bs4 import BeautifulSoup

# url = "https://www.dcard.tw/service/api/v2/posts/238632575"

def get(url):
    req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'})
    response = urllib.request.urlopen(req).read().decode('utf-8')
    return response
        
def get_single_post(pid):
    url = "https://www.dcard.tw/service/api/v2/posts/{}".format(pid)
    reqsjson = json.loads(get(url)) 
    for title in reqsjson['content']:
        result = title.replace('\n', '').replace(' ', '') # .text.strip()
    print(result) 
    return reqsjson

get_single_post(238632575)
    

### 計算配適度

In [3]:
train = pd.read_csv('done_2021-08to12.csv')

def find_song(url):
    if url[12:15] == "dca":
        article = dcardCraw(url)
    else:
        article = pttCraw(url)

    lyrics=train['lyrics']
    i=0
    num=0
    highpri=0
    for text in lyrics:
        text=movestopwords(text)
        text=text.replace(' ','')
        text=text.replace(',','，')
        if tfidf_similarity(text, article)>highpri:
            highpri=tfidf_similarity(text, article)
            num=i
        i+=1
    print('配適度:',highpri,'作者:',train.singer.iloc[num],'歌名:',train.name.iloc[num], '情緒:',train.moodCat.iloc[num])


### 抓取檔案

In [53]:
# article = find_song('https://www.dcard.tw/f/relationship/p/238632575') #youtube有
# article = find_song('https://www.dcard.tw/f/talk/p/239984330') #youtube有
article = find_song('https://www.dcard.tw/f/talk/p/239983442') #youtube有
# article = find_song("http://www.ptt.cc/bbs/Boy-Girl/M.1664277279.A.9AA.html") #youtube有
# article = find_song("https://www.ptt.cc/bbs/Gossiping/M.1664530650.A.4E3.html") #youtube有
# article = find_song("http://www.ptt.cc/bbs/Boy-Girl/M.1660356781.A.365.html") #youtube有

我先…..1.把冰箱雞蛋🥚壓在枕頭下希望孵出小雞🐤（結果想當然爾是被我壓到爛發臭後被我媽發現就被追著打2.在外婆家門口賣外婆的拖鞋一雙100$（差點害外婆沒拖鞋穿…3.小時候特愛買戳戳樂戳了好幾盒後把不喜歡的小玩具放在空的保麗龍戳戳樂盒子貼上白紙畫格子拿去學校賣一格5$…（被老師發現後寫聯絡簿通知家長結果是我被沒收了一個禮拜的零用錢…4.跟弟弟說我是魔女不可以跟別人講這個秘密不然我會變成青蛙🐸（絕對是小魔女抖蕊咪看太多….剛剛洗澡時候突發奇想就想起來這四個我記得小時候鬼靈精怪鬼點子超多！不知道大家小時候有沒有類似這樣很好玩的事情🤣🤣

送上我家肥橘🍊寫真一張


配適度: 0 作者: 金玟岐 歌名: 你註定會遇見我 情緒: 愛


In [None]:
train