In [1]:
import pandas as pd
import numpy as np
import datetime

import warnings
warnings.filterwarnings("ignore")

## 1. 處理股價

In [4]:
stock = pd.read_excel('../../bda2023_mid_dataset/stock_data_2019-2023.xlsx', sheet_name = '上市2022')

* 我們選擇日月光 (3711) 作為此次預測標的

In [5]:
ase = stock[stock['證券代碼'].str.contains('2330')]
ase['年月日'] = pd.to_datetime(ase['年月日']).dt.date
ase = ase.sort_values(by = '年月日').reset_index(drop = True)

* 首先先整理表格內容，增加兩個欄位分別為
    1. 單日漲跌幅(元)：表示今日與前日收盤價的真實價格變化
    2. label：表示今日與前日收盤價的漲跌標籤，我們直接以 >0 或 <=0 作為判別依據

In [6]:
ase = ase[['證券代碼', '年月日', '收盤價(元)']]
ase['單日漲跌幅(%)'] = ase['收盤價(元)'].rolling(window=2).apply(lambda x: (x.iloc[1]-x.iloc[0])/x.iloc[0])
ase['label'] = ase['單日漲跌幅(%)'].apply(lambda x: (x>0 and '漲') or '跌')

In [7]:
ase

Unnamed: 0,證券代碼,年月日,收盤價(元),單日漲跌幅(%),label
0,M2330 資訊服務業指數,2022-01-03,130.9100,,跌
1,2330 台積電,2022-01-03,614.0438,3.690580,漲
2,2330 台積電,2022-01-04,638.3720,0.039620,漲
3,M2330 資訊服務業指數,2022-01-04,130.5100,-0.795558,跌
4,2330 台積電,2022-01-05,632.5332,3.846626,漲
...,...,...,...,...,...
487,2330 台積電,2022-12-28,448.5727,2.674416,漲
488,M2330 資訊服務業指數,2022-12-29,122.2100,-0.727558,跌
489,2330 台積電,2022-12-29,443.5996,2.629814,漲
490,M2330 資訊服務業指數,2022-12-30,122.3800,-0.724121,跌


* 最後整理出在 2022 年 246 個交易日中，漲跌變化的個數

In [8]:
ase['label'].value_counts()

漲    247
跌    245
Name: label, dtype: int64

## 2. 處理文章資料

In [10]:
bbs1921 = pd.read_csv('../../bda2023_mid_dataset/bda2023_mid_bbs_2019-2021.csv') # 104466
bbs2223 = pd.read_csv('../../bda2023_mid_dataset/bda2023_mid_bbs_2022-2023.csv') #  34630
bbs23 = pd.concat([bbs1921, bbs2223]).reset_index().drop(["index"], axis=1)
bbs23

Unnamed: 0,id,p_type,s_name,s_area_name,post_time,title,author,content,page_url
0,1546274852018_PTT02R,bbs,Ptt,Stock,2019-01-01 00:31:32,[公告] n199808m HitMaker 警告一次,eyespot,1. 主旨：n199808m 違反板規4-2-1 警告一次 HitMake...,http://www.ptt.cc/bbs/Stock/M.1546273895.A.81F...
1,1546278287622_PTT02R,bbs,Ptt,Stock,2019-01-01 01:28:28,Re: [新聞] 貿戰讓台商錢匯不出？ 海基會：漣漪效應,CGDGAD,小弟有個想法不知可不可行 如果有人民幣想洗出來 出國一趟，比方去歐洲 用海外刷卡買黃金，存在...,http://www.ptt.cc/bbs/Stock/M.1546277311.A.1D3...
2,1546278288500_PTT02R,bbs,Ptt,Stock,2019-01-01 01:32:39,Re: [新聞] 貿易戰搶出口 透支效應2019衝擊中國經濟!,americ,分身帳號好像要連坐水桶 《ＩＤ暱稱》tangolosss (配息配股變成大富翁)《經濟狀況...,http://www.ptt.cc/bbs/Stock/M.1546277562.A.F7E...
3,1546298530556_PTT02R,bbs,Ptt,Stock,2019-01-01 07:07:37,Re: [新聞] 陸媒：俄羅斯想聯手中國去美元化,taco13,所以說不要小看俄羅斯的險惡奸詐 俄國一直鼓勵中國發展人民幣石油 去美元化的種種行為 俄羅...,http://www.ptt.cc/bbs/Stock/M.1546297660.A.928...
4,1546299585726_PTT02R,bbs,Ptt,Stock,2019-01-01 07:35:29,[標的] (伺機作多)日元正二,hrma,1. 標的：元大日元指數正二 2. 分類：(伺機作多)多 3. 分析/正文： (...,http://www.ptt.cc/bbs/Stock/M.1546299333.A.8D3...
...,...,...,...,...,...,...,...,...,...
139091,1679410888379_PTT02R,bbs,Ptt,Stock,2023-03-21 23:00:47,[情報] 3/21全市場當沖虧損最多的股票,offeree,3/21 全市場當沖虧損最多的股票 1. 創意 -999.8萬 2. 華孚 -...,http://www.ptt.cc/bbs/Stock/M.1679410849.A.80F...
139092,1679411198168_PTT02R,bbs,Ptt,Stock,2023-03-21 23:05:30,[新聞] 葉倫一番話提振市場信心 美股道瓊早盤漲2,humbler,原文標題： 葉倫一番話提振市場信心 美股道瓊早盤漲230點 原文連結： https://m...,http://www.ptt.cc/bbs/Stock/M.1679411133.A.D55...
139093,1679412591612_PTT02R,bbs,Ptt,Stock,2023-03-21 23:28:59,[新聞] 社子近三萬戶無預警停電！王美花道歉了,DrowningPool,原文標題：社子近三萬戶無預警停電！王美花道歉了：要求台電提早換設備 原文連結：https:...,http://www.ptt.cc/bbs/Stock/M.1679412541.A.011...
139094,1679413649631_PTT02R,bbs,Ptt,Stock,2023-03-21 23:45:22,[新聞] 聯電新加坡擴廠 亞翔再接82.59億元大單,changjie,原文标题：聯電新加坡擴廠 亞翔再接82.59億元大單 原文连结：https://amp-...,http://www.ptt.cc/bbs/Stock/M.1679413525.A.B90...


* 篩選出標題及內文提及「日月光」三字的文章

In [11]:
bbs23_ase = bbs23[bbs23['title'].str.contains('台積電') | bbs23['content'].str.contains('台積電')].reset_index(drop = True)
bbs23_ase

Unnamed: 0,id,p_type,s_name,s_area_name,post_time,title,author,content,page_url
0,1546301706528_PTT02R,bbs,Ptt,Stock,2019-01-01 08:07:04,[新聞] 廣州攜港澳台 拚千億半導體產業,nightwing,1.原文連結： https://tinyurl.com/y9y7p3h3 2.原文內容...,http://www.ptt.cc/bbs/Stock/M.1546301228.A.050...
1,1546390288567_PTT02R,bbs,Ptt,Stock,2019-01-02 08:35:42,[新聞] 挖礦ASIC商機曇花一現　半導體供應鏈看淡,kaube,1.原文連結： 挖礦ASIC商機曇花一現　半導體供應鏈看淡後市 https://goo.gl...,http://www.ptt.cc/bbs/Stock/M.1546389344.A.A3C...
2,1546400460272_PTT02R,bbs,Ptt,Stock,2019-01-02 11:30:29,[新聞] 14檔高殖利率 外資回頭搶補,ESunBoy,1.原文連結： https://goo.gl/K9mTJb 2.原文內容： https...,http://www.ptt.cc/bbs/Stock/M.1546399832.A.6C6...
3,1546410609352_PTT02R,bbs,Ptt,Stock,2019-01-02 14:20:07,[其他] 108/01/02 加權股價指數成分股暨市值比重,BreezeCat,大家新年快樂～ 雖然今天大盤有點糟就是了= = 然後我因為空手限制，期貨什麼空單都沒有=...,http://www.ptt.cc/bbs/Stock/M.1546410010.A.7EE...
4,1546440294641_PTT02R,bbs,Ptt,Stock,2019-01-02 22:27:02,[其他] 1/2 台灣集中市場三大法人買賣超前30名,l75cm,1/2 台灣集中市場三大法人買賣超前30名 買超 代碼 名稱 ...,http://www.ptt.cc/bbs/Stock/M.1546439226.A.4AB...
...,...,...,...,...,...,...,...,...,...
17370,1679407690398_PTT02R,bbs,Ptt,Stock,2023-03-21 22:06:25,[標的] PXD US,yamakazi,1. 標的： (例 2330.TW 台積電) PXD US 2. 分類：多/空...,http://www.ptt.cc/bbs/Stock/M.1679407589.A.990...
17371,1679409517038_PTT02R,bbs,Ptt,Stock,2023-03-21 22:35:40,[標的] BRK.B US,yamakazi,1. 標的： (例 2330.TW 台積電) BRK.B US 2. 分類：多/...,http://www.ptt.cc/bbs/Stock/M.1679409343.A.D03...
17372,1679410273480_PTT02R,bbs,Ptt,Stock,2023-03-21 22:50:00,[標的] CAT.US,yamakazi,1. 標的： (例 2330.TW 台積電) CAT 2. 分類：多/空/討論...,http://www.ptt.cc/bbs/Stock/M.1679410204.A.B1C...
17373,1679410724371_PTT02R,bbs,Ptt,Stock,2023-03-21 22:54:19,[新聞] 紫光前董事長趙偉國涉貪被訴 曾揚言買台,Su22,原文標題：紫光前董事長趙偉國涉貪被訴 曾揚言買台積電 原文連結：https://bit....,http://www.ptt.cc/bbs/Stock/M.1679410462.A.C62...


* 將第 n 天的文章與第 n+1 的股市漲跌標籤合併，並且只保留我們需要的欄位資訊

In [12]:
bbs23_ase['post_time'] = pd.to_datetime(bbs23_ase['post_time']).dt.date
ase['年月日-1'] = ase['年月日'] - datetime.timedelta(days=1)
bbs23_ase = pd.merge(bbs23_ase, ase, left_on='post_time', right_on='年月日-1')[['post_time', 'title', 'content', 'label']]
bbs23_ase

Unnamed: 0,post_time,title,content,label
0,2022-01-02,[標的] 2610.TW 華航 新年快樂,1. 標的： 2610.TW (例 2330.TW 台積電) 2. 分類：多 3. ...,跌
1,2022-01-02,[標的] 2610.TW 華航 新年快樂,1. 標的： 2610.TW (例 2330.TW 台積電) 2. 分類：多 3. ...,漲
2,2022-01-02,Re: [請益] 2021台股投報率的中位數不知是幾%?,股票真的很難玩啊～ 壞人很多 像長榮.陽明 公司賺大錢 就讓他慢慢從20漲到140就...,跌
3,2022-01-02,Re: [請益] 2021台股投報率的中位數不知是幾%?,股票真的很難玩啊～ 壞人很多 像長榮.陽明 公司賺大錢 就讓他慢慢從20漲到140就...,漲
4,2022-01-02,[請益] 台積電1年漲到800買認購權證好嗎？,以前我就有炒過台積電認購權證了 而且我炒台積電是比較理性的炒法 我挑300天到期的 行使比...,跌
...,...,...,...,...
7527,2022-12-29,[情報] 1229八大公股銀行買賣超排行,手機介面圖片好讀版： 以下資訊依張數排列 買超 ...,漲
7528,2022-12-29,[標的] 00893 國泰電動車 斷頭貪婪多,----------------------------------------------...,跌
7529,2022-12-29,[標的] 00893 國泰電動車 斷頭貪婪多,----------------------------------------------...,漲
7530,2022-12-29,[情報] 111年12月29日信用交易統計,1. 標題：111年12月29日信用交易統計 2. 來源：臺灣證券交易所、證券櫃檯買賣中心 ...,跌


# 3. 訓練集文章向量化處理

* 首先先進行斷詞處理，以下我們會先實作幾個步驟：
    1. 正則表示法清除多餘字元：先移除文章中符號、英數字，只保留中文字元
    2. 斷句：由於 monpa 在處理 200 字以上字串的斷詞時可能會出現錯誤結果，因此我們統一對長文章先進行斷句拆成較短的句子組成的 list
    3. 斷詞：透過 monpa 對斷句結果中的所有句子進行斷詞

In [13]:
import monpa
from monpa import utils
import re

+---------------------------------------------------------------------+
  Welcome to MONPA: Multi-Objective NER POS Annotator for Chinese
+---------------------------------------------------------------------+
已找到 model檔。Found model file.


In [14]:
# 這個function用來將字串以正則化處理去除中文字元以外的字元
def clearSentence(sentence):
    return re.sub(r'[^\u4e00-\u9fa5]+', '', sentence)

# 我們從stopwords_zh.txt這個檔案中匯入繁體中文的停用詞
with open('./stopwords_zh.txt', 'r') as file:
    stopwords = file.read().splitlines() 
file.close()

* 我們想嘗試用 2022 年 7 月至 9 月的文章資料來訓練一個分類器，並用於 10 月的股價變動預測當中

In [15]:
# 設定訓練資料集的開始日期與結束日期
train_startDate = datetime.date(2022,7,1)
train_endDate = datetime.date(2022,9,30)

# 透過monpa對文章進行斷詞處理，並將個別斷詞結果寫在tokenStr這個字串變數中，最後再將訓練集中所有字串存在tokenStr_list中
train_tokenStr_list = []
for i in list(bbs23_ase[bbs23_ase['post_time'].between(train_startDate, train_endDate)].index):
    try:
#         txt = clearSentence(bbs23_ase['content'][i])
        sentence_list = utils.short_sentence(bbs23_ase['content'][i])
        tokenStr = str()
        for sentence in sentence_list:
            sentence = clearSentence(sentence)
            tokens = monpa.cut(sentence)
            tokenStr += ' '.join(tokens)
        train_tokenStr_list.append(tokenStr)
    except:
        train_tokenStr_list.append('')

In [16]:
train_tokenStr_list

['電價 今 調漲 台積電 籲 供 電 穩定 經長 科學 園區 電網 採 雙 迴路 今 起 國內 電價 平均 調漲 產業 用 電 大戶 大漲 台積電 等 產業 關切 供電 穩定 經濟部長 王美花 今 表示 台積電 在 科學 園區 內 供電 來源 為 超高 壓 科學 園區 電網 都 有 雙 迴路 長期 來 看 用電 穩定 長久 以來 都 沒 問題經濟部長 王美花 今 出席 財政部 民間 參與 公共 建設 招商 大會 暨 卓越獎 頒獎 典禮 經濟部 去年 因 促 參 簽約 案件件 民間 投資 金額 億 元 而 獲得 招商 卓越獎 王美花 今 出席 領獎 於 活動 前 說明 電價 股市 及 供電 議題 經濟部 月日 宣布 今天 起 國內 電價 平均 調漲 產業 用電 大戶 則 大漲 引起 產業界 不滿 認為 調漲 幅度 太多 並 要求 政府 穩定 供 電王美花 表示 科學 園區 以外 的 其他 區域 經濟部 會 盡量 確保 電網 韌性 至於 民眾 關心 的 住宅 公設 可能 受到 電價 調漲 影響 王美花 指出 大 社區 的 公設 比較 大 通常 都 會 申請 低壓 電表 這 次 並未 在 調漲 範圍 內 因此 不 受 影響 王美花 說 較 小 的 社區 公設 通常 每 月 用 電量 多數 未 超過 度 若 逾度 建議 申請 時間 電價 因為 不同 時間 電價 有 高 有 低 用 這樣 的 管理 方式 可 節電 又 可 省 電費許麗珍 台北 報導 心得 推文 自己 斟酌 新板規 上路 檢舉板 一 堆 磨刀 霍霍 台積電 不用 擔心 電力 成本 吧 反而 某些 用 電凶 的 傳產 比較 另 人 擔憂',
 '電價 今 調漲 台積電 籲 供 電 穩定 經長 科學 園區 電網 採 雙 迴路 今 起 國內 電價 平均 調漲 產業 用 電 大戶 大漲 台積電 等 產業 關切 供電 穩定 經濟部長 王美花 今 表示 台積電 在 科學 園區 內 供電 來源 為 超高 壓 科學 園區 電網 都 有 雙 迴路 長期 來 看 用電 穩定 長久 以來 都 沒 問題經濟部長 王美花 今 出席 財政部 民間 參與 公共 建設 招商 大會 暨 卓越獎 頒獎 典禮 經濟部 去年 因 促 參 簽約 案件件 民間 投資 金額 億 元 而 獲得 招商 卓越獎 王美花 今 出席 領獎 於 活動 前 說明 電價

* 文章向量化處理：我們透過 sklearn 套件中 TfidfVectorizer 將斷詞結果去除停用詞後轉為空間向量

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [20]:
vectorizer = TfidfVectorizer(stop_words=stopwords)
X_train = vectorizer.fit_transform(train_tokenStr_list)
X_train = pd.DataFrame(X_train.toarray(),columns=vectorizer.get_feature_names_out())
X_train

Unnamed: 0,一一一,一一一年,一一三四,一一九,一一九七六,一一九五億,一一五三七,一一八三一,一一六,一七一九,...,龍頭廠,龍頭最近,龍頭聯發科,龍頭股,龐大,龐大一,龜山,龜山區,龜縮,龜苓膏
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1743,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1744,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1745,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1746,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


* 可以看到使用全部的斷詞結果去組成空間向量時稀疏性會非常大，在後續預測時效率會很低，因此我們需要選擇對分類結果有較顯著影響的詞彙作為向量空間的維度，以下我們透過 Chi-square 計算各詞彙與漲跌標籤的獨立性作為選擇向量空間維度的依據。

In [21]:
y_train = bbs23_ase[bbs23_ase['post_time'].between(train_startDate, train_endDate)]['label']

chi2_selector = SelectKBest(chi2, k = 2000)
chi2_selector.fit(X_train, y_train)
kbest_vocabs = X_train.columns[chi2_selector.get_support()]
X_train = X_train[kbest_vocabs]
X_train

Unnamed: 0,一併,一再,一千億,一千萬,一口,一同,一年,一截至,一月,一次次,...,黃崇仁,黃董,黑手,默克,點數,點日,點此外,點納,龍頭,龍頭最近
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1743,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1744,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1745,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1746,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# 4. 測試集文章向量化處理

* 接著我們可以透過 7-9 月的向量維度將 10 月份的文章也轉成 tf-idf 的向量空間

In [22]:
# 首先先計算4月份的文章個別的tfidf
test_startDate = datetime.date(2022,10,1)
test_endDate = datetime.date(2022,10,31)

test_tokenStr_list = []
for i in list(bbs23_ase[bbs23_ase['post_time'].between(test_startDate, test_endDate)].index):
    try:
        txt = clearSentence(bbs23_ase['content'][i])
        sentence_list = utils.short_sentence(txt)
        tokenStr = str()
        for sentence in sentence_list:
            tokens = monpa.cut(sentence)
            tokenStr += ' '.join(tokens)
        test_tokenStr_list.append(tokenStr)
    except:
        test_tokenStr_list.append('')

In [25]:
# 接著將4月份的結果透過df.reindex這個方法映射到訓練集的向量空間中
y_test = bbs23_ase[bbs23_ase['post_time'].between(test_startDate, test_endDate)]['label']

vectorizer = TfidfVectorizer(stop_words=stopwords)
X_test = vectorizer.fit_transform(test_tokenStr_list)
X_test = pd.DataFrame(X_test.toarray(),columns=vectorizer.get_feature_names_out())
X_test = X_test.reindex(kbest_vocabs, axis=1, fill_value=0)
X_test

Unnamed: 0,一併,一再,一千億,一千萬,一口,一同,一年,一截至,一月,一次次,...,黃崇仁,黃董,黑手,默克,點數,點日,點此外,點納,龍頭,龍頭最近
0,0.0,0.0,0,0,0,0,0.0,0,0.0,0,...,0,0.0,0,0,0,0,0,0,0.0,0
1,0.0,0.0,0,0,0,0,0.0,0,0.0,0,...,0,0.0,0,0,0,0,0,0,0.0,0
2,0.0,0.0,0,0,0,0,0.0,0,0.0,0,...,0,0.0,0,0,0,0,0,0,0.0,0
3,0.0,0.0,0,0,0,0,0.0,0,0.0,0,...,0,0.0,0,0,0,0,0,0,0.0,0
4,0.0,0.0,0,0,0,0,0.0,0,0.0,0,...,0,0.0,0,0,0,0,0,0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
829,0.0,0.0,0,0,0,0,0.0,0,0.0,0,...,0,0.0,0,0,0,0,0,0,0.0,0
830,0.0,0.0,0,0,0,0,0.0,0,0.0,0,...,0,0.0,0,0,0,0,0,0,0.0,0
831,0.0,0.0,0,0,0,0,0.0,0,0.0,0,...,0,0.0,0,0,0,0,0,0,0.0,0
832,0.0,0.0,0,0,0,0,0.0,0,0.0,0,...,0,0.0,0,0,0,0,0,0,0.0,0


# 5. 建立預測模型

In [26]:
from sklearn.ensemble import GradientBoostingClassifier

In [27]:
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1, max_depth=7, random_state=0)
clf.fit(X_train, y_train)
clf.score(X_train, y_train)

0.5337528604118993

# 6. 檢視預測結果

In [28]:
test_label = ase[ase['年月日-1'].between(test_startDate, test_endDate)]['label']

test_data = bbs23_ase[bbs23_ase['post_time'].between(test_startDate, test_endDate)]
test_data['predict_label'] = clf.predict(X_test)
predict_label = pd.merge(
    ase[ase['年月日-1'].between(test_startDate, test_endDate)], 
    test_data.groupby(['post_time', 'predict_label']).count().sort_values('label', ascending = False).sort_index(level=[0], sort_remaining=False).groupby(level=0).head(1).reset_index(), 
    left_on='年月日-1', right_on='post_time', how='left').fillna(method='ffill').fillna(method='bfill')['predict_label']

from sklearn.metrics import accuracy_score
print('{}月份預測準確率:'.format(test_startDate.month), accuracy_score(test_label, predict_label))

10月份預測準確率: 0.47619047619047616


In [29]:
pd.DataFrame([test_label.reset_index(drop=True), predict_label]).T

Unnamed: 0,label,predict_label
0,跌,跌
1,漲,跌
2,跌,跌
3,漲,跌
4,跌,跌
5,漲,跌
6,跌,跌
7,漲,跌
8,跌,跌
9,漲,跌


# 4. 同學們可以嘗試調整

1. 漲跌標籤的判斷%數（重要！！）
2. 文章與股價時間區間的移動天數（小時數）
3. 使用不同斷詞工具（推薦中研院CKIPTransformer）
4. 特徵選擇的其他方法（lift、、MI、、LLR...）
5. 特徵選擇的數量（太少會有很高的 false positive，太高則效率差）
6. 嘗試用看看不同分類模型
7. 改變投票方法，漲跌平三者的權重應該一樣嗎？

      GOOD LUCK!!!