In [87]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [88]:
# import 斷詞（要放 topwords_zh.txt 進資料夾）
with open('stopwords_zh.txt', 'r') as file:
    stopwords = file.read().splitlines() 
file.close()

In [93]:
# import stock excel
stock_excel = pd.read_excel("../../bda2023_mid_dataset/stock_data_2019-2023.xlsx", sheet_name = None)

In [94]:
# 只留日期跟收盤價
# 按照日期由最早往最晚排
df_stock = pd.DataFrame()
for i in range(2020, 2023):
    df_stock = pd.concat([df_stock, stock_excel.get(f"上市{i}").loc[::-1]]) # 上市 or 上櫃 -> 記得改

In [96]:
df_stock = df_stock[df_stock["證券代碼"] == stock_name][['證券代碼', '年月日', '收盤價(元)']].reset_index(drop = True)
df_stock

Unnamed: 0,證券代碼,年月日,收盤價(元)
0,2330 台積電,2020/01/02,328.2957
1,2330 台積電,2020/01/03,328.7799
2,2330 台積電,2020/01/06,321.5168
3,2330 台積電,2020/01/07,319.0957
4,2330 台積電,2020/01/08,319.0957
...,...,...,...
730,2330 台積電,2022/12/26,454.0431
731,2330 台積電,2022/12/27,454.5404
732,2330 台積電,2022/12/28,448.5727
733,2330 台積電,2022/12/29,443.5996


In [90]:
df_stock

Unnamed: 0,證券代碼,年月日,收盤價(元)
281566,ZGT50 富櫃50指數,2021/01/04,231.44
281565,Z50TR 富櫃50報酬指數,2021/01/04,336.72
281564,TPCIR 櫃薪TR,2021/01/04,125.67
281563,TPCI 櫃買薪酬,2021/01/04,101.80
281562,TPCGR 櫃 CG TR,2021/01/04,212.29
...,...,...,...
4,00695B 富邦美債7-10,2023/03/24,35.00
3,00694B 富邦美債1-3,2023/03/24,38.57
2,00687B 國泰20年美債,2023/03/24,33.68
1,00679B 元大美債20年,2023/03/24,32.10


In [107]:
# 設定要用的股票的資料
stock_name = '2330 台積電'
news_path = "../../TSMC_news_20-22.xlsx"

# stock_name = '6261 久元'
# news_path = "../../ND_news_21-23.xlsx"

In [97]:
# 只篩出特定股票
df_stock = df_stock[df_stock['證券代碼'] == stock_name]

In [98]:
df_stock = df_stock.reset_index(drop = True)
df_stock

Unnamed: 0,證券代碼,年月日,收盤價(元)
0,2330 台積電,2020/01/02,328.2957
1,2330 台積電,2020/01/03,328.7799
2,2330 台積電,2020/01/06,321.5168
3,2330 台積電,2020/01/07,319.0957
4,2330 台積電,2020/01/08,319.0957
...,...,...,...
730,2330 台積電,2022/12/26,454.0431
731,2330 台積電,2022/12/27,454.5404
732,2330 台積電,2022/12/28,448.5727
733,2330 台積電,2022/12/29,443.5996


# Requirement 1

In [99]:
# Requirement 1 的第二點

# 決定參數
day_n = 1 # 以 day_n 天後來看是漲還是跌
sigma = 0 # 以 sigma 決定漲（跌）幅超過幾 % 是漲（跌）

label = []
for i in range(len(df_stock) - day_n):
    rate = (df_stock['收盤價(元)'][i + day_n] - df_stock['收盤價(元)'][i]) / df_stock['收盤價(元)'][i]
    if rate > sigma:
       label.append('漲')
    elif rate < -sigma:
        label.append('跌')
    else:
        label.append('持平')
for i in range(day_n):
    label.append(0)
# label.extend([0, 0, 0])

df_stock['label'] = label
df_stock.tail(10)

Unnamed: 0,證券代碼,年月日,收盤價(元),label
725,2330 台積電,2022/12/19,463.9893,跌
726,2330 台積電,2022/12/20,455.0377,漲
727,2330 台積電,2022/12/21,456.5297,漲
728,2330 台積電,2022/12/22,465.4812,跌
729,2330 台積電,2022/12/23,452.5512,漲
730,2330 台積電,2022/12/26,454.0431,漲
731,2330 台積電,2022/12/27,454.5404,跌
732,2330 台積電,2022/12/28,448.5727,跌
733,2330 台積電,2022/12/29,443.5996,漲
734,2330 台積電,2022/12/30,446.0862,0


In [102]:
# 更改年月日的日期格式
df_stock['年月日'] = pd.to_datetime(df_stock['年月日']).dt.date
df_stock.tail(10)

Unnamed: 0,證券代碼,年月日,收盤價(元),label
725,2330 台積電,2022-12-19,463.9893,跌
726,2330 台積電,2022-12-20,455.0377,漲
727,2330 台積電,2022-12-21,456.5297,漲
728,2330 台積電,2022-12-22,465.4812,跌
729,2330 台積電,2022-12-23,452.5512,漲
730,2330 台積電,2022-12-26,454.0431,漲
731,2330 台積電,2022-12-27,454.5404,跌
732,2330 台積電,2022-12-28,448.5727,跌
733,2330 台積電,2022-12-29,443.5996,漲
734,2330 台積電,2022-12-30,446.0862,0


In [103]:
upCnt = 0
downCnt = 0
flatCnt = 0
for i in range(len(df_stock)):
    if df_stock['label'][i] == '漲':
        upCnt += 1
    elif df_stock['label'][i] == '跌':
        downCnt += 1 
    else :
        flatCnt += 1
print('預估漲的天數：', upCnt)
print('預估跌的天數：', downCnt)
print('預估持平的天數：', flatCnt)

預估漲的天數： 350
預估跌的天數： 352
預估持平的天數： 33


In [104]:
# 丟掉持平的資料
df_stock = df_stock[df_stock.label != '持平']
df_stock = df_stock[df_stock.label != 0]
df_stock = df_stock.reset_index(drop = True)
df_stock

Unnamed: 0,證券代碼,年月日,收盤價(元),label
0,2330 台積電,2020-01-02,328.2957,漲
1,2330 台積電,2020-01-03,328.7799,跌
2,2330 台積電,2020-01-06,321.5168,跌
3,2330 台積電,2020-01-08,319.0957,漲
4,2330 台積電,2020-01-09,326.8431,漲
...,...,...,...,...
698,2330 台積電,2022-12-26,454.0431,漲
699,2330 台積電,2022-12-27,454.5404,跌
700,2330 台積電,2022-12-28,448.5727,跌
701,2330 台積電,2022-12-29,443.5996,漲


In [108]:
# import news excel 
df_news = pd.read_excel(news_path)
df_news.head()

Unnamed: 0.1,Unnamed: 0,post_time,title,content,token
0,0,2020-01-02,美股：美中協議月中簽署，四大指數週二齊漲約0.3%封關，費半全年狂升逾六成,美股週二開盤延續前一天拉回格局，主要指數以平低盤開出，，盤中美國總統川普發文宣稱1月15日會...,"['美股', '週二', '開盤', '延續', '前', '一', '天', '拉回', ..."
1,1,2020-01-02,《熱門族群》5G新機連發，帶旺璟德、聯發科,美國消費性電子展（CES）將於元月7日∼10日接續舉辦，法人預期5G仍將是本次展期焦點，加上...,"['美國', '消費性', '電子展', '將', '於', '元月', '日日', '接續..."
2,2,2020-01-02,《熱門族群》台積電走高，半導體設備股跟漲,台積電 (2330) 股價今早開高走高，帶領台積電供應鏈及半導體設備股再度轉強。隨著美中貿易...,"['台積電', '股價', '今', '早', '開', '高', '走高', '帶領', ..."
3,3,2020-01-02,台灣科技能打趴歐洲大廠？專家分析「還太早」,近年來台灣不論科技、經濟等發展，都大幅的成長，也逐漸讓國際看到台灣。不過在科技的成長方面，就...,"['近年', '來', '台灣', '不論', '科技', '經濟', '等', '發展',..."
4,4,2020-01-02,京元電Q1營收估創同期高；全年看增雙位數,MoneyDJ新聞 2020-01-02 14:23:09 記者 王怡茹 報導<BR>京元電...,"['新聞', '記者', '王怡茹', '報導', '京元電子', '年月', '營收', ..."


In [109]:
# 幫 news 標 label（用上面的日期去對照哪一天的 news 是漲/跌）
label_news = []
for i in range(len(df_news)):
    for j in range(len(df_stock)):
        if df_news['post_time'][i] == df_stock['年月日'][j]:
            label_news.append(df_stock['label'][j])
    if len(label_news) == i:
        label_news.append(0) # 日期沒有對到的情況（e.g. 週末） 

df_news['label'] = label_news
df_news.head(10)
df_news.tail(10)

  if df_news['post_time'][i] == df_stock['年月日'][j]:


Unnamed: 0.1,Unnamed: 0,post_time,title,content,token,label
7442,7442,2022-12-27,宏觀明年毛利率有機會向上，持審慎樂觀看待,財訊快報宏觀明年毛利率有機會向上，持審慎樂觀看待閱讀全文 2022年12月27日 下午3:5...,"['財訊', '快報', '宏觀', '明年', '毛利率', '有', '機會', '向'...",跌
7443,7443,2022-12-27,駁斥台積電赴美投資是去台化 蔡總統：門兒都沒有,中央廣播電台駁斥台積電赴美投資是去台化 蔡總統：門兒都沒有閱讀全文 劉玉秋採訪2022年12...,"['中央廣播電台', '駁斥', '台積電', '赴', '美', '投資', '是', '...",跌
7444,7444,2022-12-28,蔡總統首度公開肯定：賴清德是最佳總統接班人選,工商時報蔡總統首度公開肯定：賴清德是最佳總統接班人選閱讀全文 彭媁琳／台北報導2022年12...,"['工商時報', '蔡', '總統', '首度', '公開', '肯定', '賴清德', '...",跌
7445,7445,2022-12-28,台積電股價沒有一家猜對！謝金河曝半導體變慘業：存在1危險性,中時財經即時台積電股價沒有一家猜對！謝金河曝半導體變慘業：存在1危險性閱讀全文 吳美觀202...,"['中時', '財經', '即時', '台積電', '股價', '沒有', '一', '家'...",跌
7446,7446,2022-12-28,《美股》公債殖利率攀高 那斯達克指數下跌1.38%,Moneydj理財網《美股》公債殖利率攀高 那斯達克指數下跌1.38%閱讀全文 2022年1...,"['理財', '網美股', '公債', '殖利率', '攀高', '那斯達克', '指數',...",跌
7447,7447,2022-12-28,道瓊聖誕假期後小漲37點，費半、那指各挫1.8%和1.4%，特斯拉再瀉11%,財訊快報道瓊聖誕假期後小漲37點，費半、那指各挫1.8%和1.4%，特斯拉再瀉11%閱讀全文...,"['財訊', '快報', '道瓊', '聖誕', '假期', '後', '小漲', '點',...",跌
7448,7448,2022-12-28,《美股掃瞄》債息急升打壓成長股 標普收黑、那指費半大跌(2-2),時報資訊《美股掃瞄》債息急升打壓成長股 標普收黑、那指費半大跌(2-2)閱讀全文 2022年...,"['時報', '資訊', '美股', '掃瞄', '債息', '急升', '打壓', '成長...",跌
7449,7449,2022-12-28,《半導體》宏觀今年營收拚增2成 明年毛利率向上,時報資訊《半導體》宏觀今年營收拚增2成 明年毛利率向上閱讀全文 2022年12月28日 上午...,"['時報', '資訊', '半導體', '宏觀', '今年', '營收', '拚', '增成...",跌
7450,7450,2022-12-28,《半導體》聯發科毛利率跌破45% 外資曝時間點,時報資訊《半導體》聯發科毛利率跌破45% 外資曝時間點閱讀全文 2022年12月28日 上午...,"['時報', '資訊', '半導體', '聯發科', '毛利率', '跌破', '外資', ...",跌
7451,7451,2022-12-28,擔憂市占及毛利率下滑，美系外資調降聯發科目標價至649元,財訊快報擔憂市占及毛利率下滑，美系外資調降聯發科目標價至649元閱讀全文 2022年12月2...,"['財訊', '快報', '擔憂', '市占', '及', '毛利率', '下滑', '美系...",跌


In [110]:
# 刪掉 label 是 0 的資料
df_news_no_zero = df_news[df_news.label != 0]
df_news_no_zero = df_news_no_zero.reset_index(drop = True)

In [111]:
# token 型態轉換，fit in 套件
df_news_list = []
for i in range(len(df_news_no_zero)):
    df_news_list_tmp = []
    df_news_str = ''
    df_news_list_tmp = eval(df_news_no_zero.token[i])
    df_news_str = ' '.join(df_news_list_tmp)
    df_news_list.append(df_news_str)
df_news_list

['美股 週二 開盤 延續 前 一 天 拉回 格局 主要 指數 以 平 低盤 開出 盤 中 美國 總統 川普 發文 宣稱 月日會 簽署 首 階段 美中 貿易 協議 市場 吃 了 定心丸 後 再 上 四 大 指數 在 年 最後 一 個 交易日 尾盤 齊步 走 揚收 小紅封 關 唯一 的 遺憾 是 所有 指數 未 能 扳回 前 一 天 所有 失土 在 封關 的 當下 標普 和那 指 同步 締造 六 年 來 年度 最 大 漲幅 紀錄 分析 人士 認為 美股 年 的 升勢 主要 源起 聯準會 三 度 降息 在 此 之前 在 年 先後 加 息 次 聯邦 基金 目標 利率 最 高 見至川普 週二 透過 推特 發文 透露 將 於 月日 在 白宮 與 中方 高級別 代表 舉行 首 階段 貿易 協議 簽署 儀式 爾後 他 會 親身 前往 北京 就 第二 階段 協議 展開 談判 惟 未 有 交代 時間 這 項 利好 因 美國 駐 巴格達 大使館 外 發生 暴力 抗議 的 消息 而 淡化 紐約 股市 年終 收盤 道瓊 工業 指數 上漲點 或 以 點 作收 當天 思科 和 蘋果 三 檔 最 旺 依序 漲 和 蘋果 收 在 美元 是 陣營 收 唯一 在 封關日 改寫 新猷 的 個股 標普 指數 收 漲點 或 來到 點 那 指 漲點 或 收 在 點 以 晶片 為主 的 費城 半導體 指數 小漲點 或 報點 成份股 中英偉達 和 美光 各 漲 和 最 旺台 積電 漲 至 美元 也 在 前 十 強 之 列 凌雲 邏輯 跌 是 最 大 魯蛇 也 唯一 跌幅 逾 的 成份股 四 大 股指 月 月線 第四 季 和 年 均 大幅 上漲 就 月線 而言 道瓊 漲 標普 指數 急漲 為 九 年 來 最 旺 的 月 那 指彈 升費半 大漲 為 月 以來 最 旺年 第四季 道瓊 漲 標普 指數 勁揚 那 指 猛漲 費半 飆升 為 年 第三季 強彈 以來 最 旺 累計 年 全 年 道瓊 漲幅 為 兩 年 來 最 旺 標普 指數 漲 那 指漲 皆 是 年來 年度 最 旺費半狂升 則 是 年暴衝 以來 的 十 年 最 旺 統計 在 過去年 道瓊 指數 累計 大漲 標普 指數 急漲 那 指 強彈 費半 更 飆出 的 漲幅 年 全年 計算 蘋果 飆升 微軟 急彈 是 道瓊 且 是 美股 兩 大 攻頂 火車頭 週二 成交量 

In [147]:
X_data = df_news_list
Y_data = df_news_no_zero['label']

In [148]:
# 隨機切分資料
# text_size = 0.2 --> train 80%、test 20%
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X_data, Y_data, test_size = 0.2, random_state = 2, stratify = Y_data)
print("train data length:", len(X_train), ",", len(Y_train))
print("test data length:",len(X_test), ",", len(Y_test))

train data length: 5190 , 5190
test data length: 1298 , 1298


In [149]:
#將所有資料向量化，轉成 tfidf vector
vectorizer = TfidfVectorizer(stop_words=stopwords)
X_train = vectorizer.fit_transform(X_train)
X_train = pd.DataFrame(X_train.toarray(), columns = vectorizer.get_feature_names_out())
display(X_train)



Unnamed: 0,一一一億,一一七五,一一七八,一一七六億,一一九九七,一一五,一一兆,一一八九點九四,一一八五,一一六,...,龐培歐,龐大,龐雜,龔培元,龔明鑫,龜尾,龜山,龜山廠,龜牙,龜速
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.081743,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5185,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5186,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5187,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5188,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [150]:
chi2_selector = SelectKBest(chi2, k = 10000)
chi2_selector.fit(X_train, Y_train)
kbest_vocabs = X_train.columns[chi2_selector.get_support()]
X_train = X_train[kbest_vocabs]
X_train

Unnamed: 0,一三,一些些,一兩,一出,一劑,一動,一哥三星,一城,一堂,一家,...,龍潭廠,龍科,龍頭公司,龍頭市,龍頭特斯拉,龍頭聯電,龍頭股,龍頭輝,龐佩奧,龔明鑫
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.053507,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5185,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
5186,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
5187,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
5188,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0


In [151]:
# 計算 test 個別的 tfidf，再將結果透過 df.reindex 這個方法映射到訓練集的向量空間中
X_test = vectorizer.fit_transform(X_test)
X_test = pd.DataFrame(X_test.toarray(), columns=vectorizer.get_feature_names_out())
X_test = X_test.reindex(kbest_vocabs, axis=1, fill_value=0)
X_test

Unnamed: 0,一三,一些些,一兩,一出,一劑,一動,一哥三星,一城,一堂,一家,...,龍潭廠,龍科,龍頭公司,龍頭市,龍頭特斯拉,龍頭聯電,龍頭股,龍頭輝,龐佩奧,龔明鑫
0,0,0,0.0,0.0,0,0.0,0.0,0.0,0,0,...,0.0,0,0,0,0.0,0,0.0,0,0,0.0
1,0,0,0.0,0.0,0,0.0,0.0,0.0,0,0,...,0.0,0,0,0,0.0,0,0.0,0,0,0.0
2,0,0,0.0,0.0,0,0.0,0.0,0.0,0,0,...,0.0,0,0,0,0.0,0,0.0,0,0,0.0
3,0,0,0.0,0.0,0,0.0,0.0,0.0,0,0,...,0.0,0,0,0,0.0,0,0.0,0,0,0.0
4,0,0,0.0,0.0,0,0.0,0.0,0.0,0,0,...,0.0,0,0,0,0.0,0,0.0,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1293,0,0,0.0,0.0,0,0.0,0.0,0.0,0,0,...,0.0,0,0,0,0.0,0,0.0,0,0,0.0
1294,0,0,0.0,0.0,0,0.0,0.0,0.0,0,0,...,0.0,0,0,0,0.0,0,0.0,0,0,0.0
1295,0,0,0.0,0.0,0,0.0,0.0,0.0,0,0,...,0.0,0,0,0,0.0,0,0.0,0,0,0.0
1296,0,0,0.0,0.0,0,0.0,0.0,0.0,0,0,...,0.0,0,0,0,0.0,0,0.0,0,0,0.0


# Requirement 2

### 預測模型

In [152]:
from sklearn.ensemble import GradientBoostingClassifier

In [154]:
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1, max_depth=7, max_features='sqrt', random_state=0)
clf.fit(X_train, Y_train)
clf.score(X_train, Y_train)

0.9724470134874759

In [155]:
test_label = Y_test

test_data = X_test
predict_label = clf.predict(X_test)
# predict_label = pd.merge(
#     ase[ase['年月日-1'].between(test_startDate, test_endDate)], 
#     test_data.groupby(['post_time', 'predict_label']).count().sort_values('label', ascending = False).sort_index(level=[0], sort_remaining=False).groupby(level=0).head(1).reset_index(), 
#     left_on='年月日-1', right_on='post_time', how='left').fillna(method='ffill').fillna(method='bfill')['predict_label']

from sklearn.metrics import accuracy_score
print('預測準確率:', accuracy_score(test_label, predict_label))

預測準確率: 0.5562403697996918


In [156]:
#建立預測模型（NB）
from sklearn.naive_bayes import BernoulliNB
from sklearn import metrics
NB_model = BernoulliNB()

In [157]:
NB_model.fit(X_train,Y_train)

In [158]:
predicted_results = []
expected_results = []
expected_results.extend(Y_test)
predicted_results.extend(NB_model.predict(X_test))
print(predicted_results)

['漲', '跌', '跌', '跌', '跌', '跌', '跌', '漲', '跌', '跌', '跌', '漲', '漲', '跌', '漲', '跌', '跌', '跌', '漲', '漲', '漲', '跌', '跌', '跌', '跌', '漲', '跌', '漲', '跌', '跌', '跌', '跌', '漲', '跌', '漲', '跌', '漲', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '漲', '漲', '跌', '跌', '跌', '漲', '跌', '跌', '漲', '跌', '漲', '跌', '跌', '漲', '跌', '漲', '漲', '跌', '跌', '跌', '漲', '跌', '跌', '漲', '跌', '跌', '跌', '漲', '跌', '跌', '跌', '跌', '漲', '跌', '漲', '跌', '漲', '跌', '跌', '漲', '跌', '漲', '跌', '跌', '漲', '漲', '跌', '漲', '跌', '漲', '跌', '跌', '漲', '漲', '漲', '漲', '跌', '跌', '漲', '漲', '漲', '跌', '跌', '漲', '跌', '跌', '漲', '漲', '跌', '跌', '跌', '跌', '漲', '跌', '漲', '跌', '漲', '漲', '跌', '漲', '跌', '跌', '跌', '跌', '跌', '漲', '跌', '跌', '漲', '漲', '跌', '漲', '跌', '跌', '漲', '跌', '漲', '跌', '跌', '漲', '跌', '跌', '漲', '跌', '跌', '跌', '跌', '跌', '漲', '跌', '跌', '漲', '跌', '漲', '漲', '漲', '跌', '漲', '跌', '漲', '漲', '跌', '漲', '漲', '跌', '漲', '跌', '漲', '跌', '跌', '跌', '跌', '漲', '漲', '跌', '漲', '漲', '漲', '漲', '跌', '跌', '跌', '跌', '跌', '漲', '跌', '漲', '跌', '跌', '跌',

In [159]:
report = metrics.classification_report(expected_results, predicted_results)
print(report)

              precision    recall  f1-score   support

           漲       0.57      0.51      0.54       618
           跌       0.59      0.65      0.62       680

    accuracy                           0.58      1298
   macro avg       0.58      0.58      0.58      1298
weighted avg       0.58      0.58      0.58      1298



In [160]:
#建立預測模型（SVM Linear）
from sklearn.svm import SVC
SVM_model = SVC(kernel = 'linear', C = 50.0, probability = True)
SVM_model.fit(X_train, Y_train)

In [161]:
predicted_results = []
expected_results = []
expected_results.extend(Y_test)
predicted_results.extend(SVM_model.predict(X_test))
print(predicted_results)

['跌', '跌', '跌', '跌', '跌', '漲', '跌', '漲', '跌', '跌', '跌', '跌', '漲', '跌', '漲', '跌', '漲', '跌', '跌', '漲', '漲', '漲', '漲', '跌', '漲', '漲', '跌', '漲', '跌', '跌', '跌', '跌', '漲', '跌', '跌', '跌', '跌', '跌', '漲', '跌', '跌', '跌', '跌', '漲', '跌', '跌', '跌', '跌', '跌', '漲', '跌', '漲', '跌', '漲', '跌', '漲', '跌', '跌', '漲', '跌', '跌', '跌', '跌', '漲', '跌', '漲', '漲', '跌', '漲', '跌', '漲', '跌', '跌', '漲', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '漲', '漲', '漲', '漲', '跌', '跌', '漲', '跌', '漲', '跌', '漲', '漲', '漲', '跌', '跌', '跌', '跌', '漲', '跌', '漲', '漲', '跌', '跌', '跌', '跌', '漲', '漲', '跌', '跌', '跌', '漲', '跌', '漲', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '漲', '漲', '漲', '漲', '漲', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '漲', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '漲', '漲', '漲', '漲', '漲', '跌', '跌', '跌', '漲', '漲', '跌', '漲', '漲', '跌', '跌', '漲', '跌', '漲', '跌', '跌', '跌', '漲', '跌', '跌', '跌', '跌', '漲', '跌', '跌', '漲', '漲', '漲', '漲', '漲', '跌', '跌', '跌', '漲', '漲', '漲', '漲', '漲', '跌', '漲', '跌', '漲', '漲', '漲', '漲', '漲', '跌',

In [162]:
report = metrics.classification_report(expected_results, predicted_results)
print(report)

              precision    recall  f1-score   support

           漲       0.56      0.54      0.55       618
           跌       0.60      0.62      0.61       680

    accuracy                           0.58      1298
   macro avg       0.58      0.58      0.58      1298
weighted avg       0.58      0.58      0.58      1298



In [163]:
#建立預測模型（SVM Rbf）
SVM_model = SVC(kernel = 'rbf', C = 1.0, probability=True)
SVM_model.fit(X_train,Y_train)

In [164]:
predicted_results = []
expected_results = []
expected_results.extend(Y_test)
predicted_results.extend(SVM_model.predict(X_test))
print(predicted_results)

['漲', '跌', '跌', '跌', '漲', '漲', '跌', '跌', '跌', '跌', '跌', '漲', '漲', '跌', '漲', '跌', '跌', '跌', '跌', '漲', '漲', '漲', '漲', '跌', '跌', '漲', '跌', '漲', '漲', '漲', '跌', '跌', '漲', '漲', '漲', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '漲', '跌', '跌', '跌', '跌', '跌', '漲', '跌', '漲', '跌', '跌', '跌', '跌', '跌', '跌', '漲', '跌', '漲', '跌', '跌', '跌', '跌', '漲', '漲', '跌', '漲', '跌', '跌', '跌', '跌', '漲', '跌', '跌', '跌', '跌', '漲', '跌', '跌', '跌', '跌', '漲', '漲', '漲', '漲', '跌', '跌', '漲', '跌', '漲', '跌', '跌', '漲', '漲', '跌', '跌', '漲', '跌', '漲', '漲', '漲', '漲', '跌', '跌', '跌', '跌', '漲', '跌', '跌', '跌', '跌', '漲', '跌', '漲', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '漲', '漲', '跌', '漲', '漲', '跌', '跌', '跌', '跌', '跌', '跌', '漲', '漲', '跌', '跌', '跌', '跌', '漲', '跌', '跌', '漲', '漲', '跌', '跌', '跌', '跌', '跌', '跌', '漲', '漲', '跌', '漲', '漲', '跌', '跌', '漲', '跌', '跌', '跌', '跌', '漲', '漲', '漲', '跌', '跌', '跌', '漲', '跌', '漲', '漲', '漲', '漲', '跌', '漲', '跌', '漲', '跌', '漲', '漲', '跌', '漲', '跌', '跌', '跌', '跌', '漲', '漲', '漲', '跌', '跌', '跌',

In [165]:
report = metrics.classification_report(expected_results, predicted_results)
print(report)

              precision    recall  f1-score   support

           漲       0.57      0.51      0.54       618
           跌       0.59      0.65      0.62       680

    accuracy                           0.58      1298
   macro avg       0.58      0.58      0.58      1298
weighted avg       0.58      0.58      0.58      1298



#### example 6 

In [166]:
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB() # naive bayes classifier

In [167]:
# 以下將自身資料切成 train 及 test 兩組，重新訓練一次，測試模型準確率
classifier = MultinomialNB()
classifier.fit(X_train, Y_train) #訓練

In [168]:
from sklearn import metrics 
from sklearn.metrics import accuracy_score
Y_pred = classifier.predict(X_test) #用測試資料預測
print("Accuracy:", metrics.accuracy_score(Y_test, Y_pred)) #比對答案，計算準確率

Accuracy: 0.5624036979969184


In [169]:
from sklearn.metrics import classification_report  
print(classification_report(Y_test, Y_pred)) #印出分類報告

              precision    recall  f1-score   support

           漲       0.59      0.28      0.37       618
           跌       0.56      0.82      0.66       680

    accuracy                           0.56      1298
   macro avg       0.57      0.55      0.52      1298
weighted avg       0.57      0.56      0.53      1298



In [170]:
from sklearn.metrics import confusion_matrix  
print(confusion_matrix(Y_test, Y_pred, labels = ['漲', '跌'])) # 印出混淆矩陣
# test/predicted 看漲   看跌   
#    看漲         TN     FP
#    看跌         FN     TP

[[170 448]
 [120 560]]


In [171]:
Y_test

4921    跌
3857    跌
2735    跌
4731    跌
4378    跌
       ..
1389    跌
2861    漲
4593    漲
693     漲
5289    跌
Name: label, Length: 1298, dtype: object

#### Example 7

In [172]:
X_data = df_news_list

In [173]:
# 將所有資料(X_data)向量化，轉成 tfidf vector
vectorizer = TfidfVectorizer(stop_words=stopwords)
X_data = vectorizer.fit_transform(X_data)
X_data = pd.DataFrame(X_data.toarray(), columns = vectorizer.get_feature_names_out())

chi2_selector = SelectKBest(chi2, k = 5000)
chi2_selector.fit(X_data, Y_data)
kbest_vocabs = X_data.columns[chi2_selector.get_support()]
X_data = X_data[kbest_vocabs]
X_data

display(X_data)



Unnamed: 0,一三,一併,一兩,一哥三星,一城,一擊,一早,一流,一站式,一聯發科,...,齊賣,齊跌,龍潭,龍潭廠,龍頭公司,龍頭特斯拉,龍頭聯電,龍頭股,龐佩奧,龔明鑫
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6483,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.086038,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6484,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6485,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6486,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [174]:
# DecisionTree
from sklearn.tree import DecisionTreeClassifier                          
classifier = DecisionTreeClassifier(criterion = "entropy")

from sklearn import metrics 
from sklearn.metrics import accuracy_score
classifier.fit(X_train, Y_train) # 訓練
Y_pred = classifier.predict(X_test) # 用測試資料預測
print("Accuracy:", metrics.accuracy_score(Y_test, Y_pred)) # 比對答案，計算準確率

Accuracy: 0.5416024653312789


In [175]:
from sklearn.metrics import classification_report  
print(classification_report(Y_test, Y_pred)) # 印出分類報告

              precision    recall  f1-score   support

           漲       0.52      0.53      0.52       618
           跌       0.56      0.56      0.56       680

    accuracy                           0.54      1298
   macro avg       0.54      0.54      0.54      1298
weighted avg       0.54      0.54      0.54      1298



In [None]:
from sklearn.metrics import confusion_matrix  
print(confusion_matrix(Y_test, Y_pred, labels = ['漲', '跌'])) #印出混淆矩陣
# test/predicted 看漲   看跌
#    看漲         TN     FP
#    看跌         FN     TP

[[314 304]
 [300 380]]


In [None]:
Y_data

0       漲
1       漲
2       漲
3       漲
4       漲
       ..
6483    跌
6484    跌
6485    跌
6486    跌
6487    跌
Name: label, Length: 6488, dtype: object

In [None]:
# KNeighbors
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 7)

from sklearn.model_selection import cross_val_score
scores = cross_val_score(classifier, X_data, Y_data, cv = 5, scoring = 'accuracy') # 交叉驗證，計算準確率
print(scores)
print("Avg. Accuracy:", scores.mean())

[0.52157165 0.54083205 0.50770416 0.53199692 0.51734773]
Avg. Accuracy: 0.5238904999447581


In [None]:
classifier.fit(X_train, Y_train) # 訓練
Y_pred = classifier.predict(X_test) # 用測試資料預測
print("Accuracy:", metrics.accuracy_score(Y_test, Y_pred)) # 比對答案，計算準確率

Accuracy: 0.5446841294298922


In [None]:
print(classification_report(Y_test, Y_pred)) # 印出分類報告

              precision    recall  f1-score   support

           漲       0.53      0.39      0.45       618
           跌       0.55      0.69      0.61       680

    accuracy                           0.54      1298
   macro avg       0.54      0.54      0.53      1298
weighted avg       0.54      0.54      0.53      1298



In [None]:
print(confusion_matrix(Y_test, Y_pred, labels = ['漲', '跌'])) # 印出混淆矩陣
# test/predicted 看漲   看跌
#    看漲         TN     FP
#    看跌         FN     TP

[[239 379]
 [212 468]]


In [None]:
# SVC
from sklearn.svm import SVC
classifier = SVC(kernel='linear')

classifier.fit(X_train, Y_train) # 訓練
Y_pred = classifier.predict(X_test) # 用測試資料預測
print("Accuracy:", metrics.accuracy_score(Y_test, Y_pred)) # 比對答案，計算準確率

KeyboardInterrupt: 

In [None]:
print(classification_report(Y_test, Y_pred)) # 印出分類報告

In [None]:
print(confusion_matrix(Y_test, Y_pred, labels = ['漲', '跌'])) # 印出混淆矩陣
# test/predicted 看漲   看跌
#    看漲         TN     FP
#    看跌         FN     TP