In [53]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [54]:
# impoort 斷詞（要放 topwords_zh.txt 進資料夾）
with open('stopwords_zh.txt', 'r') as file:
    stopwords = file.read().splitlines() 
file.close()

In [55]:
# import stock excel
# 只留台積電的日期跟收盤價
# 按照日期由最早往最晚排
df_stock = pd.DataFrame()
for i in range(2021, 2024):
    x = pd.read_excel("../../bda2023_mid_dataset/stock_data_2019-2023.xlsx", sheet_name=f'上市{i}', usecols=['證券代碼', '年月日', '收盤價(元)'])
    x = x[x['證券代碼'] == '2330 台積電']
    x = x.loc[::-1]
    df_stock = pd.concat([df_stock, x])

In [56]:
df_stock = df_stock.reset_index(drop = True)
df_stock

Unnamed: 0,證券代碼,年月日,收盤價(元)
0,2330 台積電,2021/01/04,526.7744
1,2330 台積電,2021/01/05,532.6711
2,2330 台積電,2021/01/06,539.5506
3,2330 台積電,2021/01/07,555.2752
4,2330 台積電,2021/01/08,570.0170
...,...,...,...
534,2330 台積電,2023/03/20,512.0000
535,2330 台積電,2023/03/21,517.0000
536,2330 台積電,2023/03/22,533.0000
537,2330 台積電,2023/03/23,538.0000


# Requirement 1

In [57]:
# Requirement 1 的第二點

day_n = 3 # 以 day_n 天後來看是漲還是跌
sigma = 0.01 # 以 sigma 決定漲（跌）幅超過幾 % 是漲（跌）
label = []
for i in range(len(df_stock) - day_n):
    rate = (df_stock['收盤價(元)'][i + day_n] - df_stock['收盤價(元)'][i]) / df_stock['收盤價(元)'][i]
    if rate > sigma:
       label.append('漲')
    elif rate < -sigma:
        label.append('跌')
    else:
        label.append('持平')
for i in range(day_n):
    label.append(0)
# label.extend([0, 0, 0])

df_stock['label'] = label
df_stock.tail(10)

Unnamed: 0,證券代碼,年月日,收盤價(元),label
529,2330 台積電,2023/03/13,513.2229,跌
530,2330 台積電,2023/03/14,507.2552,漲
531,2330 台積電,2023/03/15,508.2498,持平
532,2330 台積電,2023/03/16,505.0,漲
533,2330 台積電,2023/03/17,518.0,漲
534,2330 台積電,2023/03/20,512.0,漲
535,2330 台積電,2023/03/21,517.0,漲
536,2330 台積電,2023/03/22,533.0,0
537,2330 台積電,2023/03/23,538.0,0
538,2330 台積電,2023/03/24,539.0,0


In [58]:
# 更改年月日的日期格式
df_stock['年月日'] = pd.to_datetime(df_stock['年月日']).dt.date
df_stock.tail(10)

Unnamed: 0,證券代碼,年月日,收盤價(元),label
529,2330 台積電,2023-03-13,513.2229,跌
530,2330 台積電,2023-03-14,507.2552,漲
531,2330 台積電,2023-03-15,508.2498,持平
532,2330 台積電,2023-03-16,505.0,漲
533,2330 台積電,2023-03-17,518.0,漲
534,2330 台積電,2023-03-20,512.0,漲
535,2330 台積電,2023-03-21,517.0,漲
536,2330 台積電,2023-03-22,533.0,0
537,2330 台積電,2023-03-23,538.0,0
538,2330 台積電,2023-03-24,539.0,0


In [59]:
upCnt = 0
downCnt = 0
flatCnt = 0
for i in range(len(df_stock)):
    if df_stock['label'][i] == '漲':
        upCnt += 1
    elif df_stock['label'][i] == '跌':
        downCnt += 1 
    else :
        flatCnt += 1
print('預估漲的天數：', upCnt)
print('預估跌的天數：', downCnt)
print('預估持平的天數：', flatCnt)

預估漲的天數： 180
預估跌的天數： 200
預估持平的天數： 159


In [60]:
# 丟掉持平的資料
df_stock = df_stock[df_stock.label != '持平']
df_stock = df_stock[df_stock.label != '0']
df_stock = df_stock.reset_index(drop = True)
df_stock

Unnamed: 0,證券代碼,年月日,收盤價(元),label
0,2330 台積電,2021-01-04,526.7744,漲
1,2330 台積電,2021-01-05,532.6711,漲
2,2330 台積電,2021-01-06,539.5506,漲
3,2330 台積電,2021-01-07,555.2752,漲
4,2330 台積電,2021-01-08,570.0170,漲
...,...,...,...,...
378,2330 台積電,2023-03-20,512.0000,漲
379,2330 台積電,2023-03-21,517.0000,漲
380,2330 台積電,2023-03-22,533.0000,0
381,2330 台積電,2023-03-23,538.0000,0


In [61]:
# import news excel 
df_news = pd.read_excel("../../TSMC_news_21-23.xlsx")
df_news.head()

Unnamed: 0.1,Unnamed: 0,post_time,title,content,token
0,0,2021-01-01,明年買哪幾檔晶片股?分析師首選 Nvidia 和 AMD,週四 (31 日) 華爾街分析師表示，Nvidia 和 AMD 本月漲勢落後於其他半導體類股...,"['週四', '日', '華爾街', '分析師', '表示', '和', '本', '月',..."
1,1,2021-01-01,《大陸產業》中芯成熟製程 獲美放行,【時報-台北電】遭美制裁的大陸晶圓代工龍頭中芯國際迎來曙光。業界傳出，中芯國際成熟製程獲得美...,"['時報', '台北電', '遭', '美', '制裁', '的', '大陸', '晶圓',..."
2,2,2021-01-04,《各報要聞》台積今年資本支出上看200億美元,【時報-台北電】晶圓代工龍頭台積電2020年繳出亮麗成績單，預期全年美元營收年成長率逾三成並...,"['時報', '台北', '電', '晶圓', '代工', '龍頭', '台積電', '年'..."
3,3,2021-01-04,《半導體》台積電等供應鏈力挺 聯發科Q1拚淡季不淡,【時報記者王逸芯台北報導】聯發科(2454)在去年第三季一舉超車高通，在行動晶片市占率衝上3...,"['時報', '記者', '王逸芯', '台北', '報導', '聯發科', '在', '去..."
4,4,2021-01-04,《半導體》2021年拚翻身 創意鎖漲停,【時報記者王逸芯台北報導】創意(3443)營運最壞時期已經過去，去年第四季在NRE案認列入帳...,"['時報', '記者', '王逸芯', '台北', '報導', '創意', '營運', '最..."


In [62]:
# 幫 news 標 label（用上面的日期去對照哪一天的 news 是漲/跌）
label_news = []
for i in range(len(df_news)):
    for j in range(len(df_stock)):
        if df_news['post_time'][i] == df_stock['年月日'][j]:
            label_news.append(df_stock['label'][j])
    if len(label_news) == i:
        label_news.append(0) # 日期沒有對到的情況（e.g. 週末） 

df_news['label'] = label_news
df_news.head(10)
df_news.tail(10)

  if df_news['post_time'][i] == df_stock['年月日'][j]:


Unnamed: 0.1,Unnamed: 0,post_time,title,content,token,label
6822,6822,2023-03-20,算力大戰 台積日月光受惠,隨著微軟轉投資OpenAI推出的聊天機器人ChatGPT全球爆紅，包括Google、阿里巴巴...,"['隨著', '微軟', '轉', '投資', '推出', '的', '聊天', '機器人'...",漲
6823,6823,2023-03-20,銀行股再見殺氣，美股四巫日齊黑，道瓊急跌385點，微軟逆勢漲約1.2%,【財訊快報／陳孟朔】被華府接管的矽谷銀行(SVB)，其母公司SVB金融集團申請破產保護的同時...,"['財訊', '快報', '陳孟朔', '被', '華府', '接管', '的', '矽谷銀...",漲
6824,6824,2023-03-20,《熱門族群》外資釋出AI口袋名單股 台積電等3檔入列,【時報記者王逸芯台北報導】ChatGPT颳起AI旋風，美系外資針對AI產業也出具最新研究報告...,"['時報', '記者', '王逸芯', '台北', '報導', '颳起', '旋風', '美...",漲
6825,6825,2023-03-20,台積電蟬聯百大創新企業，專利與營業秘密雙軌保護成果，研發營收比8%,【財訊快報／記者李純君報導】台積電(2330)今年繼續蟬聯百大創新企業，其副法務長陳碧莉提到...,"['財訊快報', '記者', '李純君', '報導', '台積電', '今年', '繼續',...",漲
6826,6826,2023-03-20,台積電／台積電「創新」再獲獎 副法務長陳碧莉：去年在台、美專利獲准百發百中,台積電（2330）再次入選「2023全球百大創新機構獎」，出席領獎的台積電副法務漲陳碧莉表示...,"['台積電', '再次', '入選', '全球', '百', '大', '創新', '機構獎...",漲
6827,6827,2023-03-21,晶片業起死回生？日媒揭實際庫存真相：反轉訊號來了,半導體景氣反轉向下，從「供不應求」轉變成「供應過剩」，市場關注產業景氣何時才會翻轉向上，日媒...,"['半導體', '景氣', '反轉', '向', '下', '從', '供', '不', '...",漲
6828,6828,2023-03-21,《盤前掃瞄-基本面》高通加速轉單台灣；國發會估景氣4月探底,【時報-台北電】基本面：1.前一交易日新台幣以30.593元兌一美元收市，貶值3.9分，成交...,"['時報', '台北', '電', '基本面', '前', '一', '交易日', '新台幣...",漲
6829,6829,2023-03-21,《各報要聞》高通去中化 加速轉單台灣,【時報-台北電】為了因應半導體市場出現美國陣營及中國陣營的兩極化地緣政治風險，手機晶片大廠高...,"['時報', '台北電', '為了', '因應', '半導體', '市場', '出現', '...",漲
6830,6830,2023-03-21,《科技》台積電兩招…擴大專利版圖,【時報-台北電】晶圓代工龍頭台積電20日獲頒2023年科睿唯安全球百大創新機構獎，副法務長陳...,"['時報', '台北', '電晶圓', '代工', '龍頭', '台積電', '日', '獲...",漲
6831,6831,2023-03-21,《熱門族群》高通轉單喜訊 精測笑開懷、這兩檔卻擺臭臉,【時報-台北電】台股今日隨美股彈升，在金融股回神、AI等具題材族群續強下，大盤指數盤中彈升約...,"['時報', '台北電', '台股', '今日', '隨', '美股', '彈升', '在'...",漲


In [63]:
# 刪掉 label 是 0 的資料
df_news_no_zero = df_news[df_news.label != 0]
df_news_no_zero = df_news_no_zero.reset_index(drop = True)

In [64]:
# token 型態轉換，fit in 套件
df_news_list = []
for i in range(len(df_news_no_zero)):
    df_news_list_tmp = []
    df_news_str = ''
    df_news_list_tmp = eval(df_news_no_zero.token[i])
    df_news_str = ' '.join(df_news_list_tmp)
    df_news_list.append(df_news_str)
df_news_list

['時報 台北 電 晶圓 代工 龍頭 台積電 年 繳出 亮麗 成績單 預期 全 年 美元 營收 年 成長率 逾 三成 並 創下 歷史 新高年 雖然 仍 有 新冠 肺炎 疫情 蔓延 地緣 政治 及 貿易戰 等 外在 環境 變數 但 晶圓 代工 訂單 強勁 第二季 前 產能 利用率 維持 滿載 設備 業者 及 市場 法人 看好 台積電 年 奈米 產能 建置 及 奈米 技術 研發 加速 進行 全 年 資本 支出 將 上看 億 美元 再 創 新高 台積電 年 資本 支出 估 達 億 美元 創下 新高 無虞 但 奈米 及 奈米 等 先 進 製程 奈米 以上 成熟 製程 等 產能 仍 供 不 應 求 台積電 年 持續 擴增 產能 以 因應 強勁 需求 包括 廠 第三 期 奈米 產能 將 在 第一季 開出 廠 奈米 產能 建置 正 加速 進行 同時 提升 奈米 閘極 全 球 場效電 晶體 技術 研發 速度 等 設備 業者 指出 受惠 於 及 人工 智慧 及 高效能 運算車 用 及 物聯網 晶片 等 晶圓 代工 需求 強勁 台積電 看 好 年 產能 持續 供 不 應求 奈米 及 奈米 接單 幾乎 全 滿成熟 製程 產能 全 線 吃緊 加上 奈米 產能 建置 及 美國 奈米 新廠 動工 將 同步 展開 持續 增加 極 紫外光 曝光機 及 相關 設備 採購 規模 預期年 資本 支出 將 上看 億 美元 連續 三年 創下 歷史 新高 以 年 新廠 建置 進度 來 看 台積電 針對 奈米 製程 打造 的 廠 共 三 期 工程 已 動工 美國 奈米 建廠 即將 展開 南科 廠會 再 興建 廠 做為 特殊 製程 晶圓廠 並 在 同 一 廠 區 內 興建 先進 封裝 技術 生產 基地 再者 竹科 廠區 將 興建 擁有 兩 座 研發晶圓廠 的 研發中心 其中 研發 晶圓廠 預 計年 完工 做為 奈米 及 更 先進 製程 的 研發 基地 竹南 先進 封裝廠 則 如期 進行 台積電 奈米 將 於 年 第二季 進入 量產 及 應用 會 是 主要 產品線 根據 台積電 規畫 奈米 晶圓廠 廠 房 基地 面積 約 為 公頃 潔淨室 面積 將 超過 萬 平方 公尺 大約 是 座 標準 足球場 大小 奈米 進入 量產 時 當年 產能 預估 將 超過 每 年 萬 片 吋 晶圓 業界 預期 台積電 會 是 全球 第

In [65]:
X_data = df_news_list
Y_data = df_news_no_zero['label']

In [66]:
# 隨機切分資料
# text_size = 0.2 --> train 80%、test 20%
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X_data, Y_data, test_size = 0.2, random_state = 2, stratify = Y_data)
print("train data length:", len(X_train), ",", len(Y_train))
print("test data length:",len(X_test), ",", len(Y_test))

train data length: 3410 , 3410
test data length: 853 , 853


In [67]:
#將所有資料向量化，轉成 tfidf vector
vectorizer = TfidfVectorizer(stop_words=stopwords)
X_train = vectorizer.fit_transform(X_train)
X_train = pd.DataFrame(X_train.toarray(), columns = vectorizer.get_feature_names_out())
display(X_train)



Unnamed: 0,一一一億,一一七五,一一七六億,一一九九七,一一五,一一八五,一一六,一一六億,一一六八三兆,一一出爐,...,龍頭輝,龍鳳配,龍鷹,龐大,龐雜,龔明鑫,龔明鑫日,龜尾,龜山,龜山廠
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3405,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3406,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3407,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3408,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [68]:
chi2_selector = SelectKBest(chi2, k = 5000)
chi2_selector.fit(X_train, Y_train)
kbest_vocabs = X_train.columns[chi2_selector.get_support()]
X_train = X_train[kbest_vocabs]
X_train

Unnamed: 0,一三,一事,一兆,一城,一帶,一成,一百四,一線廠,一般板,一角,...,齊漲,齊發,齊締,齊聯發科,齊讚,齊賣超,齊黑特斯拉,龍潭,龍潭廠,龍燈
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3405,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3406,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3407,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3408,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [69]:
# 計算 test 個別的 tfidf，再將結果透過 df.reindex 這個方法映射到訓練集的向量空間中
X_test = vectorizer.fit_transform(X_test)
X_test = pd.DataFrame(X_test.toarray(), columns=vectorizer.get_feature_names_out())
X_test = X_test.reindex(kbest_vocabs, axis=1, fill_value=0)
X_test

Unnamed: 0,一三,一事,一兆,一城,一帶,一成,一百四,一線廠,一般板,一角,...,齊漲,齊發,齊締,齊聯發科,齊讚,齊賣超,齊黑特斯拉,龍潭,龍潭廠,龍燈
0,0,0.0,0.0,0.00000,0,0.0,0,0,0,0.0,...,0.0,0.000000,0,0,0,0,0,0.0,0.0,0
1,0,0.0,0.0,0.00000,0,0.0,0,0,0,0.0,...,0.0,0.000000,0,0,0,0,0,0.0,0.0,0
2,0,0.0,0.0,0.05297,0,0.0,0,0,0,0.0,...,0.0,0.054377,0,0,0,0,0,0.0,0.0,0
3,0,0.0,0.0,0.00000,0,0.0,0,0,0,0.0,...,0.0,0.000000,0,0,0,0,0,0.0,0.0,0
4,0,0.0,0.0,0.00000,0,0.0,0,0,0,0.0,...,0.0,0.000000,0,0,0,0,0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
848,0,0.0,0.0,0.00000,0,0.0,0,0,0,0.0,...,0.0,0.000000,0,0,0,0,0,0.0,0.0,0
849,0,0.0,0.0,0.00000,0,0.0,0,0,0,0.0,...,0.0,0.000000,0,0,0,0,0,0.0,0.0,0
850,0,0.0,0.0,0.00000,0,0.0,0,0,0,0.0,...,0.0,0.000000,0,0,0,0,0,0.0,0.0,0
851,0,0.0,0.0,0.00000,0,0.0,0,0,0,0.0,...,0.0,0.000000,0,0,0,0,0,0.0,0.0,0


### 預測模型

In [70]:
from sklearn.ensemble import GradientBoostingClassifier

In [71]:
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1, max_depth=7, random_state=0)
clf.fit(X_train, Y_train)
clf.score(X_train, Y_train)

1.0

In [72]:
test_label = Y_test

test_data = X_test
predict_label = clf.predict(X_test)
# predict_label = pd.merge(
#     ase[ase['年月日-1'].between(test_startDate, test_endDate)], 
#     test_data.groupby(['post_time', 'predict_label']).count().sort_values('label', ascending = False).sort_index(level=[0], sort_remaining=False).groupby(level=0).head(1).reset_index(), 
#     left_on='年月日-1', right_on='post_time', how='left').fillna(method='ffill').fillna(method='bfill')['predict_label']

from sklearn.metrics import accuracy_score
print('預測準確率:', accuracy_score(test_label, predict_label))

預測準確率: 0.6201641266119577


In [73]:
#建立預測模型（NB）
from sklearn.naive_bayes import BernoulliNB
from sklearn import metrics
NB_model = BernoulliNB()

In [74]:
NB_model.fit(X_train,Y_train)

In [75]:
predicted_results = []
expected_results = []
expected_results.extend(Y_test)
predicted_results.extend(NB_model.predict(X_test))
print(predicted_results)

['跌', '跌', '漲', '跌', '跌', '跌', '跌', '漲', '漲', '漲', '漲', '漲', '漲', '跌', '跌', '跌', '跌', '跌', '漲', '跌', '跌', '漲', '跌', '漲', '漲', '漲', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '漲', '漲', '跌', '跌', '跌', '跌', '跌', '漲', '跌', '跌', '跌', '漲', '跌', '漲', '跌', '跌', '跌', '漲', '跌', '跌', '跌', '跌', '跌', '漲', '漲', '跌', '漲', '漲', '跌', '跌', '漲', '漲', '漲', '漲', '跌', '跌', '跌', '跌', '跌', '跌', '漲', '漲', '漲', '跌', '跌', '漲', '跌', '漲', '跌', '漲', '跌', '漲', '漲', '漲', '跌', '漲', '漲', '跌', '跌', '跌', '漲', '跌', '漲', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '漲', '跌', '漲', '漲', '跌', '跌', '漲', '漲', '漲', '跌', '漲', '跌', '漲', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '漲', '漲', '漲', '跌', '漲', '漲', '漲', '跌', '跌', '漲', '跌', '跌', '跌', '跌', '漲', '漲', '跌', '漲', '漲', '漲', '漲', '跌', '漲', '跌', '跌', '跌', '漲', '跌', '跌', '漲', '漲', '漲', '跌', '漲', '漲', '跌', '跌', '跌', '跌', '跌', '跌', '漲', '跌', '漲', '漲', '跌', '跌', '跌', '跌', '跌', '跌', '漲', '跌', '漲', '跌', '跌', '跌', '跌', '跌', '漲', '漲', '跌', '漲', '跌', '跌', '跌', '漲', '跌', '跌', '跌', '跌', '跌',

In [76]:
report = metrics.classification_report(expected_results, predicted_results)
print(report)

              precision    recall  f1-score   support

           漲       0.63      0.55      0.59       396
           跌       0.65      0.72      0.68       457

    accuracy                           0.64       853
   macro avg       0.64      0.63      0.63       853
weighted avg       0.64      0.64      0.64       853



In [77]:
#建立預測模型（SVM Linear）
from sklearn.svm import SVC
SVM_model = SVC(kernel = 'linear', C = 50.0, probability = True)
SVM_model.fit(X_train, Y_train)

In [78]:
predicted_results = []
expected_results = []
expected_results.extend(Y_test)
predicted_results.extend(SVM_model.predict(X_test))
print(predicted_results)

['跌', '漲', '漲', '跌', '跌', '跌', '漲', '漲', '漲', '漲', '漲', '漲', '漲', '跌', '跌', '跌', '漲', '跌', '跌', '跌', '漲', '跌', '跌', '漲', '漲', '跌', '漲', '漲', '跌', '跌', '漲', '跌', '跌', '跌', '跌', '跌', '漲', '跌', '漲', '跌', '漲', '跌', '漲', '跌', '跌', '跌', '漲', '跌', '跌', '跌', '漲', '跌', '漲', '跌', '跌', '跌', '漲', '跌', '跌', '漲', '漲', '漲', '跌', '跌', '跌', '漲', '跌', '跌', '漲', '跌', '跌', '跌', '漲', '跌', '跌', '漲', '漲', '跌', '跌', '跌', '漲', '漲', '跌', '漲', '跌', '漲', '跌', '漲', '漲', '漲', '跌', '漲', '跌', '跌', '跌', '漲', '跌', '跌', '漲', '跌', '跌', '漲', '跌', '跌', '跌', '跌', '跌', '漲', '跌', '跌', '漲', '跌', '漲', '漲', '漲', '漲', '漲', '跌', '漲', '漲', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '漲', '跌', '跌', '漲', '跌', '跌', '漲', '漲', '跌', '漲', '跌', '漲', '漲', '跌', '漲', '漲', '漲', '跌', '漲', '跌', '漲', '跌', '漲', '跌', '漲', '漲', '漲', '跌', '漲', '漲', '跌', '漲', '跌', '漲', '漲', '跌', '跌', '跌', '跌', '跌', '漲', '跌', '跌', '漲', '跌', '跌', '漲', '漲', '跌', '漲', '跌', '漲', '漲', '跌', '跌', '跌', '跌', '跌', '跌', '漲', '漲', '跌', '漲', '漲', '漲', '漲', '跌', '漲', '跌', '跌', '跌', '跌',

In [79]:
report = metrics.classification_report(expected_results, predicted_results)
print(report)

              precision    recall  f1-score   support

           漲       0.64      0.59      0.61       396
           跌       0.66      0.71      0.69       457

    accuracy                           0.65       853
   macro avg       0.65      0.65      0.65       853
weighted avg       0.65      0.65      0.65       853



In [80]:
#建立預測模型（SVM Rbf）
SVM_model = SVC(kernel = 'rbf', C = 1.0, probability=True)
SVM_model.fit(X_train,Y_train)

In [81]:
predicted_results = []
expected_results = []
expected_results.extend(Y_test)
predicted_results.extend(SVM_model.predict(X_test))
print(predicted_results)

['跌', '跌', '漲', '跌', '跌', '漲', '漲', '跌', '漲', '漲', '漲', '漲', '漲', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '漲', '跌', '跌', '跌', '跌', '漲', '漲', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '漲', '漲', '跌', '漲', '跌', '漲', '跌', '漲', '跌', '跌', '跌', '漲', '跌', '跌', '跌', '漲', '跌', '漲', '跌', '跌', '跌', '跌', '跌', '漲', '漲', '漲', '漲', '跌', '跌', '跌', '跌', '跌', '漲', '漲', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '漲', '漲', '跌', '跌', '漲', '漲', '跌', '漲', '漲', '漲', '漲', '漲', '漲', '跌', '跌', '漲', '跌', '跌', '漲', '漲', '跌', '跌', '跌', '跌', '跌', '漲', '跌', '跌', '跌', '漲', '跌', '漲', '跌', '跌', '漲', '跌', '漲', '跌', '漲', '跌', '跌', '跌', '漲', '漲', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '漲', '跌', '跌', '跌', '漲', '漲', '漲', '跌', '跌', '漲', '跌', '漲', '跌', '跌', '跌', '跌', '跌', '跌', '漲', '跌', '跌', '跌', '漲', '跌', '漲', '漲', '漲', '跌', '跌', '漲', '跌', '漲', '跌', '漲', '跌', '漲', '跌', '跌', '跌', '漲', '漲', '跌', '跌', '漲', '跌', '跌', '漲', '跌', '跌', '漲', '跌', '跌', '跌', '漲', '跌', '跌', '跌', '跌', '跌', '漲', '漲', '跌', '跌', '跌', '跌', '漲', '跌', '跌', '跌', '跌', '跌', '跌',

In [82]:
report = metrics.classification_report(expected_results, predicted_results)
print(report)

              precision    recall  f1-score   support

           漲       0.68      0.49      0.57       396
           跌       0.65      0.80      0.72       457

    accuracy                           0.66       853
   macro avg       0.67      0.65      0.64       853
weighted avg       0.66      0.66      0.65       853



#### example 6 

In [83]:
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB() # naive bayes classifier

In [84]:
# 以下將自身資料切成 train 及 test 兩組，重新訓練一次，測試模型準確率
classifier = MultinomialNB()
classifier.fit(X_train, Y_train) #訓練

In [85]:
from sklearn import metrics 
from sklearn.metrics import accuracy_score
Y_pred = classifier.predict(X_test) #用測試資料預測
print("Accuracy:", metrics.accuracy_score(Y_test, Y_pred)) #比對答案，計算準確率

Accuracy: 0.6131301289566237


In [86]:
from sklearn.metrics import classification_report  
print(classification_report(Y_test, Y_pred)) #印出分類報告

              precision    recall  f1-score   support

           漲       0.72      0.28      0.40       396
           跌       0.59      0.91      0.72       457

    accuracy                           0.61       853
   macro avg       0.65      0.59      0.56       853
weighted avg       0.65      0.61      0.57       853



In [87]:
from sklearn.metrics import confusion_matrix  
print(confusion_matrix(Y_test, Y_pred, labels = ['漲', '跌'])) # 印出混淆矩陣
# test/predicted 看漲   看跌   
#    看漲         TN     FP
#    看跌         FN     TP

[[109 287]
 [ 43 414]]


In [88]:
Y_test

3758    跌
2090    跌
3174    跌
1553    漲
1428    跌
       ..
2267    跌
3708    跌
2303    漲
3020    跌
2807    跌
Name: label, Length: 853, dtype: object

#### Example 7

In [113]:
X_data = df_news_list

In [114]:
# 將所有資料(X_data)向量化，轉成 tfidf vector
vectorizer = TfidfVectorizer(stop_words=stopwords)
X_data = vectorizer.fit_transform(X_data)
X_data = pd.DataFrame(X_data.toarray(), columns = vectorizer.get_feature_names_out())

chi2_selector = SelectKBest(chi2, k = 5000)
chi2_selector.fit(X_data, Y_data)
kbest_vocabs = X_data.columns[chi2_selector.get_support()]
X_data = X_data[kbest_vocabs]
X_data

display(X_data)



Unnamed: 0,一三,一事,一代,一兆,一城,一帶,一成,一現,一百四,一線廠,...,齊締,齊讚,齊賣超,齊高歌,龍城,龍潭,龍潭廠,龍燈,龐佩奧,龔明鑫
0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.054647,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4258,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4259,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4260,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4261,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [115]:
# DecisionTree
from sklearn.tree import DecisionTreeClassifier                          
classifier = DecisionTreeClassifier(criterion = "entropy")

from sklearn import metrics 
from sklearn.metrics import accuracy_score
classifier.fit(X_train, Y_train) # 訓練
Y_pred = classifier.predict(X_test) # 用測試資料預測
print("Accuracy:", metrics.accuracy_score(Y_test, Y_pred)) # 比對答案，計算準確率

Accuracy: 0.5943728018757327


In [116]:
from sklearn.metrics import classification_report  
print(classification_report(Y_test, Y_pred)) # 印出分類報告

              precision    recall  f1-score   support

           漲       0.56      0.56      0.56       396
           跌       0.62      0.62      0.62       457

    accuracy                           0.59       853
   macro avg       0.59      0.59      0.59       853
weighted avg       0.59      0.59      0.59       853



In [117]:
from sklearn.metrics import confusion_matrix  
print(confusion_matrix(Y_test, Y_pred, labels = ['漲', '跌'])) #印出混淆矩陣
# test/predicted 看漲   看跌
#    看漲         TN     FP
#    看跌         FN     TP

[[222 174]
 [172 285]]


In [118]:
Y_data

0       漲
1       漲
2       漲
3       漲
4       漲
       ..
4258    漲
4259    漲
4260    漲
4261    漲
4262    漲
Name: label, Length: 4263, dtype: object

In [119]:
# KNeighbors
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 7)

from sklearn.model_selection import cross_val_score
scores = cross_val_score(classifier, X_data, Y_data, cv = 5, scoring = 'accuracy') # 交叉驗證，計算準確率
print(scores)
print("Avg. Accuracy:", scores.mean())

[0.52051583 0.56154748 0.56271981 0.54342723 0.52816901]
Avg. Accuracy: 0.5432758725074165


In [120]:
classifier.fit(X_train, Y_train) # 訓練
Y_pred = classifier.predict(X_test) # 用測試資料預測
print("Accuracy:", metrics.accuracy_score(Y_test, Y_pred)) # 比對答案，計算準確率

Accuracy: 0.5732708089097304


In [121]:
print(classification_report(Y_test, Y_pred)) # 印出分類報告

              precision    recall  f1-score   support

           漲       0.63      0.20      0.30       396
           跌       0.56      0.90      0.69       457

    accuracy                           0.57       853
   macro avg       0.60      0.55      0.50       853
weighted avg       0.59      0.57      0.51       853



In [122]:
print(confusion_matrix(Y_test, Y_pred, labels = ['漲', '跌'])) # 印出混淆矩陣
# test/predicted 看漲   看跌
#    看漲         TN     FP
#    看跌         FN     TP

[[ 79 317]
 [ 47 410]]


In [123]:
# SVC
from sklearn.svm import SVC
classifier = SVC(kernel='linear')

classifier.fit(X_train, Y_train) # 訓練
Y_pred = classifier.predict(X_test) # 用測試資料預測
print("Accuracy:", metrics.accuracy_score(Y_test, Y_pred)) # 比對答案，計算準確率

Accuracy: 0.64947245017585


In [124]:
print(classification_report(Y_test, Y_pred)) # 印出分類報告

              precision    recall  f1-score   support

           漲       0.66      0.50      0.57       396
           跌       0.64      0.78      0.70       457

    accuracy                           0.65       853
   macro avg       0.65      0.64      0.64       853
weighted avg       0.65      0.65      0.64       853



In [125]:
print(confusion_matrix(Y_test, Y_pred, labels = ['漲', '跌'])) # 印出混淆矩陣
# test/predicted 看漲   看跌
#    看漲         TN     FP
#    看跌         FN     TP

[[199 197]
 [102 355]]
