In [157]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [158]:
with open('stopwords_zh.txt', 'r') as file:
    stopwords = file.read().splitlines() 
file.close()

In [159]:
df_stock = pd.DataFrame()
for i in range(2021, 2024):
    x = pd.read_excel("../../bda2023_mid_dataset/stock_data_2019-2023.xlsx", sheet_name=f'上市{i}', usecols=['證券代碼', '年月日', '收盤價(元)'])
    x = x[x['證券代碼'] == '2330 台積電']
    x = x.loc[::-1]
    df_stock = pd.concat([df_stock, x])

In [160]:
df_stock = df_stock.reset_index(drop = True)
df_stock

Unnamed: 0,證券代碼,年月日,收盤價(元)
0,2330 台積電,2021/01/04,526.7744
1,2330 台積電,2021/01/05,532.6711
2,2330 台積電,2021/01/06,539.5506
3,2330 台積電,2021/01/07,555.2752
4,2330 台積電,2021/01/08,570.0170
...,...,...,...
534,2330 台積電,2023/03/20,512.0000
535,2330 台積電,2023/03/21,517.0000
536,2330 台積電,2023/03/22,533.0000
537,2330 台積電,2023/03/23,538.0000


In [161]:
label = []
for i in range(len(df_stock) - 4):
    rate = (df_stock['收盤價(元)'][i + 4] - df_stock['收盤價(元)'][i]) / df_stock['收盤價(元)'][i]
    if rate > 0.005:
       label.append('漲')
    elif rate < -0.005:
        label.append('跌')
    else:
        label.append('持平')

label.extend([0, 0, 0, 0])
df_stock['label'] = label
df_stock.tail(10)

Unnamed: 0,證券代碼,年月日,收盤價(元),label
529,2330 台積電,2023/03/13,513.2229,漲
530,2330 台積電,2023/03/14,507.2552,漲
531,2330 台積電,2023/03/15,508.2498,漲
532,2330 台積電,2023/03/16,505.0,漲
533,2330 台積電,2023/03/17,518.0,漲
534,2330 台積電,2023/03/20,512.0,漲
535,2330 台積電,2023/03/21,517.0,0
536,2330 台積電,2023/03/22,533.0,0
537,2330 台積電,2023/03/23,538.0,0
538,2330 台積電,2023/03/24,539.0,0


In [162]:
df_stock['年月日'] = pd.to_datetime(df_stock['年月日']).dt.date
df_stock.tail(10)

Unnamed: 0,證券代碼,年月日,收盤價(元),label
529,2330 台積電,2023-03-13,513.2229,漲
530,2330 台積電,2023-03-14,507.2552,漲
531,2330 台積電,2023-03-15,508.2498,漲
532,2330 台積電,2023-03-16,505.0,漲
533,2330 台積電,2023-03-17,518.0,漲
534,2330 台積電,2023-03-20,512.0,漲
535,2330 台積電,2023-03-21,517.0,0
536,2330 台積電,2023-03-22,533.0,0
537,2330 台積電,2023-03-23,538.0,0
538,2330 台積電,2023-03-24,539.0,0


In [163]:
upCnt = 0
downCnt = 0
for i in range(len(df_stock)):
    if df_stock['label'][i] == '漲':
        upCnt += 1
    elif df_stock['label'][i] == '跌':
        downCnt += 1 
print('預估漲的天數：', upCnt)
print('預估跌的天數：', downCnt)

預估漲的天數： 222
預估跌的天數： 231


In [164]:
df_news = pd.read_excel("../../TSMC_news_21-23.xlsx")
df_news.head()

Unnamed: 0.1,Unnamed: 0,post_time,title,content,token
0,0,2021-01-01,明年買哪幾檔晶片股?分析師首選 Nvidia 和 AMD,週四 (31 日) 華爾街分析師表示，Nvidia 和 AMD 本月漲勢落後於其他半導體類股...,"['週四', '日', '華爾街', '分析師', '表示', '和', '本', '月',..."
1,1,2021-01-01,《大陸產業》中芯成熟製程 獲美放行,【時報-台北電】遭美制裁的大陸晶圓代工龍頭中芯國際迎來曙光。業界傳出，中芯國際成熟製程獲得美...,"['時報', '台北電', '遭', '美', '制裁', '的', '大陸', '晶圓',..."
2,2,2021-01-04,《各報要聞》台積今年資本支出上看200億美元,【時報-台北電】晶圓代工龍頭台積電2020年繳出亮麗成績單，預期全年美元營收年成長率逾三成並...,"['時報', '台北', '電', '晶圓', '代工', '龍頭', '台積電', '年'..."
3,3,2021-01-04,《半導體》台積電等供應鏈力挺 聯發科Q1拚淡季不淡,【時報記者王逸芯台北報導】聯發科(2454)在去年第三季一舉超車高通，在行動晶片市占率衝上3...,"['時報', '記者', '王逸芯', '台北', '報導', '聯發科', '在', '去..."
4,4,2021-01-04,《半導體》2021年拚翻身 創意鎖漲停,【時報記者王逸芯台北報導】創意(3443)營運最壞時期已經過去，去年第四季在NRE案認列入帳...,"['時報', '記者', '王逸芯', '台北', '報導', '創意', '營運', '最..."


In [165]:
label_news = []
for i in range(len(df_news)):
    for j in range(len(df_stock)):
        if df_news['post_time'][i] == df_stock['年月日'][j]:
            label_news.append(df_stock['label'][j])
    if len(label_news) == i:
        label_news.append(0) # 日期沒有對到的情況（e.g. 週末） 

df_news['label'] = label_news
df_news.head(10)
df_news.tail(10)

  if df_news['post_time'][i] == df_stock['年月日'][j]:


Unnamed: 0.1,Unnamed: 0,post_time,title,content,token,label
6822,6822,2023-03-20,算力大戰 台積日月光受惠,隨著微軟轉投資OpenAI推出的聊天機器人ChatGPT全球爆紅，包括Google、阿里巴巴...,"['隨著', '微軟', '轉', '投資', '推出', '的', '聊天', '機器人'...",漲
6823,6823,2023-03-20,銀行股再見殺氣，美股四巫日齊黑，道瓊急跌385點，微軟逆勢漲約1.2%,【財訊快報／陳孟朔】被華府接管的矽谷銀行(SVB)，其母公司SVB金融集團申請破產保護的同時...,"['財訊', '快報', '陳孟朔', '被', '華府', '接管', '的', '矽谷銀...",漲
6824,6824,2023-03-20,《熱門族群》外資釋出AI口袋名單股 台積電等3檔入列,【時報記者王逸芯台北報導】ChatGPT颳起AI旋風，美系外資針對AI產業也出具最新研究報告...,"['時報', '記者', '王逸芯', '台北', '報導', '颳起', '旋風', '美...",漲
6825,6825,2023-03-20,台積電蟬聯百大創新企業，專利與營業秘密雙軌保護成果，研發營收比8%,【財訊快報／記者李純君報導】台積電(2330)今年繼續蟬聯百大創新企業，其副法務長陳碧莉提到...,"['財訊快報', '記者', '李純君', '報導', '台積電', '今年', '繼續',...",漲
6826,6826,2023-03-20,台積電／台積電「創新」再獲獎 副法務長陳碧莉：去年在台、美專利獲准百發百中,台積電（2330）再次入選「2023全球百大創新機構獎」，出席領獎的台積電副法務漲陳碧莉表示...,"['台積電', '再次', '入選', '全球', '百', '大', '創新', '機構獎...",漲
6827,6827,2023-03-21,晶片業起死回生？日媒揭實際庫存真相：反轉訊號來了,半導體景氣反轉向下，從「供不應求」轉變成「供應過剩」，市場關注產業景氣何時才會翻轉向上，日媒...,"['半導體', '景氣', '反轉', '向', '下', '從', '供', '不', '...",0
6828,6828,2023-03-21,《盤前掃瞄-基本面》高通加速轉單台灣；國發會估景氣4月探底,【時報-台北電】基本面：1.前一交易日新台幣以30.593元兌一美元收市，貶值3.9分，成交...,"['時報', '台北', '電', '基本面', '前', '一', '交易日', '新台幣...",0
6829,6829,2023-03-21,《各報要聞》高通去中化 加速轉單台灣,【時報-台北電】為了因應半導體市場出現美國陣營及中國陣營的兩極化地緣政治風險，手機晶片大廠高...,"['時報', '台北電', '為了', '因應', '半導體', '市場', '出現', '...",0
6830,6830,2023-03-21,《科技》台積電兩招…擴大專利版圖,【時報-台北電】晶圓代工龍頭台積電20日獲頒2023年科睿唯安全球百大創新機構獎，副法務長陳...,"['時報', '台北', '電晶圓', '代工', '龍頭', '台積電', '日', '獲...",0
6831,6831,2023-03-21,《熱門族群》高通轉單喜訊 精測笑開懷、這兩檔卻擺臭臉,【時報-台北電】台股今日隨美股彈升，在金融股回神、AI等具題材族群續強下，大盤指數盤中彈升約...,"['時報', '台北電', '台股', '今日', '隨', '美股', '彈升', '在'...",0


In [166]:
#刪掉 label 是 0 的資料
df_news_no_zero = df_news[df_news.label != 0]
df_news_no_zero = df_news_no_zero.reset_index(drop = True)

In [180]:
config = {
    'feature_size': 5000,
    'test_size': 0.2,
    'seed': 2,
    'SVM_Linear_C': 100,
    'SVM_rbf_C': 100,
}

In [181]:
#隨資切分資料，train 80%、test 20%
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(df_news_no_zero, df_news_no_zero['label'], test_size = config['test_size'], random_state = config['seed'], stratify = df_news_no_zero['label'])

### train data 向量轉換

In [188]:
#token 型態轉換，fit in 套件
df_news_list = []
for i in range(len(X_train)):
    df_news_list_tmp = []
    df_news_str = ''
    df_news_list_tmp = eval(X_train.token[X_train.token.index[i]])
    df_news_str = ' '.join(df_news_list_tmp)
    df_news_list.append(df_news_str)
df_news_list

['新聞 記者 陳苓 報導 三星 電子 的 晶圓 代工 業務 再 下 一城 據傳 該 公司 贏得 意法 半導體 的 微控制器 訂單 將 替 意法 半導體 生產 供應 給 次 世代 的 晶片 韓國 經濟日報日 報導 半導體 業界 人士 透露 這 是 意法 半導體 首 次 把 大 客戶 蘋果 的 訂單 委外 生產 全球 大 缺貨 意法 半導體 委託 三星 代工 的 將 採 奈米 製程 這 代表 和 傳統 相比 新品 體積 更 小功率 密度 更 強 三星 對 新 訂單 不 置 可否 目前 也 不 清楚 詳細 金額 用於 各 種 系統 與 裝置 為 系統 單 晶片 的 一 種 能 控制 大 零件 的 小 功能 智慧機 內嵌 的 可以 處理 多 個 感測器 的 數據 智慧機 休眠 時 也 會 持續 運作 以 減少 電力 損耗 三星 接獲 的 新 訂單 可望 增加 該 公司 的 晶圓 代工 市佔 台積電 不只 是 晶圓 代工 霸主 也 在 生產 稱雄 全球 委外 代工 的 成由 台積電 生產 台 積續 領先 拉大 與 三星 市 佔 差距 先前 報導 台積電 擴大 與 三星 電子 的 差距 進一步 鞏固 晶圓 代工 的 龍頭 地位 數據 顯示 今年 第三季 台積電 的 晶圓 代工 營收 季增 至 億 美元 市 佔 增至 三星 電子 的 晶圓 代工 營收 季增 至 億 美元 但是 市 佔 降至 編者 按 本文 僅 供 參考 之 用 並 不 構成 要 約 招攬 或 邀請 誘使 任何 不論 種類 或 形式 之 申述 或 訂立 任何 建議 及 推薦 讀者 務請 運用 個人 獨立 思考 能力 自行 作出 投資 決定 如 因 相關 建議 招致 損失 概與 精實 財經 媒體 編者 及 作者 無涉 傳 三星 奪 晶圓 代工 訂單 將 生產 愛瘋 用 傳 三星 奪 晶圓 代工 訂單 將 生產 愛瘋 用 傳 三星 奪 晶圓 代工 訂單 將 生產 愛瘋 用 傳 三星 奪 晶圓 代工 訂單 將 生產 愛瘋 用 傳 三星 奪 晶圓 代工 訂單 將 生產 愛瘋 用',
 '開盤 日出刊 台 積電法 說 日 登場 法人 關注 多 焦點 看好 前景 國際 股市 回檔 台股 受 牽連檔 年常 勝軍 撐盤 年前 運 價現 高峰 貨櫃 三雄 加船 加班 旺季 提早 到 平均 得 分分 國際 股市 分道瓊 指數 漲跌 

In [189]:
#將 train 資料向量化，轉成 tfidf vector
vectorizer = TfidfVectorizer(stop_words=stopwords)
x_train = vectorizer.fit_transform(df_news_list)
x_train = pd.DataFrame(x_train.toarray(),columns=vectorizer.get_feature_names_out())
x_train



Unnamed: 0,一一七五,一一七八,一一九九七,一一五,一一兆,一一八五,一一六,一一六億,一一六八三兆,一一出爐,...,龍鳳配,龍鷹,龐佩奧,龐大,龐雜,龔培元,龔明鑫,龔明鑫日,龜山,龜牙
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4914,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
4915,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
4916,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.042631,0.0,0.0,0.0,0.0,0.0,0.0
4917,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0


In [224]:
y_train = X_train['label']

chi2_selector = SelectKBest(chi2, k = config['feature_size'])
chi2_selector.fit(x_train, y_train)
kbest_vocabs = x_train.columns[chi2_selector.get_support()]
x_train = x_train[kbest_vocabs]
x_train

Unnamed: 0,一年鏡,一廠,一晃,一級,一線廠,一職,一角,一語,一起來,一飛,...,齊締,齊賀,龍城,龍潭,龍燈,龍芯,龍頭市,龍鷹,龐佩奧,龔明鑫
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4914,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4915,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4916,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4917,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [226]:
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()

from sklearn.model_selection import cross_val_score
scores = cross_val_score(classifier,x_train,y_train,cv=5,scoring='accuracy') #交叉驗證，計算準確率
print(scores)
print("Avg. Accuracy:",scores.mean())

[0.53963415 0.54065041 0.52439024 0.5304878  0.53509664]
Avg. Accuracy: 0.5340518489111646


In [227]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=7)

from sklearn.model_selection import cross_val_score
scores = cross_val_score(classifier,x_train,y_train,cv=5,scoring='accuracy') #交叉驗證，計算準確率
print(scores)
print("Avg. Accuracy:",scores.mean())

[0.47764228 0.49186992 0.4695122  0.47154472 0.46693795]
Avg. Accuracy: 0.4755014101514362


In [228]:
from sklearn.svm import SVC
classifier = SVC(kernel='linear')

from sklearn.model_selection import cross_val_score
scores = cross_val_score(classifier, x_train, y_train, cv=5,scoring='accuracy') #交叉驗證，計算準確率
print(scores)
print("Avg. Accuracy:",scores.mean())

[0.54471545 0.54369919 0.55487805 0.56504065 0.54628688]
Avg. Accuracy: 0.550924042048152


### 測試訓練結果

In [229]:
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1, max_depth=7, random_state=0)
clf.fit(x_train, y_train)
clf.score(x_train, y_train)

KeyboardInterrupt: 

### test data 向量轉換

In [210]:
#token 型態轉換，fit in 套件
df_news_list = []
for i in range(len(X_test)):
    df_news_list_tmp = []
    df_news_str = ''
    df_news_list_tmp = eval(X_test.token[X_test.token.index[i]])
    df_news_str = ' '.join(df_news_list_tmp)
    df_news_list.append(df_news_str)
df_news_list

['知名 傳奇 跑車 品牌 獨特 的 風格 與 極 佳 的 耐用性 享譽 全球 其 所 代表 的 是 品質 頂尖 與 絕佳 的 引擎 性能 半導體 中 的 跑車 洲際 半導體 指數 成份股 網羅 全球 半導體 頂尖 企業 亦 代表 品質 頂尖 與 絕佳 的 成長 動能 看好 全球 半導體長 發展 潛力 兆豐投信 首發 兆豐 洲際 半導體 募集 期間 自 月日 新台幣 元 即 可 輕鬆 駕馭 全球 半導體 龍頭 廠商 全球 半導體 股價 遭遇 逆風 主因 國際 地緣 政經 局勢 各 國 央行 升息 對抗 通膨 及 市場 認為 科技股 估值 過 高等 因素 造成 資金 抽離 然 從 近期 半導體 大廠 公布 的 財報 與 展望 角度 高 效能 運算物 聯網 及 車 用 市場 等 需求 依然 強勁 美國 的 通訊 晶片 大廠 高通 除 第一季 營收 創 新 高 外 並 調高 第二 季 優於 市場 預期 的 財測 超微 車用 晶片 大廠 安森美 亦 是 第一季 營收 創 新 高 並 調升 全 年 業績 展望 顯示 半導體 產業 具 利基性 的 廠商 仍 具 成長性 今年 以來 洲際 半導體 指數 下修 近 三成 評價面 已 相對 具 吸引力 洲際 半導體 指數 的 優勢 為 指數 篩選 邏輯 較 能 掌握到 科技 趨勢 的 脈動 每年 調整 四季 權重 足以 捕捉 科技股 輪動 相較 於 美股 費半 指數 成份股 亦 涵蓋 更多 全球 的 半導體 龍頭 企業 除 美國 的 博通高通 英特爾輝達荷蘭 的 艾司摩爾恩智浦 台灣 有 台積電 日月光 等 由於 洲際 半導體 指數 具 調整 彈性 與 效率 全球 第一 檔 也 是 規模 最 大 在 美國 掛牌 的 半導體 去年 月 將 追蹤 的 費半 指數 換成 洲際 半導體 指數 轉換 後 至今 指數 的 報酬 表現 優異 洲際 半導體 指數 可 視為 費半 指數 的 進階版 美股 正 處於 利率 政策 轉換 股市 調整 階段 短線 市場 容易 受 消息面 影響 而 劇烈 震盪 就 投資 角度 來 看 現 階段 應 重視 產業 篩選 具備 業績 成長 及 趨勢 創新 題材 的 科技股 近期 各 半導體 龍頭 大廠 對 未來 看法 仍 保持 審慎 樂觀 投資人 可 把握 近期 回檔 時 的 進場 機會 洲際 半導體 費半 再 進階 

In [211]:
#將 test 資料向量化，轉成 tfidf vector
vectorizer_test = TfidfVectorizer(stop_words=stopwords)
x_test = vectorizer.fit_transform(df_news_list)
x_test = pd.DataFrame(x_test.toarray(),columns=vectorizer.get_feature_names_out())
x_test

Unnamed: 0,一一一億,一一七六億,一七,一七三,一七二,一七五八一,一七兆,一七六四三,一三九五,一三二六五億,...,龍頭美光日,龍頭聯發科,龍頭股,龍頭輝,龐大,龐然,龔明鑫,龔說,龜尾,龜山廠
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1225,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1226,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1227,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1228,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [217]:
y_test = X_test['label']

chi2_selector = SelectKBest(chi2, k = config['feature_size'])
chi2_selector.fit(x_test, y_test)
kbest_vocabs = x_test.columns[chi2_selector.get_support()]
x_test = x_test[kbest_vocabs]
x_test

Unnamed: 0,一事,一劫,一千億,一向,一呂淑美,一喜一憂喜大於憂對,一季度,一技之長,一掃,一柯宗沅,...,齊挫,齊揚,齊攻,齊殺,齊發,齊紅特斯,齊黑,齊黑特斯拉,龍潭,龔明鑫
0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.086913,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1225,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1226,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1227,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1228,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [218]:
print('x len', len(x_train), 'y len', len(y_train))
print('x len', len(X_test), 'y len', len(Y_test))

x len 4919 y len 4919
x len 1230 y len 1230


In [219]:
#建立預測模型（NB 牛逼演算法）
from sklearn.naive_bayes import BernoulliNB
from sklearn import metrics
NB_model = BernoulliNB()

In [220]:
NB_model.fit(x_train,y_train)

In [221]:
predicted_results = []
expected_results = []
expected_results.extend(Y_test)
predicted_results.extend(NB_model.predict(x_test))
print(predicted_results)

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- 一事
- 一劫
- 一千億
- 一向
- 一呂淑美
- ...
Feature names seen at fit time, yet now missing:
- 一年鏡
- 一廠
- 一晃
- 一級
- 一線廠
- ...


In [None]:
report = metrics.classification_report(expected_results, predicted_results)
print(report)

              precision    recall  f1-score   support

          持平       0.93      0.32      0.48       205
           漲       0.62      0.67      0.64       480
           跌       0.63      0.74      0.68       545

    accuracy                           0.64      1230
   macro avg       0.72      0.58      0.60      1230
weighted avg       0.67      0.64      0.63      1230



In [None]:
#建立預測模型（SVM Linear）
from sklearn.svm import SVC
SVM_model = SVC(kernel = 'linear', C = config['SVM_Linear_C'], probability=True)
SVM_model.fit(X_train,Y_train)

In [None]:
predicted_results = []
expected_results = []
expected_results.extend(Y_test)
predicted_results.extend(SVM_model.predict(X_test))
print(predicted_results)

['漲', '漲', '跌', '漲', '跌', '漲', '漲', '跌', '跌', '漲', '跌', '漲', '漲', '漲', '漲', '漲', '跌', '跌', '持平', '持平', '漲', '漲', '漲', '跌', '漲', '跌', '漲', '漲', '跌', '漲', '持平', '漲', '跌', '漲', '跌', '漲', '跌', '跌', '跌', '跌', '漲', '漲', '跌', '跌', '漲', '跌', '跌', '跌', '持平', '持平', '跌', '漲', '跌', '跌', '跌', '漲', '漲', '漲', '持平', '漲', '漲', '跌', '跌', '漲', '持平', '持平', '漲', '漲', '漲', '跌', '跌', '跌', '跌', '跌', '漲', '跌', '跌', '跌', '跌', '漲', '漲', '跌', '跌', '跌', '跌', '漲', '漲', '跌', '漲', '跌', '跌', '跌', '持平', '跌', '跌', '持平', '漲', '跌', '跌', '跌', '漲', '持平', '跌', '跌', '漲', '持平', '跌', '跌', '跌', '跌', '跌', '持平', '持平', '跌', '跌', '跌', '漲', '漲', '漲', '漲', '跌', '持平', '跌', '持平', '跌', '跌', '持平', '跌', '跌', '漲', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '漲', '跌', '跌', '跌', '跌', '跌', '漲', '跌', '漲', '跌', '漲', '持平', '持平', '跌', '漲', '跌', '跌', '漲', '跌', '持平', '漲', '持平', '跌', '漲', '漲', '跌', '跌', '跌', '持平', '漲', '持平', '漲', '漲', '持平', '漲', '漲', '漲', '漲', '跌', '漲', '漲', '跌', '漲', '漲', '漲', '持平', '漲', '跌', '漲', '漲', '跌', '漲', '漲', '漲', '漲', '漲', '漲', '跌',

In [None]:
report = metrics.classification_report(expected_results, predicted_results)
print(report)

              precision    recall  f1-score   support

          持平       0.55      0.42      0.48       205
           漲       0.67      0.64      0.65       480
           跌       0.66      0.74      0.70       545

    accuracy                           0.65      1230
   macro avg       0.62      0.60      0.61      1230
weighted avg       0.64      0.65      0.64      1230



In [None]:
#建立預測模型（SVM Rbf）
SVM_model = SVC(kernel = 'rbf', C = config['SVM_rbf_C'], probability=True)
SVM_model.fit(X_train,Y_train)

In [None]:
predicted_results = []
expected_results = []
expected_results.extend(Y_test)
predicted_results.extend(SVM_model.predict(X_test))
print(predicted_results)

['漲', '跌', '跌', '漲', '漲', '跌', '漲', '跌', '漲', '漲', '漲', '漲', '漲', '漲', '跌', '漲', '跌', '跌', '持平', '持平', '漲', '漲', '漲', '跌', '跌', '跌', '漲', '漲', '跌', '漲', '跌', '漲', '跌', '持平', '漲', '漲', '跌', '跌', '跌', '漲', '漲', '跌', '跌', '漲', '漲', '跌', '持平', '跌', '跌', '跌', '跌', '漲', '漲', '跌', '跌', '漲', '跌', '漲', '跌', '漲', '漲', '跌', '漲', '漲', '漲', '漲', '漲', '跌', '漲', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '持平', '漲', '漲', '跌', '跌', '漲', '跌', '漲', '漲', '漲', '漲', '跌', '跌', '跌', '持平', '跌', '跌', '持平', '漲', '跌', '跌', '漲', '漲', '漲', '跌', '跌', '漲', '持平', '跌', '持平', '持平', '漲', '跌', '持平', '跌', '漲', '跌', '漲', '漲', '漲', '漲', '漲', '跌', '持平', '跌', '持平', '漲', '漲', '持平', '跌', '跌', '漲', '跌', '漲', '跌', '漲', '漲', '跌', '跌', '漲', '漲', '漲', '跌', '漲', '跌', '漲', '跌', '漲', '跌', '持平', '漲', '漲', '持平', '跌', '跌', '跌', '漲', '漲', '持平', '漲', '持平', '持平', '漲', '漲', '跌', '跌', '跌', '持平', '跌', '漲', '漲', '漲', '漲', '持平', '漲', '跌', '漲', '跌', '漲', '漲', '跌', '漲', '漲', '漲', '持平', '漲', '跌', '漲', '跌', '跌', '持平', '漲', '漲', '跌', '跌', '漲', '跌', '

In [None]:
report = metrics.classification_report(expected_results, predicted_results)
print(report)

              precision    recall  f1-score   support

          持平       0.64      0.42      0.51       205
           漲       0.60      0.69      0.64       480
           跌       0.68      0.68      0.68       545

    accuracy                           0.64      1230
   macro avg       0.64      0.60      0.61      1230
weighted avg       0.64      0.64      0.64      1230



### 漲

In [None]:
df_up = df_news[df_news['label'] == '漲'] 
df_up = df_up.reset_index(drop = True)
df_up.head() # 漲的文章
# df_up.to_excel('upNews21-23.xlsx')

Unnamed: 0.1,Unnamed: 0,post_time,title,content,token,label
0,2,2021-01-04,《各報要聞》台積今年資本支出上看200億美元,【時報-台北電】晶圓代工龍頭台積電2020年繳出亮麗成績單，預期全年美元營收年成長率逾三成並...,"['時報', '台北', '電', '晶圓', '代工', '龍頭', '台積電', '年'...",漲
1,3,2021-01-04,《半導體》台積電等供應鏈力挺 聯發科Q1拚淡季不淡,【時報記者王逸芯台北報導】聯發科(2454)在去年第三季一舉超車高通，在行動晶片市占率衝上3...,"['時報', '記者', '王逸芯', '台北', '報導', '聯發科', '在', '去...",漲
2,4,2021-01-04,《半導體》2021年拚翻身 創意鎖漲停,【時報記者王逸芯台北報導】創意(3443)營運最壞時期已經過去，去年第四季在NRE案認列入帳...,"['時報', '記者', '王逸芯', '台北', '報導', '創意', '營運', '最...",漲
3,5,2021-01-04,聯發科外資買盤助攻 股價衝上798元創新高 市值緊追鴻海,台股今 (14) 日開紅盤，在各國寬鬆貨幣政策下，資金瘋狗浪效應不斷擴散，聯發科 (2454...,"['台股', '今日', '開', '紅盤', '在', '各', '國', '寬鬆', '...",漲
4,6,2021-01-05,《半導體》外資升升升！台積目標價衝到新境界,【時報-台北電】台積電（2330）股價新年正開啟新一輪多頭大進擊，外資券商立刻助陣，看好四大...,"['時報', '台北電台', '積電', '股價', '新年', '正', '開啟', '新...",漲


In [None]:
df_up.token

0       ['時報', '台北', '電', '晶圓', '代工', '龍頭', '台積電', '年'...
1       ['時報', '記者', '王逸芯', '台北', '報導', '聯發科', '在', '去...
2       ['時報', '記者', '王逸芯', '台北', '報導', '創意', '營運', '最...
3       ['台股', '今日', '開', '紅盤', '在', '各', '國', '寬鬆', '...
4       ['時報', '台北電台', '積電', '股價', '新年', '正', '開啟', '新...
                              ...                        
2395    ['隨著', '微軟', '轉', '投資', '推出', '的', '聊天', '機器人'...
2396    ['財訊', '快報', '陳孟朔', '被', '華府', '接管', '的', '矽谷銀...
2397    ['時報', '記者', '王逸芯', '台北', '報導', '颳起', '旋風', '美...
2398    ['財訊快報', '記者', '李純君', '報導', '台積電', '今年', '繼續',...
2399    ['台積電', '再次', '入選', '全球', '百', '大', '創新', '機構獎...
Name: token, Length: 2400, dtype: object

### 跌

In [None]:
df_down = df_news[df_news['label'] == '跌']
df_down = df_down.reset_index(drop = True)
df_down.head() # 跌的文章
# df_down.to_excel('downNews21-23.xlsx')

Unnamed: 0.1,Unnamed: 0,post_time,title,content,token,label
0,135,2021-01-20,〈台股盤前要聞〉鴻海傳打進Apple Car、義隆攜聚積奪日系遊戲機訂單 今日必看財經新聞,台股盤前要聞共有九大重點，鴻海傳出已打進 Apple Car 供應鏈，正趕工生產關鍵零組件；...,"['台股', '盤', '前', '要', '聞', '共有', '九', '大', '重點...",跌
1,136,2021-01-20,【Y早報】拜登就職典禮 市場看好2族群,（開盤日09:00出刊）拜登就職典禮 市場看好2族群 14檔概念股蓄勢；半導體封測產能吃緊 ...,"['開盤', '日出刊', '拜登', '就職', '典禮', '市場', '看好', '族...",跌
2,137,2021-01-20,台積電再新天價639元 聯電攀18年高,（中央社記者張建中新竹2021年1月20日電）晶圓代工廠台積電 (2330) 與聯電 (23...,"['中央社', '記者', '張建中', '新竹', '年月', '日', '電', '晶圓...",跌
3,138,2021-01-20,《熱門族群》晶圓雙雄好猛！台積電天價650元 聯電續闖18年高,【時報記者沈培華台北報導】台積電與聯電美國存託憑證（ADR）同步大漲，帶動兩公司今天股價走高...,"['時報', '記者', '沈培華', '台北', '報導', '台積電', '與', '聯...",跌
4,139,2021-01-20,台積電今年來股價漲117元 貢獻指數近1000點,（中央社記者張建中新竹2021年1月20日電）晶圓代工廠台積電 (2330) 股價再創歷史新...,"['中央社', '記者', '張建中', '新竹', '年月日', '電', '晶圓', '...",跌


In [None]:
df_down.token

0       ['台股', '盤', '前', '要', '聞', '共有', '九', '大', '重點...
1       ['開盤', '日出刊', '拜登', '就職', '典禮', '市場', '看好', '族...
2       ['中央社', '記者', '張建中', '新竹', '年月', '日', '電', '晶圓...
3       ['時報', '記者', '沈培華', '台北', '報導', '台積電', '與', '聯...
4       ['中央社', '記者', '張建中', '新竹', '年月日', '電', '晶圓', '...
                              ...                        
2719    ['日期', '年', '月日', '盤勢', '分析', '等待', '非', '農', ...
2720    ['財訊', '快報', '陳孟朔', '美國', '上週', '首', '次', '申領'...
2721    ['時報', '編譯', '柯婉琇', '綜合', '外電', '報導', '分析', '資...
2722    ['新聞', '記者', '李彥瑾', '報導', '近期', '半導體', '寒流', '...
2723    ['時報', '編譯', '柯婉琇', '綜合', '外電', '報導', '在', '傳出...
Name: token, Length: 2724, dtype: object

In [None]:
df_down_list = []
for i in range(len(df_down)):
    df_down_list_tmp = []
    df_down_str = ''
    df_down_list_tmp = eval(df_down.token[i])
    df_down_str = ' '.join(df_down_list_tmp)
    df_down_list.append(df_down_str)
df_down_list

['台股 盤 前 要 聞 共有 九 大 重點 鴻海 傳出 已 打進 供應 鏈 正 趕工 生產 關鍵 零組件 義隆 聚積 合作 研發 拿下 新一代 大單 以及 國內 首 家 純網銀 正式 營運 樂天 推出 兩 大 引擎 以下 是 今日 必 看 重要 財經 新聞 外資 鴻海 已 打進 供應鏈 趕工 生產 關鍵 零組件 鴻海 傳出 已 打入 蘋果 供應 鏈 日系 外資 指出 鴻海 正 積極 趕工 製造 蘋果 需要 的 零組件 有望 成為 長線 營運 引擎 閱讀 全文 義隆聚積 合作 報捷 拿下 新一代 大單 任 天堂 積極 籌備 下一代 遊戲機 傳出 將 採用 新 世代 面板 技術 除了 找上 台 廠群 創 合作 據悉 義隆 聚積 分別 拿下 面板 時序 控制 晶片 驅動 晶片 訂單 大單 也 是 雙方 合作 首 個 捷報 具 象徵 與 實質 效益 閱讀 全文 鴻海 平台 成員數 近 個 月 倍增 至 家未來 擬 舉辦 開發 者 大會 鴻海 技術長 暨 聯盟 軟體 平台 負責人 魏國章 昨日 透露 目前 平台 成員數 最近 一 個 月 倍增 已經 超過 家 未來 也 規劃 舉辦 開發者 大會 閱讀 全文 首 家 純網銀 正式 營運 樂天 首 推 兩 大 引擎 祭 台幣 天高利 定存 台灣 首 家 純網銀樂天國際銀行 昨日 正式 對 外 營運 為 台灣 開啟 了 純網銀 數位 金融 的 全新 紀元 樂天國際銀行 一 上線 就 推出 兩 大 殺手級 的 金融 創新 引擎 快樂 會員 制度 和 樂天 生態圈 其中 快樂 會員 制度 的 會員 可以 與 日本 同步 享有 樂天 點數 平台 堪稱 一 大 突破 此外 推出 台幣 天定存 提供 年利率 高達 倍 定存 的 年 利率 閱讀 全文 集邦 需求量 爆增 晶片 價格 調漲 集邦旗下光電研究處 表示 由於 蘋果 三星 等 品牌 計畫 在 年 推出 搭載 背光 顯示 的 筆電 平板 電視 等 產品 因此 提前 在 年 第四季 開始 拉貨 使 晶片 需求 暴增 進而 排擠 常規 晶片 產能 結構性 缺貨 下 部分 業者 已 調漲 非 核心 客戶 和 低 毛利 產品 的 晶片 價格 約 閱讀 全文 疫情 衝擊 比 金融 海嘯 還 慘 去年 上市 櫃 家數 創 近年 新 低 金管會 昨日 公布 去年 家數 上市櫃 合計 僅 家創 近年 