In [48]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [49]:
with open('stopwords_zh.txt', 'r') as file:
    stopwords = file.read().splitlines() 
file.close()

In [50]:
df_stock = pd.DataFrame()
for i in range(2021, 2024):
    x = pd.read_excel("../../bda2023_mid_dataset/stock_data_2019-2023.xlsx", sheet_name=f'上市{i}', usecols=['證券代碼', '年月日', '收盤價(元)'])
    x = x[x['證券代碼'] == '2330 台積電']
    x = x.loc[::-1]
    df_stock = pd.concat([df_stock, x])

In [51]:
df_stock = df_stock.reset_index(drop = True)
df_stock

Unnamed: 0,證券代碼,年月日,收盤價(元)
0,2330 台積電,2021/01/04,526.7744
1,2330 台積電,2021/01/05,532.6711
2,2330 台積電,2021/01/06,539.5506
3,2330 台積電,2021/01/07,555.2752
4,2330 台積電,2021/01/08,570.0170
...,...,...,...
534,2330 台積電,2023/03/20,512.0000
535,2330 台積電,2023/03/21,517.0000
536,2330 台積電,2023/03/22,533.0000
537,2330 台積電,2023/03/23,538.0000


In [52]:
day_n = 3 # 以 day_n 天後來看是漲還是跌
sigma = 0.03 # 以 sigma 決定漲（跌）幅超過幾 % 是漲（跌）
label = []
for i in range(len(df_stock) - day_n):
    rate = (df_stock['收盤價(元)'][i + day_n] - df_stock['收盤價(元)'][i]) / df_stock['收盤價(元)'][i]
    if rate > sigma:
       label.append('漲')
    elif rate < -sigma:
        label.append('跌')
    else:
        label.append('持平')
for i in range(day_n):
    label.append(0)
# label.extend([0, 0, 0])

df_stock['label'] = label
df_stock.tail(10)

Unnamed: 0,證券代碼,年月日,收盤價(元),label
529,2330 台積電,2023/03/13,513.2229,跌
530,2330 台積電,2023/03/14,507.2552,持平
531,2330 台積電,2023/03/15,508.2498,跌
532,2330 台積電,2023/03/16,505.0,漲
533,2330 台積電,2023/03/17,518.0,跌
534,2330 台積電,2023/03/20,512.0,漲
535,2330 台積電,2023/03/21,517.0,漲
536,2330 台積電,2023/03/22,533.0,漲
537,2330 台積電,2023/03/23,538.0,持平
538,2330 台積電,2023/03/24,539.0,0


In [53]:
# 更改年月日的日期格式
df_stock['年月日'] = pd.to_datetime(df_stock['年月日']).dt.date
df_stock.tail(10)

Unnamed: 0,證券代碼,年月日,收盤價(元),label
529,2330 台積電,2023-03-13,513.2229,跌
530,2330 台積電,2023-03-14,507.2552,持平
531,2330 台積電,2023-03-15,508.2498,跌
532,2330 台積電,2023-03-16,505.0,漲
533,2330 台積電,2023-03-17,518.0,跌
534,2330 台積電,2023-03-20,512.0,漲
535,2330 台積電,2023-03-21,517.0,漲
536,2330 台積電,2023-03-22,533.0,漲
537,2330 台積電,2023-03-23,538.0,持平
538,2330 台積電,2023-03-24,539.0,0


In [54]:
upCnt = 0
downCnt = 0
flatCnt = 0
for i in range(len(df_stock)):
    if df_stock['label'][i] == '漲':
        upCnt += 1
    elif df_stock['label'][i] == '跌':
        downCnt += 1 
    else :
        flatCnt += 1
print('預估漲的天數：', upCnt)
print('預估跌的天數：', downCnt)
print('預估持平的天數：', flatCnt)

預估漲的天數： 193
預估跌的天數： 209
預估持平的天數： 137


In [55]:
df_news = pd.read_excel("../../TSMC_news_21-23.xlsx")
df_news.head()

Unnamed: 0.1,Unnamed: 0,post_time,title,content,token
0,0,2021-01-01,明年買哪幾檔晶片股?分析師首選 Nvidia 和 AMD,週四 (31 日) 華爾街分析師表示，Nvidia 和 AMD 本月漲勢落後於其他半導體類股...,"['週四', '日', '華爾街', '分析師', '表示', '和', '本', '月',..."
1,1,2021-01-01,《大陸產業》中芯成熟製程 獲美放行,【時報-台北電】遭美制裁的大陸晶圓代工龍頭中芯國際迎來曙光。業界傳出，中芯國際成熟製程獲得美...,"['時報', '台北電', '遭', '美', '制裁', '的', '大陸', '晶圓',..."
2,2,2021-01-04,《各報要聞》台積今年資本支出上看200億美元,【時報-台北電】晶圓代工龍頭台積電2020年繳出亮麗成績單，預期全年美元營收年成長率逾三成並...,"['時報', '台北', '電', '晶圓', '代工', '龍頭', '台積電', '年'..."
3,3,2021-01-04,《半導體》台積電等供應鏈力挺 聯發科Q1拚淡季不淡,【時報記者王逸芯台北報導】聯發科(2454)在去年第三季一舉超車高通，在行動晶片市占率衝上3...,"['時報', '記者', '王逸芯', '台北', '報導', '聯發科', '在', '去..."
4,4,2021-01-04,《半導體》2021年拚翻身 創意鎖漲停,【時報記者王逸芯台北報導】創意(3443)營運最壞時期已經過去，去年第四季在NRE案認列入帳...,"['時報', '記者', '王逸芯', '台北', '報導', '創意', '營運', '最..."


In [56]:
label_news = []
for i in range(len(df_news)):
    for j in range(len(df_stock)):
        if df_news['post_time'][i] == df_stock['年月日'][j]:
            label_news.append(df_stock['label'][j])
    if len(label_news) == i:
        label_news.append(0) # 日期沒有對到的情況（e.g. 週末） 

df_news['label'] = label_news
df_news.head(10)
df_news.tail(10)

  if df_news['post_time'][i] == df_stock['年月日'][j]:


Unnamed: 0.1,Unnamed: 0,post_time,title,content,token,label
6822,6822,2023-03-20,算力大戰 台積日月光受惠,隨著微軟轉投資OpenAI推出的聊天機器人ChatGPT全球爆紅，包括Google、阿里巴巴...,"['隨著', '微軟', '轉', '投資', '推出', '的', '聊天', '機器人'...",漲
6823,6823,2023-03-20,銀行股再見殺氣，美股四巫日齊黑，道瓊急跌385點，微軟逆勢漲約1.2%,【財訊快報／陳孟朔】被華府接管的矽谷銀行(SVB)，其母公司SVB金融集團申請破產保護的同時...,"['財訊', '快報', '陳孟朔', '被', '華府', '接管', '的', '矽谷銀...",漲
6824,6824,2023-03-20,《熱門族群》外資釋出AI口袋名單股 台積電等3檔入列,【時報記者王逸芯台北報導】ChatGPT颳起AI旋風，美系外資針對AI產業也出具最新研究報告...,"['時報', '記者', '王逸芯', '台北', '報導', '颳起', '旋風', '美...",漲
6825,6825,2023-03-20,台積電蟬聯百大創新企業，專利與營業秘密雙軌保護成果，研發營收比8%,【財訊快報／記者李純君報導】台積電(2330)今年繼續蟬聯百大創新企業，其副法務長陳碧莉提到...,"['財訊快報', '記者', '李純君', '報導', '台積電', '今年', '繼續',...",漲
6826,6826,2023-03-20,台積電／台積電「創新」再獲獎 副法務長陳碧莉：去年在台、美專利獲准百發百中,台積電（2330）再次入選「2023全球百大創新機構獎」，出席領獎的台積電副法務漲陳碧莉表示...,"['台積電', '再次', '入選', '全球', '百', '大', '創新', '機構獎...",漲
6827,6827,2023-03-21,晶片業起死回生？日媒揭實際庫存真相：反轉訊號來了,半導體景氣反轉向下，從「供不應求」轉變成「供應過剩」，市場關注產業景氣何時才會翻轉向上，日媒...,"['半導體', '景氣', '反轉', '向', '下', '從', '供', '不', '...",漲
6828,6828,2023-03-21,《盤前掃瞄-基本面》高通加速轉單台灣；國發會估景氣4月探底,【時報-台北電】基本面：1.前一交易日新台幣以30.593元兌一美元收市，貶值3.9分，成交...,"['時報', '台北', '電', '基本面', '前', '一', '交易日', '新台幣...",漲
6829,6829,2023-03-21,《各報要聞》高通去中化 加速轉單台灣,【時報-台北電】為了因應半導體市場出現美國陣營及中國陣營的兩極化地緣政治風險，手機晶片大廠高...,"['時報', '台北電', '為了', '因應', '半導體', '市場', '出現', '...",漲
6830,6830,2023-03-21,《科技》台積電兩招…擴大專利版圖,【時報-台北電】晶圓代工龍頭台積電20日獲頒2023年科睿唯安全球百大創新機構獎，副法務長陳...,"['時報', '台北', '電晶圓', '代工', '龍頭', '台積電', '日', '獲...",漲
6831,6831,2023-03-21,《熱門族群》高通轉單喜訊 精測笑開懷、這兩檔卻擺臭臉,【時報-台北電】台股今日隨美股彈升，在金融股回神、AI等具題材族群續強下，大盤指數盤中彈升約...,"['時報', '台北電', '台股', '今日', '隨', '美股', '彈升', '在'...",漲


In [57]:
# 刪掉 label 是 0 的資料
df_news_no_zero = df_news[df_news.label != 0]
df_news_no_zero = df_news_no_zero.reset_index(drop = True)

In [58]:
# token 型態轉換，fit in 套件
df_news_list = []
for i in range(len(df_news_no_zero)):
    df_news_list_tmp = []
    df_news_str = ''
    df_news_list_tmp = eval(df_news_no_zero.token[i])
    df_news_str = ' '.join(df_news_list_tmp)
    df_news_list.append(df_news_str)
# df_news_list

In [59]:
X_data = df_news_list
Y_data = df_news_no_zero['label']

In [60]:
# 隨機切分資料，train 80%、test 20%
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X_data, Y_data, test_size = 0.2, random_state = 1, stratify = Y_data)
print("train data length:", len(X_train), ",", len(Y_train))
print("test data length:",len(X_test), ",", len(Y_test))

train data length: 4923 , 4923
test data length: 1231 , 1231


In [61]:
#將所有資料向量化，轉成 tfidf vector
vectorizer = TfidfVectorizer(stop_words=stopwords)
X_train = vectorizer.fit_transform(X_train)
X_train = pd.DataFrame(X_train.toarray(), columns = vectorizer.get_feature_names_out())
display(X_train)



Unnamed: 0,一一一億,一一七八,一一七六億,一一兆,一一六億,一一六八三兆,一一出爐,一七,一七三,一七二,...,龐然,龐雜,龔培元,龔明鑫,龔明鑫日,龔說,龜尾,龜山,龜山廠,龜牙
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4918,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4919,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4920,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4921,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [62]:
chi2_selector = SelectKBest(chi2, k = 2000)
chi2_selector.fit(X_train, Y_train)
kbest_vocabs = X_train.columns[chi2_selector.get_support()]
X_train = X_train[kbest_vocabs]
X_train

Unnamed: 0,一三,一現,一般板,七千金,七雄旺,七黑,三強,三成五,三福化旺宏鴻海上銀,三路,...,齊減碼,齊漲,齊發聯發科,齊紅,齊紅特斯,齊締,齊賀,龍頭市,龐佩奧,龔明鑫
0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4918,0.0,0.0,0.13376,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4919,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4920,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4921,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [63]:
# 計算 test 個別的 tfidf，再將結果透過 df.reindex 這個方法映射到訓練集的向量空間中
X_test = vectorizer.fit_transform(X_test)
X_test = pd.DataFrame(X_test.toarray(), columns=vectorizer.get_feature_names_out())
X_test = X_test.reindex(kbest_vocabs, axis=1, fill_value=0)
X_test

Unnamed: 0,一三,一現,一般板,七千金,七雄旺,七黑,三強,三成五,三福化旺宏鴻海上銀,三路,...,齊減碼,齊漲,齊發聯發科,齊紅,齊紅特斯,齊締,齊賀,龍頭市,龐佩奧,龔明鑫
0,0,0.0,0,0.0,0,0.0,0.0,0,0,0,...,0,0.0,0,0,0,0,0,0,0,0.0
1,0,0.0,0,0.0,0,0.0,0.0,0,0,0,...,0,0.0,0,0,0,0,0,0,0,0.0
2,0,0.0,0,0.0,0,0.0,0.0,0,0,0,...,0,0.0,0,0,0,0,0,0,0,0.0
3,0,0.0,0,0.0,0,0.0,0.0,0,0,0,...,0,0.0,0,0,0,0,0,0,0,0.0
4,0,0.0,0,0.0,0,0.0,0.0,0,0,0,...,0,0.0,0,0,0,0,0,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1226,0,0.0,0,0.0,0,0.0,0.0,0,0,0,...,0,0.0,0,0,0,0,0,0,0,0.0
1227,0,0.0,0,0.0,0,0.0,0.0,0,0,0,...,0,0.0,0,0,0,0,0,0,0,0.0
1228,0,0.0,0,0.0,0,0.0,0.0,0,0,0,...,0,0.0,0,0,0,0,0,0,0,0.0
1229,0,0.0,0,0.0,0,0.0,0.0,0,0,0,...,0,0.0,0,0,0,0,0,0,0,0.0


### 預測模型

In [64]:
from sklearn.ensemble import GradientBoostingClassifier

In [65]:
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1, max_depth=7, random_state=0)
clf.fit(X_train, Y_train)
clf.score(X_train, Y_train)

0.9882185659150924

In [66]:
test_label = Y_test

test_data = X_test
predict_label = clf.predict(X_test)
# predict_label = pd.merge(
#     ase[ase['年月日-1'].between(test_startDate, test_endDate)], 
#     test_data.groupby(['post_time', 'predict_label']).count().sort_values('label', ascending = False).sort_index(level=[0], sort_remaining=False).groupby(level=0).head(1).reset_index(), 
#     left_on='年月日-1', right_on='post_time', how='left').fillna(method='ffill').fillna(method='bfill')['predict_label']

from sklearn.metrics import accuracy_score
print('預測準確率:', accuracy_score(test_label, predict_label))

預測準確率: 0.44435418359057677


In [67]:
#建立預測模型（NB）
from sklearn.naive_bayes import BernoulliNB
from sklearn import metrics
NB_model = BernoulliNB()

In [68]:
NB_model.fit(X_train,Y_train)

In [69]:
predicted_results = []
expected_results = []
expected_results.extend(Y_test)
predicted_results.extend(NB_model.predict(X_test))
print(predicted_results)

['漲', '漲', '跌', '持平', '漲', '持平', '跌', '跌', '跌', '漲', '漲', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '持平', '跌', '漲', '漲', '持平', '漲', '跌', '跌', '跌', '漲', '漲', '持平', '跌', '持平', '跌', '跌', '漲', '漲', '持平', '持平', '跌', '跌', '跌', '跌', '持平', '跌', '跌', '跌', '漲', '跌', '漲', '漲', '漲', '跌', '漲', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '漲', '漲', '跌', '跌', '跌', '跌', '漲', '跌', '漲', '跌', '跌', '漲', '跌', '漲', '跌', '跌', '跌', '跌', '漲', '跌', '跌', '跌', '漲', '漲', '跌', '漲', '漲', '跌', '漲', '跌', '漲', '跌', '跌', '漲', '漲', '漲', '跌', '跌', '跌', '漲', '跌', '漲', '持平', '漲', '漲', '跌', '跌', '跌', '跌', '漲', '漲', '漲', '持平', '跌', '漲', '漲', '跌', '漲', '漲', '跌', '漲', '跌', '漲', '跌', '跌', '跌', '跌', '漲', '跌', '漲', '漲', '跌', '漲', '持平', '跌', '跌', '跌', '跌', '跌', '持平', '跌', '跌', '跌', '漲', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '漲', '漲', '跌', '漲', '漲', '漲', '跌', '跌', '漲', '漲', '漲', '漲', '漲', '漲', '持平', '漲', '漲', '跌', '漲', '漲', '漲', '跌', '漲', '漲', '跌', '跌', '跌', '跌', '跌', '漲', '漲', '漲', '持平', '漲', '漲', '跌', '跌', '漲', '跌', '跌', '漲', '跌', '跌', '跌', '持平'

In [70]:
report = metrics.classification_report(expected_results, predicted_results)
print(report)

              precision    recall  f1-score   support

          持平       0.35      0.19      0.25       288
           漲       0.39      0.38      0.38       420
           跌       0.47      0.61      0.53       523

    accuracy                           0.43      1231
   macro avg       0.41      0.39      0.39      1231
weighted avg       0.42      0.43      0.42      1231



In [71]:
#建立預測模型（SVM Linear）
from sklearn.svm import SVC
SVM_model = SVC(kernel = 'linear', C = 50.0, probability = True)
SVM_model.fit(X_train, Y_train)

In [72]:
predicted_results = []
expected_results = []
expected_results.extend(Y_test)
predicted_results.extend(SVM_model.predict(X_test))
print(predicted_results)

['跌', '跌', '跌', '跌', '漲', '漲', '跌', '跌', '跌', '漲', '漲', '持平', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '漲', '跌', '漲', '漲', '跌', '跌', '跌', '跌', '持平', '跌', '跌', '持平', '跌', '跌', '跌', '漲', '漲', '漲', '跌', '跌', '跌', '跌', '持平', '漲', '跌', '跌', '漲', '持平', '漲', '跌', '漲', '跌', '持平', '跌', '跌', '跌', '漲', '跌', '跌', '跌', '跌', '跌', '漲', '跌', '跌', '持平', '漲', '跌', '跌', '漲', '跌', '跌', '跌', '持平', '漲', '跌', '跌', '跌', '跌', '漲', '跌', '跌', '跌', '持平', '持平', '跌', '持平', '漲', '跌', '持平', '跌', '跌', '跌', '跌', '跌', '跌', '漲', '跌', '跌', '跌', '漲', '跌', '漲', '跌', '跌', '漲', '跌', '持平', '跌', '跌', '漲', '跌', '漲', '跌', '跌', '漲', '漲', '漲', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '漲', '跌', '漲', '跌', '跌', '漲', '漲', '跌', '跌', '跌', '跌', '跌', '漲', '跌', '跌', '跌', '跌', '跌', '持平', '跌', '跌', '跌', '跌', '跌', '漲', '漲', '跌', '跌', '漲', '漲', '持平', '跌', '漲', '跌', '跌', '漲', '漲', '漲', '持平', '漲', '漲', '持平', '跌', '漲', '跌', '跌', '漲', '跌', '跌', '跌', '漲', '跌', '跌', '漲', '持平', '漲', '跌', '漲', '漲', '持平', '跌', '漲', '跌', '跌', '漲', '跌', '跌', '跌', 

In [73]:
report = metrics.classification_report(expected_results, predicted_results)
print(report)

              precision    recall  f1-score   support

          持平       0.44      0.23      0.30       288
           漲       0.45      0.31      0.37       420
           跌       0.48      0.73      0.58       523

    accuracy                           0.47      1231
   macro avg       0.46      0.42      0.42      1231
weighted avg       0.46      0.47      0.44      1231



In [74]:
#建立預測模型（SVM Rbf）
SVM_model = SVC(kernel = 'rbf', C = 1.0, probability=True)
SVM_model.fit(X_train,Y_train)

In [75]:
predicted_results = []
expected_results = []
expected_results.extend(Y_test)
predicted_results.extend(SVM_model.predict(X_test))
print(predicted_results)

['跌', '跌', '跌', '跌', '漲', '跌', '跌', '跌', '跌', '漲', '漲', '持平', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '漲', '漲', '跌', '跌', '跌', '跌', '持平', '跌', '持平', '漲', '跌', '跌', '跌', '漲', '跌', '持平', '持平', '跌', '持平', '跌', '持平', '跌', '跌', '跌', '漲', '漲', '跌', '跌', '跌', '跌', '持平', '跌', '跌', '跌', '漲', '跌', '跌', '跌', '跌', '跌', '漲', '跌', '跌', '跌', '漲', '跌', '跌', '漲', '跌', '跌', '跌', '持平', '漲', '跌', '跌', '跌', '跌', '漲', '跌', '漲', '跌', '漲', '漲', '跌', '漲', '漲', '跌', '持平', '跌', '跌', '跌', '跌', '跌', '跌', '漲', '漲', '跌', '跌', '漲', '跌', '漲', '跌', '跌', '漲', '跌', '跌', '跌', '跌', '跌', '跌', '漲', '跌', '跌', '漲', '漲', '漲', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '漲', '持平', '漲', '跌', '跌', '跌', '漲', '跌', '跌', '跌', '跌', '跌', '漲', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '漲', '跌', '跌', '跌', '跌', '持平', '跌', '跌', '跌', '跌', '跌', '跌', '漲', '持平', '漲', '漲', '持平', '跌', '跌', '跌', '跌', '持平', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '漲', '漲', '跌', '漲', '跌', '跌', '跌', '跌', '跌', '跌', '漲', '跌', '跌', '跌', '跌',

In [76]:
report = metrics.classification_report(expected_results, predicted_results)
print(report)

              precision    recall  f1-score   support

          持平       0.43      0.15      0.22       288
           漲       0.45      0.24      0.31       420
           跌       0.45      0.79      0.57       523

    accuracy                           0.45      1231
   macro avg       0.45      0.39      0.37      1231
weighted avg       0.45      0.45      0.40      1231



#### example 6 

In [77]:
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB() # naive bayes classifier

In [78]:
# 以下將自身資料切成 train 及 test 兩組，重新訓練一次，測試模型準確率
classifier = MultinomialNB()
classifier.fit(X_train, Y_train) #訓練

In [79]:
from sklearn import metrics 
from sklearn.metrics import accuracy_score
Y_pred = classifier.predict(X_test) #用測試資料預測
print("Accuracy:", metrics.accuracy_score(Y_test, Y_pred)) #比對答案，計算準確率

Accuracy: 0.43623070674248576


In [80]:
from sklearn.metrics import classification_report  
print(classification_report(Y_test, Y_pred)) #印出分類報告

              precision    recall  f1-score   support

          持平       0.50      0.00      0.01       288
           漲       0.56      0.07      0.13       420
           跌       0.43      0.97      0.60       523

    accuracy                           0.44      1231
   macro avg       0.50      0.35      0.24      1231
weighted avg       0.49      0.44      0.30      1231



In [81]:
from sklearn.metrics import confusion_matrix  
print(confusion_matrix(Y_test, Y_pred, labels = ['漲', '跌', '持平'])) #印出混淆矩陣
# test/predicted 看漲   看跌   持平
#    看漲         TN     FP    FF
#    看跌         FN     TP    FF
#    持平         FN     FP    TF

[[ 30 390   0]
 [ 16 506   1]
 [  8 279   1]]


In [82]:
Y_test

4664     跌
1465    持平
2566    持平
1412     漲
4502     漲
        ..
3976    持平
4312     跌
3009    持平
1610    持平
3395     跌
Name: label, Length: 1231, dtype: object

#### Example 7

In [85]:
# 將所有資料(X_data)向量化，轉成 tfidf vector
vectorizer = TfidfVectorizer(stop_words=stopwords)
X_data = vectorizer.fit_transform(X_data)
X_data = pd.DataFrame(X_data.toarray(), columns = vectorizer.get_feature_names_out())
display(X_data)



Unnamed: 0,一一一億,一一七五,一一七八,一一七六億,一一九九七,一一五,一一兆,一一八五,一一六,一一六億,...,龐然,龐雜,龔培元,龔明鑫,龔明鑫日,龔說,龜尾,龜山,龜山廠,龜牙
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6149,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6150,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6151,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6152,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [87]:
# DecisionTree
from sklearn.tree import DecisionTreeClassifier                          
classifier = DecisionTreeClassifier(criterion = "entropy")

from sklearn import metrics 
from sklearn.metrics import accuracy_score
classifier.fit(X_train, Y_train) # 訓練
Y_pred = classifier.predict(X_test) # 用測試資料預測
print("Accuracy:", metrics.accuracy_score(Y_test, Y_pred)) # 比對答案，計算準確率

Accuracy: 0.41186027619821286


In [88]:
from sklearn.metrics import classification_report  
print(classification_report(Y_test, Y_pred)) # 印出分類報告

              precision    recall  f1-score   support

          持平       0.32      0.32      0.32       288
           漲       0.39      0.40      0.39       420
           跌       0.48      0.47      0.48       523

    accuracy                           0.41      1231
   macro avg       0.40      0.40      0.40      1231
weighted avg       0.41      0.41      0.41      1231



In [89]:
from sklearn.metrics import confusion_matrix  
print(confusion_matrix(Y_test, Y_pred, labels = ['漲', '跌', '持平'])) #印出混淆矩陣
# test/predicted 看漲   看跌   持平
#    看漲         TN     FP    FF
#    看跌         FN     TP    FF
#    持平         FN     FP    TF

[[166 158  96]
 [171 248 104]
 [ 87 108  93]]


In [90]:
# KNeighbors
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 7)

scores = cross_val_score(classifier, X_data, Y_data, cv = 5, scoring = 'accuracy') # 交叉驗證，計算準確率
print(scores)
print("Avg. Accuracy:", scores.mean())

[0.33631194 0.33468725 0.34199838 0.34524777 0.34715447]
Avg. Accuracy: 0.3410799601091055


In [91]:
classifier.fit(X_train, Y_train) # 訓練
Y_pred = classifier.predict(X_test) # 用測試資料預測
print("Accuracy:", metrics.accuracy_score(Y_test, Y_pred)) # 比對答案，計算準確率

Accuracy: 0.40048740861088544


In [92]:
print(classification_report(Y_test, Y_pred)) # 印出分類報告

              precision    recall  f1-score   support

          持平       0.28      0.10      0.15       288
           漲       0.38      0.33      0.35       420
           跌       0.43      0.62      0.51       523

    accuracy                           0.40      1231
   macro avg       0.36      0.35      0.34      1231
weighted avg       0.38      0.40      0.37      1231



In [93]:
print(confusion_matrix(Y_test, Y_pred, labels = ['漲', '跌', '持平'])) #印出混淆矩陣
# test/predicted 看漲   看跌   持平
#    看漲         TN     FP    FF
#    看跌         FN     TP    FF
#    持平         FN     FP    TF

[[138 252  30]
 [151 325  47]
 [ 79 179  30]]
