In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [24]:
# impoort 斷詞（要放 topwords_zh.txt 進資料夾）
with open('../../bda2023_mid_dataset/stopwords_zh.txt', 'r') as file:
    stopwords = file.read().splitlines() 
file.close()

In [3]:
df = pd.read_excel("../../bda2023_mid_dataset/stock_data_2019-2023.xlsx", sheet_name=["上市2020", "上市2021", "上市2022"], usecols=['證券代碼', '年月日', '收盤價(元)'])

In [62]:
# import stock excel
# 只留台積電的日期跟收盤價
# 按照日期由最早往最晚排
df_stock = pd.DataFrame()
for i in range(2020, 2023):
    df_stock = pd.concat([df_stock, df[(f"上市{i}")].loc[::-1]])

In [63]:
df_stock = df_stock[df_stock["證券代碼"] == "2330 台積電"][['證券代碼', '年月日', '收盤價(元)']].reset_index(drop = True)
df_stock

Unnamed: 0,證券代碼,年月日,收盤價(元)
0,2330 台積電,2020/01/02,328.2957
1,2330 台積電,2020/01/03,328.7799
2,2330 台積電,2020/01/06,321.5168
3,2330 台積電,2020/01/07,319.0957
4,2330 台積電,2020/01/08,319.0957
...,...,...,...
730,2330 台積電,2022/12/26,454.0431
731,2330 台積電,2022/12/27,454.5404
732,2330 台積電,2022/12/28,448.5727
733,2330 台積電,2022/12/29,443.5996


# Requirement 1

In [96]:
# Requirement 1 的第二點

day_n = 3 # 以 day_n 天後來看是漲還是跌
sigma = 0 # 以 sigma 決定漲（跌）幅超過幾 % 是漲（跌）
label = []
for i in range(len(df_stock) - day_n):
    rate = (df_stock['收盤價(元)'][i + day_n] - df_stock['收盤價(元)'][i]) / df_stock['收盤價(元)'][i]
    if rate > sigma:
       label.append('漲')
    elif rate < -sigma:
        label.append('跌')
    else:
        label.append('持平')
for i in range(day_n):
    label.append(0)
# label.extend([0, 0, 0])

df_stock['label'] = label
df_stock.tail(10)

Unnamed: 0,證券代碼,年月日,收盤價(元),label
369,2330 台積電,2022-12-01,492.9796,跌
370,2330 台積電,2022-12-05,483.5848,跌
371,2330 台積電,2022-12-08,466.2786,漲
372,2330 台積電,2022-12-09,476.1679,跌
373,2330 台積電,2022-12-13,466.2786,跌
374,2330 台積電,2022-12-15,477.9139,跌
375,2330 台積電,2022-12-19,463.9893,跌
376,2330 台積電,2022-12-21,456.5297,0
377,2330 台積電,2022-12-22,465.4812,0
378,2330 台積電,2022-12-27,454.5404,0


In [97]:
# 更改年月日的日期格式
df_stock['年月日'] = pd.to_datetime(df_stock['年月日']).dt.date
df_stock.tail(10)

Unnamed: 0,證券代碼,年月日,收盤價(元),label
369,2330 台積電,2022-12-01,492.9796,跌
370,2330 台積電,2022-12-05,483.5848,跌
371,2330 台積電,2022-12-08,466.2786,漲
372,2330 台積電,2022-12-09,476.1679,跌
373,2330 台積電,2022-12-13,466.2786,跌
374,2330 台積電,2022-12-15,477.9139,跌
375,2330 台積電,2022-12-19,463.9893,跌
376,2330 台積電,2022-12-21,456.5297,0
377,2330 台積電,2022-12-22,465.4812,0
378,2330 台積電,2022-12-27,454.5404,0


In [98]:
upCnt = 0
downCnt = 0
flatCnt = 0
for i in range(len(df_stock)):
    if df_stock['label'][i] == '漲':
        upCnt += 1
    elif df_stock['label'][i] == '跌':
        downCnt += 1 
    else :
        flatCnt += 1
print('預估漲的天數：', upCnt)
print('預估跌的天數：', downCnt)
print('預估持平的天數：', flatCnt)

預估漲的天數： 189
預估跌的天數： 186
預估持平的天數： 4


In [99]:
# 丟掉持平的資料
df_stock = df_stock[df_stock.label != '持平']
df_stock = df_stock[df_stock.label != 0]
df_stock = df_stock.reset_index(drop = True)
df_stock

Unnamed: 0,證券代碼,年月日,收盤價(元),label
0,2330 台積電,2020-01-03,328.7799,漲
1,2330 台積電,2020-01-08,319.0957,漲
2,2330 台積電,2020-01-13,330.7168,跌
3,2330 台積電,2020-01-14,335.0747,跌
4,2330 台積電,2020-01-15,329.2642,跌
...,...,...,...,...
370,2330 台積電,2022-12-08,466.2786,漲
371,2330 台積電,2022-12-09,476.1679,跌
372,2330 台積電,2022-12-13,466.2786,跌
373,2330 台積電,2022-12-15,477.9139,跌


In [100]:
# import news excel 
df_news = pd.read_excel("../../TSMC_news_20-22.xlsx")
df_news.head()

Unnamed: 0.1,Unnamed: 0,post_time,title,content,token
0,0,2020-01-02,美股：美中協議月中簽署，四大指數週二齊漲約0.3%封關，費半全年狂升逾六成,美股週二開盤延續前一天拉回格局，主要指數以平低盤開出，，盤中美國總統川普發文宣稱1月15日會...,"['美股', '週二', '開盤', '延續', '前', '一', '天', '拉回', ..."
1,1,2020-01-02,《熱門族群》5G新機連發，帶旺璟德、聯發科,美國消費性電子展（CES）將於元月7日∼10日接續舉辦，法人預期5G仍將是本次展期焦點，加上...,"['美國', '消費性', '電子展', '將', '於', '元月', '日日', '接續..."
2,2,2020-01-02,《熱門族群》台積電走高，半導體設備股跟漲,台積電 (2330) 股價今早開高走高，帶領台積電供應鏈及半導體設備股再度轉強。隨著美中貿易...,"['台積電', '股價', '今', '早', '開', '高', '走高', '帶領', ..."
3,3,2020-01-02,台灣科技能打趴歐洲大廠？專家分析「還太早」,近年來台灣不論科技、經濟等發展，都大幅的成長，也逐漸讓國際看到台灣。不過在科技的成長方面，就...,"['近年', '來', '台灣', '不論', '科技', '經濟', '等', '發展',..."
4,4,2020-01-02,京元電Q1營收估創同期高；全年看增雙位數,MoneyDJ新聞 2020-01-02 14:23:09 記者 王怡茹 報導<BR>京元電...,"['新聞', '記者', '王怡茹', '報導', '京元電子', '年月', '營收', ..."


In [101]:
# 更改年月日的日期格式
df_news['post_time'] = pd.to_datetime(df_news['post_time']).dt.date

In [102]:
from collections import defaultdict
dic=defaultdict(int)
for i in range(len(df_stock)):
    dic[(df_stock['年月日'][i])] = (df_stock['label'][i])

In [103]:
label_news = []
for i in range(len(df_news)):
    label_news.append(dic[df_news["post_time"][i]])
df_news['label'] = label_news
df_news.head(10)

Unnamed: 0.1,Unnamed: 0,post_time,title,content,token,label
0,0,2020-01-02,美股：美中協議月中簽署，四大指數週二齊漲約0.3%封關，費半全年狂升逾六成,美股週二開盤延續前一天拉回格局，主要指數以平低盤開出，，盤中美國總統川普發文宣稱1月15日會...,"['美股', '週二', '開盤', '延續', '前', '一', '天', '拉回', ...",0
1,1,2020-01-02,《熱門族群》5G新機連發，帶旺璟德、聯發科,美國消費性電子展（CES）將於元月7日∼10日接續舉辦，法人預期5G仍將是本次展期焦點，加上...,"['美國', '消費性', '電子展', '將', '於', '元月', '日日', '接續...",0
2,2,2020-01-02,《熱門族群》台積電走高，半導體設備股跟漲,台積電 (2330) 股價今早開高走高，帶領台積電供應鏈及半導體設備股再度轉強。隨著美中貿易...,"['台積電', '股價', '今', '早', '開', '高', '走高', '帶領', ...",0
3,3,2020-01-02,台灣科技能打趴歐洲大廠？專家分析「還太早」,近年來台灣不論科技、經濟等發展，都大幅的成長，也逐漸讓國際看到台灣。不過在科技的成長方面，就...,"['近年', '來', '台灣', '不論', '科技', '經濟', '等', '發展',...",0
4,4,2020-01-02,京元電Q1營收估創同期高；全年看增雙位數,MoneyDJ新聞 2020-01-02 14:23:09 記者 王怡茹 報導<BR>京元電...,"['新聞', '記者', '王怡茹', '報導', '京元電子', '年月', '營收', ...",0
5,5,2020-01-02,【Y晚報】元旦假後開市 台積電領軍大漲百點,（開盤日15:30出刊）美股在前日(12/31)封關日，四大指數全數收漲，漲幅介於0.27%...,"['開盤', '日出刊', '美股', '在', '前日', '封關日', '四', '大'...",0
6,6,2020-01-03,《各報要聞》2020報喜，歐美股聯袂走揚,2020年第一個交易日，歐美股市齊聲歡唱。泛歐STOXX 600指數早盤勁升0.9％，美股三...,"['年', '第一', '個', '交易日', '歐美', '股市', '齊聲', '歡唱'...",漲
7,7,2020-01-03,美股：四大指數週四開春首日齊攻頂，道瓊大漲330點，FAANG及中概股表現亮眼,美股送走六年來漲勢最凌厲的2019年後，2020年新年第一個交易日，迎來人行降準，美中簽署第...,"['美股', '送走', '六', '年', '來', '漲勢', '最', '凌厲', '...",漲
8,8,2020-01-03,【Y早報】進入5G元年 佈局CES商機 族群行情再發威,（開盤日09:00出刊）MLCC供貨吃緊恐漲價，國巨營收看俏；iPad降價搶大陸市場，帶旺主...,"['開盤', '日出刊', '供貨', '吃緊', '恐', '漲價', '國巨營', '收...",漲
9,9,2020-01-03,【日盛金控晨訊】結構有利盤勢 支撐看月線,日期：2020年 1月 3日<BR>※盤勢分析<BR>1.隨著中國新經濟刺激措施增加、以及美...,"['日期', '年月日', '盤勢', '分析', '隨著', '中國', '新', '經濟...",漲


In [104]:
# 刪掉 label 是 0 的資料
df_news_no_zero = df_news[df_news.label != 0]
df_news_no_zero = df_news_no_zero.reset_index(drop = True)

In [105]:
# token 型態轉換，fit in 套件
df_news_list = []
for i in range(len(df_news_no_zero)):
    df_news_list_tmp = []
    df_news_str = ''
    df_news_list_tmp = eval(df_news_no_zero.token[i])
    df_news_str = ' '.join(df_news_list_tmp)
    df_news_list.append(df_news_str)
len(df_news_list)

3428

In [111]:
X_data = df_news_list
Y_data = df_news_no_zero['label']

In [107]:
from sklearn.decomposition import TruncatedSVD
svd_model = TruncatedSVD(n_components=300)
vectorizer = TfidfVectorizer(stop_words=stopwords)
TFIDF_vectors = vectorizer.fit_transform(X_data)
SVD_vectors = svd_model.fit_transform(TFIDF_vectors)



In [112]:
# 隨機切分資料
# text_size = 0.2 --> train 80%、test 20%
# X_data = SVD_vectors
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X_data, Y_data, test_size = 0.2, random_state = 2, stratify = Y_data)
print("train data length:", len(X_train), ",", len(Y_train))
print("test data length:",len(X_test), ",", len(Y_test))

train data length: 2742 , 2742
test data length: 686 , 686


In [113]:
#將所有資料向量化，轉成 tfidf vector
vectorizer = TfidfVectorizer(stop_words=stopwords)
X_train = vectorizer.fit_transform(X_train)
X_train = pd.DataFrame(X_train.toarray(), columns = vectorizer.get_feature_names_out())
display(X_train)



Unnamed: 0,一一七五,一一九九七,一一五,一一八五,一一六,一一六八三兆,一一齊,一七,一七二億,一七五,...,龍頭輝,龍頭高通,龍鳳,龐佩奧,龐培,龐大,龔培元,龔明鑫,龜牙,龜速
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.024171,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2737,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
2738,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
2739,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
2740,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0


In [114]:
chi2_selector = SelectKBest(chi2, k = 100)
chi2_selector.fit(X_train, Y_train)
kbest_vocabs = X_train.columns[chi2_selector.get_support()]
X_train = X_train[kbest_vocabs]
X_train

Unnamed: 0,上機,中信,中砂,亞利桑,人才,使用者,俄羅斯,停辦,共建,創意,...,采鈺,陞達,除息,陳俊聖,陳良基,陳超乾,預付,魏哲家,龍潭,龔明鑫
0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
1,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
2,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
3,0.0,0.0,0.0,0.0,0.209456,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.057012,0.0,0.0
4,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2737,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
2738,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
2739,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
2740,0.0,0.0,0.0,0.0,0.049992,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0


In [115]:
# 計算 test 個別的 tfidf，再將結果透過 df.reindex 這個方法映射到訓練集的向量空間中
X_test = vectorizer.fit_transform(X_test)
X_test = pd.DataFrame(X_test.toarray(), columns=vectorizer.get_feature_names_out())
X_test = X_test.reindex(kbest_vocabs, axis=1, fill_value=0)
X_test

Unnamed: 0,上機,中信,中砂,亞利桑,人才,使用者,俄羅斯,停辦,共建,創意,...,采鈺,陞達,除息,陳俊聖,陳良基,陳超乾,預付,魏哲家,龍潭,龔明鑫
0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0,0,0.0,...,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0,0,0.0,...,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0,0,0.0,...,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0,0,0.0,...,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.038416,0,0,0.0,...,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
681,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0,0,0.0,...,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
682,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0,0,0.0,...,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
683,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0,0,0.0,...,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
684,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0,0,0.0,...,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### 預測模型

In [34]:
from sklearn.ensemble import GradientBoostingClassifier

In [118]:
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1, max_depth=7, random_state=1, max_features='sqrt')
clf.fit(X_train, Y_train)
clf.score(X_train, Y_train)

0.8785557986870897

In [119]:
test_label = Y_test

test_data = X_test
predict_label = clf.predict(X_test)
# predict_label = pd.merge(
#     ase[ase['年月日-1'].between(test_startDate, test_endDate)], 
#     test_data.groupby(['post_time', 'predict_label']).count().sort_values('label', ascending = False).sort_index(level=[0], sort_remaining=False).groupby(level=0).head(1).reset_index(), 
#     left_on='年月日-1', right_on='post_time', how='left').fillna(method='ffill').fillna(method='bfill')['predict_label']

from sklearn.metrics import accuracy_score
print('預測準確率:', accuracy_score(test_label, predict_label))

預測準確率: 0.565597667638484


In [47]:
#建立預測模型（NB）
from sklearn.naive_bayes import BernoulliNB
from sklearn import metrics
NB_model = BernoulliNB()

In [48]:
NB_model.fit(X_train,Y_train)

In [49]:
predicted_results = []
expected_results = []
expected_results.extend(Y_test)
predicted_results.extend(NB_model.predict(X_test))
print(predicted_results)

['跌', '跌', '跌', '跌', '跌', '漲', '跌', '跌', '漲', '漲', '跌', '跌', '跌', '跌', '漲', '漲', '跌', '漲', '漲', '跌', '漲', '漲', '跌', '跌', '漲', '跌', '跌', '漲', '跌', '跌', '跌', '跌', '漲', '漲', '跌', '跌', '跌', '跌', '漲', '跌', '漲', '跌', '跌', '跌', '跌', '漲', '跌', '漲', '漲', '跌', '跌', '跌', '跌', '跌', '漲', '跌', '漲', '漲', '跌', '跌', '漲', '跌', '漲', '漲', '跌', '漲', '漲', '跌', '漲', '跌', '漲', '跌', '跌', '漲', '漲', '跌', '漲', '跌', '跌', '漲', '跌', '跌', '跌', '漲', '漲', '跌', '漲', '跌', '跌', '漲', '漲', '漲', '漲', '漲', '漲', '跌', '跌', '跌', '跌', '漲', '漲', '漲', '漲', '跌', '跌', '漲', '漲', '跌', '跌', '漲', '跌', '漲', '跌', '漲', '漲', '漲', '跌', '跌', '漲', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '漲', '漲', '跌', '漲', '漲', '跌', '漲', '漲', '跌', '漲', '漲', '漲', '跌', '跌', '漲', '跌', '漲', '跌', '跌', '跌', '漲', '跌', '漲', '跌', '漲', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '漲', '漲', '漲', '跌', '跌', '漲', '漲', '跌', '跌', '跌', '漲', '漲', '跌', '漲', '跌', '漲', '漲', '漲', '跌', '跌', '跌', '跌', '漲', '漲', '漲', '跌', '跌', '漲', '跌', '跌', '跌', '漲', '漲', '漲', '漲', '跌', '跌', '漲', '漲', '漲', '跌', '漲',

In [50]:
report = metrics.classification_report(expected_results, predicted_results)
print(report)

              precision    recall  f1-score   support

           漲       0.52      0.50      0.51       349
           跌       0.52      0.54      0.53       354

    accuracy                           0.52       703
   macro avg       0.52      0.52      0.52       703
weighted avg       0.52      0.52      0.52       703



In [56]:
#建立預測模型（SVM Linear）
from sklearn.svm import SVC
SVM_model = SVC(kernel = 'linear', C = 1.0, probability = True)
SVM_model.fit(X_train, Y_train)

In [57]:
predicted_results = []
expected_results = []
expected_results.extend(Y_test)
predicted_results.extend(SVM_model.predict(X_test))
print(predicted_results)

['跌', '跌', '漲', '跌', '跌', '漲', '跌', '漲', '漲', '跌', '漲', '漲', '漲', '跌', '漲', '漲', '跌', '跌', '跌', '漲', '漲', '漲', '漲', '跌', '漲', '漲', '跌', '漲', '漲', '跌', '跌', '漲', '跌', '跌', '漲', '漲', '漲', '漲', '漲', '漲', '跌', '跌', '漲', '跌', '跌', '漲', '跌', '跌', '漲', '漲', '跌', '漲', '漲', '漲', '跌', '漲', '跌', '跌', '跌', '漲', '漲', '跌', '漲', '漲', '跌', '跌', '漲', '漲', '跌', '跌', '跌', '漲', '跌', '漲', '漲', '漲', '漲', '跌', '漲', '漲', '跌', '跌', '漲', '跌', '跌', '跌', '漲', '漲', '漲', '跌', '跌', '漲', '漲', '漲', '漲', '跌', '跌', '跌', '漲', '跌', '漲', '跌', '漲', '漲', '跌', '漲', '漲', '跌', '跌', '跌', '漲', '漲', '跌', '漲', '跌', '漲', '漲', '漲', '漲', '漲', '跌', '漲', '跌', '跌', '跌', '跌', '跌', '漲', '跌', '漲', '跌', '跌', '漲', '漲', '跌', '跌', '漲', '跌', '跌', '跌', '漲', '漲', '漲', '跌', '跌', '跌', '漲', '跌', '跌', '跌', '跌', '跌', '漲', '跌', '漲', '漲', '漲', '漲', '跌', '跌', '漲', '跌', '跌', '跌', '跌', '漲', '跌', '跌', '漲', '跌', '漲', '跌', '漲', '漲', '跌', '漲', '跌', '漲', '跌', '跌', '漲', '跌', '跌', '跌', '跌', '漲', '跌', '跌', '漲', '跌', '漲', '漲', '漲', '漲', '漲', '漲', '跌', '漲', '漲', '漲',

In [58]:
report = metrics.classification_report(expected_results, predicted_results)
print(report)

              precision    recall  f1-score   support

           漲       0.56      0.55      0.56       349
           跌       0.57      0.58      0.57       354

    accuracy                           0.56       703
   macro avg       0.56      0.56      0.56       703
weighted avg       0.56      0.56      0.56       703



In [121]:
from sklearn.model_selection import GridSearchCV

In [122]:
param_grid = {'C': [1, 100], 'gamma': [1, 0.1],'kernel': ['rbf','linear']}
grid = GridSearchCV(SVC(),param_grid,refit=True,verbose=3)
grid.fit(X_train,Y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV 1/5] END ..........C=1, gamma=1, kernel=rbf;, score=0.699 total time=   5.5s
[CV 2/5] END ..........C=1, gamma=1, kernel=rbf;, score=0.669 total time=   6.5s
[CV 3/5] END ..........C=1, gamma=1, kernel=rbf;, score=0.647 total time=   6.1s
[CV 4/5] END ..........C=1, gamma=1, kernel=rbf;, score=0.693 total time=   6.2s
[CV 5/5] END ..........C=1, gamma=1, kernel=rbf;, score=0.631 total time=   5.5s
[CV 1/5] END .......C=1, gamma=1, kernel=linear;, score=0.654 total time=   5.1s
[CV 2/5] END .......C=1, gamma=1, kernel=linear;, score=0.634 total time=   5.2s
[CV 3/5] END .......C=1, gamma=1, kernel=linear;, score=0.610 total time=   3.9s
[CV 4/5] END .......C=1, gamma=1, kernel=linear;, score=0.652 total time=   6.7s
[CV 5/5] END .......C=1, gamma=1, kernel=linear;, score=0.627 total time=   6.1s
[CV 1/5] END ........C=1, gamma=0.1, kernel=rbf;, score=0.535 total time=   7.0s
[CV 2/5] END ........C=1, gamma=0.1, kernel=rbf;,

In [123]:
print(grid.best_params_)

{'C': 100, 'gamma': 0.1, 'kernel': 'rbf'}


In [120]:
#建立預測模型（SVM Rbf）
SVM_model = SVC(kernel = 'rbf', C = 100.0, gamma = 0.1, probability=True)
SVM_model.fit(X_train,Y_train)

In [121]:
predicted_results = []
expected_results = []
expected_results.extend(Y_test)
predicted_results.extend(SVM_model.predict(X_test))
print(predicted_results)

['跌', '跌', '跌', '漲', '跌', '跌', '漲', '漲', '跌', '跌', '漲', '跌', '跌', '跌', '跌', '漲', '漲', '跌', '跌', '跌', '跌', '跌', '跌', '漲', '跌', '漲', '跌', '跌', '跌', '跌', '漲', '跌', '跌', '漲', '跌', '跌', '跌', '跌', '漲', '漲', '跌', '跌', '跌', '跌', '跌', '漲', '跌', '漲', '跌', '跌', '跌', '跌', '漲', '跌', '跌', '跌', '跌', '漲', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '漲', '漲', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '漲', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '漲', '跌', '漲', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '漲', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '漲', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '漲', '漲', '跌', '跌', '跌', '跌', '跌', '漲', '跌', '漲', '跌', '跌', '跌', '跌', '跌', '漲', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '漲', '跌', '跌', '漲', '漲', '漲', '跌', '跌', '漲', '跌', '漲', '跌', '漲', '跌', '跌', '跌', '漲', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '跌', '漲', '跌', '跌', '跌', '漲', '跌', '跌', '漲', '跌', '漲', '跌', '漲', '跌', '跌', '跌', '漲', '漲', '跌', '漲', '漲', '跌', '跌', '跌', '漲',

In [122]:
report = metrics.classification_report(expected_results, predicted_results)
print(report)
accuracy = metrics.accuracy_score(expected_results, predicted_results)
print(accuracy)

              precision    recall  f1-score   support

           漲       0.58      0.37      0.45       333
           跌       0.56      0.75      0.64       353

    accuracy                           0.56       686
   macro avg       0.57      0.56      0.54       686
weighted avg       0.57      0.56      0.55       686

0.5626822157434402


In [425]:
from sklearn import ensemble
forest = ensemble.RandomForestClassifier(n_estimators = 100)
forest_fit = forest.fit(X_train, Y_train)
# 預測
test_y_predicted = forest.predict(X_test)

# 績效
accuracy = metrics.accuracy_score(Y_test, test_y_predicted)
print(accuracy)

0.6073968705547653


#### example 6 

In [83]:
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB() # naive bayes classifier

In [84]:
# 以下將自身資料切成 train 及 test 兩組，重新訓練一次，測試模型準確率
classifier = MultinomialNB()
classifier.fit(X_train, Y_train) #訓練

In [85]:
from sklearn import metrics 
from sklearn.metrics import accuracy_score
Y_pred = classifier.predict(X_test) #用測試資料預測
print("Accuracy:", metrics.accuracy_score(Y_test, Y_pred)) #比對答案，計算準確率

Accuracy: 0.6131301289566237


In [86]:
from sklearn.metrics import classification_report  
print(classification_report(Y_test, Y_pred)) #印出分類報告

              precision    recall  f1-score   support

           漲       0.72      0.28      0.40       396
           跌       0.59      0.91      0.72       457

    accuracy                           0.61       853
   macro avg       0.65      0.59      0.56       853
weighted avg       0.65      0.61      0.57       853



In [87]:
from sklearn.metrics import confusion_matrix  
print(confusion_matrix(Y_test, Y_pred, labels = ['漲', '跌'])) # 印出混淆矩陣
# test/predicted 看漲   看跌   
#    看漲         TN     FP
#    看跌         FN     TP

[[109 287]
 [ 43 414]]


In [88]:
Y_test

3758    跌
2090    跌
3174    跌
1553    漲
1428    跌
       ..
2267    跌
3708    跌
2303    漲
3020    跌
2807    跌
Name: label, Length: 853, dtype: object

#### Example 7

In [194]:
X_data = df_news_list

In [195]:
# 將所有資料(X_data)向量化，轉成 tfidf vector
vectorizer = TfidfVectorizer(stop_words=stopwords)
X_data = vectorizer.fit_transform(X_data)
X_data = pd.DataFrame(X_data.toarray(), columns = vectorizer.get_feature_names_out())

chi2_selector = SelectKBest(chi2, k = 5000)
chi2_selector.fit(X_data, Y_data)
kbest_vocabs = X_data.columns[chi2_selector.get_support()]
X_data = X_data[kbest_vocabs]
X_data

display(X_data)



Unnamed: 0,一世代,一億,一千萬,一口氣,一呂淑美,一手,一月,一朝,一林燦澤,一次性,...,齊降,齊黑,齊鼎,龍晟田寶一,龍漢翔,龍燈庫,龍邦環泰,龍頭公司,龍頭聯發科,龍頭股台積電
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.093896,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2903,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
2904,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
2905,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
2906,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0


In [196]:
# DecisionTree
from sklearn.tree import DecisionTreeClassifier                          
classifier = DecisionTreeClassifier(criterion = "entropy")

from sklearn import metrics 
from sklearn.metrics import accuracy_score
classifier.fit(X_train, Y_train) # 訓練
Y_pred = classifier.predict(X_test) # 用測試資料預測
print("Accuracy:", metrics.accuracy_score(Y_test, Y_pred)) # 比對答案，計算準確率

Accuracy: 0.5652920962199313


In [197]:
from sklearn.metrics import classification_report  
print(classification_report(Y_test, Y_pred)) # 印出分類報告

              precision    recall  f1-score   support

           漲       0.52      0.52      0.52       266
           跌       0.60      0.60      0.60       316

    accuracy                           0.57       582
   macro avg       0.56      0.56      0.56       582
weighted avg       0.57      0.57      0.57       582



In [198]:
from sklearn.metrics import confusion_matrix  
print(confusion_matrix(Y_test, Y_pred, labels = ['漲', '跌'])) #印出混淆矩陣
# test/predicted 看漲   看跌
#    看漲         TN     FP
#    看跌         FN     TP

[[139 127]
 [126 190]]


In [199]:
Y_data

0       跌
1       跌
2       跌
3       跌
4       跌
       ..
2903    跌
2904    跌
2905    跌
2906    跌
2907    跌
Name: label, Length: 2908, dtype: object

In [200]:
# KNeighbors
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 7)

from sklearn.model_selection import cross_val_score
scores = cross_val_score(classifier, X_data, Y_data, cv = 5, scoring = 'accuracy') # 交叉驗證，計算準確率
print(scores)
print("Avg. Accuracy:", scores.mean())

Traceback (most recent call last):
  File "d:\New folder\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "d:\New folder\lib\site-packages\sklearn\metrics\_scorer.py", line 106, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
  File "d:\New folder\lib\site-packages\sklearn\metrics\_scorer.py", line 261, in _score
    y_pred = method_caller(estimator, "predict", X)
  File "d:\New folder\lib\site-packages\sklearn\metrics\_scorer.py", line 71, in _cached_call
    return getattr(estimator, method)(*args, **kwargs)
  File "d:\New folder\lib\site-packages\sklearn\neighbors\_classification.py", line 226, in predict
    neigh_ind = self.kneighbors(X, return_distance=False)
  File "d:\New folder\lib\site-packages\sklearn\neighbors\_base.py", line 763, in kneighbors
    results = PairwiseDistancesArgKmin.compute(
  File "sklearn\metrics\_pairwise_distances_reduction.pyx", line 698, 

[nan nan nan nan nan]
Avg. Accuracy: nan


Traceback (most recent call last):
  File "d:\New folder\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "d:\New folder\lib\site-packages\sklearn\metrics\_scorer.py", line 106, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
  File "d:\New folder\lib\site-packages\sklearn\metrics\_scorer.py", line 261, in _score
    y_pred = method_caller(estimator, "predict", X)
  File "d:\New folder\lib\site-packages\sklearn\metrics\_scorer.py", line 71, in _cached_call
    return getattr(estimator, method)(*args, **kwargs)
  File "d:\New folder\lib\site-packages\sklearn\neighbors\_classification.py", line 226, in predict
    neigh_ind = self.kneighbors(X, return_distance=False)
  File "d:\New folder\lib\site-packages\sklearn\neighbors\_base.py", line 763, in kneighbors
    results = PairwiseDistancesArgKmin.compute(
  File "sklearn\metrics\_pairwise_distances_reduction.pyx", line 698, 

In [201]:
classifier.fit(X_train, Y_train) # 訓練
Y_pred = classifier.predict(X_test) # 用測試資料預測
print("Accuracy:", metrics.accuracy_score(Y_test, Y_pred)) # 比對答案，計算準確率

AttributeError: 'NoneType' object has no attribute 'split'

In [202]:
print(classification_report(Y_test, Y_pred)) # 印出分類報告

              precision    recall  f1-score   support

           漲       0.52      0.52      0.52       266
           跌       0.60      0.60      0.60       316

    accuracy                           0.57       582
   macro avg       0.56      0.56      0.56       582
weighted avg       0.57      0.57      0.57       582



In [122]:
print(confusion_matrix(Y_test, Y_pred, labels = ['漲', '跌'])) # 印出混淆矩陣
# test/predicted 看漲   看跌
#    看漲         TN     FP
#    看跌         FN     TP

[[ 79 317]
 [ 47 410]]


In [123]:
# SVC
from sklearn.svm import SVC
classifier = SVC(kernel='linear')

classifier.fit(X_train, Y_train) # 訓練
Y_pred = classifier.predict(X_test) # 用測試資料預測
print("Accuracy:", metrics.accuracy_score(Y_test, Y_pred)) # 比對答案，計算準確率

Accuracy: 0.64947245017585


In [124]:
print(classification_report(Y_test, Y_pred)) # 印出分類報告

              precision    recall  f1-score   support

           漲       0.66      0.50      0.57       396
           跌       0.64      0.78      0.70       457

    accuracy                           0.65       853
   macro avg       0.65      0.64      0.64       853
weighted avg       0.65      0.65      0.64       853



In [125]:
print(confusion_matrix(Y_test, Y_pred, labels = ['漲', '跌'])) # 印出混淆矩陣
# test/predicted 看漲   看跌
#    看漲         TN     FP
#    看跌         FN     TP

[[199 197]
 [102 355]]
