In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [2]:
# impoort 斷詞（要放 topwords_zh.txt 進資料夾）
with open('../../bda2023_mid_dataset/stopwords_zh.txt', 'r') as file:
    stopwords = file.read().splitlines() 
file.close()

In [3]:
df = pd.read_excel("../../bda2023_mid_dataset/stock_data_2019-2023.xlsx", sheet_name=["上市2020", "上市2021", "上市2022"], usecols=['證券代碼', '年月日', '收盤價(元)'])

In [4]:
# import stock excel
# 只留台積電的日期跟收盤價
# 按照日期由最早往最晚排
df_STOCK = pd.DataFrame()
for i in range(2020, 2023):
    df_STOCK = pd.concat([df_STOCK, df[(f"上市{i}")].loc[::-1]])

In [5]:
df_STOCK = df_STOCK[df_STOCK["證券代碼"] == "2330 台積電"].reset_index(drop = True)

In [6]:
# 更改年月日的日期格式
df_STOCK['年月日'] = pd.to_datetime(df_STOCK['年月日']).dt.date
df_STOCK.head(5)

Unnamed: 0,證券代碼,年月日,收盤價(元)
0,2330 台積電,2020-01-02,328.2957
1,2330 台積電,2020-01-03,328.7799
2,2330 台積電,2020-01-06,321.5168
3,2330 台積電,2020-01-07,319.0957
4,2330 台積電,2020-01-08,319.0957


In [143]:
# import news excel 
df_NEWS = pd.read_excel("../../TSMC_fly_news_20-22.xlsx")
df_NEWS.head()

Unnamed: 0.1,Unnamed: 0,post_time,title,content,token
0,0,2020-01-02,美股：美中協議月中簽署，四大指數週二齊漲約0.3%封關，費半全年狂升逾六成,美股週二開盤延續前一天拉回格局，主要指數以平低盤開出，，盤中美國總統川普發文宣稱1月15日會...,"['美股美中', '協議', '月中', '簽署', '四', '大', '指數', '週二..."
1,1,2020-01-02,《熱門族群》5G新機連發，帶旺璟德、聯發科,美國消費性電子展（CES）將於元月7日∼10日接續舉辦，法人預期5G仍將是本次展期焦點，加上...,"['熱門', '族群', '新機', '連', '發帶', '旺璟德', '聯發科', '熱..."
2,2,2020-01-02,《熱門族群》台積電走高，半導體設備股跟漲,台積電 (2330) 股價今早開高走高，帶領台積電供應鏈及半導體設備股再度轉強。隨著美中貿易...,"['熱門', '族群', '台積電', '走高', '半導體', '設備股', '跟', '..."
3,3,2020-01-02,台灣科技能打趴歐洲大廠？專家分析「還太早」,近年來台灣不論科技、經濟等發展，都大幅的成長，也逐漸讓國際看到台灣。不過在科技的成長方面，就...,"['台灣', '科技', '能', '打', '趴', '歐洲', '大廠', '專家', ..."
4,4,2020-01-02,京元電Q1營收估創同期高；全年看增雙位數,MoneyDJ新聞 2020-01-02 14:23:09 記者 王怡茹 報導<BR>京元電...,"['京元電', '營收', '估創', '同', '期', '高', '全', '年', '..."


In [144]:
# 更改年月日的日期格式
df_NEWS['post_time'] = pd.to_datetime(df_NEWS['post_time']).dt.date

In [145]:
df_stock = df_STOCK.copy()
df_news = df_NEWS.copy()

In [None]:
# for i, date in enumerate(df_news['post_time']):
#     if(date.isoweekday() == 6):
#         df_news['post_time'][i] += datetime.timedelta(days = -1)
#     elif(date.isoweekday() == 7):
#         df_news['post_time'][i] += datetime.timedelta(days = -2)

# Requirement 1

In [146]:
# Requirement 1 的第二點

day_n = 1 # 以 day_n 天後來看是漲還是跌
sigma = 0.01 # 以 sigma 決定漲（跌）幅超過幾 % 是漲（跌）
label = []
for i in range(len(df_stock) - day_n):
    rate = (df_stock['收盤價(元)'][i + day_n] - df_stock['收盤價(元)'][i]) / df_stock['收盤價(元)'][i]
    if rate > sigma:
       label.append('漲')
    elif rate < -sigma:
        label.append('跌')
    else:
        label.append('持平')
for i in range(day_n):
    label.append(0)
# label.extend([0, 0, 0])

df_stock['label'] = label
df_stock.head(5)

Unnamed: 0,證券代碼,年月日,收盤價(元),label
0,2330 台積電,2020-01-02,328.2957,持平
1,2330 台積電,2020-01-03,328.7799,跌
2,2330 台積電,2020-01-06,321.5168,持平
3,2330 台積電,2020-01-07,319.0957,持平
4,2330 台積電,2020-01-08,319.0957,漲


In [147]:
upCnt = 0
downCnt = 0
flatCnt = 0
for i in range(len(df_stock)):
    if df_stock['label'][i] == '漲':
        upCnt += 1
    elif df_stock['label'][i] == '跌':
        downCnt += 1 
    else :
        flatCnt += 1
print('預估漲的天數：', upCnt)
print('預估跌的天數：', downCnt)
print('預估持平的天數：', flatCnt)

預估漲的天數： 198
預估跌的天數： 186
預估持平的天數： 351


In [148]:
# 丟掉持平的資料
df_stock = df_stock[df_stock.label != '持平']
df_stock = df_stock[df_stock.label != 0]
df_stock = df_stock.reset_index(drop = True)
df_stock

Unnamed: 0,證券代碼,年月日,收盤價(元),label
0,2330 台積電,2020-01-03,328.7799,跌
1,2330 台積電,2020-01-08,319.0957,漲
2,2330 台積電,2020-01-13,330.7168,漲
3,2330 台積電,2020-01-14,335.0747,跌
4,2330 台積電,2020-01-15,329.2642,跌
...,...,...,...,...
379,2330 台積電,2022-12-19,463.9893,跌
380,2330 台積電,2022-12-21,456.5297,漲
381,2330 台積電,2022-12-22,465.4812,跌
382,2330 台積電,2022-12-27,454.5404,跌


In [149]:
from collections import defaultdict
dic=defaultdict(int)
for i in range(len(df_stock)):
    dic[(df_stock['年月日'][i])] = (df_stock['label'][i])

In [150]:
label_news = []
for i in range(len(df_news)):
    label_news.append(dic[df_news["post_time"][i]])
df_news['label'] = label_news
df_news.head(10)

Unnamed: 0.1,Unnamed: 0,post_time,title,content,token,label
0,0,2020-01-02,美股：美中協議月中簽署，四大指數週二齊漲約0.3%封關，費半全年狂升逾六成,美股週二開盤延續前一天拉回格局，主要指數以平低盤開出，，盤中美國總統川普發文宣稱1月15日會...,"['美股美中', '協議', '月中', '簽署', '四', '大', '指數', '週二...",0
1,1,2020-01-02,《熱門族群》5G新機連發，帶旺璟德、聯發科,美國消費性電子展（CES）將於元月7日∼10日接續舉辦，法人預期5G仍將是本次展期焦點，加上...,"['熱門', '族群', '新機', '連', '發帶', '旺璟德', '聯發科', '熱...",0
2,2,2020-01-02,《熱門族群》台積電走高，半導體設備股跟漲,台積電 (2330) 股價今早開高走高，帶領台積電供應鏈及半導體設備股再度轉強。隨著美中貿易...,"['熱門', '族群', '台積電', '走高', '半導體', '設備股', '跟', '...",0
3,3,2020-01-02,台灣科技能打趴歐洲大廠？專家分析「還太早」,近年來台灣不論科技、經濟等發展，都大幅的成長，也逐漸讓國際看到台灣。不過在科技的成長方面，就...,"['台灣', '科技', '能', '打', '趴', '歐洲', '大廠', '專家', ...",0
4,4,2020-01-02,京元電Q1營收估創同期高；全年看增雙位數,MoneyDJ新聞 2020-01-02 14:23:09 記者 王怡茹 報導<BR>京元電...,"['京元電', '營收', '估創', '同', '期', '高', '全', '年', '...",0
5,5,2020-01-02,【Y晚報】元旦假後開市 台積電領軍大漲百點,（開盤日15:30出刊）美股在前日(12/31)封關日，四大指數全數收漲，漲幅介於0.27%...,"['晚報', '元旦', '假', '後', '開市', '台積電', '領軍', '大漲'...",0
6,6,2020-01-03,《各報要聞》2020報喜，歐美股聯袂走揚,2020年第一個交易日，歐美股市齊聲歡唱。泛歐STOXX 600指數早盤勁升0.9％，美股三...,"['各', '報', '要聞', '報喜', '歐美', '股', '聯袂', '走揚', ...",跌
7,7,2020-01-03,美股：四大指數週四開春首日齊攻頂，道瓊大漲330點，FAANG及中概股表現亮眼,美股送走六年來漲勢最凌厲的2019年後，2020年新年第一個交易日，迎來人行降準，美中簽署第...,"['美股', '四', '大', '指數', '週四', '開春', '首日', '齊', ...",跌
8,8,2020-01-03,【Y早報】進入5G元年 佈局CES商機 族群行情再發威,（開盤日09:00出刊）MLCC供貨吃緊恐漲價，國巨營收看俏；iPad降價搶大陸市場，帶旺主...,"['早報', '進入', '元年', '佈局', '商機', '族群', '行情', '再'...",跌
9,9,2020-01-03,【日盛金控晨訊】結構有利盤勢 支撐看月線,日期：2020年 1月 3日<BR>※盤勢分析<BR>1.隨著中國新經濟刺激措施增加、以及美...,"['日盛金控', '晨訊', '結構', '有利', '盤勢', '支撐', '看', '月...",跌


In [151]:
# 刪掉 label 是 0 的資料
df_news_no_zero = df_news[df_news.label != 0]
df_news_no_zero = df_news_no_zero.reset_index(drop = True)

In [152]:
# token 型態轉換，fit in 套件
df_news_list = []
for i in range(len(df_news_no_zero)):
    df_news_list_tmp = []
    df_news_str = ''
    df_news_list_tmp = eval(df_news_no_zero.token[i])
    df_news_str = ' '.join(df_news_list_tmp)
    df_news_list.append(df_news_str)
len(df_news_list)

3518

In [153]:
X_data = df_news_list
Y_data = df_news_no_zero['label']

In [160]:
# 隨機切分資料
# text_size = 0.2 --> train 80%、test 20%
# X_data = SVD_vectors
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X_data, Y_data, test_size = 0.2, random_state = 2, stratify = Y_data)
print("train data length:", len(X_train), ",", len(Y_train))
print("test data length:",len(X_test), ",", len(Y_test))

train data length: 2814 , 2814
test data length: 704 , 704


In [161]:
#將所有資料向量化，轉成 tfidf vector
vectorizer = TfidfVectorizer(stop_words=stopwords)
X_train = vectorizer.fit_transform(X_train)
X_train = pd.DataFrame(X_train.toarray(), columns = vectorizer.get_feature_names_out())
display(X_train)



Unnamed: 0,一一七五,一一九九七,一一五,一一八五,一一六八三兆,一一齊,一七,一七二億,一七五,一七億,...,龐培,龐培歐,龐大,龐大利益,龐雜,龔培元,龔明鑫,龔明鑫日,龜山,龜速
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2809,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2810,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2811,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2812,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [162]:
X_test = vectorizer.fit_transform(X_test)
X_test = pd.DataFrame(X_test.toarray(), columns=vectorizer.get_feature_names_out())
display(X_test)

Unnamed: 0,一一六,一七三,一七五,一三七,一三二六五二億,一三兆,一三八一,一下子,一世代,一二,...,龍邦,龍頭,龍頭公司,龍頭巨大集團,龍頭廠,龍頭聯發科,龍頭股,龍頭股台積電,龐大,龔明鑫
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.027640,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.026386,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.024141,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
699,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
700,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
701,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
702,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.027169,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [163]:
X_test = X_test.reindex(X_train.columns, axis=1, fill_value=0)


In [116]:
# from sklearn.decomposition import LatentDirichletAllocation as LDA
# lda_model = LDA(n_components=80)
# lda_model.fit(X_train, Y_train)
# X_train = lda_model.transform(X_train)
# X_test = lda_model.transform(X_test)

In [164]:
from sklearn.decomposition import PCA
pca_model = PCA(n_components=500)
pca_model.fit(X_train, Y_train)
X_train = pca_model.transform(X_train)
X_test = pca_model.transform(X_test)

In [135]:
# from sklearn.decomposition import TruncatedSVD
# svd_model = TruncatedSVD(n_components=1000)
# svd_model.fit(X_train, Y_train)
# X_train = svd_model.transform(X_train)
# X_test = svd_model.transform(X_test)

In [20]:
chi2_selector = SelectKBest(chi2, k = 5000)
chi2_selector.fit(X_train, Y_train)
kbest_vocabs = X_train.columns[chi2_selector.get_support()]
X_train = X_train[kbest_vocabs]
X_train

Unnamed: 0,一兩,一劑,一千,一城,一成一,一林燦澤,一柯宗沅,一百四,一聯發科,一角,...,齊黑,龍潭,龍潭廠,龍燈,龍科,龍邦,龍頭特斯拉,龍頭聯發科,龐佩奧,龔明鑫
0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.059991,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2806,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2807,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2808,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2809,0.0,0.0,0.070164,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
# 計算 test 個別的 tfidf，再將結果透過 df.reindex 這個方法映射到訓練集的向量空間中
X_test = vectorizer.fit_transform(X_test)
X_test = pd.DataFrame(X_test.toarray(), columns=vectorizer.get_feature_names_out())
X_test = X_test.reindex(X_train.columns, axis=1, fill_value=0)
X_test

Unnamed: 0,一兩,一劑,一千,一城,一成一,一林燦澤,一柯宗沅,一百四,一聯發科,一角,...,齊黑,龍潭,龍潭廠,龍燈,龍科,龍邦,龍頭特斯拉,龍頭聯發科,龐佩奧,龔明鑫
0,0.0,0,0.0,0,0,0.0,0.0,0,0,0.0,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
1,0.0,0,0.0,0,0,0.0,0.0,0,0,0.0,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
2,0.0,0,0.0,0,0,0.0,0.0,0,0,0.0,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
3,0.0,0,0.0,0,0,0.0,0.0,0,0,0.0,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
4,0.0,0,0.0,0,0,0.0,0.0,0,0,0.0,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
698,0.0,0,0.0,0,0,0.0,0.0,0,0,0.0,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
699,0.0,0,0.0,0,0,0.0,0.0,0,0,0.0,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
700,0.0,0,0.0,0,0,0.0,0.0,0,0,0.0,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
701,0.0,0,0.0,0,0,0.0,0.0,0,0,0.0,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0


### 預測模型

In [72]:
def vote(X_train, Y_train, X_test, Y_test):
    result = []

    from sklearn import metrics
    # Gradient Boosting Classifier
    from sklearn.ensemble import GradientBoostingClassifier
    clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1, max_depth=7, random_state=1, max_features='sqrt')
    clf.fit(X_train, Y_train)
    GBC_predicted_y = clf.predict(X_test)
    print('GBC Accuracy:', metrics.accuracy_score(Y_test, GBC_predicted_y))
    result.append(GBC_predicted_y)
    
    # NB
    from sklearn.naive_bayes import BernoulliNB
    NB_model = BernoulliNB()
    NB_model.fit(X_train,Y_train)
    NB_predicted_y = NB_model.predict(X_test)
    print('NB Accuracy:', metrics.accuracy_score(Y_test, NB_predicted_y))
    result.append(NB_predicted_y)


    # SVM
    from sklearn.svm import SVC
    SVM_model = SVC(kernel = 'rbf', C = 100.0, gamma = 0.1, probability=True)
    SVM_model.fit(X_train,Y_train)
    SVM_predicted_y = SVM_model.predict(X_test)
    print('SVM Accuracy:', metrics.accuracy_score(Y_test, SVM_predicted_y))
    result.append(SVM_predicted_y)


    # DT
    from sklearn.tree import DecisionTreeClassifier                          
    classifier = DecisionTreeClassifier(criterion = "entropy")
    classifier.fit(X_train, Y_train)
    DT_predicted_y = classifier.predict(X_test)
    print("DT Accuracy:", metrics.accuracy_score(Y_test, DT_predicted_y)) # 比對答案，計算準確率
    result.append(DT_predicted_y)


    # RM
    from sklearn.ensemble import RandomForestClassifier
    forest = RandomForestClassifier(n_estimators = 100)
    forest_fit = forest.fit(X_train, Y_train)
    RM_predicted_y = forest.predict(X_test)
    print("RM Accuracy:", metrics.accuracy_score(Y_test, RM_predicted_y))
    result.append(RM_predicted_y)

    # voting
    res = pd.DataFrame(result, index=["GBC", "NB", "SVM", "DT", "RF"])
    pred = [(max(list(res[i]), key=list(res[i]).count)) for i in range(len(res.columns))]
    print("Accuracy after voting:", metrics.accuracy_score(Y_test, pred)) # 比對答案，計算準確率



In [165]:
vote(X_train, Y_train, X_test, Y_test)

GBC Accuracy: 0.5539772727272727
NB Accuracy: 0.546875
SVM Accuracy: 0.6136363636363636
DT Accuracy: 0.5340909090909091
RM Accuracy: 0.5980113636363636
Accuracy after voting: 0.6051136363636364


In [25]:
!pip install thundersvm

Collecting thundersvm
  Downloading thundersvm-0.3.12-py3-none-any.whl (507 kB)
Installing collected packages: thundersvm
Successfully installed thundersvm-0.3.12


In [28]:
from thundersvm import *
SVM_model = SVC(kernel = 'rbf', C = 100.0, gamma = 0.1, probability=True)
SVM_model.fit(X_train,Y_train)
SVM_predicted_y = SVM_model.predict(X_test)
print('SVM Accuracy:', metrics.accuracy_score(Y_test, SVM_predicted_y))
result.append(SVM_predicted_y)

FileNotFoundError: Please build the library first!

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid = {'C': [1, 100], 'gamma': [1, 0.1],'kernel': ['rbf','linear']}
grid = GridSearchCV(SVC(),param_grid,refit=True,verbose=3)
grid.fit(X_train,Y_train)

In [None]:
print(grid.best_params_)

#### example 6 

In [None]:
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB() # naive bayes classifier

In [None]:
# 以下將自身資料切成 train 及 test 兩組，重新訓練一次，測試模型準確率
classifier = MultinomialNB()
classifier.fit(X_train, Y_train) #訓練

In [None]:
from sklearn import metrics 
from sklearn.metrics import accuracy_score
Y_pred = classifier.predict(X_test) #用測試資料預測
print("Accuracy:", metrics.accuracy_score(Y_test, Y_pred)) #比對答案，計算準確率

In [None]:
from sklearn.metrics import classification_report  
print(classification_report(Y_test, Y_pred)) #印出分類報告

In [None]:
from sklearn.metrics import confusion_matrix  
print(confusion_matrix(Y_test, Y_pred, labels = ['漲', '跌'])) # 印出混淆矩陣
# test/predicted 看漲   看跌   
#    看漲         TN     FP
#    看跌         FN     TP

In [376]:
res = pd.DataFrame(outcomes, index=["RF", "GBC", "NB", "SVM", "DT"])
res

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1286,1287,1288,1289,1290,1291,1292,1293,1294,1295
RF,跌,跌,跌,漲,漲,跌,跌,跌,跌,跌,...,跌,漲,跌,跌,跌,跌,漲,漲,跌,漲
GBC,漲,跌,跌,漲,漲,漲,跌,跌,跌,漲,...,跌,漲,漲,跌,跌,跌,漲,漲,跌,跌
NB,跌,跌,跌,漲,漲,跌,跌,跌,跌,漲,...,跌,漲,漲,跌,漲,跌,漲,漲,跌,跌
SVM,跌,跌,跌,跌,漲,漲,跌,跌,跌,漲,...,跌,漲,漲,跌,漲,跌,跌,漲,跌,漲
DT,跌,跌,跌,漲,漲,跌,跌,漲,漲,跌,...,跌,漲,跌,跌,跌,漲,漲,漲,跌,漲


In [382]:
list(res[0])

['跌', '漲', '跌', '跌', '跌']

In [384]:
pred = []
for i in range(len(res.columns)):
    pred.append(max(list(res[i]), key=list(res[i]).count))

In [385]:
print("Accuracy:", metrics.accuracy_score(Y_test, pred)) # 比對答案，計算準確率

Accuracy: 0.5763888888888888


In [None]:
# KNeighbors
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 7)

from sklearn.model_selection import cross_val_score
scores = cross_val_score(classifier, X_data, Y_data, cv = 5, scoring = 'accuracy') # 交叉驗證，計算準確率
print(scores)
print("Avg. Accuracy:", scores.mean())

In [None]:
classifier.fit(X_train, Y_train) # 訓練
Y_pred = classifier.predict(X_test) # 用測試資料預測
print("Accuracy:", metrics.accuracy_score(Y_test, Y_pred)) # 比對答案，計算準確率

In [None]:
print(classification_report(Y_test, Y_pred)) # 印出分類報告

In [None]:
print(confusion_matrix(Y_test, Y_pred, labels = ['漲', '跌'])) # 印出混淆矩陣
# test/predicted 看漲   看跌
#    看漲         TN     FP
#    看跌         FN     TP