In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [3]:
# impoort 斷詞（要放 topwords_zh.txt 進資料夾）
with open('./stopwords_zh.txt', 'r') as file:
    stopwords = file.read().splitlines() 
file.close()

In [4]:
df = pd.read_excel("../../bda2023_mid_dataset/stock_data_2019-2023.xlsx", sheet_name=["上市2020", "上市2021", "上市2022"], usecols=['證券代碼', '年月日', '收盤價(元)'])

In [5]:
# import stock excel
# 只留台積電的日期跟收盤價
# 按照日期由最早往最晚排
df_STOCK = pd.DataFrame()
for i in range(2020, 2023):
    df_STOCK = pd.concat([df_STOCK, df[(f"上市{i}")].loc[::-1]])

In [6]:
df_STOCK = df_STOCK[df_STOCK["證券代碼"] == "2330 台積電"].reset_index(drop = True)

In [7]:
# 更改年月日的日期格式
df_STOCK['年月日'] = pd.to_datetime(df_STOCK['年月日']).dt.date
df_STOCK.head(5)

Unnamed: 0,證券代碼,年月日,收盤價(元)
0,2330 台積電,2020-01-02,328.2957
1,2330 台積電,2020-01-03,328.7799
2,2330 台積電,2020-01-06,321.5168
3,2330 台積電,2020-01-07,319.0957
4,2330 台積電,2020-01-08,319.0957


In [8]:
# import news excel 
df_NEWS = pd.read_excel("../../TSMC_fly_news_20-22.xlsx")
df_NEWS.head()

Unnamed: 0.1,Unnamed: 0,post_time,title,content,token
0,0,2020-01-02,美股：美中協議月中簽署，四大指數週二齊漲約0.3%封關，費半全年狂升逾六成,美股週二開盤延續前一天拉回格局，主要指數以平低盤開出，，盤中美國總統川普發文宣稱1月15日會...,"['美股美中', '協議', '月中', '簽署', '四', '大', '指數', '週二..."
1,1,2020-01-02,《熱門族群》5G新機連發，帶旺璟德、聯發科,美國消費性電子展（CES）將於元月7日∼10日接續舉辦，法人預期5G仍將是本次展期焦點，加上...,"['熱門', '族群', '新機', '連', '發帶', '旺璟德', '聯發科', '熱..."
2,2,2020-01-02,《熱門族群》台積電走高，半導體設備股跟漲,台積電 (2330) 股價今早開高走高，帶領台積電供應鏈及半導體設備股再度轉強。隨著美中貿易...,"['熱門', '族群', '台積電', '走高', '半導體', '設備股', '跟', '..."
3,3,2020-01-02,台灣科技能打趴歐洲大廠？專家分析「還太早」,近年來台灣不論科技、經濟等發展，都大幅的成長，也逐漸讓國際看到台灣。不過在科技的成長方面，就...,"['台灣', '科技', '能', '打', '趴', '歐洲', '大廠', '專家', ..."
4,4,2020-01-02,京元電Q1營收估創同期高；全年看增雙位數,MoneyDJ新聞 2020-01-02 14:23:09 記者 王怡茹 報導<BR>京元電...,"['京元電', '營收', '估創', '同', '期', '高', '全', '年', '..."


In [267]:
# 更改年月日的日期格式
df_NEWS['post_time'] = pd.to_datetime(df_NEWS['post_time']).dt.date

# Requirement 1

In [372]:
# 換新的 n 跟 sigma 要記得重跑這裡
df_stock = df_STOCK.copy()
df_news = df_NEWS.copy()

In [373]:
# for i, date in enumerate(df_news['post_time']):
#     if(date.isoweekday() == 6):
#         df_news['post_time'][i] += datetime.timedelta(days = -1)
#     elif(date.isoweekday() == 7):
#         df_news['post_time'][i] += datetime.timedelta(days = -2)

In [374]:
df_stock

Unnamed: 0,證券代碼,年月日,收盤價(元)
0,2330 台積電,2020-01-02,328.2957
1,2330 台積電,2020-01-03,328.7799
2,2330 台積電,2020-01-06,321.5168
3,2330 台積電,2020-01-07,319.0957
4,2330 台積電,2020-01-08,319.0957
...,...,...,...
730,2330 台積電,2022-12-26,454.0431
731,2330 台積電,2022-12-27,454.5404
732,2330 台積電,2022-12-28,448.5727
733,2330 台積電,2022-12-29,443.5996


In [375]:
# Requirement 1 的第二點

# 如果調整這裡的參數，記得重新拿 df_stock（因為後面會 drop 持平）
day_n = 3 # 以 day_n 天後來看是漲還是跌
sigma = 0.01 # 以 sigma 決定漲（跌）幅超過幾 % 是漲（跌）


label = []
for i in range(len(df_stock) - day_n):
    rate = (df_stock['收盤價(元)'][i + day_n] - df_stock['收盤價(元)'][i]) / df_stock['收盤價(元)'][i]
    if rate > sigma:
       label.append('漲')
    elif rate < -sigma:
        label.append('跌')
    else:
        label.append('持平')
for i in range(day_n):
    label.append(0)
# label.extend([0, 0, 0])

df_stock['label'] = label
df_stock

Unnamed: 0,證券代碼,年月日,收盤價(元),label
0,2330 台積電,2020-01-02,328.2957,跌
1,2330 台積電,2020-01-03,328.7799,跌
2,2330 台積電,2020-01-06,321.5168,漲
3,2330 台積電,2020-01-07,319.0957,漲
4,2330 台積電,2020-01-08,319.0957,漲
...,...,...,...,...
730,2330 台積電,2022-12-26,454.0431,跌
731,2330 台積電,2022-12-27,454.5404,跌
732,2330 台積電,2022-12-28,448.5727,0
733,2330 台積電,2022-12-29,443.5996,0


In [376]:
upCnt = 0
downCnt = 0
flatCnt = 0
for i in range(len(df_stock)):
    if df_stock['label'][i] == '漲':
        upCnt += 1
    elif df_stock['label'][i] == '跌':
        downCnt += 1 
    else :
        flatCnt += 1
print(len(df_stock))
print('預估漲的天數：', upCnt)
print('預估跌的天數：', downCnt)
print('預估持平的天數：', flatCnt)

735
預估漲的天數： 261
預估跌的天數： 261
預估持平的天數： 213


In [377]:
# 丟掉持平的資料
df_stock = df_stock[df_stock.label != '持平']
df_stock = df_stock[df_stock.label != 0]
df_stock = df_stock.reset_index(drop = True)
# df_stock

In [378]:
from collections import defaultdict
dic=defaultdict(int)
for i in range(len(df_stock)):
    dic[(df_stock['年月日'][i])] = (df_stock['label'][i])

In [379]:
label_news = []
for i in range(len(df_news)):
    label_news.append(dic[df_news["post_time"][i]])
df_news['label'] = label_news
df_news.head(10)

Unnamed: 0.1,Unnamed: 0,post_time,title,content,token,label
0,0,2020-01-02,美股：美中協議月中簽署，四大指數週二齊漲約0.3%封關，費半全年狂升逾六成,美股週二開盤延續前一天拉回格局，主要指數以平低盤開出，，盤中美國總統川普發文宣稱1月15日會...,"['美股美中', '協議', '月中', '簽署', '四', '大', '指數', '週二...",跌
1,1,2020-01-02,《熱門族群》5G新機連發，帶旺璟德、聯發科,美國消費性電子展（CES）將於元月7日∼10日接續舉辦，法人預期5G仍將是本次展期焦點，加上...,"['熱門', '族群', '新機', '連', '發帶', '旺璟德', '聯發科', '熱...",跌
2,2,2020-01-02,《熱門族群》台積電走高，半導體設備股跟漲,台積電 (2330) 股價今早開高走高，帶領台積電供應鏈及半導體設備股再度轉強。隨著美中貿易...,"['熱門', '族群', '台積電', '走高', '半導體', '設備股', '跟', '...",跌
3,3,2020-01-02,台灣科技能打趴歐洲大廠？專家分析「還太早」,近年來台灣不論科技、經濟等發展，都大幅的成長，也逐漸讓國際看到台灣。不過在科技的成長方面，就...,"['台灣', '科技', '能', '打', '趴', '歐洲', '大廠', '專家', ...",跌
4,4,2020-01-02,京元電Q1營收估創同期高；全年看增雙位數,MoneyDJ新聞 2020-01-02 14:23:09 記者 王怡茹 報導<BR>京元電...,"['京元電', '營收', '估創', '同', '期', '高', '全', '年', '...",跌
5,5,2020-01-02,【Y晚報】元旦假後開市 台積電領軍大漲百點,（開盤日15:30出刊）美股在前日(12/31)封關日，四大指數全數收漲，漲幅介於0.27%...,"['晚報', '元旦', '假', '後', '開市', '台積電', '領軍', '大漲'...",跌
6,6,2020-01-03,《各報要聞》2020報喜，歐美股聯袂走揚,2020年第一個交易日，歐美股市齊聲歡唱。泛歐STOXX 600指數早盤勁升0.9％，美股三...,"['各', '報', '要聞', '報喜', '歐美', '股', '聯袂', '走揚', ...",跌
7,7,2020-01-03,美股：四大指數週四開春首日齊攻頂，道瓊大漲330點，FAANG及中概股表現亮眼,美股送走六年來漲勢最凌厲的2019年後，2020年新年第一個交易日，迎來人行降準，美中簽署第...,"['美股', '四', '大', '指數', '週四', '開春', '首日', '齊', ...",跌
8,8,2020-01-03,【Y早報】進入5G元年 佈局CES商機 族群行情再發威,（開盤日09:00出刊）MLCC供貨吃緊恐漲價，國巨營收看俏；iPad降價搶大陸市場，帶旺主...,"['早報', '進入', '元年', '佈局', '商機', '族群', '行情', '再'...",跌
9,9,2020-01-03,【日盛金控晨訊】結構有利盤勢 支撐看月線,日期：2020年 1月 3日<BR>※盤勢分析<BR>1.隨著中國新經濟刺激措施增加、以及美...,"['日盛金控', '晨訊', '結構', '有利', '盤勢', '支撐', '看', '月...",跌


In [380]:
# 刪掉 label 是 0 的資料
df_news_no_zero = df_news[df_news.label != 0]
df_news_no_zero = df_news_no_zero.reset_index(drop = True)

In [381]:
# token 型態轉換，fit in 套件
df_news_list = []
for i in range(len(df_news_no_zero)):
    df_news_list_tmp = []
    df_news_str = ''
    df_news_list_tmp = eval(df_news_no_zero.token[i])
    df_news_str = ' '.join(df_news_list_tmp)
    df_news_list.append(df_news_str)
len(df_news_list)

4863

In [382]:
df_news_no_zero['new_token'] = [news.split(' ') for news in df_news_list]

In [383]:
df_news_up = df_news_no_zero[df_news_no_zero['label'] == '漲'].reset_index(drop = True)
df_news_down = df_news_no_zero[df_news_no_zero['label'] == '跌'].reset_index(drop = True)

In [384]:
from collections import Counter

In [385]:
dict_stopwords = defaultdict(int)
for word in stopwords:
    dict_stopwords[word] += 1

In [386]:
tf_counter_up = Counter() # 預備統計tf用
df_counter_up = Counter() # 預備統計df用

upstr_list = []

for i in range(len(df_news_up)):
    df_tmp_up=Counter() # 暫存本篇df用
    for term in df_news_up['new_token'][i]:
        if(dict_stopwords[term]):
            continue
        if(len(term) > 1): # 若詞長>1
            tf_counter_up[term] += 1 # tf加1
            upstr_list.append(term)
        if(df_tmp_up[term] == 0): # 若本篇之前不曾出現
            df_tmp_up[term] = 1 # df標為1
    df_counter_up += df_tmp_up # 累加多篇df

In [387]:
import math
tfidf_up = Counter()
for i in range(len(upstr_list)):
    item  = upstr_list[i]
    pts = (1 + math.log(tf_counter_up[item]) * math.log(len(df_news_no_zero)/df_counter_up[item]))
    tfidf_up[item] = pts
pts

17.754234628215773

In [388]:
tf_counter_down = Counter() #預備統計tf用
df_counter_down = Counter() #預備統計df用

downstr_list = []

for i in range(len(df_news_down)):
    df_tmp_down=Counter() #暫存本篇df用
    for term in df_news_down['new_token'][i]:
        if(dict_stopwords[term]):
            continue
        if(len(term) > 1): #若詞長>1
            tf_counter_down[term] += 1 #tf加1
            downstr_list.append(term)
        if(df_tmp_down[term] == 0): #若本篇之前不曾出現
            df_tmp_down[term] = 1 #df標為1
    df_counter_down += df_tmp_down # 累加多篇df

In [389]:
tfidf_down = Counter()
for i in range(len(downstr_list)):
    item  = downstr_list[i]
    pts = (1 + math.log(tf_counter_down[item]) * math.log(len(df_news_no_zero)/df_counter_down[item]))
    tfidf_down[item] = pts
pts

16.210982207708

In [390]:
allstr_list = upstr_list + downstr_list
tf_counter_all = Counter()
df_counter_all = Counter()
for item in allstr_list:
    tf_counter_all[item] = tf_counter_up[item] + tf_counter_down[item]
    df_counter_all[item] = df_counter_up[item] + df_counter_down[item]

In [391]:
#tfidf_chi_up
chi_tfidf_up = Counter()
for item in upstr_list:
    expected_tf = tf_counter_all[item] / (len(df_news_up.index) + len(df_news_down.index)) * len(df_news_up)
    expected_df = df_counter_all[item] / (len(df_news_up.index) + len(df_news_down.index)) * len(df_news_up)
    
    if tf_counter_up[item] < expected_tf:
        n1 = -1
    else:
        n1 = 1
    tf_pts = (tf_counter_all[item] - expected_tf)**2 / expected_tf
    if df_counter_up[item] < expected_df:
        n2 = -1
    else:
        n2 = 1
    df_pts = (df_counter_all[item] - expected_df)**2 / expected_df
    chi_pts = tfidf_up[item] * math.sqrt(tf_pts) * math.sqrt(df_pts) * n1 * n2
    chi_tfidf_up[item] = chi_pts

In [392]:
for term in chi_tfidf_up.most_common(100): #印出看漲 tfidf 前100名
  print(term[0],term[1])

半導體 38488.8904679966
美國 36055.004901808745
指數 35739.14344467522
晶片 35232.14399035886
美元 33531.380350758256
台股 32993.936537597816
市場 31884.978331869825
營收 31614.46642246532
台灣 29255.35413329279
股價 28488.51388638449
奈米 26744.801709716092
產業 26073.079094243392
全球 25684.512525658964
晶圓 25339.97537535546
成長 24995.39888223516
外資 23669.71490321117
持續 23465.931822466955
需求 23449.95456565214
公司 22946.48393870916
產能 21774.23657452215
技術 21590.619385897684
表現 21369.034012417193
聯發科 20692.25318249242
客戶 20305.241891525464
代工 20146.702180096
產品 19197.05072920714
影響 18997.09944297719
新高 18995.497487221233
上漲 18834.046443814146
法人 18709.857671156646
報導 18412.434676966815
去年 18023.19082851452
生產 17620.798012642674
手機 17606.320801932758
電子 17554.76744660847
英特爾 17368.887043866336
下跌 17119.280213166305
科技 17090.78682099224
疫情 16962.268007163926
未來 16791.68906683983
國際 15447.29285539111
相關 15257.077286463355
營運 14957.71588635619
訂單 14951.50485250245
帶動 14768.982005218286
可望 14600.575307228084
企業 14599.34

In [393]:
#tfidf_chi_down
chi_tfidf_down = Counter()
for item in downstr_list:
    expected_tf = tf_counter_all[item] / (len(df_news_up.index) + len(df_news_down.index)) * len(df_news_down)
    expected_df = df_counter_all[item] / (len(df_news_up.index) + len(df_news_down.index)) * len(df_news_down)

    if tf_counter_down[item] < expected_tf:
        n1 = -1
    else:
        n1 = 1
    tf_pts = (tf_counter_all[item] - expected_tf)**2 / expected_tf
    if df_counter_down[item] < expected_df:
        n2 = -1
    else:
        n2 = 1
    df_pts = (df_counter_all[item] - expected_df)**2 / expected_df
    chi_pts = tfidf_down[item] * math.sqrt(tf_pts) * math.sqrt(df_pts) * n1 * n2
    chi_tfidf_down[item] = chi_pts

In [394]:
for term in chi_tfidf_down.most_common(100): #印出看跌 tfidf 前100名
  print(term[0],term[1])

半導體 30695.068240563865
美國 28886.88620279682
指數 28793.237902998917
台股 27825.396910747284
美元 27512.75476466757
晶片 27103.635269531398
營收 26073.89335698234
市場 26036.451338780607
台灣 24316.04966144
股價 24204.58024731117
奈米 21971.306743490964
全球 21103.58028785071
產業 20913.583871353323
晶圓 20892.113194798065
外資 20519.35615810211
成長 20445.97719079766
需求 19378.846438130702
持續 19230.729231878275
公司 18682.221831496503
產能 18137.671577535326
技術 17498.09481302781
表現 17452.530727239457
聯發科 17052.927259476484
代工 16806.143204480464
客戶 16585.174837152994
影響 16112.989338611407
新高 15734.739546195571
產品 15727.495336883352
法人 15610.242637452422
報導 15507.711273007255
上漲 15495.378413983624
電子 14553.877660346992
生產 14448.836586714788
去年 14419.080662700908
手機 14393.023495522477
疫情 14365.958961889366
下跌 14218.976710383862
英特爾 14194.753164777027
科技 14015.937070531325
未來 13960.6788770358
訂單 12845.673308278096
相關 12636.886328048757
台北 12524.752617860364
營運 12456.266404764378
可望 12312.918474130207
國際 12263.512926493822

In [411]:
# 取前 n 名關鍵字的差集

# n -> most_common(n)
list_up = [word[0] for word in chi_tfidf_up.most_common(10000)]
list_down = [word[0] for word in chi_tfidf_down.most_common(10000)]

key_diff = list(set(list_up).difference(set(list_down)))
len(key_diff)

1444

In [412]:
X_data = df_news_list
Y_data = df_news_no_zero['label']

In [413]:
# 將所有資料向量化，轉成 tfidf vector
vectorizer = TfidfVectorizer(stop_words=stopwords)
TFIDF_vectors = vectorizer.fit_transform(X_data)
TFIDF_vectors = pd.DataFrame(TFIDF_vectors.toarray(), columns = vectorizer.get_feature_names_out())
display(TFIDF_vectors)



Unnamed: 0,一一一億,一一七五,一一七六億,一一九九七,一一五,一一八五,一一六,一一六億,一一六八三兆,一一出爐,...,龐培歐,龐大,龐雜,龔培元,龔明鑫,龔明鑫日,龜尾,龜山,龜山廠,龜速
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4858,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4859,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4860,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4861,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [414]:
diff_vectors = TFIDF_vectors[key_diff]

In [415]:
# 隨機切分資料
# text_size = 0.2 --> train 80%、test 20%
# X_data = SVD_vectors
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(diff_vectors, Y_data, test_size = 0.2, random_state = 2, stratify = Y_data)
print("train data length:", len(X_train), ",", len(Y_train))
print("test data length:",len(X_test), ",", len(Y_test))

train data length: 3890 , 3890
test data length: 973 , 973


In [417]:
# PCA
from sklearn.decomposition import PCA
pca_model = PCA(n_components = 1000) # 取多少重要關鍵字調：n_components
pca_model.fit(X_train, Y_train)
X_train = pca_model.transform(X_train)
X_test = pca_model.transform(X_test)

### 預測模型

In [194]:
def vote(X_train, Y_train, X_test, Y_test):
    result = []

    from sklearn import metrics
    # Gradient Boosting Classifier
    from sklearn.ensemble import GradientBoostingClassifier
    clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1, max_depth=7, random_state=1, max_features='sqrt')
    clf.fit(X_train, Y_train)
    GBC_predicted_y = clf.predict(X_test)
    print('GBC Accuracy:', metrics.accuracy_score(Y_test, GBC_predicted_y))
    result.append(GBC_predicted_y)
    
    # NB
    from sklearn.naive_bayes import BernoulliNB
    NB_model = BernoulliNB()
    NB_model.fit(X_train,Y_train)
    NB_predicted_y = NB_model.predict(X_test)
    print('NB Accuracy:', metrics.accuracy_score(Y_test, NB_predicted_y))
    result.append(NB_predicted_y)


    # SVM
    from sklearn.svm import SVC
    SVM_model = SVC(kernel = 'rbf', C = 100.0, gamma = 1, probability=True)
    SVM_model.fit(X_train,Y_train)
    SVM_predicted_y = SVM_model.predict(X_test)
    print('SVM Accuracy:', metrics.accuracy_score(Y_test, SVM_predicted_y))
    result.append(SVM_predicted_y)


    # DT
    from sklearn.tree import DecisionTreeClassifier                          
    classifier = DecisionTreeClassifier(criterion = "entropy")
    classifier.fit(X_train, Y_train)
    DT_predicted_y = classifier.predict(X_test)
    print("DT Accuracy:", metrics.accuracy_score(Y_test, DT_predicted_y)) # 比對答案，計算準確率
    result.append(DT_predicted_y)


    # RF
    from sklearn.ensemble import RandomForestClassifier
    forest = RandomForestClassifier(n_estimators = 100, max_features="sqrt")
    forest_fit = forest.fit(X_train, Y_train)
    RM_predicted_y = forest.predict(X_test)
    print("RM Accuracy:", metrics.accuracy_score(Y_test, RM_predicted_y))
    result.append(RM_predicted_y)

    # KNN
    # from sklearn.neighbors import KNeighborsClassifier
    # classifie = KNeighborsClassifier(n_neighbors = 7)
    # classifie.fit(X_train, Y_train)
    # KNN_predicted_y = classifie.predict(X_test)
    # print("KNN Accuracy:", metrics.accuracy_score(Y_test, KNN_predicted_y)) # 比對答案，計算準確率
    # result.append(KNN_predicted_y)

    # voting
    res = pd.DataFrame(result, index=["GBC", "NB", "SVM", "DT", "RF"])
    pred = [(max(list(res[i]), key=list(res[i]).count)) for i in range(len(res.columns))]
    print("Accuracy after voting:", metrics.accuracy_score(Y_test, pred)) # 比對答案，計算準確率



In [416]:
# n = 3, sigma = 0.01
# no PCA, keyword most common: 10000 (1444 difference)
vote(X_train, Y_train, X_test, Y_test)

GBC Accuracy: 0.6474820143884892
NB Accuracy: 0.6341212744090442
SVM Accuracy: 0.7029804727646455
DT Accuracy: 0.656731757451182
RM Accuracy: 0.6515930113052415
Accuracy after voting: 0.6618705035971223


In [418]:
# n = 3, sigma = 0.01
# PCA: 1000, keyword most common: 10000 (1444 difference)
vote(X_train, Y_train, X_test, Y_test)

GBC Accuracy: 0.7040082219938335
NB Accuracy: 0.697841726618705
SVM Accuracy: 0.6824254881808839
DT Accuracy: 0.6557040082219938
RM Accuracy: 0.7060637204522097
Accuracy after voting: 0.7173689619732785


In [410]:
# n = 3, sigma = 0.01
# PCA: 1500, keyword most common: 12000 (2385 difference)
vote(X_train, Y_train, X_test, Y_test)

GBC Accuracy: 0.736896197327852
NB Accuracy: 0.7122302158273381
SVM Accuracy: 0.710174717368962
DT Accuracy: 0.6752312435765673
RM Accuracy: 0.7225077081192189
Accuracy after voting: 0.7595066803699897


In [406]:
# n = 3, sigma = 0.01
# PCA: 1000, keyword most common: 12000 (2385 difference)
vote(X_train, Y_train, X_test, Y_test)

GBC Accuracy: 0.7492291880781089
NB Accuracy: 0.7163412127440905
SVM Accuracy: 0.6690647482014388
DT Accuracy: 0.7019527235354573
RM Accuracy: 0.7574511819116135
Accuracy after voting: 0.7553956834532374


In [363]:
# n = 1, sigma = 0.01
# PCA: 1000, keyword most common: 12000 (3348 difference)
vote(X_train, Y_train, X_test, Y_test)

GBC Accuracy: 0.7542613636363636
NB Accuracy: 0.7613636363636364
SVM Accuracy: 0.6576704545454546
DT Accuracy: 0.7130681818181818
RM Accuracy: 0.7627840909090909
Accuracy after voting: 0.7627840909090909


In [228]:
# n = 3, sigma = 0.03
# PCA: 1000, keyword most common: 12000
vote(X_train, Y_train, X_test, Y_test)

GBC Accuracy: 0.9327731092436975
NB Accuracy: 0.9159663865546218
SVM Accuracy: 0.7619047619047619
DT Accuracy: 0.8543417366946778
RM Accuracy: 0.9159663865546218
Accuracy after voting: 0.9215686274509803


In [134]:
# PCA: 1800, keyword most common: 12000
vote(X_train, Y_train, X_test, Y_test)

GBC Accuracy: 0.6619318181818182
NB Accuracy: 0.7514204545454546
SVM Accuracy: 0.6818181818181818
DT Accuracy: 0.7144886363636364
RM Accuracy: 0.6676136363636364
Accuracy after voting: 0.7329545454545454


In [131]:
# PCA: 1500, keyword most common: 12000
vote(X_train, Y_train, X_test, Y_test)

GBC Accuracy: 0.6917613636363636
NB Accuracy: 0.7514204545454546
SVM Accuracy: 0.6704545454545454
DT Accuracy: 0.6974431818181818
RM Accuracy: 0.7017045454545454
Accuracy after voting: 0.734375


In [128]:
# PCA: 2000, keyword most common: 12000
vote(X_train, Y_train, X_test, Y_test)

GBC Accuracy: 0.6690340909090909
NB Accuracy: 0.75
SVM Accuracy: 0.6818181818181818
DT Accuracy: 0.6889204545454546
RM Accuracy: 0.6321022727272727
Accuracy after voting: 0.7144886363636364


In [119]:
# PCA: 500, keyword most common: 10000
vote(X_train, Y_train, X_test, Y_test)

GBC Accuracy: 0.7443181818181818
NB Accuracy: 0.7485795454545454
SVM Accuracy: 0.6178977272727273
DT Accuracy: 0.6931818181818182
RM Accuracy: 0.7514204545454546
Accuracy after voting: 0.7286931818181818


In [181]:
# n = 3, sigma = 0.01
# PCA: 800, keyword most common: 10000
vote(X_train, Y_train, X_test, Y_test)

GBC Accuracy: 0.7440273037542662
NB Accuracy: 0.7406143344709898
SVM Accuracy: 0.6416382252559727
DT Accuracy: 0.7235494880546075
RM Accuracy: 0.764505119453925
Accuracy after voting: 0.764505119453925


In [111]:
# PCA: 900, keyword most common: 10000
vote(X_train, Y_train, X_test, Y_test)

GBC Accuracy: 0.7372159090909091
NB Accuracy: 0.7457386363636364
SVM Accuracy: 0.6931818181818182
DT Accuracy: 0.7159090909090909
RM Accuracy: 0.7542613636363636
Accuracy after voting: 0.7428977272727273


In [173]:
# n = 3, sigma = 0.01
# PCA: 1000, keyword most common: 10000
vote(X_train, Y_train, X_test, Y_test)

GBC Accuracy: 0.7440273037542662
NB Accuracy: 0.735494880546075
SVM Accuracy: 0.674061433447099
DT Accuracy: 0.7372013651877133
RM Accuracy: 0.7610921501706485
Accuracy after voting: 0.7593856655290102


In [103]:
# PCA: 1200, keyword most common: 10000
vote(X_train, Y_train, X_test, Y_test)

GBC Accuracy: 0.703125
NB Accuracy: 0.7457386363636364
SVM Accuracy: 0.7130681818181818
DT Accuracy: 0.6633522727272727
RM Accuracy: 0.7088068181818182
Accuracy after voting: 0.7386363636363636


In [97]:
# PCA: 1250, keyword most common: 10000
vote(X_train, Y_train, X_test, Y_test)

GBC Accuracy: 0.6818181818181818
NB Accuracy: 0.7428977272727273
SVM Accuracy: 0.7159090909090909
DT Accuracy: 0.6931818181818182
RM Accuracy: 0.6548295454545454
KNN Accuracy: 0.5213068181818182
Accuracy after voting: 0.7002840909090909


In [179]:
# n = 3, sigma = 0.01
# no PCA, keyword most common: 10000
vote(X_train, Y_train, X_test, Y_test)

GBC Accuracy: 0.6245733788395904
NB Accuracy: 0.5819112627986348
SVM Accuracy: 0.8054607508532423
DT Accuracy: 0.6348122866894198
RM Accuracy: 0.621160409556314
Accuracy after voting: 0.6484641638225256


In [77]:
from sklearn.model_selection import GridSearchCV

In [78]:
from sklearn.svm import SVC
param_grid = {'C': [1, 100], 'gamma': [1, 0.1],'kernel': ['rbf','linear']}
grid = GridSearchCV(SVC(),param_grid,refit=True,verbose=3)
grid.fit(X_train,Y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV 1/5] END ..........C=1, gamma=1, kernel=rbf;, score=0.689 total time=   4.5s
[CV 2/5] END ..........C=1, gamma=1, kernel=rbf;, score=0.670 total time=   4.3s
[CV 3/5] END ..........C=1, gamma=1, kernel=rbf;, score=0.712 total time=   4.8s
[CV 4/5] END ..........C=1, gamma=1, kernel=rbf;, score=0.684 total time=   4.7s
[CV 5/5] END ..........C=1, gamma=1, kernel=rbf;, score=0.683 total time=   4.4s
[CV 1/5] END .......C=1, gamma=1, kernel=linear;, score=0.504 total time=   5.1s
[CV 2/5] END .......C=1, gamma=1, kernel=linear;, score=0.504 total time=   5.0s
[CV 3/5] END .......C=1, gamma=1, kernel=linear;, score=0.503 total time=   5.0s
[CV 4/5] END .......C=1, gamma=1, kernel=linear;, score=0.503 total time=   5.0s
[CV 5/5] END .......C=1, gamma=1, kernel=linear;, score=0.504 total time=   5.0s
[CV 1/5] END ........C=1, gamma=0.1, kernel=rbf;, score=0.504 total time=   5.9s
[CV 2/5] END ........C=1, gamma=0.1, kernel=rbf;,

KeyboardInterrupt: 

In [None]:
print(grid.best_params_)

In [None]:
from sklearn.metrics import confusion_matrix  
print(confusion_matrix(Y_test, Y_pred, labels = ['漲', '跌'])) # 印出混淆矩陣
# test/predicted 看漲   看跌   
#    看漲         TN     FP
#    看跌         FN     TP