In [166]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [2]:
# impoort 斷詞（要放 topwords_zh.txt 進資料夾）
with open('../../bda2023_mid_dataset/stopwords_zh.txt', 'r') as file:
    stopwords = file.read().splitlines() 
file.close()

In [3]:
df = pd.read_excel("../../bda2023_mid_dataset/stock_data_2019-2023.xlsx", sheet_name=["上市2020", "上市2021", "上市2022"], usecols=['證券代碼', '年月日', '收盤價(元)'])

In [4]:
# import stock excel
# 只留台積電的日期跟收盤價
# 按照日期由最早往最晚排
df_STOCK = pd.DataFrame()
for i in range(2020, 2023):
    df_STOCK = pd.concat([df_STOCK, df[(f"上市{i}")].loc[::-1]])

In [5]:
df_STOCK = df_STOCK[df_STOCK["證券代碼"] == "2330 台積電"].reset_index(drop = True)

In [6]:
# 更改年月日的日期格式
df_STOCK['年月日'] = pd.to_datetime(df_STOCK['年月日']).dt.date
df_STOCK.head(5)

Unnamed: 0,證券代碼,年月日,收盤價(元)
0,2330 台積電,2020-01-02,328.2957
1,2330 台積電,2020-01-03,328.7799
2,2330 台積電,2020-01-06,321.5168
3,2330 台積電,2020-01-07,319.0957
4,2330 台積電,2020-01-08,319.0957


In [143]:
# import news excel 
df_NEWS = pd.read_excel("../../TSMC_fly_news_20-22.xlsx")
df_NEWS.head()

Unnamed: 0.1,Unnamed: 0,post_time,title,content,token
0,0,2020-01-02,美股：美中協議月中簽署，四大指數週二齊漲約0.3%封關，費半全年狂升逾六成,美股週二開盤延續前一天拉回格局，主要指數以平低盤開出，，盤中美國總統川普發文宣稱1月15日會...,"['美股美中', '協議', '月中', '簽署', '四', '大', '指數', '週二..."
1,1,2020-01-02,《熱門族群》5G新機連發，帶旺璟德、聯發科,美國消費性電子展（CES）將於元月7日∼10日接續舉辦，法人預期5G仍將是本次展期焦點，加上...,"['熱門', '族群', '新機', '連', '發帶', '旺璟德', '聯發科', '熱..."
2,2,2020-01-02,《熱門族群》台積電走高，半導體設備股跟漲,台積電 (2330) 股價今早開高走高，帶領台積電供應鏈及半導體設備股再度轉強。隨著美中貿易...,"['熱門', '族群', '台積電', '走高', '半導體', '設備股', '跟', '..."
3,3,2020-01-02,台灣科技能打趴歐洲大廠？專家分析「還太早」,近年來台灣不論科技、經濟等發展，都大幅的成長，也逐漸讓國際看到台灣。不過在科技的成長方面，就...,"['台灣', '科技', '能', '打', '趴', '歐洲', '大廠', '專家', ..."
4,4,2020-01-02,京元電Q1營收估創同期高；全年看增雙位數,MoneyDJ新聞 2020-01-02 14:23:09 記者 王怡茹 報導<BR>京元電...,"['京元電', '營收', '估創', '同', '期', '高', '全', '年', '..."


In [144]:
# 更改年月日的日期格式
df_NEWS['post_time'] = pd.to_datetime(df_NEWS['post_time']).dt.date

In [174]:
df_stock = df_STOCK.copy()
df_news = df_NEWS.copy()

In [None]:
# for i, date in enumerate(df_news['post_time']):
#     if(date.isoweekday() == 6):
#         df_news['post_time'][i] += datetime.timedelta(days = -1)
#     elif(date.isoweekday() == 7):
#         df_news['post_time'][i] += datetime.timedelta(days = -2)

# Requirement 1

In [175]:
# Requirement 1 的第二點

day_n = 1 # 以 day_n 天後來看是漲還是跌
sigma = 0.01 # 以 sigma 決定漲（跌）幅超過幾 % 是漲（跌）
label = []
for i in range(len(df_stock) - day_n):
    rate = (df_stock['收盤價(元)'][i + day_n] - df_stock['收盤價(元)'][i]) / df_stock['收盤價(元)'][i]
    if rate > sigma:
       label.append('漲')
    elif rate < -sigma:
        label.append('跌')
    else:
        label.append('持平')
for i in range(day_n):
    label.append(0)
# label.extend([0, 0, 0])

df_stock['label'] = label
df_stock.head(5)

Unnamed: 0,證券代碼,年月日,收盤價(元),label
0,2330 台積電,2020-01-02,328.2957,持平
1,2330 台積電,2020-01-03,328.7799,跌
2,2330 台積電,2020-01-06,321.5168,持平
3,2330 台積電,2020-01-07,319.0957,持平
4,2330 台積電,2020-01-08,319.0957,漲


In [176]:
upCnt = 0
downCnt = 0
flatCnt = 0
for i in range(len(df_stock)):
    if df_stock['label'][i] == '漲':
        upCnt += 1
    elif df_stock['label'][i] == '跌':
        downCnt += 1 
    else :
        flatCnt += 1
print('預估漲的天數：', upCnt)
print('預估跌的天數：', downCnt)
print('預估持平的天數：', flatCnt)

預估漲的天數： 198
預估跌的天數： 186
預估持平的天數： 351


In [177]:
# 丟掉持平的資料
df_stock = df_stock[df_stock.label != '持平']
df_stock = df_stock[df_stock.label != 0]
df_stock = df_stock.reset_index(drop = True)
df_stock

Unnamed: 0,證券代碼,年月日,收盤價(元),label
0,2330 台積電,2020-01-03,328.7799,跌
1,2330 台積電,2020-01-08,319.0957,漲
2,2330 台積電,2020-01-13,330.7168,漲
3,2330 台積電,2020-01-14,335.0747,跌
4,2330 台積電,2020-01-15,329.2642,跌
...,...,...,...,...
379,2330 台積電,2022-12-19,463.9893,跌
380,2330 台積電,2022-12-21,456.5297,漲
381,2330 台積電,2022-12-22,465.4812,跌
382,2330 台積電,2022-12-27,454.5404,跌


In [178]:
from collections import defaultdict
dic=defaultdict(int)
for i in range(len(df_stock)):
    dic[(df_stock['年月日'][i])] = (df_stock['label'][i])

In [179]:
label_news = []
for i in range(len(df_news)):
    label_news.append(dic[df_news["post_time"][i]])
df_news['label'] = label_news
df_news.head(10)

Unnamed: 0.1,Unnamed: 0,post_time,title,content,token,label
0,0,2020-01-02,美股：美中協議月中簽署，四大指數週二齊漲約0.3%封關，費半全年狂升逾六成,美股週二開盤延續前一天拉回格局，主要指數以平低盤開出，，盤中美國總統川普發文宣稱1月15日會...,"['美股美中', '協議', '月中', '簽署', '四', '大', '指數', '週二...",0
1,1,2020-01-02,《熱門族群》5G新機連發，帶旺璟德、聯發科,美國消費性電子展（CES）將於元月7日∼10日接續舉辦，法人預期5G仍將是本次展期焦點，加上...,"['熱門', '族群', '新機', '連', '發帶', '旺璟德', '聯發科', '熱...",0
2,2,2020-01-02,《熱門族群》台積電走高，半導體設備股跟漲,台積電 (2330) 股價今早開高走高，帶領台積電供應鏈及半導體設備股再度轉強。隨著美中貿易...,"['熱門', '族群', '台積電', '走高', '半導體', '設備股', '跟', '...",0
3,3,2020-01-02,台灣科技能打趴歐洲大廠？專家分析「還太早」,近年來台灣不論科技、經濟等發展，都大幅的成長，也逐漸讓國際看到台灣。不過在科技的成長方面，就...,"['台灣', '科技', '能', '打', '趴', '歐洲', '大廠', '專家', ...",0
4,4,2020-01-02,京元電Q1營收估創同期高；全年看增雙位數,MoneyDJ新聞 2020-01-02 14:23:09 記者 王怡茹 報導<BR>京元電...,"['京元電', '營收', '估創', '同', '期', '高', '全', '年', '...",0
5,5,2020-01-02,【Y晚報】元旦假後開市 台積電領軍大漲百點,（開盤日15:30出刊）美股在前日(12/31)封關日，四大指數全數收漲，漲幅介於0.27%...,"['晚報', '元旦', '假', '後', '開市', '台積電', '領軍', '大漲'...",0
6,6,2020-01-03,《各報要聞》2020報喜，歐美股聯袂走揚,2020年第一個交易日，歐美股市齊聲歡唱。泛歐STOXX 600指數早盤勁升0.9％，美股三...,"['各', '報', '要聞', '報喜', '歐美', '股', '聯袂', '走揚', ...",跌
7,7,2020-01-03,美股：四大指數週四開春首日齊攻頂，道瓊大漲330點，FAANG及中概股表現亮眼,美股送走六年來漲勢最凌厲的2019年後，2020年新年第一個交易日，迎來人行降準，美中簽署第...,"['美股', '四', '大', '指數', '週四', '開春', '首日', '齊', ...",跌
8,8,2020-01-03,【Y早報】進入5G元年 佈局CES商機 族群行情再發威,（開盤日09:00出刊）MLCC供貨吃緊恐漲價，國巨營收看俏；iPad降價搶大陸市場，帶旺主...,"['早報', '進入', '元年', '佈局', '商機', '族群', '行情', '再'...",跌
9,9,2020-01-03,【日盛金控晨訊】結構有利盤勢 支撐看月線,日期：2020年 1月 3日<BR>※盤勢分析<BR>1.隨著中國新經濟刺激措施增加、以及美...,"['日盛金控', '晨訊', '結構', '有利', '盤勢', '支撐', '看', '月...",跌


In [180]:
# 刪掉 label 是 0 的資料
df_news_no_zero = df_news[df_news.label != 0]
df_news_no_zero = df_news_no_zero.reset_index(drop = True)

In [181]:
# token 型態轉換，fit in 套件
df_news_list = []
for i in range(len(df_news_no_zero)):
    df_news_list_tmp = []
    df_news_str = ''
    df_news_list_tmp = eval(df_news_no_zero.token[i])
    df_news_str = ' '.join(df_news_list_tmp)
    df_news_list.append(df_news_str)
len(df_news_list)

3518

In [187]:
df_news_no_zero['new_token'] = [news.split(' ') for news in df_news_list]

In [257]:
df_news_up = df_news_no_zero[df_news_no_zero['label'] == '漲'].reset_index(drop = True)
df_news_down = df_news_no_zero[df_news_no_zero['label'] == '跌'].reset_index(drop = True)

In [192]:
from collections import Counter

In [258]:
dict_stopwords = defaultdict(int)
for word in stopwords:
    dict_stopwords[word] += 1

In [259]:
tf_counter_up = Counter() #預備統計tf用
df_counter_up = Counter() #預備統計df用

upstr_list = []

for i in range(len(df_news_up)):
    df_tmp_up=Counter() #暫存本篇df用
    for term in df_news_up['new_token'][i]:
        if(dict_stopwords[term]):
            continue
        if(len(term) > 1): #若詞長>1
            tf_counter_up[term] += 1 #tf加1
            upstr_list.append(term)
        if(df_tmp_up[term] == 0): #若本篇之前不曾出現
            df_tmp_up[term] = 1 #df標為1
    df_counter_up += df_tmp_up # 累加多篇df

In [260]:
import math
tfidf_up = Counter()
for i in range(len(upstr_list)):
    item  = upstr_list[i]
    pts = (1 + math.log(tf_counter_up[item]) * math.log(len(df_news_no_zero)/df_counter_up[item]))
    tfidf_up[item] = pts
pts

18.329110329702395

In [261]:
tf_counter_down = Counter() #預備統計tf用
df_counter_down = Counter() #預備統計df用

downstr_list = []

for i in range(len(df_news_down)):
    df_tmp_down=Counter() #暫存本篇df用
    for term in df_news_down['new_token'][i]:
        if(dict_stopwords[term]):
            continue
        if(len(term) > 1): #若詞長>1
            tf_counter_down[term] += 1 #tf加1
            downstr_list.append(term)
        if(df_tmp_down[term] == 0): #若本篇之前不曾出現
            df_tmp_down[term] = 1 #df標為1
    df_counter_down += df_tmp_down # 累加多篇df

In [262]:
tfidf_down = Counter()
for i in range(len(downstr_list)):
    item  = downstr_list[i]
    pts = (1 + math.log(tf_counter_down[item]) * math.log(len(df_news_no_zero)/df_counter_down[item]))
    tfidf_down[item] = pts
pts

19.74504221038599

In [263]:
allstr_list = upstr_list + downstr_list
tf_counter_all = Counter()
df_counter_all = Counter()
for item in allstr_list:
    tf_counter_all[item] = tf_counter_up[item] + tf_counter_down[item]
    df_counter_all[item] = df_counter_up[item] + df_counter_down[item]

In [264]:
#tfidf_chi_up
chi_tfidf_up = Counter()
for item in upstr_list:
    expected_tf = tf_counter_all[item] / (len(df_news_up.index) + len(df_news_down.index)) * len(df_news_up)
    expected_df = df_counter_all[item] / (len(df_news_up.index) + len(df_news_down.index)) * len(df_news_up)
    
    if tf_counter_up[item] < expected_tf:
        n1 = -1
    else:
        n1 = 1
    tf_pts = (tf_counter_all[item] - expected_tf)**2 / expected_tf
    if df_counter_up[item] < expected_df:
        n2 = -1
    else:
        n2 = 1
    df_pts = (df_counter_all[item] - expected_df)**2 / expected_df
    chi_pts = tfidf_up[item] * math.sqrt(tf_pts) * math.sqrt(df_pts) * n1 * n2
    chi_tfidf_up[item] = chi_pts

In [265]:
for term in chi_tfidf_up.most_common(100): #印出看漲 tfidf 前100名
  print(term[0],term[1])

半導體 24978.307003758036
美國 23844.2606366147
指數 23368.438824411995
晶片 22845.03151675571
美元 21712.575139071083
台股 21078.398304062594
營收 20540.73648950766
台灣 19171.518985771734
奈米 17730.5900134381
產業 17309.57262492988
製程 17237.654315250136
全球 17032.27230421254
晶圓 16734.7219752696
成長 16145.339027481972
預期 15743.816074631935
持續 15433.272393902718
外資 14973.970645378979
公司 14696.278204411597
產能 14297.09102965508
美股 13598.980128908548
代工 13553.414851997648
技術 13384.25943041685
上漲 12812.984442752419
中國 12742.828928681216
經濟 12654.276167340684
產品 12605.166981625685
影響 12566.389616888355
新高 12272.318887788064
英特爾 12264.329128921017
報導 11994.637602417588
去年 11962.48813323895
法人 11683.648018103671
手機 11654.35466268128
科技 11580.620928565912
電子 11563.539419049459
指出 11499.60112882578
生產 11377.456507106519
疫情 11259.784508687811
未來 10996.143195800792
類股 10063.22535237411
股市 9840.435647116616
三星 9814.23725522923
國際 9740.456154674863
訂單 9661.895464231888
價格 9601.014117572893
族群 9593.324697175402
設計 9591.3

In [266]:
#tfidf_chi_down
chi_tfidf_down = Counter()
for item in downstr_list:
    expected_tf = tf_counter_all[item] / (len(df_news_up.index) + len(df_news_down.index)) * len(df_news_down)
    expected_df = df_counter_all[item] / (len(df_news_up.index) + len(df_news_down.index)) * len(df_news_down)

    if tf_counter_down[item] < expected_tf:
        n1 = -1
    else:
        n1 = 1
    tf_pts = (tf_counter_all[item] - expected_tf)**2 / expected_tf
    if df_counter_down[item] < expected_df:
        n2 = -1
    else:
        n2 = 1
    df_pts = (df_counter_all[item] - expected_df)**2 / expected_df
    chi_pts = tfidf_down[item] * math.sqrt(tf_pts) * math.sqrt(df_pts) * n1 * n2
    chi_tfidf_down[item] = chi_pts

In [267]:
for term in chi_tfidf_down.most_common(100): #印出看跌 tfidf 前100名
  print(term[0],term[1])

半導體 23446.91571995296
美國 22395.341031656306
指數 22085.913934874872
晶片 21080.507903265836
台股 20740.43789377485
美元 20541.104389010445
營收 20072.372390542405
台灣 18156.788791876468
奈米 16749.74750336153
製程 16542.104186344557
全球 15773.374471853711
晶圓 15724.357812682203
產業 15673.88560781929
成長 15375.50858633374
預期 15168.598630073064
外資 14881.742790128956
持續 14656.217621890331
產能 14203.864254482558
公司 13731.203051390363
美股 13274.04038932965
代工 12918.328732068854
技術 12576.950598915566
新高 12094.801958775923
中國 12093.881390923707
影響 12055.371414262292
上漲 11936.173923431255
經濟 11885.459616746593
法人 11593.747256977971
產品 11593.642315567397
英特爾 11530.537273170889
報導 11377.68739923052
手機 11247.114791421893
電子 10953.18845285671
疫情 10877.262408119886
去年 10847.83285330153
科技 10758.762670210108
生產 10750.309148843493
指出 10745.121730906401
未來 10350.144426341973
類股 9655.060016404112
訂單 9557.045942248282
營運 9367.05428983602
三星 9270.09108913702
股市 9142.367161383067
價格 9126.474966067384
設計 9121.625971886653
國際 9

In [268]:
a = [word[0] for word in chi_tfidf_up.most_common(2000)] + [word[0] for word in chi_tfidf_down.most_common(2000)]
keywords = (list(set(a)))

In [291]:
X_data = df_news_list
Y_data = df_news_no_zero['label']

In [300]:
# 隨機切分資料
# text_size = 0.2 --> train 80%、test 20%
# X_data = SVD_vectors
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X_data, Y_data, test_size = 0.2, random_state = 2, stratify = Y_data)
print("train data length:", len(X_train), ",", len(Y_train))
print("test data length:",len(X_test), ",", len(Y_test))

train data length: 2814 , 2814
test data length: 704 , 704


In [301]:
#將所有資料向量化，轉成 tfidf vector
vectorizer = TfidfVectorizer(stop_words=stopwords)
X_train = vectorizer.fit_transform(X_train)
X_train = pd.DataFrame(X_train.toarray(), columns = vectorizer.get_feature_names_out())
display(X_train)



Unnamed: 0,一一七五,一一九九七,一一五,一一八五,一一六八三兆,一一齊,一七,一七二億,一七五,一七億,...,龐培,龐培歐,龐大,龐大利益,龐雜,龔培元,龔明鑫,龔明鑫日,龜山,龜速
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2809,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2810,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2811,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2812,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [302]:
X_train = X_train[keywords]

In [303]:
X_test = vectorizer.fit_transform(X_test)
X_test = pd.DataFrame(X_test.toarray(), columns=vectorizer.get_feature_names_out())
display(X_test)

Unnamed: 0,一一六,一七三,一七五,一三七,一三二六五二億,一三兆,一三八一,一下子,一世代,一二,...,龍邦,龍頭,龍頭公司,龍頭巨大集團,龍頭廠,龍頭聯發科,龍頭股,龍頭股台積電,龐大,龔明鑫
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.027640,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.026386,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.024141,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
699,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
700,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
701,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
702,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.027169,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [304]:
X_test = X_test.reindex(X_train.columns, axis=1, fill_value=0)


In [116]:
# from sklearn.decomposition import LatentDirichletAllocation as LDA
# lda_model = LDA(n_components=80)
# lda_model.fit(X_train, Y_train)
# X_train = lda_model.transform(X_train)
# X_test = lda_model.transform(X_test)

In [298]:
from sklearn.decomposition import PCA
pca_model = PCA(n_components=1000)
pca_model.fit(X_train, Y_train)
X_train = pca_model.transform(X_train)
X_test = pca_model.transform(X_test)

In [135]:
# from sklearn.decomposition import TruncatedSVD
# svd_model = TruncatedSVD(n_components=1000)
# svd_model.fit(X_train, Y_train)
# X_train = svd_model.transform(X_train)
# X_test = svd_model.transform(X_test)

In [20]:
chi2_selector = SelectKBest(chi2, k = 5000)
chi2_selector.fit(X_train, Y_train)
kbest_vocabs = X_train.columns[chi2_selector.get_support()]
X_train = X_train[kbest_vocabs]
X_train

Unnamed: 0,一兩,一劑,一千,一城,一成一,一林燦澤,一柯宗沅,一百四,一聯發科,一角,...,齊黑,龍潭,龍潭廠,龍燈,龍科,龍邦,龍頭特斯拉,龍頭聯發科,龐佩奧,龔明鑫
0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.059991,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2806,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2807,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2808,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2809,0.0,0.0,0.070164,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
# 計算 test 個別的 tfidf，再將結果透過 df.reindex 這個方法映射到訓練集的向量空間中
X_test = vectorizer.fit_transform(X_test)
X_test = pd.DataFrame(X_test.toarray(), columns=vectorizer.get_feature_names_out())
X_test = X_test.reindex(X_train.columns, axis=1, fill_value=0)
X_test

Unnamed: 0,一兩,一劑,一千,一城,一成一,一林燦澤,一柯宗沅,一百四,一聯發科,一角,...,齊黑,龍潭,龍潭廠,龍燈,龍科,龍邦,龍頭特斯拉,龍頭聯發科,龐佩奧,龔明鑫
0,0.0,0,0.0,0,0,0.0,0.0,0,0,0.0,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
1,0.0,0,0.0,0,0,0.0,0.0,0,0,0.0,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
2,0.0,0,0.0,0,0,0.0,0.0,0,0,0.0,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
3,0.0,0,0.0,0,0,0.0,0.0,0,0,0.0,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
4,0.0,0,0.0,0,0,0.0,0.0,0,0,0.0,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
698,0.0,0,0.0,0,0,0.0,0.0,0,0,0.0,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
699,0.0,0,0.0,0,0,0.0,0.0,0,0,0.0,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
700,0.0,0,0.0,0,0,0.0,0.0,0,0,0.0,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
701,0.0,0,0.0,0,0,0.0,0.0,0,0,0.0,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0


### 預測模型

In [337]:
def vote(X_train, Y_train, X_test, Y_test):
    result = []

    from sklearn import metrics
    # Gradient Boosting Classifier
    from sklearn.ensemble import GradientBoostingClassifier
    clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1, max_depth=7, random_state=1, max_features='sqrt')
    clf.fit(X_train, Y_train)
    GBC_predicted_y = clf.predict(X_test)
    print('GBC Accuracy:', metrics.accuracy_score(Y_test, GBC_predicted_y))
    result.append(GBC_predicted_y)
    
    # NB
    # from sklearn.naive_bayes import BernoulliNB
    # NB_model = BernoulliNB()
    # NB_model.fit(X_train,Y_train)
    # NB_predicted_y = NB_model.predict(X_test)
    # print('NB Accuracy:', metrics.accuracy_score(Y_test, NB_predicted_y))
    # result.append(NB_predicted_y)


    # SVM
    from sklearn.svm import SVC
    SVM_model = SVC(kernel = 'rbf', C = 100.0, gamma = 0.1, probability=True)
    SVM_model.fit(X_train,Y_train)
    SVM_predicted_y = SVM_model.predict(X_test)
    print('SVM Accuracy:', metrics.accuracy_score(Y_test, SVM_predicted_y))
    result.append(SVM_predicted_y)


    # DT
    # from sklearn.tree import DecisionTreeClassifier                          
    # classifier = DecisionTreeClassifier(criterion = "entropy")
    # classifier.fit(X_train, Y_train)
    # DT_predicted_y = classifier.predict(X_test)
    # print("DT Accuracy:", metrics.accuracy_score(Y_test, DT_predicted_y)) # 比對答案，計算準確率
    # result.append(DT_predicted_y)


    # RM
    from sklearn.ensemble import RandomForestClassifier
    forest = RandomForestClassifier(n_estimators = 100, max_features="sqrt", max_depth=7)
    forest_fit = forest.fit(X_train, Y_train)
    RM_predicted_y = forest.predict(X_test)
    print("RM Accuracy:", metrics.accuracy_score(Y_test, RM_predicted_y))
    result.append(RM_predicted_y)

    # KNN
    # from sklearn.neighbors import KNeighborsClassifier
    # classifie = KNeighborsClassifier(n_neighbors = 7)
    # classifie.fit(X_train, Y_train)
    # KNN_predicted_y = classifie.predict(X_test)
    # print("KNN Accuracy:", metrics.accuracy_score(Y_test, KNN_predicted_y)) # 比對答案，計算準確率
    # result.append(KNN_predicted_y)

    # voting
    res = pd.DataFrame(result, index=["GBC", "SVM", "RF"])
    pred = [(max(list(res[i]), key=list(res[i]).count)) for i in range(len(res.columns))]
    print("Accuracy after voting:", metrics.accuracy_score(Y_test, pred)) # 比對答案，計算準確率



In [342]:
from sklearn.model_selection import RandomizedSearchCV

# 建立參數的各自區間
n_estimators = [int(x) for x in np.linspace(start=200, stop=2000, num=5)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num=5)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

random_grid = {'n_estimators': n_estimators, 'max_features': max_features,
               'max_depth': max_depth, 'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf, 'bootstrap': bootstrap}
random_grid

{'n_estimators': [200, 650, 1100, 1550, 2000],
 'max_features': ['auto', 'sqrt'],
 'max_depth': [10, 35, 60, 85, 110, None],
 'min_samples_split': [2, 5, 10],
 'min_samples_leaf': [1, 2, 4],
 'bootstrap': [True, False]}

In [343]:
from sklearn.ensemble import RandomForestClassifier
forest2 = RandomForestClassifier(random_state=42)
rf_random = RandomizedSearchCV(estimator = forest2, param_distributions=random_grid,
                              n_iter=100, cv=3, verbose=2, random_state=42, n_jobs=-1)

rf_random.fit(X_train,Y_train)
rf_random.best_params_

Fitting 3 folds for each of 100 candidates, totalling 300 fits


In [280]:
vote(X_train, Y_train, X_test, Y_test)

GBC Accuracy: 0.5823863636363636
NB Accuracy: 0.5767045454545454
SVM Accuracy: 0.6036931818181818
DT Accuracy: 0.5681818181818182
RM Accuracy: 0.6178977272727273
Accuracy after voting: 0.6193181818181818


In [338]:
vote(X_train, Y_train, X_test, Y_test)


GBC Accuracy: 0.5823863636363636
SVM Accuracy: 0.6036931818181818
RM Accuracy: 0.609375
Accuracy after voting: 0.6079545454545454


In [327]:
from sklearn.model_selection import GridSearchCV

In [332]:
from sklearn.svm import SVC
param_grid = {'C': [1000], 'gamma': [1],'kernel': ['rbf','linear']}
grid = GridSearchCV(SVC(),param_grid,refit=True,verbose=3)
grid.fit(X_train,Y_train)

Fitting 5 folds for each of 2 candidates, totalling 10 fits
[CV 1/5] END .......C=1000, gamma=1, kernel=rbf;, score=0.542 total time=  11.1s
[CV 2/5] END .......C=1000, gamma=1, kernel=rbf;, score=0.570 total time=  11.0s


KeyboardInterrupt: 

In [None]:
print(grid.best_params_)

In [None]:
from sklearn.metrics import confusion_matrix  
print(confusion_matrix(Y_test, Y_pred, labels = ['漲', '跌'])) # 印出混淆矩陣
# test/predicted 看漲   看跌   
#    看漲         TN     FP
#    看跌         FN     TP