
# 透過機器學習預測股市漲跌-模型建模
## 作者：蔡尚宏（臺灣行銷研究特邀作者）、劉睿哲（臺灣行銷研究特邀作者）、鄭晴文（臺灣行銷研究特邀作者）、鍾皓軒(臺灣行銷研究有限公司創辦人）
## 縮寫還原完的資料請見[本連結](https://drive.google.com/file/d/1HfZvdy0nJYbPN_tB9cBAotdAeGWkm9Re/view?usp=sharing)，下載下來後與本ipynb檔案放於同一個工作目錄中，再執行下方程式即可

In [1]:
import os
import re
import nltk
import string
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from datetime import timedelta, datetime


In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
data = pd.read_csv("after_Combined_News_DJIA.csv")

# 系列一_透過機器學習預測股市漲跌-基本資料處理

In [4]:
def preprocess(processdata):
    # 轉小寫
    headlines = []
    for i in range(1, 26):
      headlines.append('Top'+str(i))
    processdata[headlines] = processdata[headlines].astype(str)
    processdata[headlines] = processdata[headlines].applymap(str.lower)
    
    # 組成以天為單位的data
    processdata_headlines = []
    for row in range(0,len(processdata.index)):
      processdata_headlines.append(' '.join(str(x) for x in processdata.iloc[row,2:27]))

    # remove punctuation characters
    for line in range(len(processdata_headlines)):
      processdata_headlines[line] = re.sub(r'[^A-Za-z]'," ", processdata_headlines[line])

    # 切字
    for sentence in range(len(processdata_headlines)):
      processdata_headlines[sentence] = word_tokenize(processdata_headlines[sentence]) 

    # 去除停用詞
    alpha = []
    for abc in string.ascii_lowercase :
      alpha.append(abc)      
    en_stops = stopwords.words('english')
    en_stops.extend(alpha)
    for sentence in range(len(processdata_headlines)):
      processdata_headlines[sentence] = [w for w in processdata_headlines[sentence] if w not in en_stops] 
    
    # 單字變回原形
    for sentence in range(len(processdata_headlines)):
      processdata_headlines[sentence] = [WordNetLemmatizer().lemmatize(w) for w in processdata_headlines[sentence]]
      processdata_headlines[sentence] = [WordNetLemmatizer().lemmatize(w, pos='v') for w in processdata_headlines[sentence]]   

    # 組回標題
    final_processdata_headlines = []
    for words in processdata_headlines :
      filter_words = ""
      for i in range(len(words)) :
        filter_words = filter_words + words[i] + " "
      final_processdata_headlines.append(filter_words)  

    return final_processdata_headlines  

## 依照時間切分訓練集、測試集

In [5]:
train = data[data['Date'] < '2015-01-01']
test = data[data['Date'] > '2014-12-31']
final_traindata = preprocess(train)
final_testdata = preprocess(test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


# 系列二_透過機器學習預測股市漲跌-進階資料處理

In [6]:
tfidf_vector = TfidfVectorizer(min_df=0.01, max_df=0.99, max_features=160, ngram_range=(2, 2))
final_traindata_tfidf = tfidf_vector.fit_transform(final_traindata)
final_testdata_tfidf = tfidf_vector.transform(final_testdata)

In [7]:
word = tfidf_vector.get_feature_names()
df = pd.DataFrame(final_traindata_tfidf.T.todense().transpose(), columns=word).sum(axis=0)
df.head()

air force     15.463068
air strike    22.806328
al jazeera    37.810893
al qaeda      40.214684
al qaida      19.244321
dtype: float64

# 系列三_透過機器學習預測股市漲跌-模型建模

## 查看模型輸入

In [8]:
word = tfidf_vector.get_feature_names()
input_df = pd.DataFrame(final_traindata_tfidf.todense(), columns=word)
input_df.head(10)

Unnamed: 0,air force,air strike,al jazeera,al qaeda,al qaida,anti gay,around world,australian government,barack obama,bbc news,...,world biggest,world cup,world first,world largest,world news,world war,year ago,year jail,year old,year prison
0,0.0,0.0,0.0,0.361791,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.400727,0.0,0.0,0.228241,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.671348,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.79022,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.337538,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.375588,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.29868,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.606469,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.391499,0.0,0.233103,0.0
8,0.0,0.0,0.0,0.375292,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Logistic Regression 模型

### Random Search

In [None]:
lr_random_grid ={'solver' : ["newton-cg", "lbfgs", "liblinear", "sag", "saga"], 
                   'penalty' : ["l1", "l2"], 
                   "C" : [x for x in np.arange(0.0001, 1000, 10)] ,  
                   "max_iter" : [int(x) for x in range(1,500,10)],
                   "class_weight" : ['balanced']}

lr_random_model = LogisticRegression()
lr_random_search = RandomizedSearchCV(estimator=lr_random_model, param_distributions=lr_random_grid, n_iter = 100, 
                                        scoring='accuracy', cv = 3, verbose=2, random_state=42, n_jobs=-1)

# Fit the random search model
lr_random_search.fit(final_traindata_tfidf, train["Label"])
random_lr_model = lr_random_search.best_estimator_
predictions = random_lr_model.predict(final_testdata_tfidf)

print("Score of train set: % .10f" % (random_lr_model.score(final_traindata_tfidf, train["Label"])))
print("Score of test set: % .10f" % (random_lr_model.score(final_testdata_tfidf, test["Label"])))
print("Best score:{}".format(lr_random_search.best_score_))
print("Best parameters:{}".format(lr_random_search.best_params_))

### Grid Search

In [None]:
lr_grid ={'solver' : ["newton-cg", "liblinear", "sag"], 
          'penalty' : ["l2"], 
          "C" : [x for x in np.arange(100, 200, 10)] ,  
          "max_iter" : [int(x) for x in range(200, 400, 20)],
          "class_weight" : ['balanced']}

lr_model = LogisticRegression()
lr_grid_search = GridSearchCV(lr_model, lr_grid, scoring='accuracy')
lr_grid_search.fit(final_traindata_tfidf, train["Label"])

grid_lr_model = lr_grid_search.best_estimator_
print("Score of train set: % .10f" % (grid_lr_model.score(final_traindata_tfidf, train["Label"])))
print("Score of test set: % .10f" % (grid_lr_model.score(final_testdata_tfidf, test["Label"])))
print("Best score:{}".format(lr_grid_search.best_score_))
print("Best parameters:{}".format(lr_grid_search.best_params_))

### 最終結果

In [9]:
lr_model = LogisticRegression()
lr_model.fit(final_traindata_tfidf, train["Label"])

train_pred = lr_model.predict(final_traindata_tfidf)
test_pred = lr_model.predict(final_testdata_tfidf)

train_accuracy = accuracy_score(train['Label'], train_pred)
test_accuracy = accuracy_score(test['Label'], test_pred)
print("Accuracy of train set ：{:.4f}".format(train_accuracy))
print("Accuracy of test set：{:.4f}".format(test_accuracy))

Accuracy of train set ：0.6394
Accuracy of test set：0.5423


## Random Forest 模型

### Random Search

In [None]:
n_estimators = [int(x) for x in np.linspace(60, 160, num = 20)]
max_features = ['auto', 'sqrt'] 
max_depth = [int(x) for x in np.linspace(10, 100, num = 10)]
max_depth.append(None)
min_samples_split = [2, 4, 5, 7, 8, 10]
min_samples_leaf = [1, 2, 3, 4, 5, 6]
bootstrap = [True, False]
criterion = ['entropy']
random_state = [0]

rfc_random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap,
               'random_state':random_state,
               'criterion':criterion}

rfc = RandomForestClassifier()
rfc_random_search = RandomizedSearchCV(estimator=rfc, param_distributions=rfc_random_grid, n_iter = 100, scoring='accuracy', 
                               cv = 3, verbose=2, random_state=42, n_jobs=-1)
rfc_random_search.fit(final_traindata_tfidf, train["Label"])

rfc_random_model = rfc_random_search.best_estimator_
print("Score of train set: % .10f" % (rfc_random_model.score(final_traindata_tfidf, train["Label"])))
print("Score of test set: % .10f" % (rfc_random_model.score(final_testdata_tfidf, test["Label"])))
print("Best score:{}".format(rfc_random_search.best_score_))
print("Best parameters:{}".format(rfc_random_search.best_params_))

### Grid Search

In [None]:
n_estimators = [130,138] 
max_features = ['auto'] 
max_depth = [5,8,10]
max_depth.append(None)
min_samples_split = [2,3]
min_samples_leaf = [None,2,6]
bootstrap = [True]
criterion = ['entropy']
random_state = [0]

rfc_param_grid = {"random_state":random_state,
                  "max_features":max_features,
                  "n_estimators":n_estimators,
                  "max_depth":max_depth,
                  "min_samples_leaf":min_samples_leaf,
                  "min_samples_split":min_samples_split,
                  "criterion":criterion}

rfc = RandomForestClassifier()
rfc_grid_search = GridSearchCV(rfc, rfc_param_grid, scoring='accuracy')
rfc_grid_search.fit(final_traindata_tfidf, train["Label"])

rfc_grid_model = rfc_grid_search.best_estimator_
print("Score of train set: % .10f" % (rfc_grid_model.score(final_traindata_tfidf, train["Label"])))
print("Score of test set: % .10f" % (rfc_grid_model.score(final_testdata_tfidf, test["Label"])))
print("Best score:{}".format(rfc_grid_search.best_score_))
print("Best parameters:{}".format(rfc_grid_search.best_params_))

### 最終結果

In [19]:
rfc = RandomForestClassifier(n_estimators = 138 ,criterion = 'gini' ,min_samples_split = 7, max_depth = 10, random_state=0)
rfc.fit(final_traindata_tfidf, train["Label"])

train_pred = rfc.predict(final_traindata_tfidf)
test_pred = rfc.predict(final_testdata_tfidf)

train_accuracy = accuracy_score(train['Label'], train_pred)
test_accuracy = accuracy_score(test['Label'], test_pred)
print("Accuracy of train set ：{:.4f}".format(train_accuracy))
print("Accuracy of test set：{:.4f}".format(test_accuracy))

Accuracy of train set ：0.7511
Accuracy of test set：0.5423


## Naive Bayes 模型

### Random Search

In [None]:
nb_random_grid = {'alpha': [x for x in np.arange(0.001,100,0.01)]}

nb_model = MultinomialNB()
nb_random_search = RandomizedSearchCV(estimator=nb_model, param_distributions=nb_random_grid,
                              n_iter = 100, scoring='accuracy', 
                              cv = 3, verbose=2, random_state=42, n_jobs=-1)

nb_random_search.fit(final_traindata_tfidf, train["Label"])

nb_result = nb_random_search.fit(final_traindata_tfidf, train["Label"])

nb_random_model = nb_random_search.best_estimator_

print("Scroe of train set: % .10f" % (nb_random_model.score(final_traindata_tfidf, train["Label"])))
print("Scroe of test set: % .10f" % (nb_random_model.score(final_testdata_tfidf, test["Label"])))
print("Best score:{}".format(nb_random_search.best_score_))
print("Best parameters:{}".format(nb_random_search.best_params_))

### Grid Search

In [None]:
# 調參
nb_param_grid= {'alpha': [x for x in np.arange(0.1, 80, 0.1)]}
nb_model = MultinomialNB()
nb_grid_search = GridSearchCV(nb_model, nb_param_grid, scoring='accuracy', cv=5)
nb_result = nb_grid_search.fit(final_traindata_tfidf, train["Label"])

nb_grid_model = nb_grid_search.best_estimator_

print("Scroe of train set: % .10f" % (nb_grid_model.score(final_traindata_tfidf, train["Label"])))
print("Scroe of test set: % .10f" % (nb_grid_model.score(final_testdata_tfidf, test["Label"])))
print("Best score:{}".format(nb_grid_search.best_score_))
print("Best parameters:{}".format(nb_grid_search.best_params_))

### 最終結果

In [20]:
nb_model = MultinomialNB(alpha=1.8)
nb_model.fit(final_traindata_tfidf, train["Label"])

train_pred = nb_model.predict(final_traindata_tfidf)
test_pred = nb_model.predict(final_testdata_tfidf)

train_accuracy = accuracy_score(train['Label'], train_pred)
test_accuracy = accuracy_score(test['Label'], test_pred)
print("Accuracy of train set ：{:.4f}".format(train_accuracy))
print("Accuracy of test set：{:.4f}".format(test_accuracy))

Accuracy of train set ：0.6077
Accuracy of test set：0.5317
