In [1]:
# 基本パッケージ（numpy,Pandas,matplotlib）
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# 線形サポートベクターマシーン
from sklearn.svm import LinearSVC
# ランダムフォレスト
from sklearn.ensemble import RandomForestClassifier
# LightGBM
import lightgbm as lgb
# XGboost
import xgboost as xgb
# train_test_split（データを分割出してくれる）
from sklearn.model_selection import train_test_split
# accuracy_score（正解率を測れる）
from sklearn.metrics import accuracy_score
# グリッドサーチ（ハイパーパラメータを自動的に最適化してくれる）
from sklearn.model_selection import GridSearchCV
# 正規化
from sklearn.preprocessing import MinMaxScaler
# 交差検証
from sklearn.model_selection import cross_val_score
# 特徴量選択用
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import RFE
# warningの抑制
import warnings
# モデルの保存
from sklearn.externals import joblib

  from numpy.core.umath_tests import inner1d


In [2]:
# 表示関連
## DataFrameの列数設定
pd.set_option('display.max_columns', 500)
## warningの抑制
warnings.filterwarnings('ignore')

In [3]:
# 株価データの読み込み
stock_data = pd.read_csv("stock_Price_Prediction_v3.3.csv", encoding="shift-jis")

# 読み込みデータの表示
stock_data.head()

Unnamed: 0,GBP,CAD,CHF,SEK,AUD,ZAR,IDR(100),INR,PHP,SGD,KRW(100),THB,KWD,TWD,Open000001.SS,High000001.SS,Low000001.SS,Adj Close000001.SS,Open^AORD,High^AORD,Low^AORD,Adj Close^AORD,Open^AXJO,High^AXJO,Low^AXJO,Adj Close^AXJO,Open^BFX,High^BFX,Low^BFX,Adj Close^BFX,Open^BSESN,High^BSESN,Low^BSESN,Adj Close^BSESN,Open^BVSP,High^BVSP,Low^BVSP,Adj Close^BVSP,Open^DJI,High^DJI,Low^DJI,Adj Close^DJI,Open^FCHI,High^FCHI,Low^FCHI,Adj Close^FCHI,Open^GDAXI,High^GDAXI,Low^GDAXI,Adj Close^GDAXI,Open^GSPC,High^GSPC,Low^GSPC,Open^GSPTSE,High^GSPTSE,Low^GSPTSE,Adj Close^GSPTSE,Open^HSI,High^HSI,Low^HSI,Adj Close^HSI,Open^IPSA,High^IPSA,Low^IPSA,Adj Close^IPSA,Open^IXIC,High^IXIC,Low^IXIC,Adj Close^IXIC,Open^JKSE,High^JKSE,Low^JKSE,Adj Close^JKSE,Open^KS11,High^KS11,Low^KS11,Adj Close^KS11,Open^MERV,High^MERV,Adj Close^MERV,Open^MXX,High^MXX,Low^MXX,Adj Close^MXX,Open^N100,High^N100,Low^N100,Adj Close^N100,Open^NYA,High^NYA,Low^NYA,Open^RUT,High^RUT,Low^RUT,Adj Close^RUT,High^TWII,Low^TWII,Adj Close^TWII,Open^VIX,High^VIX,Adj Close^VIX,Open^XAX,High^XAX,Low^XAX,Adj Close^XAX,Open,High,Low,Volume,Adj Close,month_1,month_2,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,answer,Low^VIX,Low^MERV,Open^TWII,Adj Close^NYA,Adj Close^GSPC,Low^VIX^2,Low^VIX Low^MERV,Low^VIX Open^TWII,Low^VIX Adj Close^NYA,Low^VIX Adj Close^GSPC,Low^MERV^2,Low^MERV Open^TWII,Low^MERV Adj Close^NYA,Low^MERV Adj Close^GSPC,Open^TWII^2,Open^TWII Adj Close^NYA,Open^TWII Adj Close^GSPC,Adj Close^NYA^2,Adj Close^NYA Adj Close^GSPC,Adj Close^GSPC^2
0,-0.002711,0.003238,-0.001622,-0.003846,0.00169,0.009338,-0.007246,0.003663,0.0,-0.000831,-0.008893,-0.006536,0.003609,0.0,-0.016024,-0.004302,0.004251,0.007344,-0.007255,-0.006006,-0.00102,-0.002846,-0.008053,-0.007226,-0.00071,-0.002805,-0.004067,0.006438,0.002904,0.010276,-0.001324,-0.010327,-0.009173,-0.012208,-0.014983,-0.014038,-0.013892,-0.01208,-0.003927,-0.001217,-0.012214,-0.011191,-0.011637,-0.009375,-0.009456,0.001065,-0.016036,-0.010612,-0.004488,-0.005506,-0.00853,-0.006707,-0.014114,-0.003407,-0.004167,-0.006936,-0.006474,-0.018231,-0.01828,-0.003118,-0.004052,-0.009256,-0.003805,-0.000105,0.005605,-0.014401,-0.014141,-0.018726,-0.011112,0.003838,0.00059,0.004085,-0.001906,0.023984,0.016071,0.030788,0.014635,-0.00259,-0.027515,-0.008481,-0.007267,-0.009166,-0.017076,-0.01705,-0.006375,-0.007674,-0.00725,-0.00155,-0.009459,-0.009459,-0.009459,-0.007948,-0.005887,-0.009231,-0.007772,-0.003112,-0.003727,0.008201,0.015167,0.054611,0.05428,0.001202,-0.003871,-0.007277,-0.008593,-0.00625,0.024889,-0.003604,2.643314,0.0187,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,-0.034943,-0.044454,0.0,-0.000422,-0.003205,0.001221,0.001553,-0.0,1.5e-05,0.000112,0.001976,-0.0,1.9e-05,0.000142,0.0,-0.0,-0.0,1.784871e-07,1e-06,1e-05
1,-0.00068,0.000837,0.002,-0.001544,-0.004639,0.010934,0.0,-0.00365,-0.003817,0.001941,-0.000997,0.003289,-0.003367,0.0,0.007755,0.02478,0.000608,0.029596,-0.002965,-0.004128,-0.003605,-0.001832,-0.002805,-0.004479,-0.003582,-0.001539,0.007597,0.008347,0.00903,0.007857,-0.010703,0.008073,0.001284,0.014311,-0.014307,0.001873,-0.001531,0.021016,-0.010887,-0.007542,0.001753,0.003616,-0.002981,-0.004875,-0.010073,-0.020282,-0.001732,-0.006093,-0.010315,-0.005091,-0.009993,-0.007376,0.000339,-0.004867,-0.003182,-0.007444,-0.00737,-0.005971,-0.001443,-0.003247,-0.000239,0.006228,0.006706,0.00644,0.003402,-0.018269,-0.00691,-0.000254,0.003026,0.00156,0.021175,0.003847,0.029382,0.0144,0.001065,0.011734,-0.000631,-0.004941,0.004482,-0.037923,-0.017218,-0.008022,-0.003079,0.00856,-0.005135,-0.005065,-0.012784,-0.016449,0.000559,0.000559,0.000559,-0.007772,-0.005882,-0.002279,0.003564,0.00014,-0.006939,-0.013912,0.025245,-0.018564,-0.020792,-0.008593,-0.008558,-0.006889,-0.004813,0.026954,0.004337,0.028933,0.208818,0.000874,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.042066,-0.0323,-0.015727,0.001489,0.00228,0.00177,-0.001359,-0.000662,6.3e-05,9.6e-05,0.001043,0.000508,-4.8e-05,-7.4e-05,0.000247,-2.3e-05,-3.6e-05,2.216505e-06,3e-06,5e-06
2,-0.007273,-0.009078,-0.008855,-0.00464,-0.006779,-0.015807,0.0,-0.007326,-0.003831,-0.005674,0.000998,-0.003279,-0.006665,-0.005236,0.034787,0.008247,0.02836,-0.004298,-0.001832,0.004866,0.001598,0.005206,-0.001539,0.00524,0.001783,0.005159,0.008674,-0.000677,-0.00243,-0.012138,0.015344,0.002683,0.010328,-0.003411,0.020779,0.00561,0.014723,-0.006287,0.003555,0.007155,0.00592,0.003563,-0.014871,-0.014456,-0.006542,-0.004982,-0.006381,0.000597,0.004482,0.001062,0.000835,0.00253,-0.000509,-0.002822,-0.006792,0.000129,-0.000308,0.0,0.0,0.0,0.0,0.002781,0.000104,0.001543,0.001339,0.011477,0.001322,-0.000119,-0.011018,0.022725,0.014726,0.022725,0.012923,0.0,0.0,0.0,0.0,-0.041345,-0.035672,-0.033684,0.008841,0.013239,0.011796,0.011341,-0.012056,-0.013,-0.004384,-0.004898,-0.000422,-0.000422,-0.000422,0.003564,0.005155,0.005599,-0.001224,0.0,0.0,0.0,-0.028643,-0.034345,-0.032862,0.0,-0.000231,-0.004151,-0.005533,0.0,-0.011226,-0.002636,-0.317789,-0.003493,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,-0.013797,0.001199,0.002925,-0.001352,-0.006656,0.00019,-1.7e-05,-4e-05,1.9e-05,9.2e-05,1e-06,4e-06,-2e-06,-8e-06,9e-06,-4e-06,-1.9e-05,1.828797e-06,9e-06,4.4e-05
3,-0.006324,-0.001929,-0.005914,-0.003885,-0.008105,0.007608,0.007299,-0.00369,-0.007692,-0.001253,-0.012961,-0.003289,-0.005992,-0.007895,-0.010102,-0.008097,0.001548,0.003675,0.005206,0.002003,0.003281,3e-05,0.005159,0.001914,0.002996,-0.000295,-0.008389,-0.01018,-0.008107,-0.006618,-0.004078,-0.004827,-0.005753,-0.005828,-0.008532,-0.014652,-0.008993,-0.008738,0.002258,-0.007391,-0.009447,-0.002196,-0.007102,-0.009416,-0.020218,-0.018562,0.002074,-0.006028,-0.020454,-0.015246,-0.003205,-0.006971,-0.006878,-0.009221,-0.005789,-0.0071,-0.004433,0.006186,0.0017,-0.004238,-0.009942,0.001235,-0.002148,-0.003708,-0.004723,-0.031064,-0.009322,-0.020402,0.008949,0.016687,0.01607,0.016363,0.011957,0.008642,0.005821,-0.009441,-0.018867,-0.039105,-0.05194,-0.039321,0.009772,-0.004341,0.002261,-0.008723,-0.007304,-0.007888,-0.017864,-0.015799,0.001489,0.001489,0.001489,-0.001224,0.003731,-0.007538,0.010547,-0.009671,0.002065,-0.00262,0.052768,0.054639,0.025092,-0.01032,-0.003931,0.001493,0.006499,-0.004374,0.001747,-0.003524,-0.435145,-0.002629,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,-0.058549,0.006819,-0.029993,0.010968,0.011335,0.003428,-0.000399,0.001756,-0.000642,-0.000664,4.6e-05,-0.000205,7.5e-05,7.7e-05,0.0009,-0.000329,-0.00034,0.0001202927,0.000124,0.000128
4,-0.00122,-0.004227,-0.006835,-0.01014,-0.003727,-0.010906,0.007246,0.0,0.0,-0.001254,0.00101,-0.0033,0.003608,0.002653,0.004906,0.017149,0.005901,0.016379,3e-05,-0.003341,-0.00627,-0.007544,-0.000295,-0.003733,-0.006446,-0.007554,-0.00614,-0.004728,-0.001337,-0.004236,-0.005591,-0.008465,-0.008954,-0.004839,-0.006265,-0.000377,0.002516,0.00266,-0.000884,0.004492,0.00793,-0.003943,-0.007926,-0.005144,0.009511,0.007227,-0.012788,-0.006301,0.007453,-0.001946,0.00228,0.002559,0.004443,0.002533,-0.000927,-0.002345,-0.003291,-0.011269,-0.006038,-0.001285,0.000934,-0.004619,0.003897,0.000725,0.007223,0.027592,0.005161,0.004937,-0.024246,0.012158,0.004893,0.009956,0.009554,-0.018505,-0.02264,-0.017562,-0.013345,-0.03197,-0.010411,0.002076,-0.007231,-0.004142,-0.004802,-7.6e-05,-0.005894,-0.00207,0.008837,0.005615,-0.001352,-0.001352,-0.001352,0.010547,0.004811,0.018106,0.0,-0.001657,-0.017512,-0.019542,-0.051106,-0.035679,-0.007139,0.006499,-1.1e-05,-0.001625,-0.006656,0.0,-0.004359,-0.012378,1.40135,-0.017575,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.021464,-0.023549,0.010348,-0.021329,-0.023689,0.000461,-0.000505,0.000222,-0.000458,-0.000508,0.000555,-0.000244,0.000502,0.000558,0.000107,-0.000221,-0.000245,0.0004549136,0.000505,0.000561


In [4]:
# DataFrameをNumPy配列へ変換
# 正解ラベルの変換
answers = stock_data.answer.values
# 説明変数の変換
stock_data.drop(columns=['answer'], inplace=True)
explanatory_variable = stock_data.values

In [5]:
# スケール変換        
ms = MinMaxScaler()
ms.fit(explanatory_variable)
explanatory_variable = ms.transform(explanatory_variable)

In [6]:
# データの分割（データの80%を訓練用に、20％をテスト用に分割する）
X_train, X_test, y_train, y_test = train_test_split(explanatory_variable , answers, test_size=0.2, random_state=1,shuffle = False)

In [7]:
# グリッドサーチするパラメータを設定
parameters = {'C':[0.01,0.1,1,10,100],'loss':['hinge', 'squared_hinge']}
# グリッドサーチを実行
lsvc =  LinearSVC(random_state=1)
grid_search = GridSearchCV(lsvc, param_grid=parameters, cv=5)
grid_search = grid_search.fit(X_train , y_train)

# グリッドサーチ結果(最適パラメータ)を取得
GS_C, GS_loss = grid_search.best_params_.values()
print ("最適パラメータ：{}".format(grid_search.best_params_))

最適パラメータ：{'C': 1, 'loss': 'squared_hinge'}


In [8]:
# 最適パラメーターを指定して学習
clf = LinearSVC(loss=GS_loss, C=GS_C, random_state=1)
clf.fit(X_train , y_train)

LinearSVC(C=1, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=1, tol=0.0001,
     verbose=0)

In [9]:
# 学習後のモデルによるテスト
# トレーニングデータを用いた予測
y_train_pred = clf.predict(X_train)
# テストデータを用いた予測
y_val_pred = clf.predict(X_test)

In [10]:
# 正解率の計算
train_score = accuracy_score(y_train, y_train_pred)
test_score = accuracy_score(y_test, y_val_pred)
# 正解率を表示
print("トレーニングデータに対する正解率：" + str(train_score * 100) + "%")
print("テストデータに対する正解率：" + str(test_score * 100) + "%")

トレーニングデータに対する正解率：59.54601990049751%
テストデータに対する正解率：52.670807453416145%


In [11]:
#  交差検証
## 5分割し交差検証
scores = cross_val_score(clf, explanatory_variable, answers, cv=5)
## 各分割におけるスコア
print('Cross-Validation scores: {}'.format(scores))
## スコアの平均値
print('Average score: {}'.format(np.mean(scores)))

Cross-Validation scores: [0.53167702 0.56273292 0.53043478 0.51930262 0.52303861]
Average score: 0.5334371881840613


In [12]:
# モデルベース特徴量選択による特徴量選択
# estimatorとしてランダムフォレストを使用。重要度がmedian（中央値）以上のものを選択
selector_RF = SelectFromModel(RandomForestClassifier(random_state=1), threshold="median")
selector_RF.fit(X_train , y_train)

# 選択された特徴量による学習：モデルベース特徴量選択
# 選択された特徴量のみにデータを変換
X_train_RF = selector_RF.transform(X_train)
X_test_RF = selector_RF.transform(X_test)

# 学習
clf.fit(X_train_RF, y_train)

# 正解率の算出
y_train_pred_RF = clf.predict(X_train_RF)
y_val_pred_RF = clf.predict(X_test_RF)
train_score = accuracy_score(y_train, y_train_pred_RF)
test_score = accuracy_score(y_test, y_val_pred_RF)

print("モデルベース特徴量選択(LightGBM)：トレーニングデータに対する正解率：" + str(train_score * 100) + "%")
print("モデルベース特徴量選択(LightGBM)：テストデータに対する正解率：" + str(test_score * 100) + "%")

#  交差検証
## 選択された特徴量のみにデータを変換
explanatory_variable_RF = selector_RF.transform(explanatory_variable)
## 5分割し交差検証
scores = cross_val_score(clf, explanatory_variable_RF, answers, cv=5)
## 各分割におけるスコア
print('Cross-Validation scores: {}'.format(scores))
## スコアの平均値
print('Average score: {}'.format(np.mean(scores)))

モデルベース特徴量選択(LightGBM)：トレーニングデータに対する正解率：57.77363184079603%
モデルベース特徴量選択(LightGBM)：テストデータに対する正解率：54.78260869565217%
Cross-Validation scores: [0.55031056 0.54409938 0.56770186 0.53798257 0.54669988]
Average score: 0.549358848417812


In [13]:
# モデルベース特徴量選択による特徴量選択
# estimatorとしてLightGBMを使用。重要度がmedian（中央値）以上のものを選択
selector_LGBM = SelectFromModel(lgb.LGBMClassifier(random_state=1), threshold="median")
selector_LGBM.fit(X_train , y_train)

# 選択された特徴量による学習：モデルベース特徴量選択
# 選択された特徴量のみにデータを変換
X_train_LGBM = selector_LGBM.transform(X_train)
X_test_LGBM = selector_LGBM.transform(X_test)

# 学習
clf.fit(X_train_LGBM, y_train)

# 正解率の算出
y_train_pred_LGBM = clf.predict(X_train_LGBM)
y_val_pred_LGBM = clf.predict(X_test_LGBM)
train_score = accuracy_score(y_train, y_train_pred_LGBM)
test_score = accuracy_score(y_test, y_val_pred_LGBM)

print("モデルベース特徴量選択(LightGBM)：トレーニングデータに対する正解率：" + str(train_score * 100) + "%")
print("モデルベース特徴量選択(LightGBM)：テストデータに対する正解率：" + str(test_score * 100) + "%")

#  交差検証
## 選択された特徴量のみにデータを変換
explanatory_variable_LGBM = selector_LGBM.transform(explanatory_variable)
## 5分割し交差検証
scores = cross_val_score(clf, explanatory_variable_LGBM, answers, cv=5)
## 各分割におけるスコア
print('Cross-Validation scores: {}'.format(scores))
## スコアの平均値
print('Average score: {}'.format(np.mean(scores)))

モデルベース特徴量選択(LightGBM)：トレーニングデータに対する正解率：58.333333333333336%
モデルベース特徴量選択(LightGBM)：テストデータに対する正解率：53.16770186335403%
Cross-Validation scores: [0.55031056 0.56770186 0.56273292 0.5267746  0.53424658]
Average score: 0.5483533024450237


In [14]:
# モデルベース特徴量選択による特徴量選択
# estimatorとしてXGboostを使用。重要度がmedian（中央値）以上のものを選択
selector_XGB = SelectFromModel(xgb.XGBClassifier(random_state=1), threshold="median")
selector_XGB.fit(X_train , y_train)

# 選択された特徴量による学習：モデルベース特徴量選択
# 選択された特徴量のみにデータを変換
X_train_XGB = selector_XGB.transform(X_train)
X_test_XGB = selector_XGB.transform(X_test)

# 学習
clf.fit(X_train_XGB, y_train)

# 正解率の算出
y_train_pred_XGB = clf.predict(X_train_XGB)
y_val_pred_XGB = clf.predict(X_test_XGB)
train_score = accuracy_score(y_train, y_train_pred_XGB)
test_score = accuracy_score(y_test, y_val_pred_XGB)

print("モデルベース特徴量選択(XGboost)：トレーニングデータに対する正解率：" + str(train_score * 100) + "%")
print("モデルベース特徴量選択(XGboost)：テストデータに対する正解率：" + str(test_score * 100) + "%")

#  交差検証
## 選択された特徴量のみにデータを変換
explanatory_variable_XGB = selector_XGB.transform(explanatory_variable)
## 5分割し交差検証
scores = cross_val_score(clf, explanatory_variable_XGB, answers, cv=5)
## 各分割におけるスコア
print('Cross-Validation scores: {}'.format(scores))
## スコアの平均値
print('Average score: {}'.format(np.mean(scores)))

モデルベース特徴量選択(XGboost)：トレーニングデータに対する正解率：57.86691542288557%
モデルベース特徴量選択(XGboost)：テストデータに対する正解率：54.409937888198755%
Cross-Validation scores: [0.5378882  0.5552795  0.55900621 0.52428394 0.54669988]
Average score: 0.5446315447506632


In [15]:
# RFEによる特徴量選択
# estimatorとしてLightGBMを使用。上位50個の特徴量を抽出する
selector_RFE = RFE(lgb.LGBMClassifier(random_state=1), n_features_to_select=50)
selector_RFE.fit(X_train , y_train)

# 選択された特徴量のみにデータを変換
X_train_RFE = selector_RFE.transform(X_train)
X_test_RFE = selector_RFE.transform(X_test)

# 学習
clf.fit(X_train_RFE, y_train)

# 正解率の算出
y_train_pred_RFE = clf.predict(X_train_RFE)
y_val_pred_RFE = clf.predict(X_test_RFE)
train_score = accuracy_score(y_train, y_train_pred_RFE)
test_score = accuracy_score(y_test, y_val_pred_RFE)

print("RFEによる特徴量選択：トレーニングデータに対する正解率：" + str(train_score * 100) + "%")
print("RFEによる特徴量選択：テストデータに対する正解率：" + str(test_score * 100) + "%")

#  交差検証
## 選択された特徴量のみにデータを変換
explanatory_variable_RFE = selector_RFE.transform(explanatory_variable)
## 5分割し交差検証
scores = cross_val_score(clf, explanatory_variable_RFE, answers, cv=5)
## 各分割におけるスコア
print('Cross-Validation scores: {}'.format(scores))
## スコアの平均値
print('Average score: {}'.format(np.mean(scores)))

RFEによる特徴量選択：トレーニングデータに対する正解率：57.46268656716418%
RFEによる特徴量選択：テストデータに対する正解率：55.15527950310559%
Cross-Validation scores: [0.5552795  0.55776398 0.57515528 0.52303861 0.54919054]
Average score: 0.5520855796972534
