In [1]:
# 基本パッケージ（numpy,Pandas,matplotlib）
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# サポートベクターマシーン
from sklearn import svm
# train_test_split（データを分割出してくれる）
from sklearn.model_selection import train_test_split
# accuracy_score（正解率を測れる）
from sklearn.metrics import accuracy_score

# グリッドサーチ（ハイパーパラメータを自動的に最適化してくれる）
from sklearn.model_selection import GridSearchCV
# 正規化、標準化用
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
# 特徴量選択用
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE
# 交差検証用
from sklearn.model_selection import cross_val_score
# 多項式特徴量生成用
from sklearn.preprocessing import PolynomialFeatures

In [2]:
# 表示関連
# DataFrameの列数設定
pd.set_option('display.max_columns', 500)

In [3]:
# 株価データの読み込み
stock_data = pd.read_csv("stock_Price_Prediction_v3.csv", encoding="shift-jis")

# 読み込みデータの表示
stock_data.head()

Unnamed: 0,始値,高値,安値,出来高,終値調整値,米ドル,英ポンド,ユーロ,カナダドル,スイスフラン,スウェーデンクローネ,デンマーククローネ,ノルウェークローネ,オーストラリアドル,ニュージーランドドル,南アフリカランド,バーレーンディナール,インドネシアルピア,中国人民元,香港ドル,インドルピー,フィリピンペソ,シンガポールドル,韓国ウォン,タイバーツ,クウェートディナール,サウジアラビアリアル,UAEディルハム,メキシコペソ,台湾ドル,Open000001.SS,High000001.SS,Low000001.SS,Adj Close000001.SS,Open^AORD,High^AORD,Low^AORD,Adj Close^AORD,Open^AXJO,High^AXJO,Low^AXJO,Adj Close^AXJO,Open^BFX,High^BFX,Low^BFX,Adj Close^BFX,Open^BSESN,High^BSESN,Low^BSESN,Adj Close^BSESN,Open^BVSP,High^BVSP,Low^BVSP,Adj Close^BVSP,Open^DJI,High^DJI,Low^DJI,Adj Close^DJI,Open^FCHI,High^FCHI,Low^FCHI,Adj Close^FCHI,Open^GDAXI,High^GDAXI,Low^GDAXI,Adj Close^GDAXI,Open^GSPC,High^GSPC,Low^GSPC,Adj Close^GSPC,Open^GSPTSE,High^GSPTSE,Low^GSPTSE,Adj Close^GSPTSE,Open^HSI,High^HSI,Low^HSI,Adj Close^HSI,Open^IPSA,High^IPSA,Low^IPSA,Adj Close^IPSA,Open^IXIC,High^IXIC,Low^IXIC,Adj Close^IXIC,Open^JKSE,High^JKSE,Low^JKSE,Adj Close^JKSE,Open^KS11,High^KS11,Low^KS11,Adj Close^KS11,Open^MERV,High^MERV,Low^MERV,Adj Close^MERV,Open^MXX,High^MXX,Low^MXX,Adj Close^MXX,Open^N100,High^N100,Low^N100,Adj Close^N100,Open^NYA,High^NYA,Low^NYA,Adj Close^NYA,Open^RUT,High^RUT,Low^RUT,Adj Close^RUT,Open^STOXX50E,High^STOXX50E,Low^STOXX50E,Adj Close^STOXX50E,Open^TA125.TA,High^TA125.TA,Low^TA125.TA,Adj Close^TA125.TA,Open^TWII,High^TWII,Low^TWII,Adj Close^TWII,Open^VIX,High^VIX,Low^VIX,Adj Close^VIX,Open^XAX,High^XAX,Low^XAX,Adj Close^XAX,answer
0,-0.00625,0.024889,-0.003604,2.643314,0.0187,0.0,-0.002711,-0.001877,0.003238,-0.001622,-0.003846,-0.001268,-0.000656,0.00169,-0.000849,0.009338,0.0,-0.007246,0.0,0.0,0.003663,0.0,-0.000831,-0.008893,-0.006536,0.003609,0.0,0.0,-0.002027,0.0,-0.016024,-0.004302,0.004251,0.007344,-0.007255,-0.006006,-0.00102,-0.002846,-0.008053,-0.007226,-0.00071,-0.002805,-0.004067,0.006438,0.002904,0.010276,-0.001324,-0.010327,-0.009173,-0.012208,-0.014983,-0.014038,-0.013892,-0.01208,-0.003927,-0.001217,-0.012214,-0.011191,-0.011637,-0.009375,-0.009456,0.001065,-0.016036,-0.010612,-0.004488,-0.005506,-0.00853,-0.006707,-0.014114,-0.009993,-0.003407,-0.004167,-0.006936,-0.006474,-0.018231,-0.01828,-0.003118,-0.004052,-0.009256,-0.003805,-0.000105,0.005605,-0.014401,-0.014141,-0.018726,-0.011112,0.003838,0.00059,0.004085,-0.001906,0.023984,0.016071,0.030788,0.014635,-0.00259,-0.027515,-0.014697,-0.008481,-0.007267,-0.009166,-0.017076,-0.01705,-0.006375,-0.007674,-0.00725,-0.00155,-0.009459,-0.009459,-0.009459,-0.009459,-0.007948,-0.005887,-0.009231,-0.007772,0.001197,0.001197,0.001197,0.001197,0.0,0.0,0.0,0.0,-0.004409,-0.003112,-0.003727,0.008201,0.015167,0.054611,0.014644,0.05428,0.001202,-0.003871,-0.007277,-0.008593,1.0
1,0.026954,0.004337,0.028933,0.208818,0.000874,-0.000751,-0.00068,0.001624,0.000837,0.002,-0.001544,0.00127,0.006566,-0.004639,-0.004931,0.010934,-0.000735,0.0,-0.000622,-0.001171,-0.00365,-0.003817,0.001941,-0.000997,0.003289,-0.003367,-0.000844,-0.000827,-0.001354,0.0,0.007755,0.02478,0.000608,0.029596,-0.002965,-0.004128,-0.003605,-0.001832,-0.002805,-0.004479,-0.003582,-0.001539,0.007597,0.008347,0.00903,0.007857,-0.010703,0.008073,0.001284,0.014311,-0.014307,0.001873,-0.001531,0.021016,-0.010887,-0.007542,0.001753,0.003616,-0.002981,-0.004875,-0.010073,-0.020282,-0.001732,-0.006093,-0.010315,-0.005091,-0.009993,-0.007376,0.000339,0.000835,-0.004867,-0.003182,-0.007444,-0.00737,-0.005971,-0.001443,-0.003247,-0.000239,0.006228,0.006706,0.00644,0.003402,-0.018269,-0.00691,-0.000254,0.003026,0.00156,0.021175,0.003847,0.029382,0.0144,0.001065,0.011734,-0.000631,-0.004941,0.004482,-0.026619,-0.037923,-0.017218,-0.008022,-0.003079,0.00856,-0.005135,-0.005065,-0.012784,-0.016449,0.000559,0.000559,0.000559,0.000559,-0.007772,-0.005882,-0.002279,0.003564,-0.015297,-0.015297,-0.015297,-0.015297,-0.005912,-0.005912,-0.01763,-0.007909,0.013378,0.00014,-0.006939,-0.013912,0.025245,-0.018564,0.003093,-0.020792,-0.008593,-0.008558,-0.006889,-0.004813,1.0
2,-0.004374,-0.009499,-0.006151,-0.614649,-0.006114,-0.011645,-0.013551,-0.012463,-0.010989,-0.014717,-0.008507,-0.012682,-0.013046,-0.014828,-0.018626,-0.008319,-0.011654,0.007299,-0.011816,-0.011137,-0.010989,-0.011494,-0.006919,-0.011976,-0.006557,-0.012617,-0.011549,-0.011865,-0.008814,-0.013089,0.024334,8.3e-05,0.029952,-0.000639,0.003364,0.006878,0.004884,0.005236,0.003612,0.007164,0.004784,0.004863,0.000212,-0.01085,-0.010517,-0.018676,0.011203,-0.002156,0.004516,-0.009218,0.01207,-0.009125,0.005598,-0.01497,0.005821,-0.000289,-0.003583,0.001359,-0.021868,-0.023736,-0.026628,-0.023451,-0.00432,-0.005435,-0.016064,-0.0142,-0.002373,-0.004458,-0.007384,-0.000932,-0.012017,-0.012541,-0.006972,-0.00474,0.006186,0.0017,-0.004238,-0.009942,0.004019,-0.002044,-0.002171,-0.003391,-0.019943,-0.008013,-0.020518,-0.002168,0.039791,0.031033,0.03946,0.025035,0.008642,0.005821,-0.009441,-0.018867,-0.078833,-0.085759,-0.075318,-0.07168,0.018699,0.008841,0.014083,0.002519,-0.019272,-0.020785,-0.022169,-0.02062,0.001066,0.001066,0.001066,0.001066,0.002336,0.008905,-0.001981,0.00931,-0.0234,-0.0234,-0.0234,-0.0234,0.01953,0.01953,0.010708,0.005261,-0.015727,-0.009671,0.002065,-0.00262,0.022613,0.018417,0.005653,-0.008595,-0.01032,-0.004162,-0.002664,0.00093,0.0
3,0.0,-0.004359,-0.012378,1.40135,-0.017575,0.0,-0.00122,-0.005532,-0.004227,-0.006835,-0.01014,-0.00578,-0.003305,-0.003727,-0.003831,-0.010906,0.0,0.007246,0.0,0.0,0.0,0.0,-0.001254,0.00101,-0.0033,0.003608,0.0,0.000559,-0.002052,0.002653,0.004906,0.017149,0.005901,0.016379,3e-05,-0.003341,-0.00627,-0.007544,-0.000295,-0.003733,-0.006446,-0.007554,-0.00614,-0.004728,-0.001337,-0.004236,-0.005591,-0.008465,-0.008954,-0.004839,-0.006265,-0.000377,0.002516,0.00266,-0.000884,0.004492,0.00793,-0.003943,-0.007926,-0.005144,0.009511,0.007227,-0.012788,-0.006301,0.007453,-0.001946,0.00228,0.002559,0.004443,-0.006656,0.002533,-0.000927,-0.002345,-0.003291,-0.011269,-0.006038,-0.001285,0.000934,-0.004619,0.003897,0.000725,0.007223,0.027592,0.005161,0.004937,-0.024246,0.012158,0.004893,0.009956,0.009554,-0.018505,-0.02264,-0.017562,-0.013345,-0.03197,-0.010411,0.001199,0.002076,-0.007231,-0.004142,-0.004802,-7.6e-05,-0.005894,-0.00207,0.008837,0.005615,-0.001352,-0.001352,-0.001352,-0.001352,0.010547,0.004811,0.018106,0.0,0.005068,0.005068,0.005068,0.005068,-0.015976,-0.00247,-0.005606,-0.006435,0.002925,-0.001657,-0.017512,-0.019542,-0.051106,-0.035679,-0.013797,-0.007139,0.006499,-1.1e-05,-0.001625,-0.006656,1.0
4,-0.021968,-0.007881,-0.007162,-0.424734,0.006261,-0.006461,-0.002496,0.000956,-0.006307,0.000127,-0.004728,0.000646,-0.001326,-0.001295,-0.002972,-0.00933,-0.006468,-0.014388,-0.006293,-0.00652,-0.007407,-0.003876,-0.006976,-0.007064,-0.003311,-0.00712,-0.006555,-0.006698,-0.006854,-0.007937,0.017636,0.003634,0.013885,0.006078,-0.007544,-0.003262,-0.000876,0.00365,-0.007554,-0.002744,-0.000625,0.004162,-0.005157,0.000181,-0.003328,0.008132,-0.004916,0.000742,0.003553,0.004695,0.001367,0.015253,0.002662,0.016749,-0.003848,0.008703,0.000909,0.016952,-0.007434,0.011369,-0.007457,0.012975,-0.00464,0.00881,-0.005271,0.018396,-0.006656,0.003075,0.000958,0.011335,-0.002643,0.00566,0.004454,0.010101,-0.001693,-0.009095,-0.007291,-0.008436,0.007223,0.004609,0.006504,0.00123,-0.020999,-0.013154,-0.004999,0.01406,0.012237,0.025314,0.013234,0.026552,-0.024021,-0.025606,-0.032744,-0.036729,0.002072,0.007663,0.006819,0.012847,0.000796,0.030617,0.005835,0.033895,-0.006119,0.007575,-0.006073,0.010621,0.010968,0.010968,0.010968,0.010968,0.0,0.011594,0.000617,0.016481,0.010351,0.010351,0.010351,0.010351,-0.006435,-0.003108,-0.006339,0.015024,-0.029993,-0.02454,-0.008664,-0.001753,-0.004661,-0.022808,-0.058549,-0.065742,-0.006656,0.006081,0.000959,0.012834,0.0


In [4]:
# DataFrameをNumPy配列へ変換
# 正解ラベルの変換
answers = stock_data.answer.values
# 説明変数の変換
stock_data.drop(columns=['answer'], inplace=True)
explanatory_variable = stock_data.values

In [5]:
# スケール変換        
ms = MinMaxScaler()
ms.fit(explanatory_variable)
explanatory_variable = ms.transform(explanatory_variable)

In [6]:
# 多項式特徴量を追加
poly = PolynomialFeatures(degree=2, include_bias=False)
poly.fit(explanatory_variable)
explanatory_variable_poly = poly.transform(explanatory_variable)

In [7]:
# データの分割（データの80%を訓練用に、20％をテスト用に分割する）
X_train, X_test, y_train, y_test = train_test_split(explanatory_variable_poly , answers, test_size=0.2, random_state=1,shuffle = False)

In [None]:
# グリッドサーチするパラメータを設定
parameters = {'C':[0.001,0.01,0.1,1,10,100,1000],'loss':['hinge', 'squared_hinge']}
 
# グリッドサーチを実行
clf = GridSearchCV(svm.LinearSVC(random_state=1), parameters, cv=10)
clf.fit(X_train, y_train) 

 
# グリッドサーチ結果(最適パラメータ)を取得
GS_C, GS_loss = clf.best_params_.values()
print ("最適パラメータ：{}".format(clf.best_params_))

In [None]:
# 最適パラメーターを指定して学習
clf = svm.LinearSVC(loss=GS_loss, C=GS_C, random_state=1)
clf.fit(X_train , y_train)

In [None]:
# 学習後のモデルによるテスト
# トレーニングデータを用いた予測
y_train_pred = clf.predict(X_train)
# テストデータを用いた予測
y_val_pred = clf.predict(X_test)

In [None]:
# 正解率の計算
train_score = accuracy_score(y_train, y_train_pred)
test_score = accuracy_score(y_test, y_val_pred)
# 正解率を表示
print("トレーニングデータに対する正解率：" + str(train_score * 100) + "%")
print("テストデータに対する正解率：" + str(test_score * 100) + "%")

In [None]:
#  交差検証
## 10分割し交差検証
scores = cross_val_score(clf, explanatory_variable_poly, answers, cv=10)
## 各分割におけるスコア
print('Cross-Validation scores: {}'.format(scores))
## スコアの平均値
print('Average score: {}'.format(np.mean(scores)))

In [None]:
# モデルベース特徴量選択による特徴量選択
# estimatorとしてRandomForestRegressorを使用。重要度がmedian（中央値）以上のものを選択
selector_SFM = SelectFromModel(RandomForestRegressor(n_estimators=100, random_state=1), threshold="median")
selector_SFM.fit(X_train , y_train)

# 選択された特徴量による学習：モデルベース特徴量選択
# 選択された特徴量のみにデータを変換
X_train_SFM = selector_SFM.transform(X_train)
X_test_SFM = selector_SFM.transform(X_test)

# 学習
clf.fit(X_train_SFM, y_train)

# 正解率の算出
y_train_pred_SFM = clf.predict(X_train_SFM)
y_val_pred_SFM = clf.predict(X_test_SFM)
train_score = accuracy_score(y_train, y_train_pred_SFM)
test_score = accuracy_score(y_test, y_val_pred_SFM)

print("モデルベース特徴量選択：トレーニングデータに対する正解率：" + str(train_score * 100) + "%")
print("モデルベース特徴量選択：テストデータに対する正解率：" + str(test_score * 100) + "%")