In [1]:
# 基本パッケージ（numpy,Pandas,matplotlib）
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# XGboost
import xgboost as xgb
# train_test_split（データを分割出してくれる）
from sklearn.model_selection import train_test_split
# accuracy_score（正解率を測れる）

from sklearn.metrics import accuracy_score
# グリッドサーチ
from sklearn.model_selection import GridSearchCV
# 正規化、標準化用
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
# 特徴量選択用
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE
# 交差検証用
from sklearn.model_selection import cross_val_score
# 多項式特徴量生成用
from sklearn.preprocessing import PolynomialFeatures
# warningの抑制
import warnings
# モデルの保存
from sklearn.externals import joblib

  from numpy.core.umath_tests import inner1d


In [2]:
# 表示関連
# DataFrameの列数設定
pd.set_option('display.max_columns', 500)
warnings.filterwarnings('ignore')

In [3]:
# 株価データの読み込み
stock_data = pd.read_csv("stock_Price_Prediction_v3.1.csv", encoding="shift-jis")

# 読み込みデータの表示
stock_data.head()

Unnamed: 0,Open,High,Low,Volume,Adj Close,Open000001.SS,High000001.SS,Low000001.SS,Adj Close000001.SS,Open^AXJO,High^AXJO,Low^AXJO,Adj Close^AXJO,Open^BSESN,High^BSESN,Low^BSESN,Adj Close^BSESN,Open^BVSP,High^BVSP,Low^BVSP,Adj Close^BVSP,Open^GDAXI,High^GDAXI,Low^GDAXI,Adj Close^GDAXI,Open^GSPTSE,High^GSPTSE,Low^GSPTSE,Adj Close^GSPTSE,Open^HSI,High^HSI,Low^HSI,Adj Close^HSI,Open^IPSA,High^IPSA,Low^IPSA,Adj Close^IPSA,Open^IXIC,High^IXIC,Low^IXIC,Adj Close^IXIC,Open^JKSE,High^JKSE,Low^JKSE,Adj Close^JKSE,Open^KS11,High^KS11,Low^KS11,Adj Close^KS11,Open^MERV,High^MERV,Low^MERV,Adj Close^MERV,Open^MXX,High^MXX,Low^MXX,Adj Close^MXX,Open^TWII,High^TWII,Low^TWII,Adj Close^TWII,Open^VIX,High^VIX,Low^VIX,Adj Close^VIX,Open^XAX,High^XAX,Low^XAX,Adj Close^XAX,GBP,EUR,CAD,CHF,NZD,ZAR,BHD,IDR(100),INR,KRW(100),THB,MXN,answer
0,-0.00625,0.024889,-0.003604,2.643314,0.0187,-0.016024,-0.004302,0.004251,0.007344,-0.008053,-0.007226,-0.00071,-0.002805,-0.001324,-0.010327,-0.009173,-0.012208,-0.014983,-0.014038,-0.013892,-0.01208,-0.016036,-0.010612,-0.004488,-0.005506,-0.003407,-0.004167,-0.006936,-0.006474,-0.018231,-0.01828,-0.003118,-0.004052,-0.009256,-0.003805,-0.000105,0.005605,-0.014401,-0.014141,-0.018726,-0.011112,0.003838,0.00059,0.004085,-0.001906,0.023984,0.016071,0.030788,0.014635,-0.00259,-0.027515,-0.014697,-0.008481,-0.007267,-0.009166,-0.017076,-0.01705,-0.004409,-0.003112,-0.003727,0.008201,0.015167,0.054611,0.014644,0.05428,0.001202,-0.003871,-0.007277,-0.008593,-0.002711,-0.001877,0.003238,-0.001622,-0.000849,0.009338,0.0,-0.007246,0.003663,-0.008893,-0.006536,-0.002027,1.0
1,0.026954,0.004337,0.028933,0.208818,0.000874,0.007755,0.02478,0.000608,0.029596,-0.002805,-0.004479,-0.003582,-0.001539,-0.010703,0.008073,0.001284,0.014311,-0.014307,0.001873,-0.001531,0.021016,-0.001732,-0.006093,-0.010315,-0.005091,-0.004867,-0.003182,-0.007444,-0.00737,-0.005971,-0.001443,-0.003247,-0.000239,0.006228,0.006706,0.00644,0.003402,-0.018269,-0.00691,-0.000254,0.003026,0.00156,0.021175,0.003847,0.029382,0.0144,0.001065,0.011734,-0.000631,-0.004941,0.004482,-0.026619,-0.037923,-0.017218,-0.008022,-0.003079,0.00856,0.013378,0.00014,-0.006939,-0.013912,0.025245,-0.018564,0.003093,-0.020792,-0.008593,-0.008558,-0.006889,-0.004813,-0.00068,0.001624,0.000837,0.002,-0.004931,0.010934,-0.000735,0.0,-0.00365,-0.000997,0.003289,-0.001354,0.0
2,0.0,-0.011226,-0.002636,-0.317789,-0.003493,0.034787,0.008247,0.02836,-0.004298,-0.001539,0.00524,0.001783,0.005159,0.015344,0.002683,0.010328,-0.003411,0.020779,0.00561,0.014723,-0.006287,-0.006381,0.000597,0.004482,0.001062,-0.002822,-0.006792,0.000129,-0.000308,0.0,0.0,0.0,0.0,0.002781,0.000104,0.001543,0.001339,0.011477,0.001322,-0.000119,-0.011018,0.022725,0.014726,0.022725,0.012923,0.0,0.0,0.0,0.0,-0.041345,-0.035672,-0.044454,-0.033684,0.008841,0.013239,0.011796,0.011341,0.0,0.0,0.0,0.0,-0.028643,-0.034345,-0.034943,-0.032862,0.0,-0.000231,-0.004151,-0.005533,-0.007273,-0.007085,-0.009078,-0.008855,-0.002734,-0.015807,-0.006025,0.0,-0.007326,0.000998,-0.003279,-0.006102,1.0
3,-0.004374,0.001747,-0.003524,-0.435145,-0.002629,-0.010102,-0.008097,0.001548,0.003675,0.005159,0.001914,0.002996,-0.000295,-0.004078,-0.004827,-0.005753,-0.005828,-0.008532,-0.014652,-0.008993,-0.008738,0.002074,-0.006028,-0.020454,-0.015246,-0.009221,-0.005789,-0.0071,-0.004433,0.006186,0.0017,-0.004238,-0.009942,0.001235,-0.002148,-0.003708,-0.004723,-0.031064,-0.009322,-0.020402,0.008949,0.016687,0.01607,0.016363,0.011957,0.008642,0.005821,-0.009441,-0.018867,-0.039105,-0.05194,-0.0323,-0.039321,0.009772,-0.004341,0.002261,-0.008723,-0.015727,-0.009671,0.002065,-0.00262,0.052768,0.054639,0.042066,0.025092,-0.01032,-0.003931,0.001493,0.006499,-0.006324,-0.005416,-0.001929,-0.005914,-0.015936,0.007608,-0.005663,0.007299,-0.00369,-0.012961,-0.003289,-0.002729,0.0
4,0.0,-0.004359,-0.012378,1.40135,-0.017575,0.004906,0.017149,0.005901,0.016379,-0.000295,-0.003733,-0.006446,-0.007554,-0.005591,-0.008465,-0.008954,-0.004839,-0.006265,-0.000377,0.002516,0.00266,-0.012788,-0.006301,0.007453,-0.001946,0.002533,-0.000927,-0.002345,-0.003291,-0.011269,-0.006038,-0.001285,0.000934,-0.004619,0.003897,0.000725,0.007223,0.027592,0.005161,0.004937,-0.024246,0.012158,0.004893,0.009956,0.009554,-0.018505,-0.02264,-0.017562,-0.013345,-0.03197,-0.010411,0.001199,0.002076,-0.007231,-0.004142,-0.004802,-7.6e-05,0.002925,-0.001657,-0.017512,-0.019542,-0.051106,-0.035679,-0.013797,-0.007139,0.006499,-1.1e-05,-0.001625,-0.006656,-0.00122,-0.005532,-0.004227,-0.006835,-0.003831,-0.010906,0.0,0.007246,0.0,0.00101,-0.0033,-0.002052,1.0


In [4]:
# DataFrameをNumPy配列へ変換
# 正解ラベルの変換
answers = stock_data.answer.values
# 説明変数の変換
stock_data.drop(columns=['answer'], inplace=True)
explanatory_variable = stock_data.values

In [5]:
# スケール変換       
ms = MinMaxScaler()
ms.fit(explanatory_variable)
explanatory_variable = ms.transform(explanatory_variable)

In [6]:
# データの分割（データの80%を訓練用に、20％をテスト用に分割する）
X_train, X_test, y_train, y_test = train_test_split(explanatory_variable , answers, test_size=0.2, random_state=1,shuffle = False)

In [7]:
# グリッドサーチするパラメータを設定
parameters = {"learning_rate":[0.01,0.05,0.1,0.15,0.2],
              "max_depth": [3,4,5,6,7,8,9,10],
              "subsample":[0.5,0.6,0.7,0.8,0.9,1],
              "colsample_bytree": [0.5,0.6,0.7,0.8,0.9,1],
              }

# グリッドサーチを実行
xg = joblib.load('xgb.pkl');
grid_search = GridSearchCV(xg, param_grid=parameters, cv=10)
grid_search = grid_search.fit(X_train , y_train)
 
# グリッドサーチ結果(最適パラメータ)を取得
GS_colsample_bytree, GS_learning_rate, GS_max_depth, GS_subsample = grid_search.best_params_.values()
print ("最適パラメータ：{}".format(grid_search.best_params_))

最適パラメータ：{'colsample_bytree': 0.7, 'learning_rate': 0.15, 'max_depth': 3, 'subsample': 0.7}


In [8]:
# 最適パラメーターを指定して学習
clf = xgb.XGBClassifier(colsample_bytree=GS_colsample_bytree, learning_rate=GS_learning_rate, 
                        max_depth=GS_max_depth, subsample=GS_subsample, random_state=1)
clf.fit(X_train , y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.7, gamma=0, learning_rate=0.15, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=1,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.7)

In [9]:
# 学習後のモデルによるテスト
# トレーニングデータを用いた予測
y_train_pred = clf.predict(X_train)
# テストデータを用いた予測
y_val_pred = clf.predict(X_test)

In [10]:
# 正解率の計算
train_score = clf.score(X_train , y_train)
test_score = clf.score(X_test, y_test)
# 正解率を表示
print("トレーニングデータに対する正解率：" + str(train_score * 100) + "%")
print("テストデータに対する正解率：" + str(test_score * 100) + "%")

トレーニングデータに対する正解率：83.88362209872507%
テストデータに対する正解率：52.54901960784314%


In [11]:
#  交差検証
## 10分割し交差検証
scores = cross_val_score(clf, explanatory_variable, answers, cv=10)
## 各分割におけるスコア
print('Cross-Validation scores: {}'.format(scores))
## スコアの平均値
print('Average score: {}'.format(np.mean(scores)))

Cross-Validation scores: [0.48041775 0.56657963 0.54830287 0.49869452 0.54308094 0.54308094
 0.5078534  0.53664921 0.5144357  0.55380577]
Average score: 0.5292900745580768


In [12]:
# 学習結果の保存
joblib.dump(clf, 'xgb.pkl')

['xgb.pkl']