In [1]:
# 基本パッケージ（numpy,Pandas,matplotlib）
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# ランダムフォレスト
from sklearn.ensemble import RandomForestClassifier
# train_test_split（データを分割出してくれる）
from sklearn.model_selection import train_test_split
# accuracy_score（正解率を測れる）
from sklearn.metrics import accuracy_score
# グリッドサーチ（ハイパーパラメータを自動的に最適化してくれる）
from sklearn.model_selection import GridSearchCV
# 正規化
from sklearn.preprocessing import MinMaxScaler
# 交差検証用
from sklearn.model_selection import cross_val_score
# 交互作用特徴量、多項式特徴量生成
from sklearn.preprocessing import PolynomialFeatures

  from numpy.core.umath_tests import inner1d


In [2]:
# 表示関連
# DataFrameの列数設定
pd.set_option('display.max_columns', 500)

In [3]:
# 株価データの読み込み
stock_data = pd.read_csv("stock_Price_Prediction_v2.3.csv", encoding="shift-jis")

# 読み込みデータの表示
stock_data.head()

Unnamed: 0,High,Low,Adj Close,GBP,CAD,CHF,AUD,IDR(100),INR,PHP,KRW(100),THB,KWD,TWD,month_1,month_2,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12,day_1,day_2,day_3,day_4,day_5,day_6,day_7,day_8,day_9,day_10,day_11,day_12,day_13,day_14,day_15,day_16,day_17,day_18,day_19,day_20,day_21,day_22,day_23,day_24,day_25,day_26,day_27,day_28,day_29,day_30,day_31,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,answer,"SEK_bin_(-0.068, -0.0105]","SEK_bin_(-0.0105, -0.00609]","SEK_bin_(-0.00609, -0.00346]","SEK_bin_(-0.00346, -0.00154]","SEK_bin_(-0.00154, 0.000598]","SEK_bin_(0.000598, 0.00221]","SEK_bin_(0.00221, 0.00413]","SEK_bin_(0.00413, 0.00655]","SEK_bin_(0.00655, 0.01]","SEK_bin_(0.01, 0.0825]","SGD_bin_(-0.0473, -0.00699]","SGD_bin_(-0.00699, -0.00403]","SGD_bin_(-0.00403, -0.00225]","SGD_bin_(-0.00225, -0.000919]","SGD_bin_(-0.000919, 0.000282]","SGD_bin_(0.000282, 0.00138]","SGD_bin_(0.00138, 0.00265]","SGD_bin_(0.00265, 0.00424]","SGD_bin_(0.00424, 0.00671]","SGD_bin_(0.00671, 0.0634]","Volume_bin_(-0.927, -0.512]","Volume_bin_(-0.512, -0.388]","Volume_bin_(-0.388, -0.276]","Volume_bin_(-0.276, -0.155]","Volume_bin_(-0.155, -0.0187]","Volume_bin_(-0.0187, 0.133]","Volume_bin_(0.133, 0.32]","Volume_bin_(0.32, 0.626]","Volume_bin_(0.626, 1.115]","Volume_bin_(1.115, 11.805]","Open_bin_(-0.09620000000000001, -0.0162]","Open_bin_(-0.0162, -0.00927]","Open_bin_(-0.00927, -0.00528]","Open_bin_(-0.00528, -0.00222]","Open_bin_(-0.00222, 0.000726]","Open_bin_(0.000726, 0.00342]","Open_bin_(0.00342, 0.00639]","Open_bin_(0.00639, 0.0102]","Open_bin_(0.0102, 0.0159]","Open_bin_(0.0159, 0.16]","ZAR_bin_(-0.155, -0.0148]","ZAR_bin_(-0.0148, -0.00856]","ZAR_bin_(-0.00856, -0.00495]","ZAR_bin_(-0.00495, -0.00209]","ZAR_bin_(-0.00209, 0.000597]","ZAR_bin_(0.000597, 0.00299]","ZAR_bin_(0.00299, 0.00571]","ZAR_bin_(0.00571, 0.00916]","ZAR_bin_(0.00916, 0.0142]","ZAR_bin_(0.0142, 0.129]",SEK,SGD,Volume,Open,ZAR,SEK^2,SEK SGD,SEK Volume,SEK Open,SEK ZAR,SGD^2,SGD Volume,SGD Open,SGD ZAR,Volume^2,Volume Open,Volume ZAR,Open^2,Open ZAR,ZAR^2
0,0.0,0.004525,0.010801,0.010485,-0.001198,0.010974,-0.001687,0.007299,0.0,0.003831,0.0,-0.003257,0.002004,0.0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1.0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0.010101,-0.000415,-0.514297,-0.004444,0.001701,0.000102,-4e-06,-0.005195,-4.5e-05,1.7e-05,1.726028e-07,0.000214,2e-06,-7.06556e-07,0.264501,0.002286,-0.000875,2e-05,-8e-06,3e-06
1,0.024889,-0.003604,0.0187,-0.002711,0.003238,-0.001622,0.00169,-0.007246,0.003663,0.0,-0.008893,-0.006536,0.003609,0.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,-0.003846,-0.000831,2.643314,-0.00625,0.009338,1.5e-05,3e-06,-0.010167,2.4e-05,-3.6e-05,6.909852e-07,-0.002197,5e-06,-7.762145e-06,6.987108,-0.016521,0.024683,3.9e-05,-5.8e-05,8.7e-05
2,0.004337,0.028933,0.000874,-0.00068,0.000837,0.002,-0.004639,0.0,-0.00365,-0.003817,-0.000997,0.003289,-0.003367,0.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0.0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,-0.001544,0.001941,0.208818,0.026954,0.010934,2e-06,-3e-06,-0.000322,-4.2e-05,-1.7e-05,3.768293e-06,0.000405,5.2e-05,2.122432e-05,0.043605,0.005629,0.002283,0.000727,0.000295,0.00012
3,-0.011226,-0.002636,-0.003493,-0.007273,-0.009078,-0.008855,-0.006779,0.0,-0.007326,-0.003831,0.000998,-0.003279,-0.006665,-0.005236,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,-0.00464,-0.005674,-0.317789,0.0,-0.015807,2.2e-05,2.6e-05,0.001475,-0.0,7.3e-05,3.219377e-05,0.001803,-0.0,8.968814e-05,0.10099,-0.0,0.005023,0.0,-0.0,0.00025
4,0.001747,-0.003524,-0.002629,-0.006324,-0.001929,-0.005914,-0.008105,0.007299,-0.00369,-0.007692,-0.012961,-0.003289,-0.005992,-0.007895,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0.0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,-0.003885,-0.001253,-0.435145,-0.004374,0.007608,1.5e-05,5e-06,0.001691,1.7e-05,-3e-05,1.569031e-06,0.000545,5e-06,-9.529574e-06,0.189351,0.001904,-0.00331,1.9e-05,-3.3e-05,5.8e-05


In [4]:
# DataFrameをNumPy配列へ変換
# 正解ラベルの変換
answers = stock_data.answer.values
# 説明変数の変換
stock_data.drop(columns=['answer'], inplace=True)
explanatory_variable = stock_data.values

In [5]:
# データの正規化        
ms = MinMaxScaler()
ms.fit(explanatory_variable)
explanatory_variable = ms.transform(explanatory_variable)

In [6]:
# データの分割（データの80%を訓練用に、20％をテスト用に分割する）
X_train, X_test, y_train, y_test = train_test_split(explanatory_variable, answers, test_size=0.2, random_state=1,shuffle = False)

In [7]:
# グリッドサーチするパラメータを設定
parameters = {'n_estimators':[1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]}

# グリッドサーチを実行
rf = RandomForestClassifier(random_state=1);
grid_search = GridSearchCV(rf, param_grid=parameters, cv=10)
grid_search = grid_search.fit(X_train , y_train)
 
# グリッドサーチ結果(最適パラメータ)を取得
best_params = grid_search.best_params_
print ("最適パラメータ：{}".format(grid_search.best_params_))

最適パラメータ：{'n_estimators': 100}


In [8]:
# 最適パラメーターを指定して学習
#clf = RandomForestClassifier(n_estimators=best_params['n_estimators'], random_state=1)
clf = RandomForestClassifier(random_state=1)
clf.fit(X_train , y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=1, verbose=0, warm_start=False)

In [9]:
# 正解率の計算
train_score = clf.score(X_train , y_train)
test_score = clf.score(X_test, y_test)
# 正解率を表示
print("トレーニングデータに対する正解率：" + str(train_score * 100) + "%")
print("テストデータに対する正解率：" + str(test_score * 100) + "%")

トレーニングデータに対する正解率：97.97444686818324%
テストデータに対する正解率：49.19053549190536%


In [10]:
#  交差検証
## 5分割し交差検証
scores = cross_val_score(clf, explanatory_variable, answers, cv = 5)
## 各分割におけるスコア
print('Cross-Validation scores: {}'.format(scores))
## スコアの平均値
print('Average score: {}'.format(np.mean(scores)))

Cross-Validation scores: [0.5143213  0.50809465 0.49937733 0.49376559 0.52059925]
Average score: 0.507231622437835
