In [1]:
# 基本パッケージ（numpy,Pandas,matplotlib）
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# 線形サポートベクターマシーン
from sklearn.svm import LinearSVC
# ランダムフォレスト
from sklearn.ensemble import RandomForestClassifier
# train_test_split（データを分割出してくれる）
from sklearn.model_selection import train_test_split
# accuracy_score（正解率を測れる）
from sklearn.metrics import accuracy_score
# グリッドサーチ（ハイパーパラメータを自動的に最適化してくれる）
from sklearn.model_selection import GridSearchCV
# 正規化
from sklearn.preprocessing import MinMaxScaler
# 特徴量選択用
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import RFE
# 交差検証
from sklearn.model_selection import cross_val_score

  from numpy.core.umath_tests import inner1d


In [2]:
# 表示関連
# DataFrameの列数設定
pd.set_option('display.max_columns', 500)

In [3]:
# 株価データの読み込み
stock_data = pd.read_csv("stock_Price_Prediction_v2.2.csv", encoding="shift-jis")

# 読み込みデータの表示
stock_data.head()

Unnamed: 0,Open,High,Low,Volume,Adj Close,GBP,CAD,CHF,SEK,AUD,ZAR,IDR(100),INR,PHP,SGD,KRW(100),THB,KWD,TWD,month_1,month_2,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12,day_1,day_2,day_3,day_4,day_5,day_6,day_7,day_8,day_9,day_10,day_11,day_12,day_13,day_14,day_15,day_16,day_17,day_18,day_19,day_20,day_21,day_22,day_23,day_24,day_25,day_26,day_27,day_28,day_29,day_30,day_31,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,answer,"SEK_bin_(-0.068, -0.0105]","SEK_bin_(-0.0105, -0.0061]","SEK_bin_(-0.0061, -0.00346]","SEK_bin_(-0.00346, -0.00154]","SEK_bin_(-0.00154, 0.000598]","SEK_bin_(0.000598, 0.00221]","SEK_bin_(0.00221, 0.00413]","SEK_bin_(0.00413, 0.00656]","SEK_bin_(0.00656, 0.01]","SEK_bin_(0.01, 0.0825]","SGD_bin_(-0.0473, -0.00699]","SGD_bin_(-0.00699, -0.00403]","SGD_bin_(-0.00403, -0.00225]","SGD_bin_(-0.00225, -0.000917]","SGD_bin_(-0.000917, 0.000283]","SGD_bin_(0.000283, 0.00139]","SGD_bin_(0.00139, 0.00265]","SGD_bin_(0.00265, 0.00424]","SGD_bin_(0.00424, 0.00671]","SGD_bin_(0.00671, 0.0634]","Volume_bin_(-0.927, -0.512]","Volume_bin_(-0.512, -0.388]","Volume_bin_(-0.388, -0.276]","Volume_bin_(-0.276, -0.155]","Volume_bin_(-0.155, -0.0192]","Volume_bin_(-0.0192, 0.133]","Volume_bin_(0.133, 0.319]","Volume_bin_(0.319, 0.625]","Volume_bin_(0.625, 1.106]","Volume_bin_(1.106, 11.805]","Open_bin_(-0.09620000000000001, -0.0162]","Open_bin_(-0.0162, -0.00927]","Open_bin_(-0.00927, -0.00528]","Open_bin_(-0.00528, -0.00222]","Open_bin_(-0.00222, 0.000725]","Open_bin_(0.000725, 0.00342]","Open_bin_(0.00342, 0.00639]","Open_bin_(0.00639, 0.0102]","Open_bin_(0.0102, 0.0159]","Open_bin_(0.0159, 0.16]","ZAR_bin_(-0.155, -0.0148]","ZAR_bin_(-0.0148, -0.00855]","ZAR_bin_(-0.00855, -0.00494]","ZAR_bin_(-0.00494, -0.00209]","ZAR_bin_(-0.00209, 0.000603]","ZAR_bin_(0.000603, 0.00299]","ZAR_bin_(0.00299, 0.00571]","ZAR_bin_(0.00571, 0.00916]","ZAR_bin_(0.00916, 0.0142]","ZAR_bin_(0.0142, 0.129]"
0,-0.004444,0.0,0.004525,-0.514297,0.010801,0.010485,-0.001198,0.010974,0.010101,-0.001687,0.001701,0.007299,0.0,0.003831,-0.000415,0.0,-0.003257,0.002004,0.0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1.0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
1,-0.00625,0.024889,-0.003604,2.643314,0.0187,-0.002711,0.003238,-0.001622,-0.003846,0.00169,0.009338,-0.007246,0.003663,0.0,-0.000831,-0.008893,-0.006536,0.003609,0.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
2,0.026954,0.004337,0.028933,0.208818,0.000874,-0.00068,0.000837,0.002,-0.001544,-0.004639,0.010934,0.0,-0.00365,-0.003817,0.001941,-0.000997,0.003289,-0.003367,0.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0
3,0.0,-0.011226,-0.002636,-0.317789,-0.003493,-0.007273,-0.009078,-0.008855,-0.00464,-0.006779,-0.015807,0.0,-0.007326,-0.003831,-0.005674,0.000998,-0.003279,-0.006665,-0.005236,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
4,-0.004374,0.001747,-0.003524,-0.435145,-0.002629,-0.006324,-0.001929,-0.005914,-0.003885,-0.008105,0.007608,0.007299,-0.00369,-0.007692,-0.001253,-0.012961,-0.003289,-0.005992,-0.007895,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0.0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


In [4]:
# DataFrameをNumPy配列へ変換
# 正解ラベルの変換
answers = stock_data.answer.values
# 説明変数の変換
stock_data.drop(columns=['answer'], inplace=True)
explanatory_variable = stock_data.values

In [5]:
# データの正規化        
ms = MinMaxScaler()
ms.fit(explanatory_variable)
explanatory_variable = ms.transform(explanatory_variable)

In [6]:
# データの分割（データの80%を訓練用に、20％をテスト用に分割する）
X_train, X_test, y_train, y_test = train_test_split(explanatory_variable, answers, test_size=0.2, random_state=1,shuffle = False)

In [7]:
# グリッドサーチするパラメータを設定
parameters = {'C':[0.01,0.1,1,10,100],'loss':['hinge', 'squared_hinge']}
# グリッドサーチを実行
lsvc =  LinearSVC(random_state=1)
grid_search = GridSearchCV(lsvc, param_grid=parameters, cv=5)
grid_search = grid_search.fit(X_train , y_train)

# グリッドサーチ結果(最適パラメータ)を取得
GS_C, GS_loss = grid_search.best_params_.values()
print ("最適パラメータ：{}".format(grid_search.best_params_))

最適パラメータ：{'C': 0.01, 'loss': 'hinge'}


In [8]:
# 最適パラメーターを指定して学習
clf = LinearSVC(loss=GS_loss, C=GS_C, random_state=1)
clf.fit(X_train , y_train)

LinearSVC(C=0.01, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='hinge', max_iter=1000, multi_class='ovr',
     penalty='l2', random_state=1, tol=0.0001, verbose=0)

In [9]:
# 学習後のモデルによるテスト
# トレーニングデータを用いた予測
y_train_pred = clf.predict(X_train)
# テストデータを用いた予測
y_val_pred = clf.predict(X_test)

In [10]:
# 正解率の計算
train_score = accuracy_score(y_train, y_train_pred)
test_score = accuracy_score(y_test, y_val_pred)
# 正解率を表示
print("トレーニングデータに対する正解率：" + str(train_score * 100) + "%")
print("テストデータに対する正解率：" + str(test_score * 100) + "%")

トレーニングデータに対する正解率：53.069492053599255%
テストデータに対する正解率：51.43212951432129%


In [11]:
#  交差検証
## 5分割し交差検証
scores = cross_val_score(clf, explanatory_variable, answers, cv = 5)
## 各分割におけるスコア
print('Cross-Validation scores: {}'.format(scores))
## スコアの平均値
print('Average score: {}'.format(np.mean(scores)))

Cross-Validation scores: [0.51930262 0.51681196 0.50435866 0.5        0.51310861]
Average score: 0.5107163679273884


In [12]:
# モデルベース特徴量選択による特徴量選択
# estimatorとしてRandomForestClassifierを使用。重要度がmedian（中央値）以上のものを選択
selector_SFM = SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=1), threshold="median")
selector_SFM.fit(X_train , y_train)

# 選択された特徴量による学習：モデルベース特徴量選択
# 選択された特徴量のみにデータを変換
X_train_SFM = selector_SFM.transform(X_train)
X_test_SFM = selector_SFM.transform(X_test)

# 学習
clf.fit(X_train_SFM, y_train)

# 正解率の算出
y_train_pred_SFM = clf.predict(X_train_SFM)
y_val_pred_SFM = clf.predict(X_test_SFM)
train_score = accuracy_score(y_train, y_train_pred_SFM)
test_score = accuracy_score(y_test, y_val_pred_SFM)

print("モデルベース特徴量選択：トレーニングデータに対する正解率：" + str(train_score * 100) + "%")
print("モデルベース特徴量選択：テストデータに対する正解率：" + str(test_score * 100) + "%")

#  交差検証
## 選択された特徴量のみにデータを変換
explanatory_variable_SFM = selector_SFM.transform(explanatory_variable)
## 5分割し交差検証
scores = cross_val_score(clf, explanatory_variable_SFM, answers, cv=5)
## 各分割におけるスコア
print('Cross-Validation scores: {}'.format(scores))
## スコアの平均値
print('Average score: {}'.format(np.mean(scores)))

モデルベース特徴量選択：トレーニングデータに対する正解率：51.76067310688688%
モデルベース特徴量選択：テストデータに対する正解率：51.18306351183064%
Cross-Validation scores: [0.5143213  0.51556663 0.51556663 0.51620948 0.51435705]
Average score: 0.5152042150893338
