In [1]:
# 基本パッケージ（numpy,Pandas,matplotlib）
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# サポートベクターマシーン
from sklearn import svm
# train_test_split（データを分割出してくれる）
from sklearn.model_selection import train_test_split
# accuracy_score（正解率を測れる）
from sklearn.metrics import accuracy_score
# グリッドサーチ（ハイパーパラメータを自動的に最適化してくれる）
from sklearn.model_selection import GridSearchCV
# 正規化、標準化用
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
# 特徴量選択用
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE
# 交差検証用
from sklearn.model_selection import cross_val_score
# 多項式特徴量生成用
from sklearn.preprocessing import PolynomialFeatures

  from numpy.core.umath_tests import inner1d


In [2]:
# 表示関連
# DataFrameの列数設定
pd.set_option('display.max_columns', 500)

In [3]:
# 株価データの読み込み
stock_data = pd.read_csv("stock_Price_Prediction.csv", encoding="shift-jis")

# 読み込みデータの表示
stock_data.head()

Unnamed: 0,始値,高値,安値,出来高,終値調整値,米ドル,英ポンド,ユーロ,カナダドル,スイスフラン,スウェーデンクローネ,デンマーククローネ,ノルウェークローネ,オーストラリアドル,ニュージーランドドル,南アフリカランド,バーレーンディナール,インドネシアルピア,中国人民元,香港ドル,インドルピー,フィリピンペソ,シンガポールドル,韓国ウォン,タイバーツ,クウェートディナール,サウジアラビアリアル,UAEディルハム,メキシコペソ,台湾ドル,answer
0,-0.004444,0.0,0.004525,-0.514297,0.010801,0.000376,0.010485,0.009128,-0.001198,0.010974,0.010101,0.008957,0.01061,-0.001687,0.00102,0.001701,0.000368,0.007299,0.0,0.000586,0.0,0.003831,-0.000415,0.0,-0.003257,0.002004,0.000282,0.000276,-0.000675,0.0,1.0
1,-0.00625,0.024889,-0.003604,2.643314,0.0187,0.0,-0.002711,-0.001877,0.003238,-0.001622,-0.003846,-0.001268,-0.000656,0.00169,-0.000849,0.009338,0.0,-0.007246,0.0,0.0,0.003663,0.0,-0.000831,-0.008893,-0.006536,0.003609,0.0,0.0,-0.002027,0.0,1.0
2,0.026954,0.004337,0.028933,0.208818,0.000874,-0.000751,-0.00068,0.001624,0.000837,0.002,-0.001544,0.00127,0.006566,-0.004639,-0.004931,0.010934,-0.000735,0.0,-0.000622,-0.001171,-0.00365,-0.003817,0.001941,-0.000997,0.003289,-0.003367,-0.000844,-0.000827,-0.001354,0.0,0.0
3,0.0,-0.011226,-0.002636,-0.317789,-0.003493,-0.006011,-0.007273,-0.007085,-0.009078,-0.008855,-0.00464,-0.007609,-0.007175,-0.006779,-0.002734,-0.015807,-0.006025,0.0,-0.006219,-0.005862,-0.007326,-0.003831,-0.005674,0.000998,-0.003279,-0.006665,-0.005915,-0.005795,-0.006102,-0.005236,1.0
4,-0.004374,0.001747,-0.003524,-0.435145,-0.002629,-0.005669,-0.006324,-0.005416,-0.001929,-0.005914,-0.003885,-0.005112,-0.005913,-0.008105,-0.015936,0.007608,-0.005663,0.007299,-0.005632,-0.005307,-0.00369,-0.007692,-0.001253,-0.012961,-0.003289,-0.005992,-0.005667,-0.006106,-0.002729,-0.007895,0.0


In [4]:
# DataFrameをNumPy配列へ変換
# 正解ラベルの変換
answers = stock_data.answer.values
# 説明変数の変換
stock_data.drop(columns=['answer'], inplace=True)
explanatory_variable = stock_data.values

In [5]:
# データの正規化        
ms = MinMaxScaler()
ms.fit(explanatory_variable)
explanatory_variable = ms.transform(explanatory_variable)

In [6]:
# 多項式特徴量を追加
poly = PolynomialFeatures(degree=2, include_bias=False)
poly.fit(explanatory_variable)
explanatory_variable_poly = poly.transform(explanatory_variable)

In [7]:
# データの分割（データの80%を訓練用に、20％をテスト用に分割する）
X_train, X_test, y_train, y_test = train_test_split(explanatory_variable_poly, answers, test_size=0.2, random_state=1,shuffle = False)

In [8]:
# グリッドサーチするパラメータを設定
parameters = {'C':[0.01,0.1,1,10,100],'loss':['hinge', 'squared_hinge']}
 
# グリッドサーチを実行
clf = GridSearchCV(svm.LinearSVC(), parameters)
clf.fit(X_train, y_train) 
 
# グリッドサーチ結果(最適パラメータ)を取得
GS_C, GS_loss = clf.best_params_.values()
print ("最適パラメータ：{}".format(clf.best_params_))

最適パラメータ：{'C': 0.01, 'loss': 'hinge'}


In [9]:
# 最適パラメーターを指定して学習
clf = svm.LinearSVC(loss=GS_loss, C=GS_C, random_state=1)
clf.fit(X_train , y_train)

LinearSVC(C=0.01, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='hinge', max_iter=1000, multi_class='ovr',
     penalty='l2', random_state=1, tol=0.0001, verbose=0)

In [10]:
# 学習後のモデルによるテスト
# トレーニングデータを用いた予測
y_train_pred = clf.predict(X_train)
# テストデータを用いた予測
y_val_pred = clf.predict(X_test)

In [11]:
# 正解率の計算
train_score = accuracy_score(y_train, y_train_pred)
test_score = accuracy_score(y_test, y_val_pred)
# 正解率を表示
print("トレーニングデータに対する正解率：" + str(train_score * 100) + "%")
print("テストデータに対する正解率：" + str(test_score * 100) + "%")

トレーニングデータに対する正解率：51.667186039264564%
テストデータに対する正解率：51.18306351183064%


In [12]:
#  交差検証
## 10分割し交差検証
scores = cross_val_score(clf, explanatory_variable_poly, answers, cv = 10)
## 各分割におけるスコア
print('Cross-Validation scores: {}'.format(scores))
## スコアの平均値
print('Average score: {}'.format(np.mean(scores)))

Cross-Validation scores: [0.51492537 0.51492537 0.51492537 0.51620948 0.51620948 0.51620948
 0.51620948 0.51620948 0.51620948 0.515     ]
Average score: 0.5157032977258347
