In [1]:
# 基本パッケージ（numpy,Pandas,matplotlib）
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# globのimport
import glob
# 交互作用特徴量、多項式特徴量生成
from sklearn.preprocessing import PolynomialFeatures

In [2]:
# 表示関連
# DataFrameの列数設定
pd.set_option('display.max_columns', 500)

In [3]:
# 2001～2018年の株価データをマージする
## globでファイル名の一覧を取得
stock_price_files = glob.glob('stockPrice/*.csv')
stock_price_list = []

# ファイルを読み込み、DataFrameでlistに格納する。
for f in stock_price_files:
    stock_price_list.append(pd.read_csv(f, header=1, encoding="shift-jis", parse_dates = [0]))
    
# Listに格納されたデータを全てconcat関数で連結    
stock_price_all = pd.concat(stock_price_list)

# 列名の変更
stock_price_all.rename(columns = {'日付':'Date','始値':'Open','高値':'High','安値':'Low','終値':'Close','終値調整値':'Adj Close',
                                  '出来高':'Volume'}, inplace = True)

# 欠損値のチェック
print(stock_price_all.isnull().sum())

# 読み込みデータの表示
stock_price_all.head()

Date         0
Open         0
High         0
Low          0
Close        0
Volume       0
Adj Close    0
dtype: int64


Unnamed: 0,Date,Open,High,Low,Close,Volume,Adj Close
0,2001-07-13,12490,12490,12300,12310,259430,12310
1,2001-07-16,12400,12400,12250,12330,99740,12330
2,2001-07-17,12170,12170,12100,12120,93150,12120
3,2001-07-18,12150,12150,11830,11840,165330,11840
4,2001-07-19,11990,11990,11860,11920,620220,11920


In [4]:
# 2002～2018年の為替レートを取得
market_Data = pd.read_csv('market/market.csv', header=0, encoding="shift-jis", parse_dates = [0])

# 不要な通貨を削除
market_Data.drop(columns=['USD','EUR','DKK','NOK','NZD','BHD','CNY','HKD','SAR','AED','MXN'], inplace=True)

# 欠損値のチェック
print(market_Data.isnull().sum())

# 読み込みデータの表示
market_Data.head()

Date        0
GBP         0
CAD         0
CHF         0
SEK         0
AUD         0
ZAR         0
IDR(100)    0
INR         0
PHP         0
SGD         0
KRW(100)    0
THB         0
KWD         0
TWD         0
dtype: int64


Unnamed: 0,Date,GBP,CAD,CHF,SEK,AUD,ZAR,IDR(100),INR,PHP,SGD,KRW(100),THB,KWD,TWD
0,2002-04-01,189.79,83.48,79.28,12.87,71.14,11.76,1.37,2.73,2.61,72.21,10.12,3.07,434.14,3.82
1,2002-04-02,191.78,83.38,80.15,13.0,71.02,11.78,1.38,2.73,2.62,72.18,10.12,3.06,435.01,3.82
2,2002-04-03,191.26,83.65,80.02,12.95,71.14,11.89,1.37,2.74,2.62,72.12,10.03,3.04,436.58,3.82
3,2002-04-04,191.13,83.72,80.18,12.93,70.81,12.02,1.37,2.73,2.61,72.26,10.02,3.05,435.11,3.82
4,2002-04-05,189.74,82.96,79.47,12.87,70.33,11.83,1.37,2.71,2.6,71.85,10.03,3.04,432.21,3.8


In [5]:
# 内部結合で結合する
merge_Data = pd.merge(stock_price_all, market_Data, on='Date', how='inner')
merge_Data.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Adj Close,GBP,CAD,CHF,SEK,AUD,ZAR,IDR(100),INR,PHP,SGD,KRW(100),THB,KWD,TWD
0,2002-04-01,11250,11250,11050,11110,77290,11110,189.79,83.48,79.28,12.87,71.14,11.76,1.37,2.73,2.61,72.21,10.12,3.07,434.14,3.82
1,2002-04-02,11200,11250,11100,11230,37540,11230,191.78,83.38,80.15,13.0,71.02,11.78,1.38,2.73,2.62,72.18,10.12,3.06,435.01,3.82
2,2002-04-03,11130,11530,11060,11440,136770,11440,191.26,83.65,80.02,12.95,71.14,11.89,1.37,2.74,2.62,72.12,10.03,3.04,436.58,3.82
3,2002-04-04,11430,11580,11380,11450,165330,11450,191.13,83.72,80.18,12.93,70.81,12.02,1.37,2.73,2.61,72.26,10.02,3.05,435.11,3.82
4,2002-04-05,11430,11450,11350,11410,112790,11410,189.74,82.96,79.47,12.87,70.33,11.83,1.37,2.71,2.6,71.85,10.03,3.04,432.21,3.8


In [6]:
#ワンホットエンコーディング
## 月（1～12）、日（1～31）、曜日（月曜が0, 日曜が6）情報の抽出
dummyData = pd.DataFrame({
                          'month' : merge_Data['Date'].dt.month,
                          'day'  : merge_Data['Date'].dt.day,
                          'weekday' : merge_Data['Date'].dt.dayofweek
                         })

## 月、曜日情報をダミー変数へ変換
dummyData = pd.get_dummies(dummyData, columns=['month','day','weekday'])

## 最初の5行を表示
dummyData.head()

Unnamed: 0,month_1,month_2,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12,day_1,day_2,day_3,day_4,day_5,day_6,day_7,day_8,day_9,day_10,day_11,day_12,day_13,day_14,day_15,day_16,day_17,day_18,day_19,day_20,day_21,day_22,day_23,day_24,day_25,day_26,day_27,day_28,day_29,day_30,day_31,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4
0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
2,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
4,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [7]:
# 目的変数の作成
merge_Data['diff'] = merge_Data['Close'] - merge_Data['Open']
dummyData['answer'] = pd.DataFrame({'answer' : merge_Data['diff'].apply( lambda x: 0 if x < 0 else 1 )})

# 不要な列を削除
merge_Data.drop(columns=['Date','Close','diff'], inplace=True)

# 各列を変化率へ一括変換
merge_Data_change = merge_Data.pct_change()

# 目的変数とダミー変数の付与
merge_Data_change = pd.concat([merge_Data_change, dummyData], axis=1)

# 目的変数をずらし、説明変数から見て未来の値とする
merge_Data_change.answer = merge_Data_change.answer.shift(-1)

# 最初と最後の行（NaNがある）を削除
merge_Data_change = merge_Data_change.replace([np.inf, -np.inf], np.nan)
merge_Data_change = merge_Data_change.dropna()

In [8]:
# ビニング
## SEK 10分割
merge_Data_change['SEK_bin'] = pd.qcut(merge_Data_change['SEK'], q=10)

## SGD 10分割
merge_Data_change['SGD_bin'] = pd.qcut(merge_Data_change['SGD'], q=10)

## Volume 10分割
merge_Data_change['Volume_bin'] = pd.qcut(merge_Data_change['Volume'], q=10)

## Open 10分割
merge_Data_change['Open_bin'] = pd.qcut(merge_Data_change['Open'], q=10)

## ZAR 10分割
merge_Data_change['ZAR_bin'] = pd.qcut(merge_Data_change['ZAR'], q=10)

# ダミー変数へ変換
merge_Data_change = pd.get_dummies(merge_Data_change, columns=['SEK_bin','SGD_bin','Volume_bin','Open_bin','ZAR_bin'])

In [9]:
# 交互作用特徴量、多項式特徴量を追加
poly = PolynomialFeatures(degree=2, include_bias=False)
poly_feature = pd.DataFrame({'SEK' : merge_Data_change['SEK'],
                             'SGD' : merge_Data_change['SGD'],
                             'Volume' : merge_Data_change['Volume'],
                             'Open' : merge_Data_change['Open'],
                             'ZAR' : merge_Data_change['ZAR'],
                             'High' : merge_Data_change['High'],
                             'GBP' : merge_Data_change['GBP']})
poly.fit(poly_feature)
poly_feature = poly.transform(poly_feature)
poly_df = pd.DataFrame(data=poly_feature, columns=poly.get_feature_names(), dtype='float')
poly_df = poly_df.rename(columns=lambda x: x.replace('x0','SEK'))
poly_df = poly_df.rename(columns=lambda x: x.replace('x1','SGD'))
poly_df = poly_df.rename(columns=lambda x: x.replace('x2','Volume'))
poly_df = poly_df.rename(columns=lambda x: x.replace('x3','Open'))
poly_df = poly_df.rename(columns=lambda x: x.replace('x4','ZAR'))
poly_df = poly_df.rename(columns=lambda x: x.replace('x5','High'))
poly_df = poly_df.rename(columns=lambda x: x.replace('x6','GBP'))
merge_Data_change.drop(columns=['SEK','SGD','Volume','Open','ZAR','High','GBP'], inplace=True)
merge_Data_change = pd.concat([merge_Data_change, poly_df], join='inner', axis=1)

In [10]:
# CSVへ変換
merge_Data_change.to_csv("stock_Price_Prediction_v2.3.csv",index = False, encoding="shift-jis")
# データの確認
merge_Data_change.head()

Unnamed: 0,Low,Adj Close,CAD,CHF,AUD,IDR(100),INR,PHP,KRW(100),THB,KWD,TWD,month_1,month_2,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12,day_1,day_2,day_3,day_4,day_5,day_6,day_7,day_8,day_9,day_10,day_11,day_12,day_13,day_14,day_15,day_16,day_17,day_18,day_19,day_20,day_21,day_22,day_23,day_24,day_25,day_26,day_27,day_28,day_29,day_30,day_31,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,answer,"SEK_bin_(-0.068, -0.0105]","SEK_bin_(-0.0105, -0.0061]","SEK_bin_(-0.0061, -0.00346]","SEK_bin_(-0.00346, -0.00154]","SEK_bin_(-0.00154, 0.000598]","SEK_bin_(0.000598, 0.00221]","SEK_bin_(0.00221, 0.00413]","SEK_bin_(0.00413, 0.00656]","SEK_bin_(0.00656, 0.01]","SEK_bin_(0.01, 0.0825]","SGD_bin_(-0.0473, -0.00699]","SGD_bin_(-0.00699, -0.00403]","SGD_bin_(-0.00403, -0.00225]","SGD_bin_(-0.00225, -0.000917]","SGD_bin_(-0.000917, 0.000283]","SGD_bin_(0.000283, 0.00139]","SGD_bin_(0.00139, 0.00265]","SGD_bin_(0.00265, 0.00424]","SGD_bin_(0.00424, 0.00671]","SGD_bin_(0.00671, 0.0634]","Volume_bin_(-0.927, -0.512]","Volume_bin_(-0.512, -0.388]","Volume_bin_(-0.388, -0.276]","Volume_bin_(-0.276, -0.155]","Volume_bin_(-0.155, -0.0192]","Volume_bin_(-0.0192, 0.133]","Volume_bin_(0.133, 0.319]","Volume_bin_(0.319, 0.625]","Volume_bin_(0.625, 1.106]","Volume_bin_(1.106, 11.805]","Open_bin_(-0.09620000000000001, -0.0162]","Open_bin_(-0.0162, -0.00927]","Open_bin_(-0.00927, -0.00528]","Open_bin_(-0.00528, -0.00222]","Open_bin_(-0.00222, 0.000725]","Open_bin_(0.000725, 0.00342]","Open_bin_(0.00342, 0.00639]","Open_bin_(0.00639, 0.0102]","Open_bin_(0.0102, 0.0159]","Open_bin_(0.0159, 0.16]","ZAR_bin_(-0.155, -0.0148]","ZAR_bin_(-0.0148, -0.00855]","ZAR_bin_(-0.00855, -0.00494]","ZAR_bin_(-0.00494, -0.00209]","ZAR_bin_(-0.00209, 0.000603]","ZAR_bin_(0.000603, 0.00299]","ZAR_bin_(0.00299, 0.00571]","ZAR_bin_(0.00571, 0.00916]","ZAR_bin_(0.00916, 0.0142]","ZAR_bin_(0.0142, 0.129]",SEK,SGD,Volume,Open,ZAR,High,GBP,SEK^2,SEK SGD,SEK Volume,SEK Open,SEK ZAR,SEK High,SEK GBP,SGD^2,SGD Volume,SGD Open,SGD ZAR,SGD High,SGD GBP,Volume^2,Volume Open,Volume ZAR,Volume High,Volume GBP,Open^2,Open ZAR,Open High,Open GBP,ZAR^2,ZAR High,ZAR GBP,High^2,High GBP,GBP^2
1,0.004525,0.010801,-0.001198,0.010974,-0.001687,0.007299,0.0,0.003831,0.0,-0.003257,0.002004,0.0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1.0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,-0.003846,-0.000831,2.643314,-0.00625,0.009338,0.024889,-0.002711,1.5e-05,3e-06,-0.010167,2.4e-05,-3.6e-05,-9.6e-05,1e-05,6.909852e-07,-0.002197,5e-06,-8e-06,-2.1e-05,2e-06,6.987108,-0.016521,0.024683,0.065789,-0.007167,3.9e-05,-5.8e-05,-0.000156,1.7e-05,8.7e-05,0.000232,-2.5e-05,0.000619,-6.7e-05,7.351908e-06
2,-0.003604,0.0187,0.003238,-0.001622,0.00169,-0.007246,0.003663,0.0,-0.008893,-0.006536,0.003609,0.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,-0.001544,0.001941,0.208818,0.026954,0.010934,0.004337,-0.00068,2e-06,-3e-06,-0.000322,-4.2e-05,-1.7e-05,-7e-06,1e-06,3.768293e-06,0.000405,5.2e-05,2.1e-05,8e-06,-1e-06,0.043605,0.005629,0.002283,0.000906,-0.000142,0.000727,0.000295,0.000117,-1.8e-05,0.00012,4.7e-05,-7e-06,1.9e-05,-3e-06,4.619962e-07
3,0.028933,0.000874,0.000837,0.002,-0.004639,0.0,-0.00365,-0.003817,-0.000997,0.003289,-0.003367,0.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,-0.00464,-0.005674,-0.317789,0.0,-0.015807,-0.011226,-0.007273,2.2e-05,2.6e-05,0.001475,-0.0,7.3e-05,5.2e-05,3.4e-05,3.219377e-05,0.001803,-0.0,9e-05,6.4e-05,4.1e-05,0.10099,-0.0,0.005023,0.003568,0.002311,0.0,-0.0,-0.0,-0.0,0.00025,0.000177,0.000115,0.000126,8.2e-05,5.288979e-05
4,-0.002636,-0.003493,-0.009078,-0.008855,-0.006779,0.0,-0.007326,-0.003831,0.000998,-0.003279,-0.006665,-0.005236,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,-0.003885,-0.001253,-0.435145,-0.004374,0.007608,0.001747,-0.006324,1.5e-05,5e-06,0.001691,1.7e-05,-3e-05,-7e-06,2.5e-05,1.569031e-06,0.000545,5e-06,-1e-05,-2e-06,8e-06,0.189351,0.001904,-0.00331,-0.00076,0.002752,1.9e-05,-3.3e-05,-8e-06,2.8e-05,5.8e-05,1.3e-05,-4.8e-05,3e-06,-1.1e-05,3.999859e-05
5,-0.003524,-0.002629,-0.001929,-0.005914,-0.008105,0.007299,-0.00369,-0.007692,-0.012961,-0.003289,-0.005992,-0.007895,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0.0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,-0.01014,-0.001254,1.40135,0.0,-0.010906,-0.004359,-0.00122,0.000103,1.3e-05,-0.01421,-0.0,0.000111,4.4e-05,1.2e-05,1.572969e-06,-0.001758,-0.0,1.4e-05,5e-06,2e-06,1.963781,0.0,-0.015283,-0.006109,-0.00171,0.0,-0.0,-0.0,-0.0,0.000119,4.8e-05,1.3e-05,1.9e-05,5e-06,1.488157e-06
