In [1]:
# Pandas、globのimport
import pandas as pd
import glob

In [2]:
# 表示関連
# DataFrameの列数設定
pd.set_option('display.max_columns', 100)

In [3]:
# 2001～2018年の株価データをマージする
# globでファイル名の一覧を取得
stock_price_files = glob.glob('stockPrice/*.csv')
stock_price_list = []

# ファイルを読み込み、DataFrameでlistに格納する。
for f in stock_price_files:
    stock_price_list.append(pd.read_csv(f, header=1, encoding="shift-jis", parse_dates = [0]))
    
# Listに格納されたデータを全てconcat関数で連結    
stock_price_all = pd.concat(stock_price_list)

# 列名の変更
stock_price_all.rename(columns = {'日付':'Date','始値':'Open','高値':'High','安値':'Low','終値':'Close','終値調整値':'Adj Close',
                                  '出来高':'Volume'}, inplace = True)

# 欠損値のチェック
print(stock_price_all.isnull().sum())

# 読み込みデータの表示
stock_price_all.head()

Date         0
Open         0
High         0
Low          0
Close        0
Volume       0
Adj Close    0
dtype: int64


Unnamed: 0,Date,Open,High,Low,Close,Volume,Adj Close
0,2001-07-13,12490,12490,12300,12310,259430,12310
1,2001-07-16,12400,12400,12250,12330,99740,12330
2,2001-07-17,12170,12170,12100,12120,93150,12120
3,2001-07-18,12150,12150,11830,11840,165330,11840
4,2001-07-19,11990,11990,11860,11920,620220,11920


In [4]:
# 2002～2018年の為替レートを取得
market_Data = pd.read_csv('market/market.csv', header=0, encoding="shift-jis", parse_dates = [0])

# 不要な通貨を削除
market_Data.drop(columns=['USD','EUR','DKK','NOK','NZD','BHD','CNY','HKD','SAR','AED','MXN'], inplace=True)

# 欠損値のチェック
print(market_Data.isnull().sum())

# 読み込みデータの表示
market_Data.head()

Date        0
GBP         0
CAD         0
CHF         0
SEK         0
AUD         0
ZAR         0
IDR(100)    0
INR         0
PHP         0
SGD         0
KRW(100)    0
THB         0
KWD         0
TWD         0
dtype: int64


Unnamed: 0,Date,GBP,CAD,CHF,SEK,AUD,ZAR,IDR(100),INR,PHP,SGD,KRW(100),THB,KWD,TWD
0,2002-04-01,189.79,83.48,79.28,12.87,71.14,11.76,1.37,2.73,2.61,72.21,10.12,3.07,434.14,3.82
1,2002-04-02,191.78,83.38,80.15,13.0,71.02,11.78,1.38,2.73,2.62,72.18,10.12,3.06,435.01,3.82
2,2002-04-03,191.26,83.65,80.02,12.95,71.14,11.89,1.37,2.74,2.62,72.12,10.03,3.04,436.58,3.82
3,2002-04-04,191.13,83.72,80.18,12.93,70.81,12.02,1.37,2.73,2.61,72.26,10.02,3.05,435.11,3.82
4,2002-04-05,189.74,82.96,79.47,12.87,70.33,11.83,1.37,2.71,2.6,71.85,10.03,3.04,432.21,3.8


In [5]:
# 内部結合で結合する
merge_Data = pd.merge(stock_price_all, market_Data, on='Date', how='inner')

# 目的変数の作成
merge_Data['diff'] = merge_Data['Close'] - merge_Data['Open']
merge_Data['answer'] = merge_Data['diff'].apply( lambda x: 0 if x < 0 else 1 )


In [6]:
# 不要な列を削除
merge_Data.drop(columns=['Date','Close','diff'], inplace=True)

# 各列を変化率へ一括変換
merge_Data_change = merge_Data.pct_change()

# 正解ラベルの付与
merge_Data_change['answer'] = merge_Data['answer']

# 正解ラベルをずらし、説明変数から見て未来の値とする
merge_Data_change.answer = merge_Data_change.answer.shift(-1)

# 最初と最後の行を削除
merge_Data_change.drop(0, axis=0, inplace=True)
merge_Data_change.drop(len(merge_Data_change), axis=0, inplace=True)
merge_Data_change.to_csv("stock_Price_Prediction_v2.1.csv",index = False, encoding="shift-jis")

# 読み込みデータの表示
merge_Data_change.head()

Unnamed: 0,Open,High,Low,Volume,Adj Close,GBP,CAD,CHF,SEK,AUD,ZAR,IDR(100),INR,PHP,SGD,KRW(100),THB,KWD,TWD,answer
1,-0.004444,0.0,0.004525,-0.514297,0.010801,0.010485,-0.001198,0.010974,0.010101,-0.001687,0.001701,0.007299,0.0,0.003831,-0.000415,0.0,-0.003257,0.002004,0.0,1.0
2,-0.00625,0.024889,-0.003604,2.643314,0.0187,-0.002711,0.003238,-0.001622,-0.003846,0.00169,0.009338,-0.007246,0.003663,0.0,-0.000831,-0.008893,-0.006536,0.003609,0.0,1.0
3,0.026954,0.004337,0.028933,0.208818,0.000874,-0.00068,0.000837,0.002,-0.001544,-0.004639,0.010934,0.0,-0.00365,-0.003817,0.001941,-0.000997,0.003289,-0.003367,0.0,0.0
4,0.0,-0.011226,-0.002636,-0.317789,-0.003493,-0.007273,-0.009078,-0.008855,-0.00464,-0.006779,-0.015807,0.0,-0.007326,-0.003831,-0.005674,0.000998,-0.003279,-0.006665,-0.005236,1.0
5,-0.004374,0.001747,-0.003524,-0.435145,-0.002629,-0.006324,-0.001929,-0.005914,-0.003885,-0.008105,0.007608,0.007299,-0.00369,-0.007692,-0.001253,-0.012961,-0.003289,-0.005992,-0.007895,0.0
