# 必要なパッケージのインポート

In [20]:
import pickle
import os
import pandas as pd
import time, datetime
import numpy as np

# マージするデータの読み込み

## Kaggleから取得したデータファイル（〜2018年）

In [2]:
df_kaggle_stock_price_history_data = pd.read_table("./resource_files/historical_stock_prices.csv.zip", sep=",", header=0)

  """Entry point for launching an IPython kernel.


In [4]:
df_kaggle_stock_price_history_data.head()

Unnamed: 0,ticker,open,close,adj_close,low,high,volume,date
0,AHH,11.5,11.58,8.493155,11.25,11.68,4633900,2013-05-08
1,AHH,11.66,11.55,8.471151,11.5,11.66,275800,2013-05-09
2,AHH,11.55,11.6,8.507822,11.5,11.6,277100,2013-05-10
3,AHH,11.63,11.65,8.544494,11.55,11.65,147400,2013-05-13
4,AHH,11.6,11.53,8.456484,11.5,11.6,184100,2013-05-14


In [5]:
df_kaggle_stock_price_history_data.shape

(20973889, 8)

## データの整形

In [10]:
df_kaggle_stock_price_history_data = df_kaggle_stock_price_history_data[["ticker", "open", "close", "low", "high", "volume", "date"]]

In [11]:
df_kaggle_stock_price_history_data.head()

Unnamed: 0,ticker,open,close,low,high,volume,date
0,AHH,11.5,11.58,11.25,11.68,4633900,2013-05-08
1,AHH,11.66,11.55,11.5,11.66,275800,2013-05-09
2,AHH,11.55,11.6,11.5,11.6,277100,2013-05-10
3,AHH,11.63,11.65,11.55,11.65,147400,2013-05-13
4,AHH,11.6,11.53,11.5,11.6,184100,2013-05-14


## investing.comからスクレイピングで取得したデータファイル（2018年〜2020年9月30日）

In [3]:
df_investingcom_stock_price_history_data = pd.read_table("./output/s_and_p_stock_price_history_from_investing_com.csv", sep=",", header=0)

  """Entry point for launching an IPython kernel.
  interactivity=interactivity, compiler=compiler, result=result)


In [6]:
df_investingcom_stock_price_history_data.head()

Unnamed: 0.1,Unnamed: 0,ticker,open,close,low,high,volume,date
0,0,PEP,137.96,138.6,136.93,139.19,7060000,2020-09-30
1,1,PEP,138.72,137.16,137.15,139.54,4330000,2020-09-29
2,2,PEP,135.96,137.97,135.75,138.58,7490000,2020-09-28
3,3,PEP,130.89,133.55,130.51,133.83,4580000,2020-09-25
4,4,PEP,131.27,131.58,129.94,132.31,4320000,2020-09-24


In [7]:
df_investingcom_stock_price_history_data.shape

(344311, 8)

### データの整形

In [8]:
df_investingcom_stock_price_history_data = df_investingcom_stock_price_history_data[["ticker", "open", "close", "low", "high", "volume", "date"]]

In [12]:
df_investingcom_stock_price_history_data.head()

Unnamed: 0,ticker,open,close,low,high,volume,date
0,PEP,137.96,138.6,136.93,139.19,7060000,2020-09-30
1,PEP,138.72,137.16,137.15,139.54,4330000,2020-09-29
2,PEP,135.96,137.97,135.75,138.58,7490000,2020-09-28
3,PEP,130.89,133.55,130.51,133.83,4580000,2020-09-25
4,PEP,131.27,131.58,129.94,132.31,4320000,2020-09-24


# データのマージ

## データの確認

In [14]:
df_kaggle_stock_list = df_kaggle_stock_price_history_data["ticker"].unique()

In [15]:
len(df_kaggle_stock_list)

5685

In [13]:
df_investingcom_stock_list = df_investingcom_stock_price_history_data["ticker"].unique()

In [16]:
len(df_investingcom_stock_list)

500

## 共通して存在する銘柄の確認

In [21]:
len(np.intersect1d(df_kaggle_stock_list, df_investingcom_stock_list))

480

In [22]:
intersect_tickers = np.intersect1d(df_kaggle_stock_list, df_investingcom_stock_list)

## データのマージ

In [73]:
df_merged = df_kaggle_stock_price_history_data.query("ticker in @intersect_tickers")

In [74]:
df_merged = df_merged.append(df_investingcom_stock_price_history_data.query("ticker in @intersect_tickers"))

In [75]:
df_merged.shape

(3742499, 7)

In [76]:
df_merged.head()

Unnamed: 0,ticker,open,close,low,high,volume,date
948,AAPL,0.513393,0.513393,0.513393,0.515625,117258400,1980-12-12
960,AAPL,0.488839,0.486607,0.486607,0.488839,43971200,1980-12-15
968,AAPL,0.453125,0.450893,0.450893,0.453125,26432000,1980-12-16
980,AAPL,0.462054,0.462054,0.462054,0.464286,21610400,1980-12-17
988,AAPL,0.475446,0.475446,0.475446,0.477679,18362400,1980-12-18


In [77]:
df_merged["ticker"].nunique()

480

In [78]:
df_merged.agg({"date": [min, max]})

Unnamed: 0,date
min,1970-01-02
max,2020-09-30


## 重複行の確認

In [60]:
df_merged.groupby(["ticker", "date"]).size().reset_index()\
         .loc[(df_merged.groupby(["ticker", "date"]).size()>1).to_list(), :]

Unnamed: 0,ticker,date,0
4558,A,2018-01-02,2
4559,A,2018-01-03,2
4560,A,2018-01-04,2
4561,A,2018-01-05,2
4562,A,2018-01-08,2
4563,A,2018-01-09,2
4564,A,2018-01-10,2
4565,A,2018-01-11,2
4566,A,2018-01-12,2
4567,A,2018-01-16,2


In [61]:
df_merged.query("ticker=='ZTS' & date=='2018-07-16'")

Unnamed: 0,ticker,open,close,low,high,volume,date
2549405,ZTS,86.31,84.58,84.5,86.31,2473300,2018-07-16
8861,ZTS,86.31,84.58,84.5,86.31,2470000,2018-07-16


→重複行の値は、丸められていたvolume以外は同じ様子

## 集約のためにカラムの型を変換

In [91]:
df_merged["open"] = df_merged["open"].apply(lambda x: str(x).replace(",", "") ).astype(float)

In [92]:
df_merged["low"] = df_merged["low"].apply(lambda x: str(x).replace(",", "") ).astype(float)
df_merged["high"] = df_merged["high"].apply(lambda x: str(x).replace(",", "") ).astype(float)
df_merged["close"] = df_merged["close"].apply(lambda x: str(x).replace(",", "") ).astype(float)

In [94]:
df_merged["volume"] = df_merged["volume"].apply(lambda x: str(x).replace(",", "").replace("-", "0") ).astype(float)

In [95]:
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3742499 entries, 948 to 344310
Data columns (total 7 columns):
ticker    object
open      float64
close     float64
low       float64
high      float64
volume    float64
date      object
dtypes: float64(5), object(2)
memory usage: 228.4+ MB


In [96]:
df_merged.head()

Unnamed: 0,ticker,open,close,low,high,volume,date
948,AAPL,0.513393,0.513393,0.513393,0.515625,117258400.0,1980-12-12
960,AAPL,0.488839,0.486607,0.486607,0.488839,43971200.0,1980-12-15
968,AAPL,0.453125,0.450893,0.450893,0.453125,26432000.0,1980-12-16
980,AAPL,0.462054,0.462054,0.462054,0.464286,21610400.0,1980-12-17
988,AAPL,0.475446,0.475446,0.475446,0.477679,18362400.0,1980-12-18


In [97]:
df_merged.tail()

Unnamed: 0,ticker,open,close,low,high,volume,date
344306,NOC,308.95,308.26,307.56,309.34,660790.0,2018-01-08
344307,NOC,309.09,308.22,307.21,310.24,842800.0,2018-01-05
344308,NOC,306.58,308.55,305.61,308.71,582400.0,2018-01-04
344309,NOC,304.74,306.65,304.61,307.25,722050.0,2018-01-03
344310,NOC,307.35,305.24,303.21,307.72,715000.0,2018-01-02


In [98]:
df_merged_unique = df_merged\
                    .groupby(["ticker", "date"], as_index=False)\
                    .agg({"open":"max", "close":"max", "low":"max", "high":"max", "volume":"max",})

In [99]:
df_merged_unique.shape

(3664240, 7)

In [100]:
df_merged_unique.groupby(["ticker", "date"]).size().reset_index()\
         .loc[(df_merged_unique.groupby(["ticker", "date"]).size()>1).to_list(), :]

Unnamed: 0,ticker,date,0


In [102]:
df_merged_unique["ticker"].nunique()

480

In [103]:
df_merged_unique.agg({"date":[min, max]})

Unnamed: 0,date
min,1970-01-02
max,2020-09-30


In [104]:
df_merged_unique["ticker"].value_counts()

DIS     12802
CAT     12802
MMM     12802
CVX     12802
IP      12802
GE      12802
KO      12802
AEP     12802
HPQ     12802
MRK     12802
CNP     12802
FL      12802
XOM     12802
DTE     12802
PG      12802
MCD     12802
ARNC    12802
MRO     12802
ED      12802
BA      12802
JNJ     12802
MO      12801
HON     12800
IBM     12799
DE      12190
F       12190
PFE     12189
ETR     12189
ETN     12189
BMY     12189
        ...  
APTV     2231
TRIP     2218
PSX      2132
FB       2106
FANG     2004
ABBV     1951
NCLH     1939
ZTS      1930
IQV      1863
COTY     1839
NWSA     1835
NWS      1835
TWTR     1736
ALLE     1729
HLT      1712
ANET     1592
INFO     1583
SYF      1554
CFG      1516
KEYS     1498
QRVO     1447
WRK      1328
PYPL     1321
KHC      1321
UA       1315
HPE      1247
FTV      1069
LW        978
RSG       692
CBRE      692
Name: ticker, Length: 480, dtype: int64

In [106]:
df_merged_unique.query("ticker=='CBRE'").sort_values("date")

Unnamed: 0,ticker,date,open,close,low,high,volume
618286,CBRE,2018-01-02,43.680000,43.820000,43.240000,43.830000,2390000.0
618287,CBRE,2018-01-03,43.750000,44.190000,43.520000,44.230000,2210000.0
618288,CBRE,2018-01-04,44.340000,44.050000,44.010000,44.790000,1770000.0
618289,CBRE,2018-01-05,44.160000,44.270000,44.060000,44.380000,2080000.0
618290,CBRE,2018-01-08,44.270000,44.840000,44.220000,44.910000,2150000.0
618291,CBRE,2018-01-09,44.910000,45.150000,44.850000,45.500000,1760000.0
618292,CBRE,2018-01-10,44.950000,44.270000,44.080000,44.950000,2230000.0
618293,CBRE,2018-01-11,44.410000,45.120000,44.250000,45.170000,1090000.0
618294,CBRE,2018-01-12,45.250000,44.890000,44.770000,45.250000,1180000.0
618295,CBRE,2018-01-16,45.180000,44.650002,44.590000,45.520000,1160000.0


In [107]:
df_kaggle_stock_price_history_data.query("ticker=='CBRE'").sort_values("date")

Unnamed: 0,ticker,open,close,low,high,volume,date
13294736,CBRE,45.180000,44.650002,44.590000,45.520000,1157000,2018-01-16
13294742,CBRE,44.919998,44.990002,44.570000,45.259998,1640500,2018-01-17
13294754,CBRE,45.000000,45.189999,44.619999,45.299999,1460700,2018-01-18
13294761,CBRE,45.259998,45.880001,45.099998,45.910000,1509900,2018-01-19
13294762,CBRE,45.880001,45.730000,45.230000,45.980000,1299100,2018-01-22
13294774,CBRE,45.680000,45.950001,45.529999,46.029999,1087200,2018-01-23
13294781,CBRE,45.250000,45.209999,44.580002,45.590000,1823200,2018-01-24
13294794,CBRE,45.389999,45.810001,45.389999,45.950001,1478400,2018-01-25
13294802,CBRE,46.009998,46.209999,45.660000,46.240002,1119100,2018-01-26
13294816,CBRE,46.209999,46.340000,45.790001,46.599998,1430800,2018-01-29


In [108]:
df_kaggle_stock_price_history_data.query("ticker=='UA'").sort_values("date")

Unnamed: 0,ticker,open,close,low,high,volume,date
3362705,UA,84.260002,84.260002,83.709999,84.739998,1607600,2015-07-01
3362711,UA,84.489998,84.589996,83.900002,84.779999,1051600,2015-07-02
3362728,UA,83.989998,84.580002,83.709999,85.000000,888600,2015-07-06
3362734,UA,84.800003,84.940002,83.309998,84.949997,1398700,2015-07-07
3362751,UA,84.370003,84.389999,83.870003,85.129997,1733100,2015-07-08
3362757,UA,85.389999,84.489998,84.480003,85.800003,1920000,2015-07-09
3362773,UA,85.449997,86.239998,84.769997,86.400002,1684200,2015-07-10
3362779,UA,87.160004,89.349998,87.089996,89.459999,2721800,2015-07-13
3362795,UA,89.279999,88.800003,88.269997,89.300003,1857400,2015-07-14
3362801,UA,88.800003,88.279999,88.099998,89.000000,1373400,2015-07-15


In [110]:
df_merged_unique.query("ticker=='UAL'").sort_values("date")

Unnamed: 0,ticker,date,open,close,low,high,volume
3291349,UAL,2006-02-06,34.820000,33.900002,33.700001,34.939999,1940300.0
3291350,UAL,2006-02-07,33.900002,32.990002,31.730000,33.900002,3224600.0
3291351,UAL,2006-02-08,31.459999,32.000000,30.879999,32.400002,8960500.0
3291352,UAL,2006-02-09,29.799999,34.500000,29.510000,34.980000,8016600.0
3291353,UAL,2006-02-10,33.919998,34.240002,33.599998,35.570000,2229100.0
3291354,UAL,2006-02-13,36.009998,35.040001,34.430000,36.220001,4804900.0
3291355,UAL,2006-02-14,34.040001,35.779999,34.020000,36.250000,3630300.0
3291356,UAL,2006-02-15,35.200001,36.529999,35.200001,36.619999,3260600.0
3291357,UAL,2006-02-16,36.040001,35.700001,35.450001,36.290001,2042100.0
3291358,UAL,2006-02-17,36.110001,36.930000,35.070000,37.000000,2265000.0


In [111]:
df_merged_unique.to_csv("./output/s_and_p_stock_price_history.csv")