# Set Up

## Import

In [1]:
import pandas as pd
import datetime as dt
pd.options.display.max_columns=999

## Read Data

In [2]:
price = pd.read_csv('us_equity_daily_price_201308_201908.csv', parse_dates=['date'])

In [3]:
price.head()

Unnamed: 0,date,entity_figi,ticker,open,high,low,close,volume
0,2013-08-01,BBG000C2M952,AAON,14.48,15.1,14.36,15.006667,353700.0
1,2013-08-02,BBG000C2M952,AAON,14.966666,15.32,14.966666,15.26,235800.0
2,2013-08-05,BBG000C2M952,AAON,15.28,15.473333,15.033334,15.446667,90300.0
3,2013-08-06,BBG000C2M952,AAON,15.446667,15.466666,15.32,15.333333,304800.0
4,2013-08-07,BBG000C2M952,AAON,15.333333,15.52,15.3,15.466666,128400.0


# Data Quality Check

## Check Nan Values

In [4]:
# Sort values
new_price = price.sort_values(["ticker", "date"]).reset_index(drop=True).copy()
# Count nan
new_price.isnull().sum()

date             0
entity_figi      0
ticker           0
open           488
high           488
low            488
close          488
volume         488
dtype: int64

In [5]:
new_price["next_date"] = new_price.groupby("ticker")["date"].shift(-1)
new_price.head(10)

Unnamed: 0,date,entity_figi,ticker,open,high,low,close,volume,next_date
0,2013-08-01,BBG000C2V541,A,32.55365,33.047211,32.41774,32.932762,3827000.0,2013-08-02
1,2013-08-02,BBG000C2V541,A,32.947067,33.268955,32.896996,33.175964,3354000.0,2013-08-05
2,2013-08-05,BBG000C2V541,A,33.09013,33.190273,32.761086,32.861229,2621800.0,2013-08-06
3,2013-08-06,BBG000C2V541,A,32.854076,33.061516,32.625179,32.961372,3188600.0,2013-08-07
4,2013-08-07,BBG000C2V541,A,32.818314,33.233189,32.632332,33.104435,2379500.0,2013-08-08
5,2013-08-08,BBG000C2V541,A,33.261803,33.297569,32.646637,32.796852,2379900.0,2013-08-09
6,2013-08-09,BBG000C2V541,A,32.796852,33.233189,32.768242,33.061516,2737800.0,2013-08-12
7,2013-08-12,BBG000C2V541,A,32.825466,33.268955,32.703861,33.226036,1743500.0,2013-08-13
8,2013-08-13,BBG000C2V541,A,33.226036,33.605148,33.075821,33.562233,2905300.0,2013-08-14
9,2013-08-14,BBG000C2V541,A,33.454937,33.619457,33.168812,33.268955,2778500.0,2013-08-15


In [6]:
# Show dates of missing data
new_price[new_price.open.isna()].date.unique()

array(['2019-08-16T00:00:00.000000000', '2018-11-06T00:00:00.000000000'],
      dtype='datetime64[ns]')

In [7]:
new_price[(new_price.open.isna()) & (new_price.date=='2018-11-06T00:00:00.000000000')].index

Int64Index([4237695], dtype='int64')

In [8]:
new_price[(new_price.open.isna()) & (new_price.date=='2019-08-16T00:00:00.000000000')].count()

date           487
entity_figi    487
ticker         487
open             0
high             0
low              0
close            0
volume           0
next_date        0
dtype: int64

In [9]:
"""
We get 487 2019-08-16 missing data witch is reasonable. 
All we need is to fill the missing data on 2018-11-06 
"""
new_price.loc[4237695, 'open'] = 3.31
new_price.loc[4237695, 'high'] = 3.46
new_price.loc[4237695, 'low'] = 3.30
new_price.loc[4237695, 'close'] = 3.39
new_price.loc[4237695, 'volume'] = 1486000

In [10]:
new_price.isnull().sum()

date              0
entity_figi       0
ticker            0
open            487
high            487
low             487
close           487
volume          487
next_date      4950
dtype: int64

## Check Date Gaps

In [11]:
new_price['gap'] = new_price['next_date'] - new_price['date']
new_price['gap'] = new_price['gap'].dt.days
new_price[new_price['gap'] > 3].head()

Unnamed: 0,date,entity_figi,ticker,open,high,low,close,volume,next_date,gap
21,2013-08-30,BBG000C2V541,A,33.540771,33.605148,33.168812,33.361946,1899300.0,2013-09-03,4.0
117,2014-01-17,BBG000C2V541,A,43.512161,43.512161,43.154507,43.426323,2375900.0,2014-01-21,4.0
136,2014-02-14,BBG000C2V541,A,40.879829,41.323318,39.248928,39.520744,24946800.0,2014-02-18,4.0
179,2014-04-17,BBG000C2V541,A,38.440628,39.263233,38.347641,39.09156,2882600.0,2014-04-21,4.0
204,2014-05-23,BBG000C2V541,A,39.806866,40.214592,39.470673,40.185982,4907800.0,2014-05-27,4.0


In [12]:
# Show all information about the company which's gap larger than 3
new_price_gap_larger_3 = new_price[new_price['gap'] > 3].copy().reset_index(drop=True)
new_price_gap_larger_3.dropna(inplace=True)
new_price_gap_larger_3.head()

Unnamed: 0,date,entity_figi,ticker,open,high,low,close,volume,next_date,gap
0,2013-08-30,BBG000C2V541,A,33.540771,33.605148,33.168812,33.361946,1899300.0,2013-09-03,4.0
1,2014-01-17,BBG000C2V541,A,43.512161,43.512161,43.154507,43.426323,2375900.0,2014-01-21,4.0
2,2014-02-14,BBG000C2V541,A,40.879829,41.323318,39.248928,39.520744,24946800.0,2014-02-18,4.0
3,2014-04-17,BBG000C2V541,A,38.440628,39.263233,38.347641,39.09156,2882600.0,2014-04-21,4.0
4,2014-05-23,BBG000C2V541,A,39.806866,40.214592,39.470673,40.185982,4907800.0,2014-05-27,4.0


In [14]:
# Show the maximum gap days 
max_gap = new_price_gap_larger_3.pivot_table(index='ticker', values='gap', aggfunc=max).reset_index()
max_gap.head()

Unnamed: 0,ticker,gap
0,A,4.0
1,AAAP,7.0
2,AAC,17.0
3,AAL,4.0
4,AAME,4.0


In [19]:
# Show the largest gap value
new_price_gap_larger_3["gap"].describe()

count    158098.000000
mean          4.236031
std           7.495400
min           4.000000
25%           4.000000
50%           4.000000
75%           4.000000
max         783.000000
Name: gap, dtype: float64

# Data Transformation

In [14]:
#arrays=[list(new_price.ticker),list(range(5755898))]
#tuples=list(zip(*arrays))
#index = pd.MultiIndex.from_tuples(tuples, names=['ticker', 'ID'])
#index
#new_price_new=new_price.set_index(index)

In [23]:
for i in range(1,6):
    new_price[f'{i}d_prior_open'] = pd.Series(new_price.groupby("ticker")["open"].shift(i))
    new_price[f'{i}d_prior_high'] = pd.Series(new_price.groupby("ticker")["high"].shift(i))
    new_price[f'{i}d_prior_low'] = pd.Series(new_price.groupby("ticker")["low"].shift(i))
    new_price[f'{i}d_prior_close'] = pd.Series(new_price.groupby("ticker")["close"].shift(i))
    new_price[f'{i}d_prior_volume'] = pd.Series(new_price.groupby("ticker")["volume"].shift(i))
    
    new_price[f'{i}d_future_open'] = pd.Series(new_price.groupby("ticker")["open"].shift(-i))
    new_price[f'{i}d_future_high'] = pd.Series(new_price.groupby("ticker")["high"].shift(-i))
    new_price[f'{i}d_future_low'] = pd.Series(new_price.groupby("ticker")["low"].shift(-i))
    new_price[f'{i}d_future_close'] = pd.Series(new_price.groupby("ticker")["close"].shift(-i))
    new_price[f'{i}d_future_volume'] = pd.Series(new_price.groupby("ticker")["volume"].shift(-i))

In [24]:
new_price.head(10)

Unnamed: 0,date,entity_figi,ticker,open,high,low,close,volume,next_date,gap,1d_prior_open,1d_prior_high,1d_prior_low,1d_prior_close,1d_prior_volume,1d_future_open,1d_future_high,1d_future_low,1d_future_close,1d_future_volume,2d_prior_open,2d_prior_high,2d_prior_low,2d_prior_close,2d_prior_volume,2d_future_open,2d_future_high,2d_future_low,2d_future_close,2d_future_volume,3d_prior_open,3d_prior_high,3d_prior_low,3d_prior_close,3d_prior_volume,3d_future_open,3d_future_high,3d_future_low,3d_future_close,3d_future_volume,4d_prior_open,4d_prior_high,4d_prior_low,4d_prior_close,4d_prior_volume,4d_future_open,4d_future_high,4d_future_low,4d_future_close,4d_future_volume,5d_prior_open,5d_prior_high,5d_prior_low,5d_prior_close,5d_prior_volume,5d_future_open,5d_future_high,5d_future_low,5d_future_close,5d_future_volume
0,2013-08-01,BBG000C2V541,A,32.55365,33.047211,32.41774,32.932762,3827000.0,2013-08-02,1.0,,,,,,32.947067,33.268955,32.896996,33.175964,3354000.0,,,,,,33.09013,33.190273,32.761086,32.861229,2621800.0,,,,,,32.854076,33.061516,32.625179,32.961372,3188600.0,,,,,,32.818314,33.233189,32.632332,33.104435,2379500.0,,,,,,33.261803,33.297569,32.646637,32.796852,2379900.0
1,2013-08-02,BBG000C2V541,A,32.947067,33.268955,32.896996,33.175964,3354000.0,2013-08-05,3.0,32.55365,33.047211,32.41774,32.932762,3827000.0,33.09013,33.190273,32.761086,32.861229,2621800.0,,,,,,32.854076,33.061516,32.625179,32.961372,3188600.0,,,,,,32.818314,33.233189,32.632332,33.104435,2379500.0,,,,,,33.261803,33.297569,32.646637,32.796852,2379900.0,,,,,,32.796852,33.233189,32.768242,33.061516,2737800.0
2,2013-08-05,BBG000C2V541,A,33.09013,33.190273,32.761086,32.861229,2621800.0,2013-08-06,1.0,32.947067,33.268955,32.896996,33.175964,3354000.0,32.854076,33.061516,32.625179,32.961372,3188600.0,32.55365,33.047211,32.41774,32.932762,3827000.0,32.818314,33.233189,32.632332,33.104435,2379500.0,,,,,,33.261803,33.297569,32.646637,32.796852,2379900.0,,,,,,32.796852,33.233189,32.768242,33.061516,2737800.0,,,,,,32.825466,33.268955,32.703861,33.226036,1743500.0
3,2013-08-06,BBG000C2V541,A,32.854076,33.061516,32.625179,32.961372,3188600.0,2013-08-07,1.0,33.09013,33.190273,32.761086,32.861229,2621800.0,32.818314,33.233189,32.632332,33.104435,2379500.0,32.947067,33.268955,32.896996,33.175964,3354000.0,33.261803,33.297569,32.646637,32.796852,2379900.0,32.55365,33.047211,32.41774,32.932762,3827000.0,32.796852,33.233189,32.768242,33.061516,2737800.0,,,,,,32.825466,33.268955,32.703861,33.226036,1743500.0,,,,,,33.226036,33.605148,33.075821,33.562233,2905300.0
4,2013-08-07,BBG000C2V541,A,32.818314,33.233189,32.632332,33.104435,2379500.0,2013-08-08,1.0,32.854076,33.061516,32.625179,32.961372,3188600.0,33.261803,33.297569,32.646637,32.796852,2379900.0,33.09013,33.190273,32.761086,32.861229,2621800.0,32.796852,33.233189,32.768242,33.061516,2737800.0,32.947067,33.268955,32.896996,33.175964,3354000.0,32.825466,33.268955,32.703861,33.226036,1743500.0,32.55365,33.047211,32.41774,32.932762,3827000.0,33.226036,33.605148,33.075821,33.562233,2905300.0,,,,,,33.454937,33.619457,33.168812,33.268955,2778500.0
5,2013-08-08,BBG000C2V541,A,33.261803,33.297569,32.646637,32.796852,2379900.0,2013-08-09,1.0,32.818314,33.233189,32.632332,33.104435,2379500.0,32.796852,33.233189,32.768242,33.061516,2737800.0,32.854076,33.061516,32.625179,32.961372,3188600.0,32.825466,33.268955,32.703861,33.226036,1743500.0,33.09013,33.190273,32.761086,32.861229,2621800.0,33.226036,33.605148,33.075821,33.562233,2905300.0,32.947067,33.268955,32.896996,33.175964,3354000.0,33.454937,33.619457,33.168812,33.268955,2778500.0,32.55365,33.047211,32.41774,32.932762,3827000.0,34.041489,34.456367,33.512161,33.590843,9378200.0
6,2013-08-09,BBG000C2V541,A,32.796852,33.233189,32.768242,33.061516,2737800.0,2013-08-12,3.0,33.261803,33.297569,32.646637,32.796852,2379900.0,32.825466,33.268955,32.703861,33.226036,1743500.0,32.818314,33.233189,32.632332,33.104435,2379500.0,33.226036,33.605148,33.075821,33.562233,2905300.0,32.854076,33.061516,32.625179,32.961372,3188600.0,33.454937,33.619457,33.168812,33.268955,2778500.0,33.09013,33.190273,32.761086,32.861229,2621800.0,34.041489,34.456367,33.512161,33.590843,9378200.0,32.947067,33.268955,32.896996,33.175964,3354000.0,33.483547,33.776825,33.276108,33.469242,5067300.0
7,2013-08-12,BBG000C2V541,A,32.825466,33.268955,32.703861,33.226036,1743500.0,2013-08-13,1.0,32.796852,33.233189,32.768242,33.061516,2737800.0,33.226036,33.605148,33.075821,33.562233,2905300.0,33.261803,33.297569,32.646637,32.796852,2379900.0,33.454937,33.619457,33.168812,33.268955,2778500.0,32.818314,33.233189,32.632332,33.104435,2379500.0,34.041489,34.456367,33.512161,33.590843,9378200.0,32.854076,33.061516,32.625179,32.961372,3188600.0,33.483547,33.776825,33.276108,33.469242,5067300.0,33.09013,33.190273,32.761086,32.861229,2621800.0,33.354794,33.576538,33.204578,33.218884,2415100.0
8,2013-08-13,BBG000C2V541,A,33.226036,33.605148,33.075821,33.562233,2905300.0,2013-08-14,1.0,32.825466,33.268955,32.703861,33.226036,1743500.0,33.454937,33.619457,33.168812,33.268955,2778500.0,32.796852,33.233189,32.768242,33.061516,2737800.0,34.041489,34.456367,33.512161,33.590843,9378200.0,33.261803,33.297569,32.646637,32.796852,2379900.0,33.483547,33.776825,33.276108,33.469242,5067300.0,32.818314,33.233189,32.632332,33.104435,2379500.0,33.354794,33.576538,33.204578,33.218884,2415100.0,32.854076,33.061516,32.625179,32.961372,3188600.0,33.211731,33.812588,33.025749,33.640915,3207400.0
9,2013-08-14,BBG000C2V541,A,33.454937,33.619457,33.168812,33.268955,2778500.0,2013-08-15,1.0,33.226036,33.605148,33.075821,33.562233,2905300.0,34.041489,34.456367,33.512161,33.590843,9378200.0,32.825466,33.268955,32.703861,33.226036,1743500.0,33.483547,33.776825,33.276108,33.469242,5067300.0,32.796852,33.233189,32.768242,33.061516,2737800.0,33.354794,33.576538,33.204578,33.218884,2415100.0,33.261803,33.297569,32.646637,32.796852,2379900.0,33.211731,33.812588,33.025749,33.640915,3207400.0,32.818314,33.233189,32.632332,33.104435,2379500.0,33.569386,33.79113,33.361946,33.540771,2724800.0


In [None]:
new_price.to_csv("finished_us_equity_daily_price_201308_201908.csv", index=False)