In [2]:
import pandas as pd
import numpy as np
import datetime as dt
pd.options.display.max_columns=999

# 1.Read Data

In [3]:
df=pd.read_csv('us_equity_daily_price_201308_201908.csv',parse_dates=['date'])
print(df.head())
df.columns

        date   entity_figi ticker       open       high        low      close  \
0 2013-08-01  BBG000C2M952   AAON  14.480000  15.100000  14.360000  15.006667   
1 2013-08-02  BBG000C2M952   AAON  14.966666  15.320000  14.966666  15.260000   
2 2013-08-05  BBG000C2M952   AAON  15.280000  15.473333  15.033334  15.446667   
3 2013-08-06  BBG000C2M952   AAON  15.446667  15.466666  15.320000  15.333333   
4 2013-08-07  BBG000C2M952   AAON  15.333333  15.520000  15.300000  15.466666   

     volume  
0  353700.0  
1  235800.0  
2   90300.0  
3  304800.0  
4  128400.0  


Index(['date', 'entity_figi', 'ticker', 'open', 'high', 'low', 'close',
       'volume'],
      dtype='object')

# 2.Judge Nan

In [4]:
#Sort values
test=df.sort_values(["ticker", "date"]).reset_index(drop=True)
#count nan
count=df.isnull().sum()
print(f'The number of nan is \n {count}')
test["Next_date"] = test.groupby("ticker")["date"].shift(-1)
print(test.head(10))

The number of nan is 
 date             0
entity_figi      0
ticker           0
open           488
high           488
low            488
close          488
volume         488
dtype: int64
        date   entity_figi ticker       open       high        low      close  \
0 2013-08-01  BBG000C2V541      A  32.553650  33.047211  32.417740  32.932762   
1 2013-08-02  BBG000C2V541      A  32.947067  33.268955  32.896996  33.175964   
2 2013-08-05  BBG000C2V541      A  33.090130  33.190273  32.761086  32.861229   
3 2013-08-06  BBG000C2V541      A  32.854076  33.061516  32.625179  32.961372   
4 2013-08-07  BBG000C2V541      A  32.818314  33.233189  32.632332  33.104435   
5 2013-08-08  BBG000C2V541      A  33.261803  33.297569  32.646637  32.796852   
6 2013-08-09  BBG000C2V541      A  32.796852  33.233189  32.768242  33.061516   
7 2013-08-12  BBG000C2V541      A  32.825466  33.268955  32.703861  33.226036   
8 2013-08-13  BBG000C2V541      A  33.226036  33.605148  33.075821  33.562233   
9 

In [5]:
#Show dates of missing data
test[test.open.isna()].date.unique()

array(['2019-08-16T00:00:00.000000000', '2018-11-06T00:00:00.000000000'],
      dtype='datetime64[ns]')

In [6]:
#fill nan
test[(test.open.isna())& (test.date=='2018-11-06T00:00:00.000000000')].index

Int64Index([4237695], dtype='int64')

In [7]:
test[(test.open.isna())& (test.date=='2019-08-16T00:00:00.000000000')].count()

date           487
entity_figi    487
ticker         487
open             0
high             0
low              0
close            0
volume           0
Next_date        0
dtype: int64

# We get 487 2019-08-16 missing data witch is reasonable.  All we need to do is to fill the missing data on 2018-11-06

In [8]:
test.loc[4237695, 'open'] = 3.31
test.loc[4237695, 'high'] = 3.46
test.loc[4237695, 'low'] = 3.30
test.loc[4237695, 'close'] = 3.39
test.loc[4237695, 'volume'] = 1486000
count_new=test.isnull().sum()
count_new

date              0
entity_figi       0
ticker            0
open            487
high            487
low             487
close           487
volume          487
Next_date      4950
dtype: int64

# 3.Gap

In [9]:
test['gap']=test['Next_date']-test.date
print(test.head())
test['gap'] = test['gap'].dt.days
print(test.head())
gap_count=test[test['gap']>3].date.count()
print(f'The total number of gap which larger than 3 is {gap_count}')

        date   entity_figi ticker       open       high        low      close  \
0 2013-08-01  BBG000C2V541      A  32.553650  33.047211  32.417740  32.932762   
1 2013-08-02  BBG000C2V541      A  32.947067  33.268955  32.896996  33.175964   
2 2013-08-05  BBG000C2V541      A  33.090130  33.190273  32.761086  32.861229   
3 2013-08-06  BBG000C2V541      A  32.854076  33.061516  32.625179  32.961372   
4 2013-08-07  BBG000C2V541      A  32.818314  33.233189  32.632332  33.104435   

      volume  Next_date    gap  
0  3827000.0 2013-08-02 1 days  
1  3354000.0 2013-08-05 3 days  
2  2621800.0 2013-08-06 1 days  
3  3188600.0 2013-08-07 1 days  
4  2379500.0 2013-08-08 1 days  
        date   entity_figi ticker       open       high        low      close  \
0 2013-08-01  BBG000C2V541      A  32.553650  33.047211  32.417740  32.932762   
1 2013-08-02  BBG000C2V541      A  32.947067  33.268955  32.896996  33.175964   
2 2013-08-05  BBG000C2V541      A  33.090130  33.190273  32.761086  32.8

In [10]:
test.tail()

Unnamed: 0,date,entity_figi,ticker,open,high,low,close,volume,Next_date,gap
5755894,2018-05-16,BBG009J96R40,ZYNE,9.47,9.53,9.07,9.4,154600.0,2018-05-17,1.0
5755895,2018-05-17,BBG009J96R40,ZYNE,9.47,9.97,9.35,9.81,247900.0,2018-05-18,1.0
5755896,2018-05-18,BBG009J96R40,ZYNE,9.88,10.47,9.87,10.28,285600.0,2018-05-21,3.0
5755897,2018-05-21,BBG009J96R40,ZYNE,10.36,10.54,9.621,9.98,292200.0,2018-05-22,1.0
5755898,2018-05-22,BBG009J96R40,ZYNE,10.06,10.64,9.901,10.34,261000.0,NaT,


In [11]:
#Show all information about the company which's gap larger than 3
test_gap_larger_3=test.drop(test[test['gap']<4].index)
test_gap_larger_3.dropna(inplace=True)
test_gap_larger_3

Unnamed: 0,date,entity_figi,ticker,open,high,low,close,volume,Next_date,gap
21,2013-08-30,BBG000C2V541,A,33.540771,33.605148,33.168812,33.361946,1899300.0,2013-09-03,4.0
117,2014-01-17,BBG000C2V541,A,43.512161,43.512161,43.154507,43.426323,2375900.0,2014-01-21,4.0
136,2014-02-14,BBG000C2V541,A,40.879829,41.323318,39.248928,39.520744,24946800.0,2014-02-18,4.0
179,2014-04-17,BBG000C2V541,A,38.440628,39.263233,38.347641,39.091560,2882600.0,2014-04-21,4.0
204,2014-05-23,BBG000C2V541,A,39.806866,40.214592,39.470673,40.185982,4907800.0,2014-05-27,4.0
232,2014-07-03,BBG000C2V541,A,41.623749,41.874107,41.502148,41.816883,839700.0,2014-07-07,4.0
272,2014-08-29,BBG000C2V541,A,41.008583,41.130184,40.829758,40.886982,2767900.0,2014-09-02,4.0
368,2015-01-16,BBG000C2V541,A,37.830002,38.459999,37.759998,38.250000,3004000.0,2015-01-20,4.0
387,2015-02-13,BBG000C2V541,A,39.970001,40.310001,39.910000,40.150002,2211900.0,2015-02-17,4.0
420,2015-04-02,BBG000C2V541,A,41.470001,42.130001,41.330002,42.049999,2135900.0,2015-04-06,4.0


In [12]:
#Show the maximum gap days 
max_gap=test_gap_larger_3.pivot_table(index='ticker',values='gap',aggfunc=max)
print(f'max_gap is: \n {max_gap}')
#Show the largest gap value
max_gap_value=test_gap_larger_3.gap.max()
print(f'max gap value is: \n {max_gap_value}')

max_gap is: 
          gap
ticker      
A        4.0
AAAP     7.0
AAC     17.0
AAL      4.0
AAME     4.0
AAN      4.0
AAOI     4.0
AAON     4.0
AAP      4.0
AAPL     4.0
AAT      4.0
AAV      4.0
AAWW     4.0
AAXJ     4.0
AB       4.0
ABAC     4.0
ABAX     4.0
ABB      4.0
ABBV     4.0
ABC      4.0
ABCB     4.0
ABCD     4.0
ABDC     4.0
ABEO     4.0
ABEV     4.0
ABG      4.0
ABIO     4.0
ABM      4.0
ABMD     4.0
ABR      4.0
...      ...
YY       4.0
Z        4.0
ZAGG     4.0
ZAIS     4.0
ZAYO     4.0
ZAZA     4.0
ZBH     57.0
ZBIO     4.0
ZBRA     4.0
ZEL     82.0
ZEN      4.0
ZEUS     4.0
ZF       4.0
ZFGN     4.0
ZG       4.0
ZGNX     4.0
ZION     4.0
ZIOP     4.0
ZIV      4.0
ZIXI     4.0
ZN       4.0
ZNGA     4.0
ZNH      4.0
ZOES     4.0
ZSAN     4.0
ZTR      4.0
ZTS      4.0
ZUMZ     4.0
ZX       4.0
ZYNE     4.0

[4904 rows x 1 columns]
max gap value is: 
 783.0


In [13]:
#check given company's gap days
test_gap_larger_3.set_index(['ticker', 'date']).loc['A'].gap

date
2013-08-30    4.0
2014-01-17    4.0
2014-02-14    4.0
2014-04-17    4.0
2014-05-23    4.0
2014-07-03    4.0
2014-08-29    4.0
2015-01-16    4.0
2015-02-13    4.0
2015-04-02    4.0
2015-05-22    4.0
2015-07-02    4.0
2015-09-04    4.0
2015-12-24    4.0
2015-12-31    4.0
2016-01-15    4.0
2016-02-12    4.0
2016-03-24    4.0
2016-05-27    4.0
2016-07-01    4.0
2016-09-02    4.0
2016-12-23    4.0
2016-12-30    4.0
2017-01-13    4.0
2017-02-17    4.0
2017-04-13    4.0
2017-05-26    4.0
2017-09-01    4.0
2017-12-22    4.0
2017-12-29    4.0
2018-01-12    4.0
2018-02-16    4.0
2018-03-29    4.0
2018-05-25    4.0
2018-08-31    4.0
2019-01-18    4.0
2019-02-15    4.0
2019-04-18    4.0
2019-05-24    4.0
Name: gap, dtype: float64

# 4.Add data of prior dates

In [14]:
#arrays=[list(test.ticker),list(range(5755898))]
#tuples=list(zip(*arrays))
#index = pd.MultiIndex.from_tuples(tuples, names=['ticker', 'ID'])
#index
#test_new=test.set_index(index)

In [15]:
for i in np.arange(1,6,1):
    test[f'{i}d_prior_open']=pd.Series(test.groupby("ticker")["open"].shift(i))
    test[f'{i}d_prior_high']=pd.Series(test.groupby("ticker")["high"].shift(i))
    test[f'{i}d_prior_low']=pd.Series(test.groupby("ticker")["low"].shift(i))
    test[f'{i}d_prior_close']=pd.Series(test.groupby("ticker")["close"].shift(i))
    test[f'{i}d_prior_volume']=pd.Series(test.groupby("ticker")["volume"].shift(i))
for i in np.arange(-1,-6,-1):
    test[f'{i}d_back_open']=pd.Series(test.groupby("ticker")["open"].shift(i))
    test[f'{i}d_back_high']=pd.Series(test.groupby("ticker")["high"].shift(i))
    test[f'{i}d_back_low']=pd.Series(test.groupby("ticker")["low"].shift(i))
    test[f'{i}d_back_close']=pd.Series(test.groupby("ticker")["close"].shift(i))
    test[f'{i}d_back_volume']=pd.Series(test.groupby("ticker")["volume"].shift(i))

In [16]:
test.head(10)

Unnamed: 0,date,entity_figi,ticker,open,high,low,close,volume,Next_date,gap,1d_prior_open,1d_prior_high,1d_prior_low,1d_prior_close,1d_prior_volume,2d_prior_open,2d_prior_high,2d_prior_low,2d_prior_close,2d_prior_volume,3d_prior_open,3d_prior_high,3d_prior_low,3d_prior_close,3d_prior_volume,4d_prior_open,4d_prior_high,4d_prior_low,4d_prior_close,4d_prior_volume,5d_prior_open,5d_prior_high,5d_prior_low,5d_prior_close,5d_prior_volume,-1d_back_open,-1d_back_high,-1d_back_low,-1d_back_close,-1d_back_volume,-2d_back_open,-2d_back_high,-2d_back_low,-2d_back_close,-2d_back_volume,-3d_back_open,-3d_back_high,-3d_back_low,-3d_back_close,-3d_back_volume,-4d_back_open,-4d_back_high,-4d_back_low,-4d_back_close,-4d_back_volume,-5d_back_open,-5d_back_high,-5d_back_low,-5d_back_close,-5d_back_volume
0,2013-08-01,BBG000C2V541,A,32.55365,33.047211,32.41774,32.932762,3827000.0,2013-08-02,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,32.947067,33.268955,32.896996,33.175964,3354000.0,33.09013,33.190273,32.761086,32.861229,2621800.0,32.854076,33.061516,32.625179,32.961372,3188600.0,32.818314,33.233189,32.632332,33.104435,2379500.0,33.261803,33.297569,32.646637,32.796852,2379900.0
1,2013-08-02,BBG000C2V541,A,32.947067,33.268955,32.896996,33.175964,3354000.0,2013-08-05,3.0,32.55365,33.047211,32.41774,32.932762,3827000.0,,,,,,,,,,,,,,,,,,,,,33.09013,33.190273,32.761086,32.861229,2621800.0,32.854076,33.061516,32.625179,32.961372,3188600.0,32.818314,33.233189,32.632332,33.104435,2379500.0,33.261803,33.297569,32.646637,32.796852,2379900.0,32.796852,33.233189,32.768242,33.061516,2737800.0
2,2013-08-05,BBG000C2V541,A,33.09013,33.190273,32.761086,32.861229,2621800.0,2013-08-06,1.0,32.947067,33.268955,32.896996,33.175964,3354000.0,32.55365,33.047211,32.41774,32.932762,3827000.0,,,,,,,,,,,,,,,,32.854076,33.061516,32.625179,32.961372,3188600.0,32.818314,33.233189,32.632332,33.104435,2379500.0,33.261803,33.297569,32.646637,32.796852,2379900.0,32.796852,33.233189,32.768242,33.061516,2737800.0,32.825466,33.268955,32.703861,33.226036,1743500.0
3,2013-08-06,BBG000C2V541,A,32.854076,33.061516,32.625179,32.961372,3188600.0,2013-08-07,1.0,33.09013,33.190273,32.761086,32.861229,2621800.0,32.947067,33.268955,32.896996,33.175964,3354000.0,32.55365,33.047211,32.41774,32.932762,3827000.0,,,,,,,,,,,32.818314,33.233189,32.632332,33.104435,2379500.0,33.261803,33.297569,32.646637,32.796852,2379900.0,32.796852,33.233189,32.768242,33.061516,2737800.0,32.825466,33.268955,32.703861,33.226036,1743500.0,33.226036,33.605148,33.075821,33.562233,2905300.0
4,2013-08-07,BBG000C2V541,A,32.818314,33.233189,32.632332,33.104435,2379500.0,2013-08-08,1.0,32.854076,33.061516,32.625179,32.961372,3188600.0,33.09013,33.190273,32.761086,32.861229,2621800.0,32.947067,33.268955,32.896996,33.175964,3354000.0,32.55365,33.047211,32.41774,32.932762,3827000.0,,,,,,33.261803,33.297569,32.646637,32.796852,2379900.0,32.796852,33.233189,32.768242,33.061516,2737800.0,32.825466,33.268955,32.703861,33.226036,1743500.0,33.226036,33.605148,33.075821,33.562233,2905300.0,33.454937,33.619457,33.168812,33.268955,2778500.0
5,2013-08-08,BBG000C2V541,A,33.261803,33.297569,32.646637,32.796852,2379900.0,2013-08-09,1.0,32.818314,33.233189,32.632332,33.104435,2379500.0,32.854076,33.061516,32.625179,32.961372,3188600.0,33.09013,33.190273,32.761086,32.861229,2621800.0,32.947067,33.268955,32.896996,33.175964,3354000.0,32.55365,33.047211,32.41774,32.932762,3827000.0,32.796852,33.233189,32.768242,33.061516,2737800.0,32.825466,33.268955,32.703861,33.226036,1743500.0,33.226036,33.605148,33.075821,33.562233,2905300.0,33.454937,33.619457,33.168812,33.268955,2778500.0,34.041489,34.456367,33.512161,33.590843,9378200.0
6,2013-08-09,BBG000C2V541,A,32.796852,33.233189,32.768242,33.061516,2737800.0,2013-08-12,3.0,33.261803,33.297569,32.646637,32.796852,2379900.0,32.818314,33.233189,32.632332,33.104435,2379500.0,32.854076,33.061516,32.625179,32.961372,3188600.0,33.09013,33.190273,32.761086,32.861229,2621800.0,32.947067,33.268955,32.896996,33.175964,3354000.0,32.825466,33.268955,32.703861,33.226036,1743500.0,33.226036,33.605148,33.075821,33.562233,2905300.0,33.454937,33.619457,33.168812,33.268955,2778500.0,34.041489,34.456367,33.512161,33.590843,9378200.0,33.483547,33.776825,33.276108,33.469242,5067300.0
7,2013-08-12,BBG000C2V541,A,32.825466,33.268955,32.703861,33.226036,1743500.0,2013-08-13,1.0,32.796852,33.233189,32.768242,33.061516,2737800.0,33.261803,33.297569,32.646637,32.796852,2379900.0,32.818314,33.233189,32.632332,33.104435,2379500.0,32.854076,33.061516,32.625179,32.961372,3188600.0,33.09013,33.190273,32.761086,32.861229,2621800.0,33.226036,33.605148,33.075821,33.562233,2905300.0,33.454937,33.619457,33.168812,33.268955,2778500.0,34.041489,34.456367,33.512161,33.590843,9378200.0,33.483547,33.776825,33.276108,33.469242,5067300.0,33.354794,33.576538,33.204578,33.218884,2415100.0
8,2013-08-13,BBG000C2V541,A,33.226036,33.605148,33.075821,33.562233,2905300.0,2013-08-14,1.0,32.825466,33.268955,32.703861,33.226036,1743500.0,32.796852,33.233189,32.768242,33.061516,2737800.0,33.261803,33.297569,32.646637,32.796852,2379900.0,32.818314,33.233189,32.632332,33.104435,2379500.0,32.854076,33.061516,32.625179,32.961372,3188600.0,33.454937,33.619457,33.168812,33.268955,2778500.0,34.041489,34.456367,33.512161,33.590843,9378200.0,33.483547,33.776825,33.276108,33.469242,5067300.0,33.354794,33.576538,33.204578,33.218884,2415100.0,33.211731,33.812588,33.025749,33.640915,3207400.0
9,2013-08-14,BBG000C2V541,A,33.454937,33.619457,33.168812,33.268955,2778500.0,2013-08-15,1.0,33.226036,33.605148,33.075821,33.562233,2905300.0,32.825466,33.268955,32.703861,33.226036,1743500.0,32.796852,33.233189,32.768242,33.061516,2737800.0,33.261803,33.297569,32.646637,32.796852,2379900.0,32.818314,33.233189,32.632332,33.104435,2379500.0,34.041489,34.456367,33.512161,33.590843,9378200.0,33.483547,33.776825,33.276108,33.469242,5067300.0,33.354794,33.576538,33.204578,33.218884,2415100.0,33.211731,33.812588,33.025749,33.640915,3207400.0,33.569386,33.79113,33.361946,33.540771,2724800.0


In [19]:
test.to_csv("finished_us_equity_daily_price_201308_201908.csv", index=False)