# Set Up

## Import 

In [1]:
import pandas as pd

## Read In Price Data

In [2]:
daily_price = pd.read_csv('us_equity_daily_price_201308_201908.csv', parse_dates=['date'])
daily_price = daily_price.sort_values(['ticker', 'date']).reset_index(drop=True)

In [3]:
daily_price.head()

Unnamed: 0,date,entity_figi,ticker,open,high,low,close,volume
0,2013-08-01,BBG000C2V541,A,32.55365,33.047211,32.41774,32.932762,3827000.0
1,2013-08-02,BBG000C2V541,A,32.947067,33.268955,32.896996,33.175964,3354000.0
2,2013-08-05,BBG000C2V541,A,33.09013,33.190273,32.761086,32.861229,2621800.0
3,2013-08-06,BBG000C2V541,A,32.854076,33.061516,32.625179,32.961372,3188600.0
4,2013-08-07,BBG000C2V541,A,32.818314,33.233189,32.632332,33.104435,2379500.0


## Data Quality Check
- There are no data for the current date (2019-08-16). **So we removed all the records of that date.**
- The data of ticker `PX` on `2018-11-06` is missing, **So we manual filled the value in the CSV file.**
- There are cases that one ticker has 2 `entity_figi`, however, the price data of two entity_figi of a given ticker are the same. **So we only kept the first entity_figi.**
- There are 10 tickers only have one record in total. **So we removed those 10 tickers.**
- There are 197 tickers having the date_gap > 4 days. **So we removed these tickers.**

In [4]:
# Check the null values
daily_price.isnull().sum()

date             0
entity_figi      0
ticker           0
open           488
high           488
low            488
close          488
volume         488
dtype: int64

In [5]:
# After Check we can find that the null values are all the price of today (2019-08-16) and 2018-11-06
"""
The null value of today is reasonable because it may haven't generate today's price data
But the null value of 2018-11-06 is unreasonable so we need to do the check in the next step
"""
daily_price[daily_price.open.isnull()].date.unique()

array(['2019-08-16T00:00:00.000000000', '2018-11-06T00:00:00.000000000'],
      dtype='datetime64[ns]')

In [6]:
daily_price[(daily_price.date == '2018-11-06T00:00:00.000000000') & (daily_price.open.isnull())]

Unnamed: 0,date,entity_figi,ticker,open,high,low,close,volume
4237695,2018-11-06,BBG000CGQ975,PX,,,,,


In [7]:
# Fill up the missing price data of 2018-11-06
daily_price.loc[4237695, "open"] = 3.31
daily_price.loc[4237695, "high"] = 3.46
daily_price.loc[4237695, "low"] = 3.30
daily_price.loc[4237695, "close"] = 3.39
daily_price.loc[4237695, "volume"] = 1486000

In [8]:
# Drop the other null values
daily_price.dropna(subset=['open', 'high', 'low', 'close', 'volume'], how='all', inplace=True)

In [12]:
# Check if the entity_figi and the ticker are corresponding
ticker_figi = daily_price.groupby("ticker").nunique()["entity_figi"].reset_index().sort_values("entity_figi", ascending=False)

In [10]:
# Some of the tickers have not only one corresponding entity_figi (19 in total)
ticker_figi[ticker_figi.entity_figi != 1].head(10)

Unnamed: 0,ticker,entity_figi
2122,HRZN,2
1502,ESRT,2
1135,CXSE,2
2102,HOV,2
4379,TMUS,2
1764,FSAM,2
1099,CUBI,2
350,ATO,2
3452,PFG,2
420,BAC,2


In [18]:
# Only remain one entity_figi of each ticker
price = daily_price.drop_duplicates(subset=['ticker', 'date'], keep='first')
ticker_figi = price.groupby("ticker").nunique()["entity_figi"].reset_index().sort_values("entity_figi", ascending=False)
ticker_figi[ticker_figi.entity_figi != 1].head(10)

Unnamed: 0,ticker,entity_figi
1009,CPTA,2
350,ATO,2
1764,FSAM,2
3452,PFG,2
2501,KCAP,2
3937,SCS,2
1502,ESRT,2
4095,SOCL,2
2122,HRZN,2
60,ACWI,2


In [30]:
df = price[price.ticker == 'CPTA']
df.set_index()

Unnamed: 0,date,entity_figi,ticker,open,high,low,close,volume
1185590,2013-09-25,BBG004MYLJ43,CPTA,19.150000,19.309999,18.410000,18.799999,2729500.0
1185592,2013-09-26,BBG004MYLJ43,CPTA,19.000000,19.200001,18.860001,19.150000,146300.0
1185594,2013-09-27,BBG004MYLJ43,CPTA,19.000000,19.200001,18.799999,19.049999,163800.0
1185596,2013-09-30,BBG004MYLJ43,CPTA,19.320000,19.320000,18.750000,19.200001,68400.0
1185598,2013-10-01,BBG004MYLJ43,CPTA,18.850000,19.190001,18.750000,18.950001,36800.0
1185600,2013-10-02,BBG004MYLJ43,CPTA,18.850000,18.950001,18.540001,18.719999,67400.0
1185602,2013-10-03,BBG004MYLJ43,CPTA,18.809999,18.879999,17.910000,18.250000,153300.0
1185604,2013-10-04,BBG004MYLJ43,CPTA,18.750000,18.750000,17.950001,17.950001,49900.0
1185606,2013-10-07,BBG004MYLJ43,CPTA,18.059999,18.540001,18.010000,18.309999,30300.0
1185608,2013-10-08,BBG004MYLJ43,CPTA,18.750000,18.750000,18.280001,18.280001,18200.0


In [26]:
df[df.duplicated(subset=['date'])]

Unnamed: 0,date,entity_figi,ticker,open,high,low,close,volume


In [20]:
len(ticker_figi[ticker_figi.entity_figi != 1])

10

In [12]:
# Check the date gap
price_test = price.copy()
price_test = price_test.sort_values(['ticker', 'date']).reset_index(drop=True)
price_test['next_date'] = price_test.groupby('ticker')['date'].shift(-1)
price_test['date_gap'] = price_test.apply(lambda x:(x['next_date'] - x['date']).days, axis=1)

In [13]:
# Quick check to see if there are any date gaps bigger than 3 days 
price_test[(price_test['date_gap'].notnull()) & (price_test['date_gap'] > 3)].sort_values('date_gap', ascending=False).head()

Unnamed: 0,date,entity_figi,ticker,open,high,low,close,volume,next_date,date_gap
2956102,2014-09-23,UNASSIGNED,KORZ,23.190001,23.41,23.139999,23.41,3100.0,2016-11-14,783.0
724651,2014-09-23,UNASSIGNED,BRZS,31.620001,33.09,30.25,32.02,34500.0,2016-11-14,783.0
5249778,2016-03-10,BBG008G2LBZ4,UK,20.5,20.5,20.5,20.5,0.0,2017-10-26,595.0
1974025,2014-02-03,BBG007FL7ZK4,FLOW,4.04,4.04,4.04,4.04,0.0,2015-09-17,591.0
556000,2015-03-24,BBG000BM06Y7,BDI,0.001,0.001,0.001,0.001,0.0,2016-08-22,517.0


In [14]:
price_test[price_test.date_gap > 4].ticker.nunique()

197

In [16]:
# After a check, we found that 197 tickers have date_gap > 4, so remove them
abnormal_ticker = list(price_test[price_test.date_gap > 4].ticker.unique())
normal_ticker_price = price_test[~price_test.ticker.isin(abnormal_ticker)]

In [17]:
# After remove the tickers have date-gap > 4, 4753 tickers remained
normal_ticker_price.ticker.nunique()

4753

In [18]:
# But only 4743 tickers have date_gap == 1, so we need to check the 10 tickers left 
normal_ticker_price[normal_ticker_price.date_gap == 1].ticker.nunique()

4743

In [19]:
# The 10 tickers do not have date_gap == 1 
# After the check we found that all these 10 tickers only have one record in total
set(normal_ticker_price.ticker.unique()) - set(normal_ticker_price[normal_ticker_price.date_gap == 1].ticker.unique())

{'CBG', 'DXM', 'GGP', 'GSIG', 'HCN', 'ITC', 'PCLN', 'SNI', 'STRN', 'TTF'}

In [20]:
price_test[price_test.ticker == 'CBG']

Unnamed: 0,date,entity_figi,ticker,open,high,low,close,volume,next_date,date_gap
848675,2018-03-19,BBG000C043Q6,CBG,47.450001,47.5,46.779999,47.310001,1398603.0,NaT,


In [21]:
price_test[price_test.ticker == 'DXM']

Unnamed: 0,date,entity_figi,ticker,open,high,low,close,volume,next_date,date_gap
1551157,2018-04-25,BBG000BCKW07,DXM,10923.900391,10923.900391,10923.900391,10923.900391,0.0,NaT,


In [22]:
price_test[price_test.ticker == 'GGP']

Unnamed: 0,date,entity_figi,ticker,open,high,low,close,volume,next_date,date_gap
2153121,2018-08-27,BBG000BG3K58,GGP,22.030001,22.290001,21.83,22.059999,181077630.0,NaT,


In [23]:
price_test[price_test.ticker == 'GSIG']

Unnamed: 0,date,entity_figi,ticker,open,high,low,close,volume,next_date,date_gap
2268439,2018-05-22,UNASSIGNED,GSIG,64.150002,64.400002,62.400002,62.549999,156200.0,NaT,


In [24]:
price_test[price_test.ticker == 'HCN']

Unnamed: 0,date,entity_figi,ticker,open,high,low,close,volume,next_date,date_gap
2339062,2018-02-27,BBG000BKY3S8,HCN,54.349998,54.709999,52.220001,52.220001,3190327.0,NaT,


In [25]:
price_test[price_test.ticker == 'ITC']

Unnamed: 0,date,entity_figi,ticker,open,high,low,close,volume,next_date,date_gap
2712999,2018-05-22,BBG000QDSVC0,ITC,4.12,4.12,4.1,4.1,0.0,NaT,


In [26]:
price_test[price_test.ticker == 'PCLN']

Unnamed: 0,date,entity_figi,ticker,open,high,low,close,volume,next_date,date_gap
3934990,2018-02-26,BBG000BLC4L5,PCLN,1905.949951,1923.219971,1897.660034,1905.640015,570074.0,NaT,


In [27]:
price_test[price_test.ticker == 'SNI']

Unnamed: 0,date,entity_figi,ticker,open,high,low,close,volume,next_date,date_gap
4724848,2018-03-06,BBG000RQWCN5,SNI,90.099998,93.580002,89.769997,90.040001,11132892.0,NaT,


In [28]:
price_test[price_test.ticker == 'STRN']

Unnamed: 0,date,entity_figi,ticker,open,high,low,close,volume,next_date,date_gap
4871240,2018-04-24,BBG000BR8N17,STRN,5000.0,5000.0,5000.0,5000.0,0.0,NaT,


In [29]:
price_test[price_test.ticker == 'TTF']

Unnamed: 0,date,entity_figi,ticker,open,high,low,close,volume,next_date,date_gap
5159401,2018-04-25,BBG000BVRHP4,TTF,5300.0,5300.0,5300.0,5300.0,0.0,NaT,


In [46]:
# Remove the 10 tickers only have one record in total
useless_ticker = list(set(normal_ticker_price.ticker.unique()) - set(normal_ticker_price[normal_ticker_price.date_gap == 1].ticker.unique()))
remove_ticker = useless_ticker + abnormal_ticker
left_ticker_price = price[~price.ticker.isin(remove_ticker)]

## Data Transformation

In [47]:
left_ticker_price = left_ticker_price.sort_values(["ticker", "date"]).reset_index(drop=True)
for n in range (1,6):
    for period in ["prior", "future"]:
        for col in ["open", "high", "low", "close", "volume"]:
            if period == "prior":
                left_ticker_price[f'{n}d_{period}_{col}'] = left_ticker_price.groupby('ticker')[col].shift(n)
            elif period == "future":
                left_ticker_price[f'{n}d_{period}_{col}'] = left_ticker_price.groupby('ticker')[col].shift(-n)

In [48]:
left_ticker_price.tail()

Unnamed: 0,date,entity_figi,ticker,open,high,low,close,volume,1d_prior_open,1d_prior_high,1d_prior_low,1d_prior_close,1d_prior_volume,1d_future_open,1d_future_high,1d_future_low,1d_future_close,1d_future_volume,2d_prior_open,2d_prior_high,2d_prior_low,2d_prior_close,2d_prior_volume,2d_future_open,2d_future_high,2d_future_low,2d_future_close,2d_future_volume,3d_prior_open,3d_prior_high,3d_prior_low,3d_prior_close,3d_prior_volume,3d_future_open,3d_future_high,3d_future_low,3d_future_close,3d_future_volume,4d_prior_open,4d_prior_high,4d_prior_low,4d_prior_close,4d_prior_volume,4d_future_open,4d_future_high,4d_future_low,4d_future_close,4d_future_volume,5d_prior_open,5d_prior_high,5d_prior_low,5d_prior_close,5d_prior_volume,5d_future_open,5d_future_high,5d_future_low,5d_future_close,5d_future_volume
5591938,2018-05-16,BBG009J96R40,ZYNE,9.47,9.53,9.07,9.4,154600.0,9.37,9.73,9.36,9.47,170500.0,9.47,9.97,9.35,9.81,247900.0,9.22,9.54,9.05,9.44,165900.0,9.88,10.47,9.87,10.28,285600.0,8.77,9.24,8.751,9.23,206100.0,10.36,10.54,9.621,9.98,292200.0,8.62,9.12,8.62,8.75,281500.0,10.06,10.64,9.901,10.34,261000.0,9.0,9.01,8.43,8.65,458600.0,,,,,
5591939,2018-05-17,BBG009J96R40,ZYNE,9.47,9.97,9.35,9.81,247900.0,9.47,9.53,9.07,9.4,154600.0,9.88,10.47,9.87,10.28,285600.0,9.37,9.73,9.36,9.47,170500.0,10.36,10.54,9.621,9.98,292200.0,9.22,9.54,9.05,9.44,165900.0,10.06,10.64,9.901,10.34,261000.0,8.77,9.24,8.751,9.23,206100.0,,,,,,8.62,9.12,8.62,8.75,281500.0,,,,,
5591940,2018-05-18,BBG009J96R40,ZYNE,9.88,10.47,9.87,10.28,285600.0,9.47,9.97,9.35,9.81,247900.0,10.36,10.54,9.621,9.98,292200.0,9.47,9.53,9.07,9.4,154600.0,10.06,10.64,9.901,10.34,261000.0,9.37,9.73,9.36,9.47,170500.0,,,,,,9.22,9.54,9.05,9.44,165900.0,,,,,,8.77,9.24,8.751,9.23,206100.0,,,,,
5591941,2018-05-21,BBG009J96R40,ZYNE,10.36,10.54,9.621,9.98,292200.0,9.88,10.47,9.87,10.28,285600.0,10.06,10.64,9.901,10.34,261000.0,9.47,9.97,9.35,9.81,247900.0,,,,,,9.47,9.53,9.07,9.4,154600.0,,,,,,9.37,9.73,9.36,9.47,170500.0,,,,,,9.22,9.54,9.05,9.44,165900.0,,,,,
5591942,2018-05-22,BBG009J96R40,ZYNE,10.06,10.64,9.901,10.34,261000.0,10.36,10.54,9.621,9.98,292200.0,,,,,,9.88,10.47,9.87,10.28,285600.0,,,,,,9.47,9.97,9.35,9.81,247900.0,,,,,,9.47,9.53,9.07,9.4,154600.0,,,,,,9.37,9.73,9.36,9.47,170500.0,,,,,


In [49]:
left_ticker_price.to_csv('us_equity_daily_price_201308_201908_transformed.csv', index=False)