# Set Up

## Import

In [1]:
import pandas as pd
import numpy as np

## Read Sales Data

In [2]:
sales_2013 = pd.read_csv('p_sales pre 2013.csv', parse_dates=['revision'])
sales_2016 = pd.read_csv('p_sales 2013-2016.csv', parse_dates=['revision'])
sales_2017 = pd.read_csv('p_sales post 2017.csv', parse_dates=['revision'])

In [3]:
sales_2013.head()

Unnamed: 0,equity_id,identifier,metric,revision,next_twelve_months
0,526,XEL,p_sales,2012-12-31,1.17564
1,528,XLNX,p_sales,2012-12-31,4.19856
2,235,XOM,p_sales,2012-12-31,0.877024
3,1360,XPO,p_sales,2012-12-31,0.504743
4,198,XRAY,p_sales,2012-12-31,1.87242


In [4]:
sales_2016.head()

Unnamed: 0,equity_id,identifier,metric,revision,next_twelve_months
0,1360,XPO,p_sales,2016-12-30,0.3224
1,198,XRAY,p_sales,2016-12-30,3.2407
2,527,XRX,p_sales,2016-12-30,0.516028
3,530,XYL,p_sales,2016-12-30,1.95314
4,595,Y,p_sales,2016-12-30,1.5989


In [5]:
sales_2017.head()

Unnamed: 0,equity_id,identifier,metric,revision,next_twelve_months
0,534,ZTS,p_sales,2019-07-26,8.77503
1,3314,ZUMZ,p_sales,2019-07-26,0.658078
2,3296,XENT,p_sales,2019-07-26,5.27974
3,3297,XHR,p_sales,2019-07-26,2.1332
4,528,XLNX,p_sales,2019-07-26,8.65486


In [6]:
sales = sales_2013.append([sales_2016, sales_2017], ignore_index=True)
del sales["metric"]

In [7]:
len(sales) == len(sales_2013) + len(sales_2016) + len(sales_2017)

True

In [8]:
sales = sales.sort_values(['identifier', 'revision']).reset_index(drop=True)

In [9]:
sales.head()

Unnamed: 0,equity_id,identifier,revision,next_twelve_months
0,22,700,2005-08-29,8.23332
1,22,700,2005-09-12,9.14781
2,22,700,2007-02-26,14.2869
3,22,700,2007-03-05,13.397
4,22,700,2007-03-12,15.5226


In [10]:
len(sales.identifier.unique())

2938

## Read Earnings Data

In [11]:
eps_2013 = pd.read_csv('p_non_gaap_eps pre 2013.csv', parse_dates=['revision'])
eps_2016 = pd.read_csv('p_non_gaap_eps 2013-2016.csv', parse_dates=['revision'])

In [12]:
eps_2013.head()

Unnamed: 0,equity_id,identifier,metric,revision,next_twelve_months
0,1014,WTW,p_non_gaap_eps,2012-12-31,11.6353
1,517,WU,p_non_gaap_eps,2012-12-31,9.24896
2,1145,WWD,p_non_gaap_eps,2012-12-31,17.0223
3,1485,WWW,p_non_gaap_eps,2012-12-31,15.5751
4,519,WY,p_non_gaap_eps,2012-12-31,33.1777


In [13]:
eps_2016.head()

Unnamed: 0,equity_id,identifier,metric,revision,next_twelve_months
0,198,XRAY,p_non_gaap_eps,2016-12-30,19.6903
1,527,XRX,p_non_gaap_eps,2016-12-30,5.12998
2,530,XYL,p_non_gaap_eps,2016-12-30,21.7169
3,595,Y,p_non_gaap_eps,2016-12-30,21.5913
4,3306,YRCW,p_non_gaap_eps,2016-12-30,12.8005


In [14]:
eps = eps_2013.append(eps_2016, ignore_index=True)
del eps["metric"]

In [15]:
len(eps) == len(eps_2013) + len(eps_2016)

True

In [16]:
eps = eps.sort_values(['identifier', 'revision']).reset_index(drop=True)

In [17]:
# No data later than 2016-12-30
eps.sort_values('revision').tail()

Unnamed: 0,equity_id,identifier,revision,next_twelve_months
257463,263,GM,2016-12-30,6.1105
257069,49,GLW,2016-12-30,14.7193
256578,1579,GLT,2016-12-30,15.4564
261099,1402,GPI,2016-12-30,10.4044
608076,3314,ZUMZ,2016-12-30,20.4769


In [18]:
len(eps.identifier.unique())

1845

# Data Quality Check

## Check `equity_id` and `identifier` Relationship

In [19]:
sales.groupby('equity_id').nunique()['identifier'].reset_index().query("identifier != 1")

Unnamed: 0,equity_id,identifier


In [20]:
sales.groupby('identifier').nunique()['equity_id'].reset_index().query("equity_id != 1")

Unnamed: 0,identifier,equity_id


In [21]:
eps.groupby('equity_id').nunique()['identifier'].reset_index().query("identifier != 1")

Unnamed: 0,equity_id,identifier


In [22]:
eps.groupby('identifier').nunique()['equity_id'].reset_index().query("equity_id != 1")

Unnamed: 0,identifier,equity_id


## Check Missing Data
- Based on the last step, `equity_id` and `identifier` has a one-to-one relationship. 
- Either `equity_id` or `identifier` could be used as the merge key.

In [23]:
eps.head()

Unnamed: 0,equity_id,identifier,revision,next_twelve_months
0,39,A,2009-02-23,12.799
1,39,A,2009-03-02,11.3649
2,39,A,2009-03-09,11.1498
3,39,A,2009-03-16,12.3329
4,39,A,2009-03-23,14.242


In [24]:
sales.head()

Unnamed: 0,equity_id,identifier,revision,next_twelve_months
0,22,700,2005-08-29,8.23332
1,22,700,2005-09-12,9.14781
2,22,700,2007-02-26,14.2869
3,22,700,2007-03-05,13.397
4,22,700,2007-03-12,15.5226


In [25]:
all_data = sales.merge(eps, how ='left', on=['identifier', 'equity_id', 'revision'], suffixes=["_sales", "_eps"])
all_data = all_data[["equity_id", "identifier", "revision", "next_twelve_months_sales", "next_twelve_months_eps"]]

In [26]:
all_data.head()

Unnamed: 0,equity_id,identifier,revision,next_twelve_months_sales,next_twelve_months_eps
0,22,700,2005-08-29,8.23332,
1,22,700,2005-09-12,9.14781,
2,22,700,2007-02-26,14.2869,
3,22,700,2007-03-05,13.397,
4,22,700,2007-03-12,15.5226,


In [27]:
# The number of missing data both in sales and eps file
len(all_data[(all_data.next_twelve_months_sales.isnull()) & (all_data.next_twelve_months_eps.isnull())])

248

In [28]:
all_data[(all_data.next_twelve_months_sales.isnull()) & (all_data.next_twelve_months_eps.isnull())].head()

Unnamed: 0,equity_id,identifier,revision,next_twelve_months_sales,next_twelve_months_eps
29155,1666,ACTG,2019-05-12,,
29156,1666,ACTG,2019-05-17,,
56011,1685,AHC,2019-05-10,,
56012,1685,AHC,2019-05-17,,
80602,1709,ALSK,2019-05-10,,


In [30]:
# Check the missing data in eps file but not in sales file(none)
len(all_data[all_data.next_twelve_months_eps.isnull() & (all_data.next_twelve_months_sales.notnull())])

1181904

In [31]:
all_data[all_data.next_twelve_months_sales.notnull() & (all_data.next_twelve_months_eps.isnull())].head()

Unnamed: 0,equity_id,identifier,revision,next_twelve_months_sales,next_twelve_months_eps
0,22,700,2005-08-29,8.23332,
1,22,700,2005-09-12,9.14781,
2,22,700,2007-02-26,14.2869,
3,22,700,2007-03-05,13.397,
4,22,700,2007-03-12,15.5226,


In [31]:
# The number of missing data in sales file but not in eps file
len(all_data[all_data.next_twelve_months_eps.notnull() & (all_data.next_twelve_months_sales.isnull())])

0

In [34]:
# The missing identifier and the missing date
missing_data = all_data[all_data.next_twelve_months_sales.notnull() & (all_data.next_twelve_months_eps.isnull())]
missing_data[["equity_id", 'identifier', 'revision']].sort_values(['revision', "equity_id"]).reset_index(drop=True).head(15)

Unnamed: 0,equity_id,identifier,revision
0,1,AMZN,2004-05-17
1,9,NFLX,2004-05-17
2,28,ABT,2004-05-17
3,30,ACN,2004-05-17
4,31,ATVI,2004-05-17
5,32,AYI,2004-05-17
6,33,ADBE,2004-05-17
7,34,AAP,2004-05-17
8,40,AAPL,2004-05-17
9,41,AMAT,2004-05-17


In [33]:
eps_ticker = list(eps.identifier.unique())
sales_ticker = list(sales.identifier.unique())

# Check the ticker in sales data but not in eps data
eps_missing_ticker = set(sales_ticker) - set(eps_ticker)
len(eps_missing_ticker)

1094

In [34]:
# Check the ticker in eps data but not in sales data
sales_missing_ticker = set(eps_ticker) - set(sales_ticker)
len(sales_missing_ticker)

1

## Remove the Missing Value and Conduct Further Check

In [35]:
useful_data = sales.merge(eps, how = 'inner', on =['equity_id', 'identifier', 'revision'], suffixes=["_sales", "_eps"])
useful_data.rename(columns={'revision': 'date'}, inplace=True)

In [36]:
useful_data.head()

Unnamed: 0,equity_id,identifier,date,next_twelve_months_sales,next_twelve_months_eps
0,39,A,2009-02-23,1.08558,12.799
1,39,A,2009-03-02,0.963946,11.3649
2,39,A,2009-03-09,0.9457,11.1498
3,39,A,2009-03-16,1.02602,12.3329
4,39,A,2009-03-23,1.18484,14.242


In [40]:
# Check the null values (None)
useful_data.isnull().sum()

equity_id                   0
identifier                  0
date                        0
next_twelve_months_sales    0
next_twelve_months_eps      0
dtype: int64

In [41]:
# Check if each identifier on each day have double next_twelve_months_sales records (None)
useful_data.groupby(['identifier','date']).nunique()['next_twelve_months_sales'].reset_index().query("next_twelve_months_sales != 1")

Unnamed: 0,identifier,date,next_twelve_months_sales


In [42]:
# Check if each identifier on each day have double next_twelve_months_eps records (None)
useful_data.groupby(['identifier','date']).nunique()['next_twelve_months_eps'].reset_index().query("next_twelve_months_eps != 1")

Unnamed: 0,identifier,date,next_twelve_months_eps


In [43]:
# Check if there is any duplicated data (None)
useful = useful_data.sort_values(['identifier', 'date']).copy()
useful.drop_duplicates(inplace=True)
len(useful) == len(useful_data)

True

## Check Date Gap

In [44]:
# Check the date gap
sales['next_date'] = sales.groupby('identifier')['revision'].shift(-1)
sales['date_gap'] = sales.apply(lambda x: (x['next_date'] - x['revision']).days, axis=1)
sales[sales['date_gap'].notnull()].sort_values('date_gap', ascending=False).head(10)

Unnamed: 0,equity_id,identifier,revision,next_twelve_months,next_date,date_gap
926021,2462,JBSS,2006-08-14,0.219472,2018-05-04,4281.0
988967,1387,LEE,2008-11-10,0.064286,2019-05-10,3833.0
789819,2332,HBP,2008-11-10,0.033976,2019-05-10,3833.0
191241,1808,BBGI,2008-11-24,0.185117,2019-04-29,3808.0
193646,1812,BBX,2008-11-10,0.034708,2018-08-10,3560.0
533792,2093,DVD,2010-04-26,1.09744,2019-05-10,3301.0
974832,329,L,2010-04-26,1.16583,2019-05-10,3301.0
476556,2048,CYBE,2009-02-09,1.46959,2017-04-28,3000.0
717590,2268,GDEN,2009-03-02,1.36532,2017-03-17,2937.0
321147,1917,CDZI,2011-05-02,169.138,2019-05-10,2930.0


In [46]:
# Check the total records that date_gap is larger than 7 days
len(sales.query("date_gap > 7"))

62559

In [48]:
# check the number of tickers have date_gap records larger than 7 days
sales[sales.date_gap > 7].identifier.nunique()

2561

In [45]:
# Check the date gap
eps['next_date'] = eps.groupby('identifier')['revision'].shift(-1)
eps['date_gap'] = eps.apply(lambda x: (x['next_date'] - x['revision']).days, axis=1)
eps[eps['date_gap'].notnull()].sort_values('date_gap', ascending=False).head(10)

Unnamed: 0,equity_id,identifier,revision,next_twelve_months,next_date,date_gap
371284,547,MDCA,2010-02-22,121.621,2016-05-06,2265.0
309100,2421,INFN,2008-06-09,321.25,2014-07-28,2240.0
509636,945,SNPS,2010-11-29,20.1562,2016-10-21,2153.0
465137,1078,PTC,2010-04-12,23.8405,2015-11-13,2041.0
177786,718,DLB,2010-10-25,25.032,2015-10-23,1824.0
78432,1840,BKS,2010-06-28,28.7922,2015-06-26,1824.0
197407,2128,ELY,2009-06-01,28.4308,2014-04-28,1792.0
501555,21,SINA,2012-05-14,9065.78,2016-11-21,1652.0
110818,1919,CECO,2012-06-11,3734.75,2016-11-04,1607.0
443243,14,PCLN,2012-04-02,26.8145,2016-08-05,1586.0


In [47]:
# Check the total records that date_gap is larger than 7 days
len(eps.query("date_gap > 7"))

26605

In [49]:
# check the number of tickers have date_gap records larger than 7 days
eps[eps.date_gap > 7].identifier.nunique()

1714