# Set Up

## Import

In [1]:
import pandas as pd
import numpy as np

## Read Sales Data

In [2]:
sales_2013 = pd.read_csv('p_sales pre 2013.csv', parse_dates=['revision'])
sales_2016 = pd.read_csv('p_sales 2013-2016.csv', parse_dates=['revision'])
sales_2017 = pd.read_csv('p_sales post 2017.csv', parse_dates=['revision'])

In [3]:
sales_2013.head()

Unnamed: 0,equity_id,identifier,metric,revision,next_twelve_months
0,526,XEL,p_sales,2012-12-31,1.17564
1,528,XLNX,p_sales,2012-12-31,4.19856
2,235,XOM,p_sales,2012-12-31,0.877024
3,1360,XPO,p_sales,2012-12-31,0.504743
4,198,XRAY,p_sales,2012-12-31,1.87242


In [4]:
sales_2016.head()

Unnamed: 0,equity_id,identifier,metric,revision,next_twelve_months
0,1360,XPO,p_sales,2016-12-30,0.3224
1,198,XRAY,p_sales,2016-12-30,3.2407
2,527,XRX,p_sales,2016-12-30,0.516028
3,530,XYL,p_sales,2016-12-30,1.95314
4,595,Y,p_sales,2016-12-30,1.5989


In [5]:
sales_2017.head()

Unnamed: 0,equity_id,identifier,metric,revision,next_twelve_months
0,534,ZTS,p_sales,2019-07-26,8.77503
1,3314,ZUMZ,p_sales,2019-07-26,0.658078
2,3296,XENT,p_sales,2019-07-26,5.27974
3,3297,XHR,p_sales,2019-07-26,2.1332
4,528,XLNX,p_sales,2019-07-26,8.65486


In [6]:
sales = sales_2013.append([sales_2016, sales_2017], ignore_index=True)
del sales["metric"]

In [7]:
len(sales) == len(sales_2013) + len(sales_2016) + len(sales_2017)

True

In [8]:
sales = sales.sort_values(['identifier', 'revision']).reset_index(drop=True)

In [9]:
sales.head()

Unnamed: 0,equity_id,identifier,revision,next_twelve_months
0,22,700,2005-08-29,8.23332
1,22,700,2005-09-12,9.14781
2,22,700,2007-02-26,14.2869
3,22,700,2007-03-05,13.397
4,22,700,2007-03-12,15.5226


In [10]:
len(sales.identifier.unique())

2938

## Read Earnings Data

In [11]:
eps_2013 = pd.read_csv('p_non_gaap_eps pre 2013.csv', parse_dates=['revision'])
eps_2016 = pd.read_csv('p_non_gaap_eps 2013-2016.csv', parse_dates=['revision'])
eps_2017 = pd.read_csv('p_non_gaap_eps post 2017.csv', parse_dates=['revision'])

In [12]:
eps_2013.head()

Unnamed: 0,equity_id,identifier,metric,revision,next_twelve_months
0,1014,WTW,p_non_gaap_eps,2012-12-31,11.6353
1,517,WU,p_non_gaap_eps,2012-12-31,9.24896
2,1145,WWD,p_non_gaap_eps,2012-12-31,17.0223
3,1485,WWW,p_non_gaap_eps,2012-12-31,15.5751
4,519,WY,p_non_gaap_eps,2012-12-31,33.1777


In [13]:
eps_2016.head()

Unnamed: 0,equity_id,identifier,metric,revision,next_twelve_months
0,198,XRAY,p_non_gaap_eps,2016-12-30,19.6903
1,527,XRX,p_non_gaap_eps,2016-12-30,5.12998
2,530,XYL,p_non_gaap_eps,2016-12-30,21.7169
3,595,Y,p_non_gaap_eps,2016-12-30,21.5913
4,3306,YRCW,p_non_gaap_eps,2016-12-30,12.8005


In [14]:
eps_2017.head()

Unnamed: 0,equity_id,identifier,metric,revision,next_twelve_months
0,3307,ZAGG,p_non_gaap_eps,2019-07-28,3.82712
1,1427,ZAYO,p_non_gaap_eps,2019-07-28,42.976
2,532,ZBH,p_non_gaap_eps,2019-07-28,47.695
3,1000,ZBRA,p_non_gaap_eps,2019-07-28,14.9059
4,2485,ZEN,p_non_gaap_eps,2019-07-28,281.461


In [15]:
eps = eps_2013.append([eps_2016, eps_2017], ignore_index=True)
del eps["metric"]

In [16]:
len(eps) == len(eps_2013) + len(eps_2016) + len(eps_2017)

True

In [17]:
eps = eps.sort_values(['identifier', 'revision']).reset_index(drop=True)

In [18]:
len(eps.identifier.unique())

2725

# Data Quality Check

## Check `equity_id` and `identifier` Relationship

In [19]:
sales.groupby('equity_id').nunique()['identifier'].reset_index().query("identifier != 1")

Unnamed: 0,equity_id,identifier


In [20]:
sales.groupby('identifier').nunique()['equity_id'].reset_index().query("equity_id != 1")

Unnamed: 0,identifier,equity_id


In [21]:
eps.groupby('equity_id').nunique()['identifier'].reset_index().query("identifier != 1")

Unnamed: 0,equity_id,identifier


In [22]:
eps.groupby('identifier').nunique()['equity_id'].reset_index().query("equity_id != 1")

Unnamed: 0,identifier,equity_id


## Check Missing Data
- Based on the last step, `equity_id` and `identifier` has a one-to-one relationship. 
- Either `equity_id` or `identifier` could be used as the merge key.

In [23]:
all_data = sales.merge(eps, how ='left', on=['identifier', 'equity_id', 'revision'], suffixes=["_sales", "_eps"])
all_data = all_data[["equity_id", "identifier", "revision", "next_twelve_months_sales", "next_twelve_months_eps"]]

In [24]:
all_data.head()

Unnamed: 0,equity_id,identifier,revision,next_twelve_months_sales,next_twelve_months_eps
0,22,700,2005-08-29,8.23332,
1,22,700,2005-09-12,9.14781,
2,22,700,2007-02-26,14.2869,
3,22,700,2007-03-05,13.397,
4,22,700,2007-03-12,15.5226,


In [25]:
# The number of missing data both in sales and eps file
len(all_data[(all_data.next_twelve_months_sales.isnull()) & (all_data.next_twelve_months_eps.isnull())])

201

In [27]:
# Check the missing data in eps file but not in sales file(none)
len(all_data[all_data.next_twelve_months_eps.isnull() & (all_data.next_twelve_months_sales.notnull())])

757995

In [29]:
# The number of missing data in sales file but not in eps file
len(all_data[all_data.next_twelve_months_eps.notnull() & (all_data.next_twelve_months_sales.isnull())])

47

In [31]:
eps_ticker = list(eps.identifier.unique())
sales_ticker = list(sales.identifier.unique())

# Check the ticker in sales data but not in eps data
eps_missing_ticker = set(sales_ticker) - set(eps_ticker)
len(eps_missing_ticker)

233

In [32]:
# Check the ticker in eps data but not in sales data
sales_missing_ticker = set(eps_ticker) - set(sales_ticker)
len(sales_missing_ticker)

20

## Remove the Missing Value and Conduct Further Check

In [33]:
useful_data = sales.merge(eps, how='inner', on =['equity_id', 'identifier', 'revision'], suffixes=["_sales", "_eps"])
useful_data.rename(columns={'revision': 'date'}, inplace=True)
useful_data = useful_data[useful_data["date"] < '2019-05-10']

In [34]:
# Check the null values (None)
useful_data.isnull().sum()

equity_id                   0
identifier                  0
date                        0
next_twelve_months_sales    0
next_twelve_months_eps      0
dtype: int64

In [35]:
useful_data[(useful_data["next_twelve_months_eps"].isnull()) | (useful_data["next_twelve_months_sales"].isnull())]

Unnamed: 0,equity_id,identifier,date,next_twelve_months_sales,next_twelve_months_eps


## Check Date Gap

In [53]:
# Check the date gap
useful_data = useful_data.sort_values(["equity_id", "identifier", "date"]).reset_index(drop=True)
useful_data['next_date'] = useful_data.groupby('identifier')['date'].shift(-1)
useful_data['date_gap'] = useful_data.apply(lambda x: (x['next_date'] - x['date']).days, axis=1)
useful_data[useful_data['date_gap'].notnull()].sort_values('date_gap', ascending=False).head(10)

Unnamed: 0,equity_id,identifier,date,next_twelve_months_sales,next_twelve_months_eps,next_date,date_gap
752655,2118,EGLE,2011-03-28,0.865268,29.1556,2018-11-09,2783.0
236320,438,CRM,2011-09-26,6.58768,197.041,2018-11-30,2622.0
810602,2358,HMSY,2012-02-13,6.44298,40.5882,2019-02-22,2566.0
758667,2147,ERII,2010-11-01,3.31099,110.526,2017-03-10,2321.0
430543,966,TDS,2012-04-30,0.462974,18.6846,2018-08-03,2286.0
290770,547,MDCA,2010-02-22,0.450094,121.621,2016-05-06,2265.0
524958,1262,EBIX,2013-03-11,2.61219,8.55801,2019-05-09,2250.0
825927,2421,INFN,2008-06-09,2.94958,321.25,2014-07-28,2240.0
423025,945,SNPS,2010-11-29,2.69865,20.1562,2016-10-21,2153.0
464903,1078,PTC,2010-04-12,2.23624,23.8405,2015-11-13,2041.0


In [54]:
# Check the total records that date_gap is larger than 7 days
len(useful_data.query("date_gap > 7"))

27304

## Remove Tickers with Big Date Gap
- Remove tickers that have date gap bigger than 7 days

In [48]:
# check the number of tickers have date_gap records larger than 7 days
remove_tickers = useful_data[useful_data.date_gap > 7].identifier.unique()
len(remove_tickers)

1811

In [44]:
final_data = useful_data[~useful_data["identifier"].isin(remove_tickers)]

In [45]:
len(final_data)

63950

In [52]:
print(final_data.identifier.unique())

['AAC' 'AAL' 'AAON' 'ABTX' 'ACBI' 'ACIA' 'ACLS' 'AFI' 'AG' 'AGM' 'AGX'
 'AI' 'AJRD' 'AJX' 'ALG' 'ALLE' 'ALLY' 'ALRM' 'ALTR' 'AMPH' 'AMSWA' 'ANDE'
 'ANET' 'ANIK' 'ANIP' 'APPF' 'ARA' 'ARCH' 'ARE' 'ARI' 'ARMK' 'ARNC' 'AROW'
 'ASIX' 'ASYS' 'ATKR' 'AXTA' 'AYX' 'BABA' 'BBBY' 'BBX' 'BCBP' 'BCPC'
 'BFIN' 'BKE' 'BKI' 'BL' 'BLBD' 'BLD' 'BNED' 'BOOT' 'BREW' 'BRX' 'BTE:CT'
 'BTU' 'BWEN' 'BWFG' 'CABO' 'CAC' 'CADE' 'CALX' 'CARB' 'CARO' 'CASH' 'CBL'
 'CBPX' 'CC' 'CCS' 'CDEV' 'CDK' 'CFG' 'CHGG' 'CHMG' 'CHMI' 'CHTR' 'CJ'
 'CKH' 'CLFD' 'CMD' 'COMM' 'CONE' 'COR' 'COUP' 'COWN' 'CPA' 'CPS' 'CRCM'
 'CSII' 'CSTR' 'CSWC' 'CSWI' 'CTLT' 'CTS' 'CUBI' 'CVCY' 'CWH' 'CWST'
 'CZNC' 'CZR' 'DFIN' 'DLR' 'DLTH' 'DNOW' 'DOOR' 'DPLO' 'DSKE' 'DXCM'
 'EARN' 'EBSB' 'ECOM' 'EEX' 'EGRX' 'ELF' 'ELVT' 'EMCI' 'ENFC' 'ENPH' 'ENR'
 'ENTA' 'ENVA' 'EQBK' 'EQR' 'ESCA' 'ESTE' 'ESXB' 'ETSY' 'EVA' 'FAST' 'FBK'
 'FBM' 'FBMS' 'FDC' 'FEYE' 'FFWM' 'FHB' 'FIVN' 'FIZZ' 'FLOW' 'FMBH' 'FMNB'
 'FND' 'FRAC' 'FRBK' 'FRT' 'FSB' 'FTAI' 'FTI' 'FTV' 'F