# Set Up

## Import

In [1]:
import pandas as pd

## EPS Data

### Read In Data 

In [2]:
eps1 = pd.read_csv('EPS1.csv', parse_dates=['date']).sort_values("date").reset_index(drop=True)
eps2 = pd.read_csv('EPS2.csv', parse_dates=['date']).sort_values("date").reset_index(drop=True)
eps3 = pd.read_csv('EPS3.csv', parse_dates=['date']).sort_values("date").reset_index(drop=True)
eps4 = pd.read_csv('EPS4.csv', parse_dates=['date']).sort_values("date").reset_index(drop=True)
eps5 = pd.read_csv('EPS5.csv', parse_dates=['date']).sort_values("date").reset_index(drop=True)
eps6 = pd.read_csv('EPS6.csv', parse_dates=['date']).sort_values("date").reset_index(drop=True)
eps7 = pd.read_csv('EPS7.csv', parse_dates=['date']).sort_values("date").reset_index(drop=True)

### Merge All Data

In [3]:
eps_dfs = [eps1, eps2, eps3, eps4, eps5, eps6, eps7]
eps = eps1.merge(eps2, on="date").merge(eps3, on="date").merge(eps4, on="date").merge(eps5, on="date").merge(eps6, on="date").merge(eps7, on="date")
eps = eps.sort_values("date").reset_index(drop=True)

In [4]:
# Quick quality check to make sure no columns are dropped
len(eps.columns) - 1 == sum([len(df.columns) - 1 for df in eps_dfs])

True

In [5]:
# Number of tickers left
len(eps.columns) - 1

2545

In [6]:
# Drop the data without any eps data
left_eps = eps.copy()
left_eps = left_eps.dropna(axis=1, how='all')

### Unstack EPS Data

In [7]:
trans_eps = left_eps.set_index("date").unstack().reset_index()
trans_eps.rename(columns={"level_0": "ticker", 0: "next_twelve_months_eps"}, inplace=True)
trans_eps = trans_eps.sort_values(["ticker", "date"]).reset_index(drop=True)

### Check Data Quality

In [8]:
# Check the date gap
eps_test = trans_eps.copy()
eps_test = eps_test.sort_values('date').reset_index(drop=True)
eps_test['next_date'] = eps_test.groupby('ticker')['date'].shift(-1)
eps_test['date_gap'] = eps_test.apply(lambda x:(x['next_date'] - x['date']).days, axis=1)

In [9]:
# Quick check to see if there are any date gaps bigger than 7
eps_test[(eps_test['date_gap'].notnull()) & (eps_test['date_gap'] != 7)].sort_values('date_gap', ascending=False).head()

Unnamed: 0,ticker,date,next_twelve_months_eps,next_date,date_gap


In [10]:
# Check how many tickers don't have records at all
len(set(eps.columns) - set(left_eps.columns))

74

## REV Data

### Read In REV Data

In [11]:
rev1 = pd.read_csv('REV1.csv', parse_dates=['date']).sort_values("date").reset_index(drop=True)
rev2 = pd.read_csv('REV2.csv', parse_dates=['date']).sort_values("date").reset_index(drop=True)
rev3 = pd.read_csv('REV3.csv', parse_dates=['date']).sort_values("date").reset_index(drop=True)
rev4 = pd.read_csv('REV4.csv', parse_dates=['date']).sort_values("date").reset_index(drop=True)
rev5 = pd.read_csv('REV5.csv', parse_dates=['date']).sort_values("date").reset_index(drop=True)
rev6 = pd.read_csv('REV6.csv', parse_dates=['date']).sort_values("date").reset_index(drop=True)
rev7 = pd.read_csv('REV7.csv', parse_dates=['date']).sort_values("date").reset_index(drop=True)

### Merge All Data

In [12]:
rev_dfs = [rev1, rev2, rev3, rev4, rev5, rev6, rev7]
rev = rev1.merge(rev2, on="date").merge(rev3, on="date").merge(rev4, on="date").merge(rev5, on="date").merge(rev6, on="date").merge(rev7, on="date")
rev = rev.sort_values("date").reset_index(drop=True)

In [13]:
# Quick quality check to make sure no columns are dropped
len(rev.columns) - 1 == sum([len(df.columns) - 1 for df in rev_dfs])

True

In [14]:
len(rev.columns) - 1

2545

In [15]:
# Drop the data without any rev data
left_rev = rev.copy()
left_rev = left_rev.dropna(axis=1, how='all')

### Unstack REV Data

In [16]:
trans_rev = left_rev.set_index("date").unstack().reset_index()
trans_rev.rename(columns={"level_0": "ticker", 0: "next_twelve_months_rev"}, inplace=True)
trans_rev = trans_rev.sort_values(['ticker', 'date']).reset_index(drop=True)

### Check Data Quality

In [17]:
# Check the date gap
rev_test = trans_rev.copy()
rev_test = rev_test.sort_values('date').reset_index(drop=True)
rev_test['next_date'] = rev_test.groupby('ticker')['date'].shift(-1)
rev_test['date_gap'] = rev_test.apply(lambda x:(x['next_date'] - x['date']).days, axis=1)

In [18]:
# Quick check to see if there are any date gaps bigger than 7
rev_test[(rev_test['date_gap'].notnull()) & (rev_test['date_gap'] != 7)].sort_values('date_gap', ascending=False).head()

Unnamed: 0,ticker,date,next_twelve_months_rev,next_date,date_gap


In [19]:
# Check how many tickers don't have records at all
len(set(rev.columns) - set(left_rev.columns))

77

## Market Cap Data 
- In the file `fwdREVdataWithMarketCap.xlsx` that the client provided later, there are **780** tickers.
- Within all the 780 tickers, there are only **773** tickers also exisiting `daily_market_cap_2014_2019.csv` that the client provided ealier.
- After quality check, we found that 7 tickers among 780 tickers contains all null values, which means these 7 tickers do not have any marketcap records at all
- **So we will keep those 773 tickers for the future analysis** 

### Read In Data (2004 - 2014)

In [20]:
mar1 = pd.read_csv('MAR1.csv', parse_dates=['date']).sort_values("date").reset_index(drop=True)
mar2 = pd.read_csv('MAR2.csv', parse_dates=['date']).sort_values("date").reset_index(drop=True)
mar3 = pd.read_csv('MAR3.csv', parse_dates=['date']).sort_values("date").reset_index(drop=True)
mar4 = pd.read_csv('MAR4.csv', parse_dates=['date']).sort_values("date").reset_index(drop=True)

### Merge All Data

In [21]:
mar_dfs = [mar1, mar2, mar3, mar4]
mar_2004 = mar1.merge(mar2, on="date").merge(mar3, on="date").merge(mar4, on="date")
mar_2004 = mar_2004.sort_values("date").reset_index(drop=True)

In [22]:
mar_2004.head()

Unnamed: 0,date,AAPL,GOOG,MSFT,AMZN,BRK/A,FB,BABA,JPM,JNJ,...,EXEL,DNKN,WWD,LPLA,NATI,INGR,WPX,FAF,PRAH,BOKF
0,2004-05-07,10135.7281,10135.7281,10135.7281,10135.7281,10135.7281,10135.7281,10135.7281,75940.4142,164163.7474,...,,,,,,,,,,
1,2004-05-14,10283.9446,10283.9446,10283.9446,10283.9446,10283.9446,10283.9446,10283.9446,74259.4862,161848.2371,...,,,,,,,,,,
2,2004-05-21,10302.9467,10302.9467,10302.9467,10302.9467,10302.9467,10302.9467,10302.9467,75134.1072,163095.0503,...,,,,,,,,,,
3,2004-05-28,10663.9869,10663.9869,10663.9869,10663.9869,10663.9869,10663.9869,10663.9869,76716.7547,165380.8747,...,,,,,,,,,,
4,2004-06-04,10937.6173,10937.6173,10937.6173,10937.6173,10937.6173,10937.6173,10937.6173,77633.0243,167369.8387,...,,,,,,,,,,


In [23]:
# Quick quality check to make sure no columns are dropped
len(mar_2004.columns) - 1 == sum([len(df.columns) - 1 for df in mar_dfs])

True

In [24]:
# Check how many tickers are included in the mar_2004
len(mar_2004.columns) - 1

780

In [24]:
# Create a list of all the tickers have records in the mar_2004 File
useful_tickers = mar_2004.columns[1:]

In [25]:
# Drop the data without any mar data
left_mar_2004 = mar_2004.copy()
left_mar_2004 = mar_2004.dropna(axis=1, how='all')

In [27]:
# The number of the tickers don't have records at all periods
len(mar_2004.columns) - len(left_mar_2004.columns)

195

### Unstack MAR Data

In [28]:
trans_mar = left_mar_2004.set_index("date").unstack().reset_index()
trans_mar.rename(columns={"level_0": "ticker", 0: "marketcap"}, inplace=True)
trans_mar = trans_mar.sort_values(["ticker", "date"]).reset_index(drop=True)

In [29]:
set(trans_mar.date.unique()) - set(left_rev.date.unique())

{numpy.datetime64('2004-05-07T00:00:00.000000000'),
 numpy.datetime64('2004-06-10T00:00:00.000000000'),
 numpy.datetime64('2004-12-23T00:00:00.000000000'),
 numpy.datetime64('2005-03-24T00:00:00.000000000'),
 numpy.datetime64('2006-04-12T00:00:00.000000000'),
 numpy.datetime64('2007-04-05T00:00:00.000000000'),
 numpy.datetime64('2008-03-20T00:00:00.000000000'),
 numpy.datetime64('2008-07-03T00:00:00.000000000'),
 numpy.datetime64('2009-04-09T00:00:00.000000000'),
 numpy.datetime64('2009-07-02T00:00:00.000000000'),
 numpy.datetime64('2009-12-24T00:00:00.000000000'),
 numpy.datetime64('2009-12-31T00:00:00.000000000'),
 numpy.datetime64('2010-04-01T00:00:00.000000000'),
 numpy.datetime64('2010-12-23T00:00:00.000000000'),
 numpy.datetime64('2011-04-21T00:00:00.000000000'),
 numpy.datetime64('2012-04-05T00:00:00.000000000'),
 numpy.datetime64('2013-03-28T00:00:00.000000000'),
 numpy.datetime64('2014-04-17T00:00:00.000000000'),
 numpy.datetime64('2014-07-03T00:00:00.000000000'),
 numpy.datet

In [30]:
set(left_rev.date.unique()) - set(trans_mar.date.unique())

{numpy.datetime64('2004-06-11T00:00:00.000000000'),
 numpy.datetime64('2004-12-24T00:00:00.000000000'),
 numpy.datetime64('2005-03-25T00:00:00.000000000'),
 numpy.datetime64('2006-04-14T00:00:00.000000000'),
 numpy.datetime64('2007-04-06T00:00:00.000000000'),
 numpy.datetime64('2008-03-21T00:00:00.000000000'),
 numpy.datetime64('2008-07-04T00:00:00.000000000'),
 numpy.datetime64('2009-04-10T00:00:00.000000000'),
 numpy.datetime64('2009-07-03T00:00:00.000000000'),
 numpy.datetime64('2009-12-25T00:00:00.000000000'),
 numpy.datetime64('2010-01-01T00:00:00.000000000'),
 numpy.datetime64('2010-04-02T00:00:00.000000000'),
 numpy.datetime64('2010-12-24T00:00:00.000000000'),
 numpy.datetime64('2011-04-22T00:00:00.000000000'),
 numpy.datetime64('2012-04-06T00:00:00.000000000'),
 numpy.datetime64('2013-03-29T00:00:00.000000000'),
 numpy.datetime64('2014-04-18T00:00:00.000000000'),
 numpy.datetime64('2014-07-04T00:00:00.000000000'),
 numpy.datetime64('2015-04-03T00:00:00.000000000'),
 numpy.datet

### Check Data Quanlity

In [31]:
# Check the date gap
mar_test = trans_mar.copy()
mar_test = mar_test.sort_values('date').reset_index(drop=True)
mar_test['next_date'] = mar_test.groupby('ticker')['date'].shift(-1)
mar_test['date_gap'] = mar_test.apply(lambda x:(x['next_date'] - x['date']).days, axis=1)

In [32]:
# Quick check to see if there are any date gaps bigger than 7 (None)
mar_test[(mar_test['date_gap'].notnull()) & (mar_test['date_gap'] != 7)].sort_values('date_gap', ascending=False).head()

Unnamed: 0,ticker,date,marketcap,next_date,date_gap
59085,LYB,2006-04-12,,2006-04-21,9.0
59486,IPGP,2006-04-12,,2006-04-21,9.0
59470,SWK,2006-04-12,4373.31,2006-04-21,9.0
59471,CHKP,2006-04-12,4500.19,2006-04-21,9.0
59472,CERN,2006-04-12,3503.04,2006-04-21,9.0


### Read In Data (2014 - present)

In [33]:
mar_2014 = pd.read_csv('daily_market_cap_2014_2019.csv', parse_dates=['date'])
mar_2014 = mar_2014[['ticker', 'date', 'marketcap']]
mar_2014.head()

Unnamed: 0,ticker,date,marketcap
0,A,2018-10-19,20551.1
1,AA,2018-10-19,7197.9
2,AAC,2018-10-19,170.2
3,AAL,2018-10-19,14754.7
4,AAMC,2018-10-19,79.1


### Append the Mar_2004 File Mar_2014 File 

In [34]:
mar = mar_2014.append(trans_mar, ignore_index=True)

# Just remain the 780 tickers have in mar_2004
useful_mar = mar[mar.ticker.isin(useful_tickers)]
useful_mar.ticker.nunique()

773

### Data Quality Check

In [35]:
# Check what tickers are in the marketcap file from 2004 but not in the marketfile from 2014
set(useful_tickers) - set(useful_mar.ticker.unique())

{'DWDP', 'FQVLF', 'FWONK', 'HRS', 'NWS', 'SBNY', 'ZG'}

In [36]:
# Check if any of the ticker actually have records in the mar_2004 file (None)
trans_mar[trans_mar.ticker == 'DWDP']

Unnamed: 0,ticker,date,marketcap


In [37]:
trans_mar[trans_mar.ticker == 'FQVLF']

Unnamed: 0,ticker,date,marketcap


In [38]:
trans_mar[trans_mar.ticker == 'FWONK']

Unnamed: 0,ticker,date,marketcap


In [39]:
trans_mar[trans_mar.ticker == 'HRS']

Unnamed: 0,ticker,date,marketcap


In [40]:
trans_mar[trans_mar.ticker == 'NWS']

Unnamed: 0,ticker,date,marketcap


In [41]:
trans_mar[trans_mar.ticker == 'SBNY']

Unnamed: 0,ticker,date,marketcap


In [42]:
trans_mar[trans_mar.ticker == 'ZG']

Unnamed: 0,ticker,date,marketcap


In [43]:
# Check the null value
useful_mar.isnull().sum()

ticker            0
date              0
marketcap    164373
dtype: int64

In [44]:
# Drop the null values
left_mar = useful_mar.dropna(subset=['marketcap'])

In [45]:
# Check after the drop if the ticker have any loss
left_mar.ticker.nunique()

773

In [46]:
# Check the duplicated records
len(left_mar[left_mar.duplicated(subset=['ticker', 'date'], keep='first')])

31876

In [47]:
# Drop Duplicates
"""
For dulpicated marketcap values, we want to keep the marketcap value of the data from mar_2014
After we picked some tickers and checked randomly, for the duplicated records, the first line is the records of mar_2014
So we choose tp keep the first line of duplicates
"""
final_mar = left_mar.drop_duplicates(subset=['ticker', 'date'], keep='first')

In [48]:
# Check if the drop is correct
len(left_mar) - len(final_mar)

31876

In [49]:
# Create a list only includes the 773 tickers we would like to go on with
left_tickers = list(final_mar.ticker.unique())

## Read In Price Data

In [50]:
price = pd.read_csv('daily_price.csv', parse_dates=['date']).sort_values(["date", "ticker"]).reset_index(drop=True)
price = price[['ticker', 'date', 'adj_close']]
price.head()

Unnamed: 0,ticker,date,adj_close
0,A,2004-05-07,17.670214
1,AABA,2004-05-07,26.4
2,AAME,2004-05-07,2.642746
3,AAN,2004-05-07,10.597434
4,AAON,2004-05-07,3.215521


## Merge the REV Data and EPS Data

In [51]:
eps_rev = trans_rev.merge(trans_eps, how='inner', on=['ticker', 'date'])
eps_rev.head()

Unnamed: 0,ticker,date,next_twelve_months_rev,next_twelve_months_eps
0,A,2004-05-14,7317.9,1.28
1,A,2004-05-21,7602.3,1.33
2,A,2004-05-28,7618.1,1.34
3,A,2004-06-04,7633.8,1.35
4,A,2004-06-11,7649.5,1.36


In [52]:
"""
Based on the client's request:
drop all the null records (both revenue and eps) but remain the ticker
"""
useful_eps_rev = eps_rev.dropna(subset=['next_twelve_months_rev', 'next_twelve_months_eps'], how='any')

# Check if there is ticker loss in the drop (none)
useful_eps_rev.ticker.nunique()

2465

In [53]:
useful_eps_rev.isnull().sum()

ticker                    0
date                      0
next_twelve_months_rev    0
next_twelve_months_eps    0
dtype: int64

In [54]:
# Check the date gap
eps_rev_test = useful_eps_rev.copy()
eps_rev_test = eps_rev_test.sort_values('date').reset_index(drop=True)
eps_rev_test['next_date'] = eps_rev_test.groupby('ticker')['date'].shift(-1)
eps_rev_test['date_gap'] = eps_rev_test.apply(lambda x:(x['next_date'] - x['date']).days, axis=1)

In [55]:
eps_rev_test[(eps_rev_test['date_gap'].notnull()) & (eps_rev_test['date_gap'] != 7)].sort_values('date_gap', ascending=False).head()

Unnamed: 0,ticker,date,next_twelve_months_rev,next_twelve_months_eps,next_date,date_gap


### Only remain the EPS and REV records of the 773 tickers

In [56]:
# Only remain the eps and rev records in the left_tickers
left_eps_rev = useful_eps_rev[useful_eps_rev.ticker.isin(left_tickers)]

In [57]:
left_eps_rev.ticker.nunique()

751

In [58]:
# The following tickers do not have any records of the eps_rev information (so we just drop them)
print(list(set(left_tickers) - set(left_eps_rev.ticker.unique())))

['BRK/A', 'CMCSA', 'CBOE', 'DISCA', 'ULTI', 'LBRDK', 'APTV', 'FWONA', 'WPP LN', 'LEN/B', 'NWSA', 'VIA', 'Unnamed: 87', 'LBTYK', 'LBRDA', 'IBKR', 'CBS/A', 'RYAAY', 'Unnamed: 65', 'UA', 'TRIP', 'LBTYA']


## Merge the EPS, REV Data with the MAR Data

In [81]:
with_mar = left_eps_rev.merge(final_mar, how='left', on=['ticker', 'date'])

In [82]:
# Check if the merge is correct
len(with_mar) == len(left_eps_rev)

True

## Merge the EPS, REV, MAR Data with Price Data

In [83]:
all_data = with_mar.merge(price, how='left', on=['ticker', 'date'])

In [84]:
# Check if the merge is correct
len(with_mar) == len(all_data)

True

In [85]:
all_data.head()

Unnamed: 0,ticker,date,next_twelve_months_rev,next_twelve_months_eps,marketcap,adj_close
0,A,2004-05-14,7317.9,1.28,12302.5,17.18665
1,A,2004-05-21,7602.3,1.33,11726.8,16.373995
2,A,2004-05-28,7618.1,1.34,12361.7,17.260528
3,A,2004-06-04,7633.8,1.35,12142.2,16.938152
4,A,2004-06-11,7649.5,1.36,12662.2,


In [86]:
# All ticker have missing adj_close values
all_data[all_data.adj_close.isnull()].ticker.nunique()

751

In [111]:
all_data.to_csv("all_data.csv", index=False)