# Set Up

## Import

In [1]:
import pandas as pd

## EPS Data

### Read In Data

In [2]:
eps1 = pd.read_csv('EPS1.csv', parse_dates=['date']).sort_values("date").reset_index(drop=True)
eps2 = pd.read_csv('EPS2.csv', parse_dates=['date']).sort_values("date").reset_index(drop=True)
eps3 = pd.read_csv('EPS3.csv', parse_dates=['date']).sort_values("date").reset_index(drop=True)
eps4 = pd.read_csv('EPS4.csv', parse_dates=['date']).sort_values("date").reset_index(drop=True)
eps5 = pd.read_csv('EPS5.csv', parse_dates=['date']).sort_values("date").reset_index(drop=True)
eps6 = pd.read_csv('EPS6.csv', parse_dates=['date']).sort_values("date").reset_index(drop=True)
eps7 = pd.read_csv('EPS7.csv', parse_dates=['date']).sort_values("date").reset_index(drop=True)

### Merge All Data

In [3]:
eps_dfs = [eps1, eps2, eps3, eps4, eps5, eps6, eps7]
eps = eps1.merge(eps2, on="date").merge(eps3, on="date").merge(eps4, on="date").merge(eps5, on="date").merge(eps6, on="date").merge(eps7, on="date")
eps = eps.sort_values("date").reset_index(drop=True)

In [4]:
# Quick quality check to make sure no columns are dropped
len(eps.columns) - 1 == sum([len(df.columns) - 1 for df in eps_dfs])

True

### Unstack EPS Data

In [5]:
trans_eps = eps.set_index("date").unstack().reset_index()
trans_eps.rename(columns={"level_0": "ticker", 0: "next_twelve_months_eps"}, inplace=True)
trans_eps = trans_eps.sort_values(["ticker", "date"]).reset_index(drop=True)

### Check Data Quality

In [6]:
# Check the date gap
eps_test = trans_eps.copy()
eps_test = eps_test.sort_values('date').reset_index(drop=True)
eps_test['next_date'] = eps_test.groupby('ticker')['date'].shift(-1)
eps_test['date_gap'] = eps_test.apply(lambda x:(x['next_date'] - x['date']).days, axis=1)

In [7]:
# Quick check to see if there are any date gaps bigger than 7
eps_test[(eps_test['date_gap'].notnull()) & (eps_test['date_gap'] != 7)].sort_values('date_gap', ascending=False).head()

Unnamed: 0,ticker,date,next_twelve_months_eps,next_date,date_gap


In [8]:
# Check how many tickers don't have records at all
len(set(eps.columns) - set(eps.dropna(axis=1, how='all').columns))

74

## REV Data

### Read In REV Data

In [9]:
rev1 = pd.read_csv('REV1.csv', parse_dates=['date']).sort_values("date").reset_index(drop=True)
rev2 = pd.read_csv('REV2.csv', parse_dates=['date']).sort_values("date").reset_index(drop=True)
rev3 = pd.read_csv('REV3.csv', parse_dates=['date']).sort_values("date").reset_index(drop=True)
rev4 = pd.read_csv('REV4.csv', parse_dates=['date']).sort_values("date").reset_index(drop=True)
rev5 = pd.read_csv('REV5.csv', parse_dates=['date']).sort_values("date").reset_index(drop=True)
rev6 = pd.read_csv('REV6.csv', parse_dates=['date']).sort_values("date").reset_index(drop=True)
rev7 = pd.read_csv('REV7.csv', parse_dates=['date']).sort_values("date").reset_index(drop=True)

### Merge All Data

In [10]:
rev_dfs = [rev1, rev2, rev3, rev4, rev5, rev6, rev7]
rev = rev1.merge(rev2, on="date").merge(rev3, on="date").merge(rev4, on="date").merge(rev5, on="date").merge(rev6, on="date").merge(rev7, on="date")
rev = rev.sort_values("date").reset_index(drop=True)

In [11]:
# Quick quality check to make sure no columns are dropped
len(rev.columns) - 1 == sum([len(df.columns) - 1 for df in rev_dfs])

True

### Unstack REV Data

In [12]:
trans_rev = rev.set_index("date").unstack().reset_index()
trans_rev.rename(columns={"level_0": "ticker", 0: "next_twelve_months_rev"}, inplace=True)
trans_rev = trans_rev.sort_values(['ticker', 'date']).reset_index(drop=True)

### Check Data Quality

In [13]:
# Check the date gap
rev_test = trans_rev.copy()
rev_test = rev_test.sort_values('date').reset_index(drop=True)
rev_test['next_date'] = rev_test.groupby('ticker')['date'].shift(-1)
rev_test['date_gap'] = rev_test.apply(lambda x:(x['next_date'] - x['date']).days, axis=1)

In [14]:
# Quick check to see if there are any date gaps bigger than 7
rev_test[(rev_test['date_gap'].notnull()) & (rev_test['date_gap'] != 7)].sort_values('date_gap', ascending=False).head()

Unnamed: 0,ticker,date,next_twelve_months_rev,next_date,date_gap


In [15]:
# Check how many tickers don't have records at all
len(set(rev.columns) - set(rev.dropna(axis=1, how='all').columns))

77

# Data Transformation

## Merge the REV Data and EPS Date

In [16]:
eps_rev = trans_rev.merge(trans_eps, how='inner', on=['ticker', 'date'])
eps_rev.head()

Unnamed: 0,ticker,date,next_twelve_months_rev,next_twelve_months_eps
0,A,2004-05-14,7317.9,1.28
1,A,2004-05-21,7602.3,1.33
2,A,2004-05-28,7618.1,1.34
3,A,2004-06-04,7633.8,1.35
4,A,2004-06-11,7649.5,1.36


## Data Quality Check

In [17]:
# check the null values
eps_rev.isnull().sum()

ticker                         0
date                           0
next_twelve_months_rev    500503
next_twelve_months_eps    480359
dtype: int64

In [18]:
# Check the number of tickers have missing value in the 'revenue'
eps_rev[(eps_rev['next_twelve_months_rev'].isnull()) & (eps_rev['next_twelve_months_eps'].notnull())].ticker.nunique()

192

In [19]:
# Check the number of tickers have missing value in the 'revenue'
eps_rev[(eps_rev['next_twelve_months_rev'].notnull()) & (eps_rev['next_twelve_months_eps'].isnull())].ticker.nunique()

50

In [20]:
# Check the date gap
eps_rev_test = eps_rev.copy()
eps_rev_test = eps_rev_test.sort_values('date').reset_index(drop=True)
eps_rev_test['next_date'] = eps_rev_test.groupby('ticker')['date'].shift(-1)
eps_rev_test['date_gap'] = eps_rev_test.apply(lambda x:(x['next_date'] - x['date']).days, axis=1)

In [21]:
eps_rev_test[(eps_rev_test['date_gap'].notnull()) & (eps_rev_test['date_gap'] != 7)].sort_values('date_gap', ascending=False).head()

Unnamed: 0,ticker,date,next_twelve_months_rev,next_twelve_months_eps,next_date,date_gap


## Read the Price Data

In [22]:
price = pd.read_csv('daily_price.csv', parse_dates=['date']).sort_values(["date", "ticker"]).reset_index(drop=True)
price.head()

Unnamed: 0,ticker,date,open,high,low,close,volume,dividends,closeunadj,lastupdated
0,A,2014-01-02,57.1,57.1,56.15,56.21,1916200.0,0.0,56.21,2017-11-01
1,AAGIY,2014-01-02,20.14,20.25,20.12,20.23,41000.0,0.0,20.23,2018-10-10
2,AAIIQ,2014-01-02,0.03,0.03,0.03,0.03,0.0,0.0,0.03,2019-06-28
3,AAIR,2014-01-02,0.006,0.006,0.006,0.006,2800.0,0.0,0.006,2017-11-01
4,AAL,2014-01-02,25.07,25.82,25.06,25.36,8997900.0,0.0,25.36,2017-11-01


## Read the Share Data

In [23]:
share = pd.read_csv('daily_market_cap.csv', parse_dates=['date']).sort_values(["date", "ticker"]).reset_index(drop=True)
share.head()

Unnamed: 0,ticker,date,lastupdated,ev,evebit,evebitda,marketcap,pb,pe,ps
0,A,2014-01-02,2018-10-21,18675.0,19.3,14.0,18651.0,3.5,25.8,2.8
1,AAIIQ,2014-01-02,2019-06-28,-3.1,0.6,0.9,0.1,,,
2,AAIR,2014-01-02,2018-10-21,28.3,-3.0,-7.5,0.2,,,
3,AAL,2014-01-02,2019-04-26,24314.8,38.2,14.7,8510.8,-1.1,19.7,0.3
4,AAMC,2014-01-02,2018-10-21,2404.0,,,2170.4,616.4,,
