# Import

In [1]:
import pandas as pd

# PE Data

## Read In PE Data

In [2]:
# since the columns with values have no names, turn off low_memory to avoid warnings
fwd_pe = pd.read_csv('FWD_PE.csv', low_memory=False)

In [3]:
# create a new empty df
new_fwd_pe = pd.DataFrame()

## Transform the PE Data

In [4]:
for n in range(0, len(fwd_pe.columns)-1, 2):
    pe_temp = fwd_pe.iloc[:,[n, n+1]].copy()
    pe_temp['ticker'] = pe_temp.columns[0]
    pe_temp.rename(columns={pe_temp.columns[0]: 'date', 'Unnamed: {}'.format(n+1): 'forward_pe'}, inplace=True)
    new_fwd_pe = new_fwd_pe.append(pe_temp, ignore_index=True)

In [5]:
new_fwd_pe.ticker.nunique(), len(new_fwd_pe)

(2912, 2285920)

## Formatting

In [6]:
new_fwd_pe['date'] = pd.to_datetime(new_fwd_pe['date'])
# save to csv for the future use
new_fwd_pe.to_csv('forward_pe.csv', index=False)
# forward_pe_new = pd.read_csv('forward_pe.csv', parse_dates=['date'])

## Data Quality Check

In [7]:
# Check if there is any null value
forward_pe = new_fwd_pe.copy()
forward_pe.isnull().sum()

date          789274
forward_pe    789274
ticker             0
dtype: int64

In [8]:
# Drop the null values
forward_pe = forward_pe.dropna(axis=0, how='any')

In [9]:
# Check if there is any duplicates
len(forward_pe[forward_pe.duplicated(subset=['ticker', 'date'], keep='first')])

0

In [10]:
# Check if there is any date_gap
forward_pe_test = forward_pe.copy()
forward_pe_test = forward_pe_test.sort_values(['ticker' ,'date']).reset_index(drop=True)
forward_pe_test['next_date'] = forward_pe_test.groupby('ticker')['date'].shift(-1)
forward_pe_test['date_gap'] = forward_pe_test.apply(lambda x:(x['next_date'] - x['date']).days, axis=1)

In [11]:
forward_pe_test[(forward_pe_test['date_gap'].notnull()) & (forward_pe_test['date_gap'] > 7)].sort_values('date_gap', ascending=False).head()

Unnamed: 0,date,forward_pe,ticker,next_date,date_gap
592314,2004-12-31,17.58,GNTY,2017-06-09,4543.0
783120,2006-12-29,99.9,LCTX,2016-03-18,3367.0
157398,2008-12-26,99.9,BBX,2017-12-01,3262.0
860085,2009-12-25,13.76,MFNC,2018-04-06,3024.0
1181089,2007-12-28,14.18,SBBX,2015-10-16,2849.0


In [12]:
pe_problem_tickers = forward_pe_test[(forward_pe_test['date_gap'].notnull()) & (forward_pe_test['date_gap'] > 7)]["ticker"].unique()

In [13]:
# Based on the client's request, drop the tickers that have date-gap is larger than 7 (1066 tickers are dropped)
forward_pe_final = forward_pe[~forward_pe.ticker.isin(pe_problem_tickers)]

# PS Data

## Read In PS Data

In [14]:
fwd_ps = pd.read_csv('FWD_PS.csv', low_memory=False)

In [15]:
new_fwd_ps = pd.DataFrame()

## Transform the PS Data

In [16]:
for n in range(0, len(fwd_ps.columns)-1, 2):
    ps_temp = fwd_ps.iloc[:, [n,n+1]].copy()
    ps_temp['ticker'] = ps_temp.columns[0]
    ps_temp.rename(columns={ps_temp.columns[0]: 'date', 'Unnamed: {}'.format(n+1): 'forward_ps'}, inplace=True)
    new_fwd_ps = new_fwd_ps.append(ps_temp, ignore_index=True)

## Formatting

In [17]:
new_fwd_ps['date'] = pd.to_datetime(new_fwd_ps['date'])
new_fwd_ps.to_csv('forward_ps.csv', index=False)
# forward_ps_new = pd.read_csv('forward_ps.csv', parse_dates=['date'])

## Data Quality Check

In [18]:
# Check if there is any null value
forward_ps = new_fwd_ps.copy()
forward_ps.isnull().sum()

date          813572
forward_ps    813572
ticker             0
dtype: int64

In [19]:
# Drop the null values
forward_ps = forward_ps.dropna(axis=0, how='any')

In [20]:
# Check if there is any duplicates
len(forward_ps[forward_ps.duplicated(subset=['ticker', 'date'], keep='first')])

0

In [21]:
# Check if there is any date_gap
forward_ps_test = forward_ps.copy()
forward_ps_test = forward_ps_test.sort_values(['ticker' ,'date']).reset_index(drop=True)
forward_ps_test['next_date'] = forward_ps_test.groupby('ticker')['date'].shift(-1)
forward_ps_test['date_gap'] = forward_ps_test.apply(lambda x:(x['next_date'] - x['date']).days, axis=1)

In [22]:
forward_ps_test[(forward_ps_test['date_gap'].notnull()) & (forward_ps_test['date_gap'] > 7)].sort_values('date_gap', ascending=False).head()

Unnamed: 0,date,forward_ps,ticker,next_date,date_gap
878526,2005-06-24,0.76,MSON,2018-08-31,4816.0
834406,2006-12-29,2.24,MFNC,2018-04-06,4116.0
1147159,2005-12-30,2.81,SBBX,2015-10-16,3577.0
921753,2004-08-13,0.55,NL,2014-02-07,3465.0
759826,2006-12-29,4.5,LCTX,2016-03-18,3367.0


In [23]:
ps_problem_tickers = forward_ps_test[(forward_ps_test['date_gap'].notnull()) & (forward_ps_test['date_gap'] > 7)].sort_values('date_gap', ascending=False).ticker.unique()

In [24]:
# Based on the client's request, drop the tickers that have date-gap is larger than 7 (1216 tickers are dropped)
forward_ps_final = forward_ps[~forward_ps.ticker.isin(ps_problem_tickers)]

# Final DataFrame

## Merge PS and PE DataFrames

In [25]:
# After a check we could know that in all the 2912 tickers, 
# 53 tickers are not overlapped in the pe and ps file, 
# so we just drop them and remain the left part
forward_ps_pe = forward_ps_final.merge(forward_pe_final, how='inner', on=['date', 'ticker'])
forward_ps_pe = forward_ps_pe[['date', 'ticker', 'forward_ps', 'forward_pe']]

In [26]:
forward_ps_pe.ticker.nunique()

1550

## Data Quanlity Check

In [27]:
# Check if there is any null value
ps_pe = forward_ps_pe.copy()
ps_pe.isnull().sum()

date          0
ticker        0
forward_ps    0
forward_pe    0
dtype: int64

In [28]:
# Check if there is any duplicates
len(ps_pe[ps_pe.duplicated(subset=['ticker', 'date'], keep='first')])

0

In [29]:
# Check if there is any date_gap
ps_pe_test = ps_pe.copy()
ps_pe_test = ps_pe_test.sort_values(['ticker' ,'date']).reset_index(drop=True)
ps_pe_test['next_date'] = ps_pe_test.groupby('ticker')['date'].shift(-1)
ps_pe_test['date_gap'] = ps_pe_test.apply(lambda x:(x['next_date'] - x['date']).days, axis=1)

In [30]:
ps_pe_test[(ps_pe_test['date_gap'].notnull()) & (ps_pe_test['date_gap'] > 7)].sort_values('date_gap', ascending=False)

Unnamed: 0,date,ticker,forward_ps,forward_pe,next_date,date_gap


In [31]:
# there are records with date gap of 5 days which happens at the latest records
ps_pe_test[(ps_pe_test['date_gap'].notnull()) & (ps_pe_test['date_gap'] != 7)].sort_values('date_gap', ascending=False).head()

Unnamed: 0,date,ticker,forward_ps,forward_pe,next_date,date_gap
783,2019-08-09,A,3.96,20.68,2019-08-14,5.0
474443,2019-08-09,ONDK,0.56,8.31,2019-08-14,5.0
477600,2019-08-09,OSK,0.62,9.5,2019-08-14,5.0
476734,2019-08-09,ORLY,2.77,20.14,2019-08-14,5.0
475949,2019-08-09,ORGO,0.14,99.9,2019-08-14,5.0


In [32]:
# remove the latest records at 2019-08-14
ps_pe_final = ps_pe[ps_pe['date'] != '2019-08-14']

In [33]:
ps_pe_test2 = ps_pe_final.copy()
ps_pe_test2 = ps_pe_test2.sort_values(['ticker' ,'date']).reset_index(drop=True)
ps_pe_test2['next_date'] = ps_pe_test2.groupby('ticker')['date'].shift(-1)
ps_pe_test2['date_gap'] = ps_pe_test2.apply(lambda x:(x['next_date'] - x['date']).days, axis=1)

In [34]:
ps_pe_test2[(ps_pe_test2['date_gap'].notnull()) & (ps_pe_test2['date_gap'] != 7)].sort_values('date_gap', ascending=False)

Unnamed: 0,date,ticker,forward_ps,forward_pe,next_date,date_gap


In [35]:
ps_pe_final.to_csv('forward_ps_pe.csv', index=False)