# Core code for Feature development

In [8]:
# Get all libraries
import numpy as np
import pandas as pd
import pprint
from tqdm import tqdm
import s3finance as s3c
import importlib
import gc

In [9]:
# Kevin's fill operations
def dofundsfills(df, col_name=None, lower=None,upper=None,domean=True,fillzero=True):
    if lower is not None:
        df[col_name].clip(lower=lower, inplace=True)
    if upper is not None:
        df[col_name].clip(upper=upper, inplace=True)
    #df[col_name] = df.groupby('gvkey')[col_name].apply(lambda x : x.ffill().bfill())
    df[col_name] = df.groupby('cusip')[col_name].apply(lambda x : x.ffill().bfill())
    # df[col_name] = df.groupby('ticker')[col_name].apply(lambda x : x.ffill().bfill())
    if domean:
        df[col_name].fillna(df.groupby('cusip')[col_name].transform('mean'), inplace=True)
        #df[col_name].fillna(df.groupby('gvkey')[col_name].transform('mean'), inplace=True)
    if fillzero:
        df[col_name].fillna(0, inplace=True)

def dopricesfills(df, col_name=None, lower=None,upper=None,domean=True,fillzero=True):
    if lower is not None:
        df[col_name].clip(lower=lower, inplace=True)
    if upper is not None:
        df[col_name].clip(upper=upper, inplace=True)
    df[col_name] = df.groupby('cusip')[col_name].apply(lambda x : x.ffill().bfill())
    #df[col_name] = df.groupby('tic')[col_name].apply(lambda x : x.ffill().bfill())
    if domean:
        df[col_name].fillna(df.groupby('cusip')[col_name].transform('mean'), inplace=True)
        #df[col_name].fillna(df.groupby('tic')[col_name].transform('mean'), inplace=True)
    if fillzero:
        df[col_name].fillna(0, inplace=True)

In [10]:
AWS_BUCKET = "w210-wrds-data"

In [11]:
#
# Setup S3 access to bucket for all financial data.

s3f = s3c.s3finance(AWS_BUCKET,version='1658902110.776109')


In [26]:
%%time
# stack all the items in the list
stackedprices = s3f.getrawsummaryprices()

CPU times: user 16.1 s, sys: 9.93 s, total: 26 s
Wall time: 21.8 s


In [None]:
n = gc.collect()
print("Number of unreachable objects collected by GC:", n)

In [None]:
%%time
# calculate price volatility
stackedprices = stackedprices.groupby("cusip").\
apply(lambda grp: grp.sort_values('datadate',ascending=True).\
      assign(volatility=(np.log(grp['adjusted_price']/(grp['adjusted_price'].shift())).rolling(21).std())*252**.5)).reset_index(drop=True)

# fill stackedprices
dopricesfills(stackedprices,col_name = 'volatility',lower=None,upper=1,domean=False,fillzero=False)

In [None]:
%%time
# do fill in values
# these values should be zero since they are periodic
stackedprices[['div','divd','dvi','dvrated','eps','epsmo']] =  stackedprices[['div','divd','dvi','dvrated','eps','epsmo']].fillna(0)
stackedprices[stackedprices.select_dtypes(include=np.number).columns] = \
stackedprices.groupby('cusip').apply(lambda grp: grp.sort_values('datadate',ascending=True).select_dtypes(include=np.number).ffill().bfill().fillna(0)).reset_index(drop=True)

In [None]:
print(len(stackedprices))
print(stackedprices.isna().sum())

In [None]:
%%time

# save prices summary
s3f.putsummaryprices(stackedprices)
stackedprices = None

In [None]:
n = gc.collect()
print("Number of unreachable objects collected by GC:", n)

In [36]:
# do all the stacked funds operations
stackedfunds = s3f.getrawsummaryfund()

In [None]:
col_name = 'divyield'

#stackedfunds[col_name].clip(upper=0.2, inplace=True)
stackedfunds[col_name].fillna(0, inplace=True)

dofundsfills(stackedfunds,col_name = 'sale_nwc',lower=-1000,upper=1000,domean=False)

dofundsfills(stackedfunds,col_name = 'cash_conversion',lower=None, upper=2000,domean=True)

dofundsfills(stackedfunds,col_name = 'inv_turn',lower=None,upper=200,domean=False)

dofundsfills(stackedfunds,col_name = 'int_debt',lower=None,upper=5,domean=True)

dofundsfills(stackedfunds,col_name = 'invt_act',lower=None,upper=1,domean=True)

dofundsfills(stackedfunds,col_name = 'int_totdebt',lower=None,upper=5,domean=True)

dofundsfills(stackedfunds,col_name = 'pretret_noa',lower=-4,upper=4,domean=False)

dofundsfills(stackedfunds,col_name = 'pretret_earnat',lower=-2,upper=2,domean=False)

dofundsfills(stackedfunds,col_name = 'rect_act',lower=None,upper=1,domean=True)

dofundsfills(stackedfunds,col_name = 'ocf_lct',lower=-8,upper=8,domean=False)

dofundsfills(stackedfunds,col_name = 'profit_lct',lower=-8,upper=8,domean=False)

dofundsfills(stackedfunds,col_name = 'curr_debt',lower=None,upper=1,domean=True)

dofundsfills(stackedfunds,col_name = 'quick_ratio',lower=None,upper=10,domean=True)

dofundsfills(stackedfunds,col_name = 'efftax',lower=-1,upper=1,domean=True)

dofundsfills(stackedfunds,col_name = 'intcov_ratio',lower=-500,upper=500,domean=True)

dofundsfills(stackedfunds,col_name = 'dpr',lower=-4,upper=4,domean=False)

dofundsfills(stackedfunds,col_name = 'fcf_ocf',lower=-10,upper=1,domean=False)

dofundsfills(stackedfunds,col_name = 'peg_1yrforward',lower=None,upper=None,domean=True)

dofundsfills(stackedfunds,col_name = 'short_debt',lower=0,upper=1,domean=False)

dofundsfills(stackedfunds,col_name = 'pay_turn',lower=-10,upper=100,domean=False)

dofundsfills(stackedfunds,col_name = 'roe',lower=-5,upper=5,domean=False)

dofundsfills(stackedfunds,col_name = 'sale_equity',lower=None,upper=30,domean=True)

dofundsfills(stackedfunds,col_name = 'capei',lower=-100,upper=100,domean=True)

dofundsfills(stackedfunds,col_name = 'dltt_be',lower=None,upper=10,domean=False)

dofundsfills(stackedfunds,col_name = 'ptb',lower=None,upper=50,domean=True)

dofundsfills(stackedfunds,col_name = 'rect_turn',lower=None,upper=200,domean=True)

dofundsfills(stackedfunds,col_name = 'pe_op_basic',lower=-200,upper=200,domean=True)

dofundsfills(stackedfunds,col_name = 'aftret_invcapx',lower=-1.5,upper=1.5,domean=True)

dofundsfills(stackedfunds,col_name = 'lt_ppent',lower=None,upper=200,domean=True)

dofundsfills(stackedfunds,col_name = 'pe_inc',lower=-100,upper=200,domean=True)

dofundsfills(stackedfunds,col_name = 'cfm',lower=-5,upper=5,domean=False)

dofundsfills(stackedfunds,col_name = 'debt_capital',lower=-5,upper=5,domean=True)

dofundsfills(stackedfunds,col_name = 'cash_debt',lower=-5,upper=5,domean=False)

dofundsfills(stackedfunds,col_name = 'roce',lower=-2,upper=2,domean=False)

dofundsfills(stackedfunds,col_name = 'at_turn',lower=None,upper=2,domean=False)

dofundsfills(stackedfunds,col_name = 'sale_invcap',lower=-1,upper=10,domean=True)

dofundsfills(stackedfunds,col_name = 'adv_sale',lower=0,upper=1,domean=True)

dofundsfills(stackedfunds,col_name = 'staff_sale',lower=None,upper=1,domean=False)

dofundsfills(stackedfunds,col_name = 'gpm',lower=-5,upper=1,domean=True)

dofundsfills(stackedfunds,col_name = 'evm',lower=-100,upper=100,domean=True)

dofundsfills(stackedfunds,col_name = 'opmad',lower=-5,upper=1,domean=True)

dofundsfills(stackedfunds,col_name = 'ps',lower=None,upper=100,domean=True)

dofundsfills(stackedfunds,col_name = 'totdebt_invcap',lower=None,upper=5,domean=False)

dofundsfills(stackedfunds,col_name = 'pcf',lower=-100,upper=100,domean=True)

dofundsfills(stackedfunds,col_name = 'roa',lower=-1,upper=1,domean=False)

dofundsfills(stackedfunds,col_name = 'accrual',lower=-1,upper=1,domean=False)

dofundsfills(stackedfunds,col_name = 'aftret_equity',lower=-5,upper=5,domean=False)

dofundsfills(stackedfunds,col_name = 'equity_invcap',lower=-1,upper=1,domean=True)

dofundsfills(stackedfunds,col_name = 'capital_ratio',lower=-1,upper=2,domean=False)

dofundsfills(stackedfunds,col_name = 'lt_debt',lower=None,upper=1,domean=False)

dofundsfills(stackedfunds,col_name = 'gprof',lower=-1,upper=2,domean=False)

dofundsfills(stackedfunds,col_name = 'cash_lt',lower=None,upper=10,domean=False)

dofundsfills(stackedfunds,col_name = 'de_ratio',lower=-50,upper=50,domean=False)

dofundsfills(stackedfunds,col_name = 'debt_assets',lower=None,upper=2,domean=True)

dofundsfills(stackedfunds,col_name = 'rd_sale',lower=None,upper=5,domean=False)

In [None]:
stackedfunds.isna().sum().sort_values(ascending=False)

In [None]:

stackedfunds = stackedfunds.drop(['peg_trailing', 'peg_ltgforward', 'pe_op_dil', 'curr_ratio', 'cash_ratio', 
                  'intcov', 'bm', 'pe_exi', 'debt_ebitda', 'opmbd', 'ptpm', 'npm', 'debt_invcap', 'aftret_eq', 'debt_at'], axis=1)

In [None]:
#
s3f.putsummaryfund(stackedfunds)

stackedfunds = None

In [None]:
n = gc.collect()
print("Number of unreachable objects collected by GC:", n)

In [12]:
#
# get stacked quarter and annual
stackedquarter = s3f.getrawsummaryquarter()
stackedannual = s3f.getrawsummaryannual()

In [None]:
col_name = ['optvolq', 'market_cap']

for i in col_name:
    # df[col_name].clip(lower=-1000, inplace=True)
    # df[col_name].clip(upper=2000, inplace=True)
    #stackedquarter[i] = df.groupby('gvkey')[i].apply(lambda x : x.ffill())
    stackedquarter[i] = stackedquarter.groupby('cusip')[i].apply(lambda x : x.ffill())
    #stackedquarter[i] = stackedquarter.groupby('tic')[i].apply(lambda x : x.ffill())
    #stackedquarter[i].fillna(stackedquarter.groupby('gvkey')[i].transform('mean'), inplace=True)
    stackedquarter[i].fillna(stackedquarter.groupby('cusip')[i].transform('mean'), inplace=True)
    stackedquarter[i].fillna(0, inplace=True)


In [None]:
col_name = ["revtq", "capsftq" , "optvolq" , "stkcpaq" , "xrdq" , "xsgaq" , "stkcoq" , "cshopq" , "cshiq" , "chq" , "dlttq" , "cheq" , "oiadpq" , "niq"]

for i in col_name:
    # stackedquarter[col_name].clip(lower=-1000, inplace=True)
    # stackedquarter[col_name].clip(upper=2000, inplace=True)
    # stackedquarter[col_name] = df.groupby('gvkey')[col_name].apply(lambda x : x.ffill())
    # stackedquarter[col_name] = df.groupby('cusip')[col_name].apply(lambda x : x.ffill())
    # stackedquarter[col_name] = df.groupby('ticker')[col_name].apply(lambda x : x.ffill())
    # stackedquarter[col_name].fillna(df.groupby('gvkey')[col_name].transform('mean'), inplace=True)
    stackedquarter[i].fillna(0, inplace=True)

In [None]:
# identifier = stackedquarter['cusip']
def yoychg(df, identifier, feature, target_feature):
    df[target_feature] = np.where(df[identifier] == df[identifier].shift(periods=4),
    df[feature].diff(periods=4)/ df[feature].abs().shift(periods=4)
    , 0)
    # Set ceiling and floor of +/-500%
    df[target_feature] = np.where(df[target_feature] <= -5, -5, df[target_feature])
    df[target_feature] = np.where(df[target_feature] >= 5, 5, df[target_feature])
    df[target_feature].fillna(0, inplace=True)
    return df[target_feature]

In [None]:
def pct_revenue(df, identifier, feature, target_feature):
    df[target_feature] = abs(df[feature]) / abs(df['revtq'])
    # Set ceiling and floor of +/-500%
    df[target_feature] = np.where(df[target_feature] <= -5, -5, df[target_feature])
    df[target_feature] = np.where(df[target_feature] >= 5, 5, df[target_feature])
    df[target_feature].fillna(0, inplace=True)
    return df[target_feature]

In [None]:
new_columns_names = []
for i in col_name:
    stackedquarter[i + '_yoy_chg'] = yoychg(stackedquarter, 'cusip', i, i + '_yoy_chg')
    new_columns_names.append(i + '_yoy_chg')
    stackedquarter[i + '_pct_revenue'] = pct_revenue(stackedquarter, 'cusip', i, i + '_pct_revenue')
    new_columns_names.append(i + '_pct_revenue')


In [None]:
col_name = ["revt", "capsft" , "optvol" , "stkcpa" , "xrd" , "xsga" , "stkco" , 
            "cshi" , "ch" , "dltt" , "che" , "oiadp" , "ni"]

for i in col_name:
    stackedannual[i].fillna(0, inplace=True)


In [None]:
# identifier = stackedquarter['cusip']
def yoychg_annual(df, identifier, feature, target_feature):
    df[target_feature] = np.where(df[identifier] == df[identifier].shift(periods=1),
    df[feature].diff(periods=1)/ df[feature].abs().shift(periods=4)
    , 0)
    # Set ceiling and floor of +/-500%
    df[target_feature] = np.where(df[target_feature] <= -5, -5, df[target_feature])
    df[target_feature] = np.where(df[target_feature] >= 5, 5, df[target_feature])
    df[target_feature].fillna(0, inplace=True)
    return df[target_feature]

In [None]:
def pct_revenue_annual(df, identifier, feature, target_feature):
    df[target_feature] = abs(df[feature]) / abs(df['revt'])
    # Set ceiling and floor of +/-500%
    df[target_feature] = np.where(df[target_feature] <= -5, -5, df[target_feature])
    df[target_feature] = np.where(df[target_feature] >= 5, 5, df[target_feature])
    df[target_feature].fillna(0, inplace=True)
    return df[target_feature]

In [None]:
new_columns_names = []
for i in col_name:
    stackedannual[i + '_yoy_chg'] = yoychg_annual(stackedannual, 'cusip', i, i + '_yoy_chg')
    new_columns_names.append(i + '_yoy_chg')
    stackedannual[i + '_pct_revenue'] = pct_revenue_annual(stackedannual, 'cusip', i, i + '_pct_revenue')
    new_columns_names.append(i + '_pct_revenue')


In [16]:
# Source: https://stackoverflow.com/questions/41815079/pandas-merge-join-two-data-frames-on-multiple-columns
stackedquarter_annual = pd.merge(
    stackedquarter, stackedannual,  how='left', left_on=['cusip','datadate'], right_on = ['cusip','datadate'])

In [None]:
s3f.putsummaryquarter(stackedquarter)
s3f.putsummaryannual(stackedannual)

stackedquarter=None
stackedannual=None

n = gc.collect()
print("Number of unreachable objects collected by GC:", n)

In [23]:
%%time
# stack all the items in the list
stackedjoin = s3f.getrawsummaryjoinpricesfund()
stackedjoin = stackedjoin.groupby('cusip').\
apply(lambda grp: grp.sort_values('datadate',ascending=True).\
      assign(adjusted_price=grp['prccd']/grp['ajexdi'])).reset_index(drop=True)
# calculate price volatility

stackedjoin = stackedjoin.groupby("cusip").\
apply(lambda grp: grp.sort_values('datadate',ascending=True).\
      assign(volatility=(np.log(grp['adjusted_price']/\
                                (grp['adjusted_price'].shift())).rolling(21).std())*252**.5)).\
      reset_index(drop=True)
stackedjoin[stackedjoin.select_dtypes(include=np.number).columns] = stackedjoin.groupby("cusip").\
apply(lambda grp: grp.select_dtypes(include=np.number).ffill().bfill().fillna(0))

CPU times: user 38.3 s, sys: 4.29 s, total: 42.6 s
Wall time: 46.5 s


In [25]:
stackedjoin['monthly_date'].max()

datetime.date(2021, 12, 31)

In [None]:
# Save all data into the summaries
# stackedjoin['ticker'] = stackedjoin['ticker'].astype(str)
# stackedjoin['adate'] = stackedjoin['adate'].astype(str)
# stackedjoin['iid'] = stackedjoin['iid'].astype(str)
s3f.putsummaryjoinpricesfund(stackedjoin)
n = gc.collect()
print("Number of unreachable objects collected by GC:", n)

In [None]:
## Get SPY prices to do comparisons
#
spyprices_df = s3f.getstockprices("SPY")
# put in adjusted prices, should not matter for SPY but just to be careful

spyprices_df[['div','divd','dvi','dvrated','eps','epsmo']] =  spyprices_df[['div','divd','dvi','dvrated','eps','epsmo']].fillna(0)
# calculate volatitlity
spyprices_df = spyprices_df.groupby("cusip").\
apply(lambda grp: grp.sort_values('datadate',ascending=True).\
      assign(volatility=(np.log(grp['adjusted_price']/(grp['adjusted_price'].shift())).rolling(21).std())*252**.5)).reset_index(drop=True)
spyprices_df[spyprices_df.select_dtypes(include=np.number).columns] = spyprices_df.groupby("cusip").\
apply(lambda grp: grp.select_dtypes(include=np.number).ffill().bfill().fillna(0))

In [None]:
n = gc.collect()
print("Number of unreachable objects collected by GC:", n)

In [None]:
%%time
# let's get adjusted prices
joinfundsprices = stackedjoin.groupby('cusip').\
apply(lambda grp: grp.sort_values('datadate',ascending=True).\
      assign(adjusted_price=grp['prccd']/grp['ajexdi'])).reset_index(drop=True)

## make sure there are no nas

joinfundsprices = joinfundsprices.groupby('cusip').\
apply(lambda grp: grp.sort_values('datadate',ascending=True)).\
reset_index(drop=True)
joinfundsprices = joinfundsprices.rename(columns={'gvkey_x': 'gvkey'}).drop(columns=['gvkey_y'])

# fundamental ratios + Prices + SPY Tables
join_spy = pd.merge(
  joinfundsprices, spyprices_df[['datadate','one_yr_chg']], how="left", 
  on="datadate"
)
n = gc.collect()
print("Number of unreachable objects collected by GC:", n)
# some values have 0 datadate
#
join_spy = join_spy[join_spy.datadate != 0]

# address any NAs
join_spy = join_spy.groupby('cusip').\
apply(lambda grp: grp.sort_values('datadate',ascending=True)).\
      reset_index(drop=True)

join_spy['outperformed'] = np.where(join_spy['one_yr_chg_x'] > join_spy['one_yr_chg_y'], 1, 0)
join_spy = join_spy.rename(columns={'one_yr_chg_x': 'one_yr_chg', 'one_yr_chg_y': 'one_yr_chg_spy'})

# Join Fundamaentals Table to (Ratios + Prices + SPY Tables)
stackedquarter_annual = stackedquarter_annual.\
rename(columns={'gvkey_x': 'gvkey','tic_x':'tic'})#.\
#drop(columns=['gvkey_y','tic_y'])
stackedquarter_annual = stackedquarter_annual.groupby('cusip').\
apply(lambda grp: grp.sort_values('datadate',ascending=True))\
.reset_index(drop=True)
stackedquarter_annual[stackedquarter_annual.select_dtypes(include=np.number).columns] = \
stackedquarter_annual.groupby("cusip").\
apply(lambda grp: grp.select_dtypes(include=np.number).ffill().bfill().fillna(0))

join_final = pd.merge(
  join_spy, stackedquarter_annual, how="left", 
  left_on=['cusip', 'monthly_date'], right_on=['cusip', 'datadate']
).rename(columns={'gvkey_x': 'gvkey','market_cap_x':'market_cap','datadate_x':'datadate'}).drop(columns=['gvkey_y','market_cap_y','datadate_y'])

# address any NAs
join_final = join_final.groupby('cusip').\
apply(lambda grp: grp.sort_values('datadate',ascending=True))\
.reset_index(drop=True)
join_final[join_final.select_dtypes(include=np.number).columns] = \
join_final.groupby("cusip").\
apply(lambda grp: grp.select_dtypes(include=np.number).ffill().bfill().fillna(0))

In [None]:
#join_final

In [None]:
print(len(stackedjoin))
stackedjoin.isna().sum()
#stackedjoin=None

In [None]:
print(len(join_final),sum(join_final['outperformed']))

In [None]:
n = gc.collect()
print("Number of unreachable objects collected by GC:", n)

In [None]:
%%time
# Save updated S&P prices
s3f.putstockprices(spyprices_df,'SPY')
# Save all data into the summaries
s3f.putsummaryquarter_annual(stackedquarter_annual)
s3f.putsummaryallratioprices(join_spy)
s3f.putsummaryallratiopricesfinancial(join_final)
pass

In [None]:
# print(len(stackedquarter_annual))
# stackedquarter_annual.isna().sum()

In [None]:
# print(len(join_spy))
# join_spy.isna().sum()

In [None]:
# print(len(join_final))
# print(join_final.isna().sum().to_string())
# print(len(join_final.dropna()))