In [1]:
import pandas as pd
import numpy as np
import wrds
from enum import Enum, IntEnum

In [2]:
conn = wrds.Connection()

Enter your WRDS username [harper]:yuchengxu
Enter your password:········
WRDS recommends setting up a .pgpass file.
Create .pgpass file now [y/n]?: n
You can create this file yourself at any time with the create_pgpass_file() function.
Loading library list...
Done


In [3]:
begdate = '2015-01-01'
enddate = '2022-12-30'

#### get price volume data

In [4]:
sp500 = conn.raw_sql(f"""
                        select a.*, b.date
                        from crsp.msp500list as a,
                        crsp.msf as b
                        where a.permno=b.permno
                        and b.date >= a.start and b.date<= a.ending
                        and b.date>='{begdate}'
                        and b.date<='{enddate}'
                        order by date;
                        """, date_cols=['start', 'ending', 'date'])

In [5]:
mse = conn.raw_sql("""
                        select comnam, namedt, nameendt, 
                        permno, ticker
                        from crsp.msenames
                        """, date_cols=['namedt', 'nameendt'])

# if nameendt is missing then set to today date
mse['nameendt']=mse['nameendt'].fillna(pd.to_datetime('today'))

In [6]:
# Merge with SP500 data
sp500_full = pd.merge(sp500, mse, how = 'left', on = 'permno')

# Impose the date range restrictions
sp500_full = sp500_full.loc[(sp500_full.date>=sp500_full.namedt) \
                            & (sp500_full.date<=sp500_full.nameendt)]

In [7]:
sp500 = sp500_full[['permno', 'comnam', 'ticker']].drop_duplicates()
sp500 = sp500.drop_duplicates(subset=['permno'])
sp500

Unnamed: 0,permno,comnam,ticker
3,10104.0,ORACLE CORP,ORCL
5,10107.0,MICROSOFT CORP,MSFT
10,10138.0,T ROWE PRICE GROUP INC,TROW
25,10145.0,HONEYWELL INTERNATIONAL INC,HON
35,10147.0,E M C CORP MA,EMC
...,...,...,...
331967,86288.0,COSTAR GROUP INC,CSGP
332913,16581.0,INVITATION HOMES INC,INVH
336895,12476.0,TARGA RESOURCES CORP,TRGP
341383,82276.0,ARCH CAPITAL GROUP LTD NEW,ACGL


In [8]:
permno_list = sp500['permno'].astype('int').to_list()
formatted_permnos = ",".join(["'{}'".format(permno) for permno in permno_list])

In [9]:
query = f"""
SELECT 
    permno, date, prc, vol, openprc, askhi, bidlo, ret, shrout
FROM 
    crsp_a_stock.dsf 
WHERE 
    permno IN ({formatted_permnos}) AND date BETWEEN '{begdate}' AND '{enddate}'
"""

price_vol = conn.raw_sql(query)

In [10]:
price_vol['date'] = pd.to_datetime(price_vol['date'])
price_vol['yyyy-mm'] = price_vol['date'].dt.strftime('%Y-%m')
price_vol

Unnamed: 0,permno,date,prc,vol,openprc,askhi,bidlo,ret,shrout,yyyy-mm
0,10104.0,2015-01-02,44.330002,15070165.0,45.020000,45.189800,43.970001,-0.014232,4391367.0,2015-01
1,10104.0,2015-01-05,43.590000,18375801.0,44.160000,44.250000,43.580002,-0.013986,4391367.0,2015-01
2,10104.0,2015-01-06,43.139999,19229461.0,44.060001,44.180000,42.990002,-0.010323,4391367.0,2015-01
3,10104.0,2015-01-07,43.150002,13502164.0,43.330002,43.520000,43.009998,0.000232,4391367.0,2015-01
4,10104.0,2015-01-08,43.410000,17516877.0,43.630001,43.939999,43.380001,0.006025,4391367.0,2015-01
...,...,...,...,...,...,...,...,...,...,...
221715,93436.0,2022-12-23,123.150002,166917212.0,126.370003,128.617294,121.019997,-0.017551,3157752.0,2022-12
221716,93436.0,2022-12-27,109.099998,208446557.0,117.495003,119.669998,108.760002,-0.114089,3157752.0,2022-12
221717,93436.0,2022-12-28,112.709999,220818648.0,110.349998,116.269997,108.239998,0.033089,3157752.0,2022-12
221718,93436.0,2022-12-29,121.820000,221592714.0,120.385002,123.570000,117.495003,0.080827,3157752.0,2022-12


In [11]:
query = f"""
SELECT 
    DISTINCT permno, gics, date
FROM 
    contrib_global_factor.global_factor
WHERE
    permno IN ({formatted_permnos}) AND date BETWEEN '{begdate}' AND '{enddate}'
"""

gics = conn.raw_sql(query)

In [12]:
gics['date'] = pd.to_datetime(gics['date'])
gics['yyyy-mm'] = gics['date'].dt.strftime('%Y-%m')
gics.drop(columns='date', inplace=True)
gics

Unnamed: 0,permno,gics,yyyy-mm
0,87717.0,20301010.0,2019-12
1,19502.0,30101010.0,2020-06
2,44329.0,35101010.0,2021-07
3,60442.0,40101015.0,2018-09
4,24328.0,10102020.0,2015-07
...,...,...,...
58326,92157.0,45203020.0,2018-12
58327,58246.0,40203010.0,2021-07
58328,10516.0,30202010.0,2015-10
58329,89641.0,45202030.0,2015-12


In [13]:
price_vol = pd.merge(price_vol, gics, how='left', on=['permno', 'yyyy-mm'])
price_vol = pd.merge(price_vol, sp500, how='left', on=['permno'])
price_vol = price_vol.set_index(['date', 'permno'])
price_vol

Unnamed: 0_level_0,Unnamed: 1_level_0,prc,vol,openprc,askhi,bidlo,ret,shrout,yyyy-mm,gics,comnam,ticker
date,permno,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2015-01-02,10104.0,44.330002,15070165.0,45.020000,45.189800,43.970001,-0.014232,4391367.0,2015-01,45103020.0,ORACLE CORP,ORCL
2015-01-05,10104.0,43.590000,18375801.0,44.160000,44.250000,43.580002,-0.013986,4391367.0,2015-01,45103020.0,ORACLE CORP,ORCL
2015-01-06,10104.0,43.139999,19229461.0,44.060001,44.180000,42.990002,-0.010323,4391367.0,2015-01,45103020.0,ORACLE CORP,ORCL
2015-01-07,10104.0,43.150002,13502164.0,43.330002,43.520000,43.009998,0.000232,4391367.0,2015-01,45103020.0,ORACLE CORP,ORCL
2015-01-08,10104.0,43.410000,17516877.0,43.630001,43.939999,43.380001,0.006025,4391367.0,2015-01,45103020.0,ORACLE CORP,ORCL
...,...,...,...,...,...,...,...,...,...,...,...,...
2022-12-23,93436.0,123.150002,166917212.0,126.370003,128.617294,121.019997,-0.017551,3157752.0,2022-12,25102010.0,TESLA INC,TSLA
2022-12-27,93436.0,109.099998,208446557.0,117.495003,119.669998,108.760002,-0.114089,3157752.0,2022-12,25102010.0,TESLA INC,TSLA
2022-12-28,93436.0,112.709999,220818648.0,110.349998,116.269997,108.239998,0.033089,3157752.0,2022-12,25102010.0,TESLA INC,TSLA
2022-12-29,93436.0,121.820000,221592714.0,120.385002,123.570000,117.495003,0.080827,3157752.0,2022-12,25102010.0,TESLA INC,TSLA


#### Signal

Price volume data should have index date, permno.

In [14]:
def future_retnd(data, col, hzn):
    return (data[col].unstack().shift(-hzn) / data[col].unstack() - 1).stack()

def signal_mean(data, col, hzn):
    return data[col].unstack().fillna(method = 'ffill').rolling(hzn).mean().stack()

def signal_max(data, col, hzn):
    return data[col].unstack().fillna(method = 'ffill').rolling(hzn).max().stack()

def signal_min(data, col, hzn):
    return data[col].unstack().fillna(method = 'ffill').rolling(hzn).min().stack()

def signal_std(data, col, hzn):
    return data[col].unstack().fillna(method = 'ffill').rolling(hzn).std().stack()

def signal_sum(data, col, hzn):
    return data[col].unstack().fillna(0).rolling(hzn).sum().fillna(0).stack()

def signal_change(data, col, hzn):
    signal = data[col].unstack().pct_change(hzn).stack()
    signal = signal.replace([np.inf, -np.inf], 0)
    return signal

def signal_ratio(data, col, short_hzn, long_hzn):
    return signal_mean(data, col, short_hzn) / signal_mean(data, col,long_hzn)

def signal_RSI(data, col, hzn):
    ret = data[col].unstack()
    pos_ret = pd.DataFrame(np.where(data[col].unstack() > 0, data[col].unstack(), 0), index=ret.index, columns=ret.columns).fillna('ffill').rolling(hzn).mean()
    neg_ret = pd.DataFrame(np.where(data[col].unstack() < 0, data[col].unstack(), 0), index=ret.index, columns=ret.columns).fillna('ffill').rolling(hzn).mean()
    RSI = 100 - 100 / (1 + pos_ret / neg_ret)
    return RSI.stack()

def signal_vwap_change(data, short_hzn, long_hzn):
    vwap_long = signal_sum(data, 'tvr', long_hzn) / signal_sum(data, 'vol', long_hzn)
    vwap_short = signal_sum(data, 'tvr', short_hzn) / signal_sum(data, 'vol', short_hzn)
    return vwap_short / vwap_long

def signal_min_max_range(data, col, short_hzn, long_hzn):
    return (signal_mean(data, col, short_hzn) - signal_min(data, col,long_hzn)) / (signal_max(data, col,long_hzn) - signal_min(data, col,long_hzn))


In [15]:
class CrossLevel(IntEnum):
    ALL = 1
    IND1 = 2
    IND2 = 3


def cross_rank(data: pd.DataFrame, signal_name: str, level: CrossLevel):
    x = data.copy()
    group_columns = ["date"]
    if level == CrossLevel.IND1:
        x['group_id'] = x['gics']%100
        group_columns.append("group_id")
    elif level == CrossLevel.IND2:
        x['group_id'] = x['gics']%10000
        group_columns.append("group_id")
    return x.groupby(group_columns)[signal_name].rank(pct=True)

In [16]:
price_vol['future_ret1d'] = future_retnd(price_vol, 'prc', 1)

In [17]:
price_vol["tvr"] = price_vol["vol"] * price_vol["prc"]
price_vol['s1'] = signal_change(price_vol, 'prc', 1)  # return
price_vol['s2'] = signal_mean(price_vol, 's1', 3)   # average return over past 3 days
price_vol['s3'] = signal_mean(price_vol, 's1', 15)   # average return over past 10 days
price_vol['s4'] = price_vol['s2'] - price_vol['s3']  # MACD
price_vol['s5'] = signal_std(price_vol, 's1', 10)  # volatility over past 10 days
price_vol['s6'] = signal_std(price_vol, 's1', 20)  # volatility over past 20 days
price_vol['s7'] = signal_RSI(price_vol, 's1', 10)  # RSI
price_vol['s8'] = signal_vwap_change(price_vol, 1, 3)  # vwap change
price_vol['s9'] = signal_vwap_change(price_vol, 3, 20) # vwap change
price_vol['s10'] = signal_ratio(price_vol, 'tvr', 1, 30) # turnover ratio
price_vol['s11'] = signal_min_max_range(price_vol, 'tvr', 2, 30)  # turnover min max ratio

In [18]:
signals = ['s1', 's2',  's4', 's8']
for level in [CrossLevel.ALL]:
    print('Generating', level)
    for f in signals:
        price_vol[f"{f}_cr{int(level)}"] = cross_rank(price_vol, f, level)

Generating CrossLevel.ALL


In [19]:
signals = [i for i in price_vol.columns if i.startswith('s') and i!='shrout']
labels = ['future_ret1d']

In [20]:
price_vol[signals + labels].corr('spearman')

Unnamed: 0,s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s1_cr1,s2_cr1,s4_cr1,s8_cr1,future_ret1d
s1,1.0,0.519862,0.219453,0.45971,0.006294,0.00363,0.115091,0.862748,0.115044,0.014106,0.004424,0.773659,0.412466,0.357566,0.670758,-0.019908
s2,0.519862,1.0,0.396415,0.862132,0.004629,0.00876,0.21536,0.707546,0.413987,-0.00671,-0.003502,0.416466,0.790426,0.676358,0.561526,-0.019105
s3,0.219453,0.396415,1.0,-0.028695,-0.093829,-0.03472,0.364481,0.288324,0.907011,0.007311,-0.000493,0.182904,0.327779,-0.045035,0.23873,-0.017906
s4,0.45971,0.862132,-0.028695,1.0,0.035665,0.0119,0.073839,0.62291,0.026103,-0.016204,-0.009119,0.363908,0.682276,0.79958,0.4905,-0.009704
s5,0.006294,0.004629,-0.093829,0.035665,1.0,0.867296,-0.04778,0.007523,-0.112932,0.083176,0.072089,-0.010456,-0.014133,0.001434,-0.014895,0.013174
s6,0.00363,0.00876,-0.03472,0.0119,0.867296,1.0,-0.030879,0.004951,-0.051912,-0.009968,-0.039033,-0.010551,-0.012169,-0.002865,-0.014577,0.008367
s7,0.115091,0.21536,0.364481,0.073839,-0.04778,-0.030879,1.0,0.148907,0.419495,-0.028351,-0.025767,0.092485,0.17283,0.050455,0.118697,-0.010973
s8,0.862748,0.707546,0.288324,0.62291,0.007523,0.004951,0.148907,1.0,0.192646,0.005218,0.003428,0.674076,0.556776,0.481655,0.779185,-0.019055
s9,0.115044,0.413987,0.907011,0.026103,-0.112932,-0.051912,0.419495,0.192646,1.0,0.000742,-0.004379,0.102219,0.346319,0.008392,0.163958,-0.016386
s10,0.014106,-0.00671,0.007311,-0.016204,0.083176,-0.009968,-0.028351,0.005218,0.000742,1.0,0.777654,0.047936,0.054306,0.021483,0.050485,-0.003435
