# Tutorial for the Data API

This file contains a tutorial for how to read data from the `data` directory.
The functions have **very precise type annotations**, avoiding *many* potential bugs, and improving developer productivity and efficiency.

In [1]:
from data import *
import pandas as pd

## August 2023 Summary Statistics

This parses the `August2023Summary.csv` file.

In [2]:
summary = read_august_2023_summary()
summary[0]["before_sample"] # Modify the "before_sample" to see how well the type annotation works
pd.DataFrame(summary)

Unnamed: 0,implementation,before_sample,in_sample,post_sample_but_before_pub,post_pub,last_5_years,year_2022
0,Original Paper,0.55,0.67,0.42,0.32,0.36,0.66
1,HoldPer_1,0.53,0.72,0.49,0.32,0.19,0.8
2,HoldPer_3,0.36,0.62,0.4,0.32,0.38,0.75
3,HoldPer_6,0.3,0.56,0.33,0.28,0.33,0.75
4,HoldPer_12,0.24,0.47,0.26,0.23,0.28,0.62
5,ME_gt_NYSE20pct,0.28,0.49,0.26,0.23,0.26,0.76
6,NYSEonly,0.38,0.45,0.21,0.27,0.3,0.75
7,Price_gt_5,0.34,0.55,0.31,0.23,0.25,0.54
8,VWforce,0.45,0.43,0.26,0.16,0.15,0.58
9,Quintiles (cts only),0.43,0.62,0.39,0.29,0.31,0.66


## Daily Port Summary

This parses the `DailyPortSummary.csv` file.

In [3]:
pd.DataFrame(DailyPortfolios.read_summary())

Unnamed: 0,implementation,port,n_distinct,mean_nobs_years,mean_rbar_monthly
0,CTS_PREDICTOR_DECILE,port01,179,68.388603,0.684372
1,CTS_PREDICTOR_DECILE,port02,177,67.366893,0.777174
2,CTS_PREDICTOR_DECILE,port03,177,67.76174,0.923449
3,CTS_PREDICTOR_DECILE,port04,177,67.205356,0.981265
4,CTS_PREDICTOR_DECILE,port05,178,66.335573,1.016766
5,CTS_PREDICTOR_DECILE,port06,178,66.412831,1.058475
6,CTS_PREDICTOR_DECILE,port07,178,66.909416,1.091068
7,CTS_PREDICTOR_DECILE,port08,179,67.200715,1.118744
8,CTS_PREDICTOR_DECILE,port09,179,67.690168,1.177964
9,CTS_PREDICTOR_DECILE,port10,179,68.373453,1.201524


## Signal Doc

This parses the `SignalDoc.csv` file.

In [4]:
pd.DataFrame(read_signal_doc())

Unnamed: 0,acronym,cat_signal,op_predictability,signal_rep_quality,authors,year,long_description,journal,cat_form,cat_data,...,return_val,t_stat,stock_weight,ls_quantile,quantile_filter,portfolio_period,start_month,applied_filter,notes,detailed_definition
0,AbnormalAccruals,PREDICTOR,CLEAR,FAIR,Xie,2001,Abnormal Accruals,AR,CONTINUOUS,ACCOUNTING,...,0.916667,8.43,EW,ONE_TENTH,,ANNUALLY,JUNE,,OP is aggressive and lags accounting data by o...,Define Accruals as net income (ib) minus opera...
1,Accruals,PREDICTOR,CLEAR,GOOD,Sloan,1996,Accruals,AR,CONTINUOUS,ACCOUNTING,...,0.866667,4.71,EW,ONE_TENTH,,ANNUALLY,JUNE,abs(prc)>5,Table 6 year t+1 hedge. Only size adjusted an...,Annual change in current total assets (act) mi...
2,AccrualsBM,PREDICTOR,CLEAR,GOOD,Bartov and Kim,2004,Book-to-market and accruals,RFQA,DISCRETE,ACCOUNTING,...,0.206000,5.50,EW,TWO_TENTHS,,ANNUALLY,JUNE,,,Binary variable equal to 1 if stock is in the ...
3,Activism1,PREDICTOR,CLEAR,GOOD,Cremers and Nair,2005,Takeover vulnerability,JF,CONTINUOUS,THIRTEEN_F,...,0.902500,3.13,VW,ONE_QUARTER,,MONTHLY,JUNE,,works a bit better EW in Tab 3,24 minus Governance Index (G). Set to missing ...
4,AM,PREDICTOR,CLEAR,GOOD,Fama and French,1992,Total assets to market,JF,CONTINUOUS,ACCOUNTING,...,,5.69,EW,,,ANNUALLY,JUNE,,,Total assets (at) divided by market value of e...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
326,SP_q,PLACEBO,INDIRECT,,"Barbee, Mukherji and Raines",1996,Sales-to-price quarterly,FAJ,CONTINUOUS,ACCOUNTING,...,,,,,,,,,,Ratio of annual sales (sale) to market value o...
327,tang_q,PLACEBO,INDIRECT,,Hahn and Lee,2009,Tangibility quarterly,JF,CONTINUOUS,ACCOUNTING,...,,,,,,,,,,Cash and short-term investments (che) plus .71...
328,Tax_q,PLACEBO,INDIRECT,,Lev and Nissim,2004,Taxable income to income (qtrly),AR,CONTINUOUS,ACCOUNTING,...,,,,,,,,,,Ratio of Taxes paid and tax share of net incom...
329,WW_Q,PLACEBO,INDIRECT,,Whited and Wu,2006,Whited-Wu index,RFS,CONTINUOUS,ACCOUNTING,...,,,,,,,,,"Insignificant in original paper,",Group data by 3 digit SIC code and month to co...


## Predictor

This parses a given predictor, given its short name.

In [5]:
values, ports = DailyPortfolios.read_predictor("BidAskSpread", vw=False)
print(ports)
pd.DataFrame(values)

('port01', 'port02', 'port03', 'port04', 'port05', 'port06', 'portLS')


Unnamed: 0,date,port01,port02,port03,port04,port05,port06,port07,port08,port09,port10,portLS
0,1926-02-01,-0.174025,-0.354044,-0.443207,-0.368607,-0.545103,0.348814,,,,,0.522839
1,1926-02-02,0.529962,0.606180,0.612349,0.544363,0.402014,0.510969,,,,,-0.018993
2,1926-02-03,0.602922,0.439322,0.487145,0.350119,0.830027,1.000290,,,,,0.397368
3,1926-02-04,0.480899,0.315830,0.618086,0.682094,0.612923,0.112668,,,,,-0.368231
4,1926-02-05,0.073132,-0.163043,-0.051685,0.215276,0.220188,0.867627,,,,,0.794495
...,...,...,...,...,...,...,...,...,...,...,...,...
25518,2022-12-23,0.111071,0.325080,0.428410,0.615383,0.251195,-0.289216,,,,,-0.400287
25519,2022-12-27,-0.048859,-0.044397,-0.074606,-0.128443,-1.078780,-2.838162,,,,,-2.789303
25520,2022-12-28,-0.418795,-0.913687,-1.098363,-1.338935,-1.248627,-1.044593,,,,,-0.625797
25521,2022-12-29,0.590091,1.310446,1.417995,1.685517,2.933759,3.909154,,,,,3.319063


In [6]:
values, ports = DailyPortfolios.read_predictor("BidAskSpread", vw=True)
print(ports)
pd.DataFrame(values)

('port01', 'port02', 'port03', 'port04', 'port05', 'port06', 'portLS')


Unnamed: 0,date,port01,port02,port03,port04,port05,port06,port07,port08,port09,port10,portLS
0,1926-02-01,-0.132456,-0.473825,-0.586829,-0.017379,-0.432544,-0.515035,,,,,-0.382580
1,1926-02-02,0.486124,0.686540,0.577793,0.547870,0.502511,0.492223,,,,,0.006099
2,1926-02-03,0.446565,0.479530,0.435945,0.754013,0.828716,0.107086,,,,,-0.339479
3,1926-02-04,0.468391,0.506404,0.393217,0.882955,0.529893,-0.131754,,,,,-0.600146
4,1926-02-05,-0.026952,-0.292613,0.077437,0.145242,-0.065884,0.756747,,,,,0.783699
...,...,...,...,...,...,...,...,...,...,...,...,...
25518,2022-12-23,-0.034800,0.396762,0.360643,0.724822,0.633322,-0.970691,,,,,-0.935892
25519,2022-12-27,-0.176954,-0.097301,-0.272677,-0.140204,-1.027927,-3.185845,,,,,-3.008891
25520,2022-12-28,-0.115678,-0.968472,-1.298265,-1.221485,-1.431261,-0.389858,,,,,-0.274179
25521,2022-12-29,0.260518,1.463701,1.852204,1.525173,2.534998,4.734355,,,,,4.473837


In [7]:
_ = DailyPortfolios.read_cts_predictor("BidAskSpread", frequency="decile", vw=False)
_ = DailyPortfolios.read_cts_predictor("BidAskSpread", frequency="decile", vw=True)
_ = DailyPortfolios.read_cts_predictor("BidAskSpread", frequency="quintile", vw=False)
_ = DailyPortfolios.read_cts_predictor("BidAskSpread", frequency="quintile", vw=True)
del _

In [8]:
all_predictor_ports = read_predictor_ports_full()
pd.DataFrame(all_predictor_ports)

Unnamed: 0,signal_name,port,date,ret,signal_lag,n_long,n_short
0,AM,port01,1951-07-31,7.661648,0.681619,67,0
1,AM,port01,1951-08-31,4.273648,0.639816,67,0
2,AM,port01,1951-09-28,1.315270,0.617079,67,0
3,AM,port01,1951-10-31,-3.942980,0.612266,67,0
4,AM,port01,1951-11-30,1.028670,0.637710,67,0
...,...,...,...,...,...,...,...
1183281,zerotradeAlt12,portLS,2022-08-31,-1.467892,,491,492
1183282,zerotradeAlt12,portLS,2022-09-30,4.607782,,490,491
1183283,zerotradeAlt12,portLS,2022-10-31,-5.818550,,492,492
1183284,zerotradeAlt12,portLS,2022-11-30,-2.581228,,491,492


In [9]:
values, all_ports = DailyPortfolios.read_predictor(all_predictor_ports[0]["signal_name"], vw=True)
print(all_ports)
pd.DataFrame(values)

('port01', 'port02', 'port03', 'port04', 'port05', 'portLS')


Unnamed: 0,date,port01,port02,port03,port04,port05,port06,port07,port08,port09,port10,portLS
0,1951-07-02,0.437656,0.751156,0.695964,0.483364,1.055240,,,,,,0.617584
1,1951-07-03,0.820599,0.450872,0.730964,0.701926,1.446899,,,,,,0.626300
2,1951-07-05,1.592783,1.939588,1.991927,1.007977,2.560996,,,,,,0.968213
3,1951-07-06,0.048693,0.082860,-0.352517,-0.007209,0.062074,,,,,,0.013380
4,1951-07-09,0.517417,0.223910,0.325406,0.047123,0.075076,,,,,,-0.442340
...,...,...,...,...,...,...,...,...,...,...,...,...
18024,2022-12-23,0.177421,0.809064,1.002917,0.938002,0.632599,,,,,,0.455179
18025,2022-12-27,-0.918393,-0.071020,0.172570,0.150531,-0.023797,,,,,,0.894595
18026,2022-12-28,-1.185585,-1.333443,-1.627837,-1.292865,-0.619723,,,,,,0.565863
18027,2022-12-29,2.234841,1.507996,1.489975,1.610972,1.584760,,,,,,-0.650081


In [10]:
pd.DataFrame(FirmLevelCharacteristics.read_individual_predictor("BM"))

Unnamed: 0,permno,yyyymm,value
0,10000,198704,-2.895161
1,10000,198705,-2.895161
2,10001,198612,-0.104958
3,10001,198701,-0.104958
4,10001,198702,-0.104958
...,...,...,...
2636825,93436,202208,-3.647020
2636826,93436,202209,-3.647020
2636827,93436,202210,-3.647020
2636828,93436,202211,-3.647020
