In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Reload all modules imported with %aimport
%load_ext autoreload
%autoreload 1

In [2]:
import pandas as pd
idx = pd.IndexSlice

import datetime as dt
from datetime import date
from datetime import timedelta
import dateutil.parser as dup

%aimport trans.data
%aimport trans.gtrans
%aimport trans.reg
%aimport trans.regpipe

from trans.data import GetData
gd = GetData()
from trans.gtrans import *
from trans.reg import Reg, RegAttr
from trans.regpipe import RegPipe

## Verify prices (GetDataTransformer)

In [36]:
def verify_df(df, v_df, cols=None, debug=False, **params):
    (min_d, max_d) = (v_df.index.min(), v_df.index.max())
    if debug:
        print("Verified df ({}, {}), shape {}".format(min_d, max_d, v_df.shape))
        print(df.columns)
        print(v_df.columns)
    
    # Output the verified df to a csv for hand-verification
    v_df.to_csv("/tmp/verify.csv")
    
    if (not cols == None):
        return df.loc[ min_d:max_d, cols].equals( v_df.loc[:, cols])
    else:
        return df.loc[ min_d:max_d, v_df.columns].equals( v_df.loc[:,:])

    
def verify_file(df, verified_df_file, cols=None, debug=False,**params):
    """
    Compare DataFrame to one that is stored in a file
    
    Parameters:
    --------------
    df: DataFrame
    verified_df_file: string. Name of pkl file containing verified DataFrame
    
    Returns
    --------
    Boolean
    """                   
    v_df = gd.load_data(verified_df_file)
    return verify_df(df, v_df)
   


In [4]:
sector_tickers = ['SPY',
 'XLY',
 'XLP',
 'XLE',
 'XLF',
 'XLV',
 'XLI',
 'XLB',
 'XLRE',
 'XLK',
 'XTL',
 'XLU']

price_df = GetDataTransformer(sector_tickers, cal_ticker="SPY").fit_transform( pd.DataFrame())


In [37]:
verify_file( price_df, "verify_sectors_raw_df.pkl")

True

## Verify returns (pctTrans)

In [6]:
pipe_pct   = make_pipeline(GenSelectAttrsTransformer(['Adj Close'], dropSingle=False),
                           pctTrans,
                           GenRenameAttrsTransformer(lambda col: "Pct", level=0)
                          )
pct_df = pipe_pct.fit_transform(price_df)

transform: pandas version <= 0.20.


In [39]:
verify_file(pct_df, "verify_sectors_pct_df.pkl")

True

## Verify single regression

In [8]:
regParams = gd.load_data("verify_regParams.pkl")
(start, end, step, window) = list( map( lambda c: regParams[c], [ "start", "end", "step", "window" ]) )


In [18]:
regStarts = end - window + timedelta(days=1)
regStarts, end
pct_dfs = pct_df.loc[ regStarts:end,:]

rps = RegPipe( pct_dfs )
rps.indCols( [ idx["Pct", "SPY"] ] )
rps.regressSingle()

rps.beta_df.shape

(datetime.datetime(2017, 6, 30, 0, 0), datetime.datetime(2017, 12, 29, 0, 0))

IndCols: [('Pct', 'SPY')], depCol ('Pct', 'XLB'), cols [('Pct', 'SPY'), ('Pct', 'XLB')]
IndCols: [('Pct', 'SPY')], depCol ('Pct', 'XLE'), cols [('Pct', 'SPY'), ('Pct', 'XLE')]
IndCols: [('Pct', 'SPY')], depCol ('Pct', 'XLF'), cols [('Pct', 'SPY'), ('Pct', 'XLF')]
IndCols: [('Pct', 'SPY')], depCol ('Pct', 'XLI'), cols [('Pct', 'SPY'), ('Pct', 'XLI')]
IndCols: [('Pct', 'SPY')], depCol ('Pct', 'XLK'), cols [('Pct', 'SPY'), ('Pct', 'XLK')]
IndCols: [('Pct', 'SPY')], depCol ('Pct', 'XLP'), cols [('Pct', 'SPY'), ('Pct', 'XLP')]
IndCols: [('Pct', 'SPY')], depCol ('Pct', 'XLRE'), cols [('Pct', 'SPY'), ('Pct', 'XLRE')]
IndCols: [('Pct', 'SPY')], depCol ('Pct', 'XLU'), cols [('Pct', 'SPY'), ('Pct', 'XLU')]
IndCols: [('Pct', 'SPY')], depCol ('Pct', 'XLV'), cols [('Pct', 'SPY'), ('Pct', 'XLV')]
IndCols: [('Pct', 'SPY')], depCol ('Pct', 'XLY'), cols [('Pct', 'SPY'), ('Pct', 'XLY')]
IndCols: [('Pct', 'SPY')], depCol ('Pct', 'XTL'), cols [('Pct', 'SPY'), ('Pct', 'XTL')]


(1, 22)

In [40]:
verify_file( rps.beta_df, "verify_beta_df.pkl")

True

## Continuation: Verify residuals of single regression

In [12]:
rollAmount = 0
fillMethod = "bfill"

rps.attrib_setup(pct_dfs, rps.beta_df, rollAmount, fillMethod)
rps.attrib()

rps.retAttr_df.shape

sector_residuals = rps.retAttr_df.loc[:, idx["Error",:]]

(127, 44)

In [41]:
verify_file( sector_residuals, "sector_residuals.pkl")

True

## Verify stacked residual

In [30]:
from trans.stack_residual import Residual

resStart = dup.parse("01/01/2016")
rstack = Residual(debug=True)
rstack.init(df=pct_df, start=resStart, end=end, window=window, step=step)
resid_stack = rstack.repeated()
rstack.done()

nextChunk for period 2017-06-30 00:00:00 to 2017-12-29 00:00:00 shape: (127, 12)
repeated: chunk label 2017-12-29 00:00:00 with shape (127, 12)
IndCols: [('Pct', 'SPY')], depCol ('Pct', 'XLB'), cols [('Pct', 'SPY'), ('Pct', 'XLB')]
IndCols: [('Pct', 'SPY')], depCol ('Pct', 'XLE'), cols [('Pct', 'SPY'), ('Pct', 'XLE')]
IndCols: [('Pct', 'SPY')], depCol ('Pct', 'XLF'), cols [('Pct', 'SPY'), ('Pct', 'XLF')]
IndCols: [('Pct', 'SPY')], depCol ('Pct', 'XLI'), cols [('Pct', 'SPY'), ('Pct', 'XLI')]
IndCols: [('Pct', 'SPY')], depCol ('Pct', 'XLK'), cols [('Pct', 'SPY'), ('Pct', 'XLK')]
IndCols: [('Pct', 'SPY')], depCol ('Pct', 'XLP'), cols [('Pct', 'SPY'), ('Pct', 'XLP')]
IndCols: [('Pct', 'SPY')], depCol ('Pct', 'XLRE'), cols [('Pct', 'SPY'), ('Pct', 'XLRE')]
IndCols: [('Pct', 'SPY')], depCol ('Pct', 'XLU'), cols [('Pct', 'SPY'), ('Pct', 'XLU')]
IndCols: [('Pct', 'SPY')], depCol ('Pct', 'XLV'), cols [('Pct', 'SPY'), ('Pct', 'XLV')]
IndCols: [('Pct', 'SPY')], depCol ('Pct', 'XLY'), cols [('Pct'

In [32]:
v_stack = gd.load_data("verify_resid_stack.pkl")


list

### Verify single regression matches first element of stack

In [72]:
(v_label, v_df) = v_stack[0]
verify_df(sector_residuals, v_df)

True

## Verify first element of stack

In [51]:
(label, df) = resid_stack[0]
verify_df(df, v_df)

True

## Verify second element of stack

In [53]:
(v_label, v_df) = v_stack[1]
(label, df) = resid_stack[1]
verify_df(df, v_df)


True

Timestamp('2017-06-02 00:00:00')

### Manually carry out second single regression so can compare beta0, beta1, by hand with spreadsheet

In [66]:
end2 = end - step
regStarts2 = end2 - window + timedelta(days=1)
regStarts2, end2

(datetime.datetime(2017, 6, 2, 0, 0), datetime.datetime(2017, 12, 1, 0, 0))

In [71]:

pct_dfs2 = pct_df.loc[ regStarts2:end2,:]

rps2 = RegPipe( pct_dfs2 )
rps2.indCols( [ idx["Pct", "SPY"] ] )
rps2.regressSingle()


IndCols: [('Pct', 'SPY')], depCol ('Pct', 'XLB'), cols [('Pct', 'SPY'), ('Pct', 'XLB')]
IndCols: [('Pct', 'SPY')], depCol ('Pct', 'XLE'), cols [('Pct', 'SPY'), ('Pct', 'XLE')]
IndCols: [('Pct', 'SPY')], depCol ('Pct', 'XLF'), cols [('Pct', 'SPY'), ('Pct', 'XLF')]
IndCols: [('Pct', 'SPY')], depCol ('Pct', 'XLI'), cols [('Pct', 'SPY'), ('Pct', 'XLI')]
IndCols: [('Pct', 'SPY')], depCol ('Pct', 'XLK'), cols [('Pct', 'SPY'), ('Pct', 'XLK')]
IndCols: [('Pct', 'SPY')], depCol ('Pct', 'XLP'), cols [('Pct', 'SPY'), ('Pct', 'XLP')]
IndCols: [('Pct', 'SPY')], depCol ('Pct', 'XLRE'), cols [('Pct', 'SPY'), ('Pct', 'XLRE')]
IndCols: [('Pct', 'SPY')], depCol ('Pct', 'XLU'), cols [('Pct', 'SPY'), ('Pct', 'XLU')]
IndCols: [('Pct', 'SPY')], depCol ('Pct', 'XLV'), cols [('Pct', 'SPY'), ('Pct', 'XLV')]
IndCols: [('Pct', 'SPY')], depCol ('Pct', 'XLY'), cols [('Pct', 'SPY'), ('Pct', 'XLY')]
IndCols: [('Pct', 'SPY')], depCol ('Pct', 'XTL'), cols [('Pct', 'SPY'), ('Pct', 'XTL')]
