In [1]:
%load_ext lineapy

In [2]:
import lineapy

# Parameter Refactoring 

This notebook demonstrates how to use LineaPy to perform parameter refactoring.

It introduces two new LineaPy APIs related to parameter refactoring.

## A function that takes user-defined input parameters and returns a list of artifacts calculated by the same process

```
lineapy.get_function(
    artifact_list=[art_name1, art_name2, ...], 
    input_parameters=[var1, var2, ...]
) -> Callable
```

This return a python function that takes user-selected input variables and returns user-selected artifacts.

* Benefits
    + Easy to adjust input/return variables.
    + It is the same function used in `to_pipeline` and data scientists can validate the output pipeline in the notebook directly.
    + Extend LineaPy user base to everyone who writes a for-loop.

## Module definition 

```
lineapy.get_module_definition(
    artifact_list=[art_name1, art_name2, ...], 
    input_parameters=[var1, var2, ...]
) -> str
```

This returns the entire module definition as a string. 
Note, this should be the same as the `module.py` generated from the current `to_pipeline`.

* Benefits
    + Easily examine the output result directly in the notebook.
    + Export to the filesystem for other applications.
    
    





## Example in quantative finance (pair trade)

What Is a Pairs Trade? 

A pairs trade is a trading strategy that involves matching a long position with a short position in two stocks with a __high correlation__.

### Required data
* You need daily(weekly/monthly/hourly ...) return of two stocks
* Starting time frame
* Correlation window size

In [3]:
import yfinance as yf
import numpy as np
import pandas as pd

start="2017-01-01"
n_day = 20

start_date=pd.Timestamp(start)

ticker1 = 'MSFT'
stock1 = yf.Ticker(ticker1)
history1 = stock1.history(start=start_date).assign(chg_pct = lambda df: df.Close/df.Close.shift(1)-1).dropna()[:n_day]
lineapy.save(history1, 'history1')

print(history1.head())

ticker2 = 'AAPL'
stock2 = yf.Ticker(ticker2)
history2 = stock2.history(start=start_date).assign(chg_pct = lambda df: df.Close/df.Close.shift(1)-1).dropna()[:n_day]
lineapy.save(history2, 'history2')

print(history2.head())

pair_correlation = np.corrcoef(history1.chg_pct, history2.chg_pct)
lineapy.save(pair_correlation, 'correlation')

                 Open       High        Low      Close    Volume  Dividends  \
Date                                                                          
2017-01-04  57.837964  58.087904  57.504710  57.671337  21340000        0.0   
2017-01-05  57.569509  58.004591  57.421397  57.671337  24876000        0.0   
2017-01-06  57.671330  58.458181  57.430649  58.171211  19922900        0.0   
2017-01-09  58.097152  58.393381  57.893500  57.986069  20382700        0.0   
2017-01-10  58.069380  58.384119  57.652813  57.967552  18593000        0.0   

            Stock Splits   chg_pct  
Date                                
2017-01-04             0 -0.004474  
2017-01-05             0  0.000000  
2017-01-06             0  0.008668  
2017-01-09             0 -0.003183  
2017-01-10             0 -0.000319  
                 Open       High        Low      Close     Volume  Dividends  \
Date                                                                           
2017-01-04  27.187240  27.3

LineaArtifact(name='correlation', _version=8)

In [4]:
pair_correlation

array([[ 1.        , -0.36153237],
       [-0.36153237,  1.        ]])

# Standard Pipeline

**Q: If our goal is `correlation`, why we need both history1, history2? 
A: Compliance! You need not only the result, but also the input data to verify your result is reprodecible**

In [5]:
print(lineapy.get_module_definition(['history1','history2','correlation'],indentation=2, keep_lineapy_save=True))


import copy
import numpy as np
import pandas as pd
import yfinance as yf

def get_start_date_n_day_for_artifact_history1_and_downstream():
  start = "2017-01-01"
  n_day = 20
  start_date = pd.Timestamp(start)
  return start_date, n_day

def get_history1(n_day, start_date):
  ticker1 = "MSFT"
  stock1 = yf.Ticker(ticker1)
  history1 = (
      stock1.history(start=start_date)
      .assign(chg_pct=lambda df: df.Close / df.Close.shift(1) - 1)
      .dropna()[:n_day]
  )
  return history1

def get_history2(n_day, start_date):
  ticker2 = "AAPL"
  stock2 = yf.Ticker(ticker2)
  history2 = (
      stock2.history(start=start_date)
      .assign(chg_pct=lambda df: df.Close / df.Close.shift(1) - 1)
      .dropna()[:n_day]
  )
  return history2

def get_correlation(history1, history2):
  pair_correlation = np.corrcoef(history1.chg_pct, history2.chg_pct)
  return pair_correlation

def pipeline():
  sessionartifacts = []
  start_date, n_day = get_start_date_n_day_for_artifact_history1_and_downstre

# Parameterized Pipeline

If you are a analyst covers MSFT, want to identify pair trade candidate.

In [6]:
print(lineapy.get_module_definition(['history1','history2','correlation'],input_parameters=['ticker2'],indentation=2))

import copy
import numpy as np
import pandas as pd
import yfinance as yf

def get_start_date_n_day_for_artifact_history1_and_downstream():
  start = "2017-01-01"
  n_day = 20
  start_date = pd.Timestamp(start)
  return start_date, n_day

def get_history1(n_day, start_date):
  ticker1 = "MSFT"
  stock1 = yf.Ticker(ticker1)
  history1 = (
      stock1.history(start=start_date)
      .assign(chg_pct=lambda df: df.Close / df.Close.shift(1) - 1)
      .dropna()[:n_day]
  )
  return history1

def get_history2(n_day, start_date, ticker2):
  stock2 = yf.Ticker(ticker2)
  history2 = (
      stock2.history(start=start_date)
      .assign(chg_pct=lambda df: df.Close / df.Close.shift(1) - 1)
      .dropna()[:n_day]
  )
  return history2

def get_correlation(history1, history2):
  pair_correlation = np.corrcoef(history1.chg_pct, history2.chg_pct)
  return pair_correlation

def pipeline(ticker2 = "AAPL"):
  sessionartifacts = []
  start_date, n_day = get_start_date_n_day_for_artifact_history1_and_do

# How about we just return the session as a module? Then we can use the parameterized pipeline as an function.

* Pure code refactoring/parametrization, no need to copy paste code and reuse the result.
* Directly using the parameterized functions; easy to do experinment and validate 


In [7]:
ft = lineapy.get_function(['history1','history2','correlation'],input_parameters=['ticker2'])

for ticker2 in ['AMZN','SPY','QQQ']:
    print(ticker2)
    print(ft(ticker2=ticker2)[2])


AMZN
[[ 1.         -0.01847983]
 [-0.01847983  1.        ]]
SPY
[[1.         0.28165898]
 [0.28165898 1.        ]]
QQQ
[[1.         0.26407754]
 [0.26407754 1.        ]]


# Another way for parameterization. 

You don't care about MSFT, you just want to find out high correlation pair for pair trade.

In [8]:
print(lineapy.get_module_definition(['history1','history2','correlation'],input_parameters=['ticker1','ticker2'],indentation=2))

import copy
import numpy as np
import pandas as pd
import yfinance as yf

def get_start_date_n_day_for_artifact_history1_and_downstream():
  start = "2017-01-01"
  n_day = 20
  start_date = pd.Timestamp(start)
  return start_date, n_day

def get_history1(n_day, start_date, ticker1):
  stock1 = yf.Ticker(ticker1)
  history1 = (
      stock1.history(start=start_date)
      .assign(chg_pct=lambda df: df.Close / df.Close.shift(1) - 1)
      .dropna()[:n_day]
  )
  return history1

def get_history2(n_day, start_date, ticker2):
  stock2 = yf.Ticker(ticker2)
  history2 = (
      stock2.history(start=start_date)
      .assign(chg_pct=lambda df: df.Close / df.Close.shift(1) - 1)
      .dropna()[:n_day]
  )
  return history2

def get_correlation(history1, history2):
  pair_correlation = np.corrcoef(history1.chg_pct, history2.chg_pct)
  return pair_correlation

def pipeline(
    ticker1 = "MSFT",
    ticker2 = "AAPL",
):
  sessionartifacts = []
  start_date, n_day = get_start_date_n_day_for_artif

In [9]:
ft = lineapy.get_function(['history1','history2','correlation'],input_parameters=['ticker1','ticker2'])

for t1,t2 in [('MSFT','AMZN'),('SPY','QQQ'),('TWTR','TSLA')]:
    print(ft(ticker1=t1, ticker2=t2)[2])

[[ 1.         -0.01848058]
 [-0.01848058  1.        ]]
[[1.         0.77937329]
 [0.77937329 1.        ]]
[[1.         0.14237649]
 [0.14237649 1.        ]]


# If you want some backtesting, you need to change the `start`

In [10]:
print(lineapy.get_module_definition(['history1','history2','correlation'],input_parameters=['ticker1','ticker2','start'],indentation=2))


import copy
import numpy as np
import pandas as pd
import yfinance as yf

def get_start_date_n_day_for_artifact_history1_and_downstream(start):
  n_day = 20
  start_date = pd.Timestamp(start)
  return start_date, n_day

def get_history1(n_day, start_date, ticker1):
  stock1 = yf.Ticker(ticker1)
  history1 = (
      stock1.history(start=start_date)
      .assign(chg_pct=lambda df: df.Close / df.Close.shift(1) - 1)
      .dropna()[:n_day]
  )
  return history1

def get_history2(n_day, start_date, ticker2):
  stock2 = yf.Ticker(ticker2)
  history2 = (
      stock2.history(start=start_date)
      .assign(chg_pct=lambda df: df.Close / df.Close.shift(1) - 1)
      .dropna()[:n_day]
  )
  return history2

def get_correlation(history1, history2):
  pair_correlation = np.corrcoef(history1.chg_pct, history2.chg_pct)
  return pair_correlation

def pipeline(
    start = "2017-01-01",
    ticker1 = "MSFT",
    ticker2 = "AAPL",
):
  sessionartifacts = []
  start_date, n_day = get_start_date_n_day_f

In [11]:
ft = lineapy.get_function(['history1','history2','correlation'],input_parameters=['ticker1','ticker2','start'])

for t1,t2, start in [('MSFT','AMZN','2018-01-01'),('SPY','QQQ','2008-09-30'),('TWTR','TSLA','2022-06-01')]:
    print(ft(ticker1=t1, ticker2=t2, start=start)[2])

[[1.         0.27812645]
 [0.27812645 1.        ]]
[[1.         0.94338018]
 [0.94338018 1.        ]]
[[1.         0.68685324]
 [0.68685324 1.        ]]


# If you want to tune your parameters for trading strategy(n_day correlation)

In [12]:
print(lineapy.get_module_definition(['history1','history2','correlation'], input_parameters=['ticker1','ticker2','start','n_day']))

import copy
import numpy as np
import pandas as pd
import yfinance as yf

def get_start_date_n_day_for_artifact_history1_and_downstream(n_day, start):
    start_date = pd.Timestamp(start)
    return start_date, n_day

def get_history1(n_day, start_date, ticker1):
    stock1 = yf.Ticker(ticker1)
    history1 = (
        stock1.history(start=start_date)
        .assign(chg_pct=lambda df: df.Close / df.Close.shift(1) - 1)
        .dropna()[:n_day]
    )
    return history1

def get_history2(n_day, start_date, ticker2):
    stock2 = yf.Ticker(ticker2)
    history2 = (
        stock2.history(start=start_date)
        .assign(chg_pct=lambda df: df.Close / df.Close.shift(1) - 1)
        .dropna()[:n_day]
    )
    return history2

def get_correlation(history1, history2):
    pair_correlation = np.corrcoef(history1.chg_pct, history2.chg_pct)
    return pair_correlation

def pipeline(
    start = "2017-01-01",
    n_day = 20,
    ticker1 = "MSFT",
    ticker2 = "AAPL",
):
    sessionartifacts =

In [13]:
ft = lineapy.get_function(['history1','history2','correlation'], input_parameters=['ticker1','ticker2','start','n_day'])

print(ft(ticker1='QQQ',ticker2='SPY',start='2008-10-01',n_day=30)[2])
print(ft(ticker1='QQQ',ticker2='SPY',start='2009-10-01',n_day=30)[2])
print(ft(ticker2='SPY',start='2009-10-01',n_day=30)[2])


[[1.         0.94871212]
 [0.94871212 1.        ]]
[[1.         0.92359317]
 [0.92359317 1.        ]]
[[1.         0.16596815]
 [0.16596815 1.        ]]


# If we don't care about the compliance to keep the original data, we only need one of the artifact

In [14]:
print(lineapy.get_module_definition(['correlation'], input_parameters=['ticker1','ticker2','start','n_day']))

import numpy as np
import pandas as pd
import yfinance as yf

def get_correlation(n_day, start, ticker1, ticker2):
    start_date = pd.Timestamp(start)
    stock1 = yf.Ticker(ticker1)
    history1 = (
        stock1.history(start=start_date)
        .assign(chg_pct=lambda df: df.Close / df.Close.shift(1) - 1)
        .dropna()[:n_day]
    )
    stock2 = yf.Ticker(ticker2)
    history2 = (
        stock2.history(start=start_date)
        .assign(chg_pct=lambda df: df.Close / df.Close.shift(1) - 1)
        .dropna()[:n_day]
    )
    pair_correlation = np.corrcoef(history1.chg_pct, history2.chg_pct)
    return pair_correlation

def pipeline(
    start = "2017-01-01",
    n_day = 20,
    ticker1 = "MSFT",
    ticker2 = "AAPL",
):
    pair_correlation = get_correlation(n_day, start, ticker1, ticker2)
    return pair_correlation

if __name__=="__main__":
    pipeline()



In [15]:
ft = lineapy.get_function(['correlation'], input_parameters=['ticker1','ticker2','start','n_day'])


#Still can use default value
print(
    ft(
        ticker2='SPY',start='2009-10-01',n_day=30
    )
)

[[1.         0.16596826]
 [0.16596826 1.        ]]
