In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import os

from utils import BaseSim

# Load True Data

In [2]:
# load data
data = {}
for file in os.listdir('data/clean_data'):
    print(file)
    ticker = file.split('.')[0] # retrieve ticker_name
    data[ticker] = pd.read_csv(filepath_or_buffer=os.path.join('data/clean_data/', file), header=0, index_col = 0, parse_dates=True, infer_datetime_format=True) # read data correctly


V.csv
NVDA.csv
GS.csv
JPM.csv
TSLA.csv
AAPL.csv


In [3]:
tickers = list(data.keys())

# Load Simulation

In [4]:
root = 'results/Thresholding'

In [5]:
# load simdata
simdata = {}
for ticker in data:
    simdata[ticker] = np.load(os.path.join(root, f'{ticker}/simulation.npy'))

In [6]:
# load metric_data
metric_data = {}
for ticker in data:
    print(ticker)
    metricdata = np.load(os.path.join(root, f'{ticker}/metrics.npz'))
    metric_data[ticker] = {}
    for arr in metricdata.files:
        metric_data[ticker][arr] = metricdata[arr]
    metricdata.close()

V
NVDA
GS
JPM
TSLA
AAPL


Assert Simulation is correct shape

In [7]:
pred_period = 140
# check to make sure simdata is of size (runs, X periods, 140, 3)
for ticker, d in simdata.items():
    assert d.shape == (d.shape[0], d.shape[1], pred_period, 3)

In [8]:
ticker = 'AAPL'
simdata[ticker][0, 0, 0]

array([-4.11117591e-04,  0.00000000e+00,  4.16600000e+01])

Observe a single sample of ticker AAPL

In [9]:
for stat, metric in metric_data[ticker].items():
    print(f'{stat}:\n', metric[0])


PnL:
 [-0.06076959 -0.08787595 -0.02876177  0.00377911 -0.0913253  -0.06102534
  0.0366747  -0.05594317 -0.05125415  0.01247897 -0.03967135 -0.04209786
  0.01145242 -0.00146771 -0.04609284 -0.11423108 -0.07058745 -0.0869484
 -0.0702989   0.09677402  0.01331458 -0.05592432 -0.06135503 -0.05021143
 -0.04495649  0.07308791 -0.00166539  0.01608497 -0.00885442 -0.06864438
 -0.06152895 -0.01844829  0.04419339 -0.0257204  -0.0422677  -0.03532152
  0.00227971 -0.03871018  0.03472967]
Sharpe:
 [        nan -5.68576614 -2.50579132 -1.27171919 -1.4729097  -1.64909657
 -0.91941982 -1.0214978  -1.10562877 -0.92384938 -0.97369159 -1.02603501
 -0.90014043 -0.84814706 -0.89941078 -0.94201568 -1.00076622 -1.05739287
 -1.10973614 -0.72483878 -0.67808592 -0.71122157 -0.74621791 -0.77331109
 -0.79597424 -0.64357562 -0.62850121 -0.59536837 -0.58961742 -0.62009384
 -0.64688731 -0.64830216 -0.58570649 -0.59292545 -0.60924003 -0.62166863
 -0.60814433 -0.62200589 -0.57885248]
Correlation:
 [ 0.09859163 -0.0207

# Computing Summary Statistics

Assumptions:
- Each individual month period can be considered a seperate MC prediction so it is acceptable to swap PnL, sharpe etc. for each period
- When capturing quantile runs, we use the final expected return as the measure for quantile location. Therefore, it does not correspond that a max run would also produce the max PnL, but rather a PnL that occurs if the max simulation ran happened.
- Quantiles are computed using a closest algorithm to avoid averaging when the number of simulations are even. Otherwise, since quantiles are computed on final expected return, a true median of 2n runs is undefined.

We wish to capture the Max, Median and Min possible Markov simulations


Min Max Medium of runs

In [10]:
# Actual Max/Min simulation. For each period, we track the simulation with the highest/lowest ending value
quantile_runs = {}
split = .5


for ticker in data:
    
    mid = int(split * len(data[ticker]))
    end = mid + simdata[ticker].shape[1] * simdata[ticker].shape[2]
    index = data[ticker].index[mid:end]
    
    min_pos = []
    max_pos = []
    median = []
    q1 = []
    q3 = []
    
    num_sims = len(simdata[ticker])
    num_periods = len(simdata[ticker][0])
    
    for periodid in range(num_periods):
        temp = []
        for simid in range(num_sims):
            temp.append(simdata[ticker][simid][periodid][:, 0].sum())
        temp = np.array(temp)
        min_pos.append(simdata[ticker][np.where(temp == temp.min())[0][0]][periodid])
        max_pos.append(simdata[ticker][np.where(temp == temp.max())[0][0]][periodid])
        median.append(simdata[ticker][np.where(temp == np.quantile(temp, .5, method = 'closest_observation'))[0][0]][periodid])
        q1.append(simdata[ticker][np.where(temp == np.quantile(temp, .25, method = 'closest_observation'))[0][0]][periodid])
        q3.append(simdata[ticker][np.where(temp == np.quantile(temp, .75, method = 'closest_observation'))[0][0]][periodid])

    quantile_runs[ticker] = {'Min':np.array(min_pos), 'Q1': np.array(q1), 'Median': np.array(median), 'Q3':(q3), 'Max': 
        np.array(max_pos)}

    

TypeError: _quantile_dispatcher() got an unexpected keyword argument 'method'

In [None]:
quantile_runs['TSLA']['Min'][0][:10]

array([[-6.62129779e-03, -2.48844778e-03,  5.61900000e+01],
       [-2.18636124e-02,  3.19829697e-03,  5.63700000e+01],
       [-6.45879726e-04,  6.01345993e-03,  5.67100000e+01],
       [ 9.31770985e-03, -2.37320443e-02,  5.53800000e+01],
       [ 8.66947450e-03,  7.01757266e-03,  5.57700000e+01],
       [ 6.20814303e-03, -1.79323949e-04,  5.57600000e+01],
       [-1.19511126e-02,  1.80277676e-03,  5.55200000e+01],
       [-4.36607409e-03, -2.70538536e-03,  5.53700000e+01],
       [-5.40314188e-03, -6.70474560e-03,  5.50000000e+01],
       [-2.31206804e-03, -8.76557840e-03,  5.45200000e+01]])

In [None]:
# Using computed quantiles to recompute metrics
m = BaseSim([])
quantile_metrics = {}
for ticker in tickers:
    quantile_metrics[ticker] = {}
    for q, run  in quantile_runs[ticker].items():
        run =np.expand_dims(run, axis = 0)
        quantile_metrics[ticker][q] = m.compute_metrics(run, strategy = 'B&H', rfrate = 0)

In [None]:
quantile_metrics['TSLA']['Median']

{'PnL': array([[-0.06960257,  0.10130129,  0.00309572, -0.10464378,  0.00300243,
         -0.04832377,  0.05222664, -0.14032773,  0.05587659,  0.1972465 ,
          0.12839415,  0.17312497, -0.01721737,  0.13096342, -0.09915621,
         -0.09247393,  0.13148408,  0.2157832 , -0.09648876,  0.08607192,
          0.00498799,  0.18966302,  0.23643178,  0.02187816, -0.01131578,
         -0.01375738,  0.091127  ,  0.05967734, -0.08124205, -0.00754395,
          0.04522023,  0.01068889,  0.20887638,  0.05762608, -0.05600441,
          0.05001449, -0.03491861, -0.19640275]]),
 'Sharpe': array([[        nan,  0.22649105,  0.19829156, -0.18001913, -0.15178181,
         -0.25437366, -0.10256321, -0.30048329, -0.18076505,  0.09817817,
          0.21034864,  0.32831852,  0.29998707,  0.37438558,  0.28403752,
          0.21307493,  0.27688634,  0.36061911,  0.29725666,  0.32774428,
          0.32138468,  0.3810858 ,  0.44132085,  0.43923423,  0.42405517,
          0.40891734,  0.4312241 ,  0.443143

# Metrics for actual Strategy

In [None]:
# model vs a basic buy and hold strategy
def buyholdret(true_data, start, end):
    return np.exp(true_data['log_returns'][start:end].sum()) - 1
    
def modelret(PnL):
    return np.exp(PnL.sum()) - 1

P/L

In [None]:

cols = ['Buy and Hold', 'Model BH']
pnl = {}
for ticker in data:
    pnl[ticker] = pd.DataFrame()
    for m, dat in quantile_metrics[ticker].items():
        # compute start and end
        start = int(.5 * len(data[ticker]))
        end = start + simdata[ticker].shape[1] * simdata[ticker].shape[2]
        bh = buyholdret(data[ticker], start, end)
        pred = modelret(dat['PnL'])
        temp = pd.DataFrame(data = dict(zip(cols, [bh, pred])), index = [m], columns = cols)
        pnl[ticker] = pd.concat([pnl[ticker], temp])
        

In [None]:
pnl['AAPL'] # Max/Min PnL are not necessarily max/min, but may be skewed to look like that because our stocks have a natural bullish trend so guessing downwards is usually bad....

Unnamed: 0,Buy and Hold,Model BH
Min,1.936181,-0.608007
Q1,1.936181,-0.231202
Median,1.936181,0.76235
Q3,1.936181,2.01542
Max,1.936181,1.936181


In [None]:
def compute_pred(single_sim):
    ''' computes the reals '''
    preds = []
    for period in single_sim:
        start_price = period[0][2]
        for i in range(len(period)):
            expected = np.exp(period[:, 0][:i+1].sum())
            preds.append(start_price * expected)
    return preds
                        

In [None]:
data[ticker]['close']

timestamp
2015-12-01 13:00:00+00:00     76.170
2015-12-01 14:00:00+00:00     76.390
2015-12-01 15:00:00+00:00     76.200
2015-12-02 13:00:00+00:00     76.710
2015-12-02 14:00:00+00:00     76.610
                              ...   
2022-03-25 11:00:00+00:00    217.500
2022-03-25 12:00:00+00:00    216.767
2022-03-25 13:00:00+00:00    218.250
2022-03-25 14:00:00+00:00    218.250
2022-03-25 15:00:00+00:00    216.760
Name: close, Length: 8261, dtype: float64

In [None]:
ticker = 'V'

Yactual = data[ticker]['close']
Xactual = range(len(Yactual))

mid = int(len(data[ticker]) * .5)

test = data[ticker]['close'].copy()

for x in quantile_runs[ticker]:
    Ypred = compute_pred(quantile_runs[ticker][x])
    Xpred = range(mid, mid + len(Ypred))
    df = pd.DataFrame(list(Ypred), index = data[ticker].index[mid:mid+len(Ypred)], columns = [x])
    assert data[ticker]['close'][mid] == quantile_runs[ticker][x][0][0][2] # check starting points are the same
    test = pd.concat([test, df], axis = 1)


fig = px.line(test, x = test.index, y = ['close'] + list(quantile_runs[ticker].keys()), )
for x in test.index[mid:mid + len(Ypred) + 1:140]:
    fig.add_vline(x = x, line_width = .5, line_color = 'red')
fig.show()