In [14]:
'''
Thisi example is adapted from Hudson and Thames
https://hudson-and-thames-mlfinlab-premium.readthedocs-hosted.com/en/latest/sampling/sequential_boot.html
 
'''
import numpy as np
import pandas as pd
from mlfinlab.util import volatility
from mlfinlab.filters import filters
from mlfinlab.labeling import labeling
from mlfinlab.sampling import bootstrapping
from mlfinlab.sampling import concurrent
from mlfinlab.data_structures import standard_data_structures

'''-----------------------------------------------------------------------------------'''
# sp_read = pd.read_csv('../data/SP.csv')
# sp = sp_read.loc[pd.to_datetime(sp_read['date'].astype(str)) >= pd.Timestamp('2011-01-01')].copy()
# sp['date_time'] = pd.to_datetime(sp['date'].astype(str) + ' ' + sp['time'].astype(str))
# sp['date'] = sp['date_time'].dt.normalize()
# sp['date_time'] = pd.to_datetime(sp['date'].astype(str) + ' ' + sp['time'].astype(str))
# sp_processed = sp[['date_time', 'price', 'volume']].copy()
# sp_processed.columns = ['date_time', 'price', 'volume']

# data = standard_data_structures.get_dollar_bars(
#     sp_processed, threshold=1000000, batch_size=100000, verbose=False
# )
'''-----------------------------------------------------------------------------------'''
data = pd.read_csv(
    "https://raw.githubusercontent.com/hudson-and-thames/example-data/main/dollar_bars.csv"
)
data = data.iloc[:2000, :]  # slice the dataset so example doesn't run too long
data.index = pd.to_datetime(data["date_time"])
data = data.drop("date_time", axis=1)
'''-----------------------------------------------------------------------------------'''

data = data.iloc[:2000, :]  # slice the dataset so example doesn't run too long
#data.index = pd.to_datetime(data["date_time"])
#data = data.drop("date_time", axis=1)-
# Select the data from 1st September 2011
pdata1 = data.copy()

data = data["2011-09-01":]
# Based on the simple moving average cross-over strategy.
# Compute moving averages
fast_window = 20
slow_window = 50
data["fast_mavg"] = (
     data["close"]
     .rolling(window=fast_window, min_periods=fast_window, center=False)
     .mean()
 )
data["slow_mavg"] = (
     data["close"]
     .rolling(window=slow_window, min_periods=slow_window, center=False)
     .mean()
)
# Compute sides
data["side"] = np.nan
long_signals = data["fast_mavg"] >= data["slow_mavg"]
short_signals = data["fast_mavg"] < data["slow_mavg"]
data.loc[long_signals, "side"] = 1
data.loc[short_signals, "side"] = -1
# Remove Look ahead biase by lagging the signal
data["side"] = data["side"].shift(1)
# Duplicate the raw data
raw_data = data.copy()
# Drop the NaN values from our data set
data.dropna(axis=0, how="any", inplace=True)
# Compute daily volatility
daily_vol = volatility.get_daily_vol(close=data["close"], lookback=50)
# Apply Symmetric CUSUM filter and get timestamps for events
# Note: Only the CUSUM filter needs a point estimate for volatility
cusum_events = filters.cusum_filter(
     data["close"], threshold=daily_vol["2011-09-01":"2018-01-01"].mean()
)
# Compute (triple barrier labeling) vertical barrier
vertical_barriers = labeling.add_vertical_barrier(
     t_events=cusum_events, close=data["close"], num_days=1
)
pt_sl = [1, 2]
min_ret = 0.005
barrier_events = labeling.get_events(
     close=data["close"],
     t_events=cusum_events,
     pt_sl=pt_sl,
     target=daily_vol,
     min_ret=min_ret,
     num_threads=3,
     vertical_barrier_times=vertical_barriers,
     side_prediction=data["side"],
)
barrier_events

# Check for NaN values
print(f"Shape of barrier_events: {barrier_events.shape}")
print(f"NaN counts in barrier_events:\n{barrier_events.isna().sum()}")
print(f"\nFirst few rows:\n{barrier_events.head()}")

# Remove rows with NaN values
barrier_events_clean = barrier_events.dropna()
print(f"\nShape after dropping NaN: {barrier_events_clean.shape}")

# Use the close prices from dollar bars dataset as the price bars for the indicator matrix.
close_prices = pdata1[['close']].copy() 


# Create the indicator matrix
triple_barrier_ind_mat = bootstrapping.get_ind_matrix(barrier_events_clean, close_prices)
# MlFinlab can also get average label uniqueness on the indicator matrix
ind_mat_uniqueness = bootstrapping.get_ind_mat_average_uniqueness(
     triple_barrier_ind_mat
)
av_unique = concurrent.get_av_uniqueness_from_triple_barrier(
     pd.DataFrame(barrier_events_clean), close_prices, num_threads=1
)
# Draw sequential bootstrap
bootstrapping.seq_bootstrap(
     triple_barrier_ind_mat, sample_length=4, warmup_samples=[1]
)

Shape of barrier_events: (89, 5)
NaN counts in barrier_events:
t1      0
trgt    0
side    0
pt      0
sl      0
dtype: int64

First few rows:
                                             t1      trgt  side  pt  sl
2015-01-07 16:55:16.638 2015-01-08 01:48:57.964  0.006039   1.0   1   2
2015-01-08 01:48:57.964 2015-01-08 18:11:19.540  0.008445   1.0   1   2
2015-01-08 15:00:21.581 2015-01-09 15:02:59.709  0.007834   1.0   1   2
2015-01-08 18:11:19.540 2015-01-09 18:14:11.989  0.007365   1.0   1   2
2015-01-09 14:29:19.229 2015-01-09 16:17:00.682  0.006032   1.0   1   2

Shape after dropping NaN: (89, 5)


2025-11-05 11:05:19.030936 100.0% apply_pt_sl_on_t1 done after 0.03 minutes. Remaining 0.0 minutes..


[1, 70, 42, 79]

In [15]:
pdata1.head()

Unnamed: 0_level_0,open,high,low,close,cum_vol,cum_dollar,cum_ticks
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2015-01-01 23:00:23.723,2053.0,2056.75,2051.0,2056.75,34102,70001096.75,8478
2015-01-02 07:07:35.156,2056.75,2067.25,2056.25,2064.0,33968,70010061.25,14514
2015-01-02 09:35:57.204,2064.0,2067.25,2058.75,2060.5,33972,70087834.25,16152
2015-01-02 12:59:42.176,2060.5,2062.0,2057.75,2061.0,33985,70006169.75,15502
2015-01-02 14:19:33.847,2061.0,2064.25,2058.75,2063.75,33958,70000723.25,12332
