In [1]:
import polars as pl
import pandas as pd
import polars.datatypes as T
import numpy as np

from datetime import datetime, time
import sys
sys.path.insert(0, 'C:\\Users\\rockh\\Repositories\\Tsuro\\')

from tsuro.datasets import load_cba_trades
from tsuro.preprocessing import cast_strings_to_datetime

PSEUDO CODE
Function receives two arguments:
1) sample_bars [[y_1, tau_start_1, tau_end_1],[y_2, tau_start_2, tau_end_2] ..., [y_I, tau_start_I, tau_end_I]].
2) time_index [tau_0,tau_1,...,tau_t].

Easier to work with index label itself. We have no need for timestamps. At the end of the day we just want to create bootstrap samples for the sample_bars. Therefore, time_index is optional argument. Pass sample_bars through alone with tau_start, tau_end replaced with time index labels ([y_1, 1, 2] means that y_1 bar covers the period [tau_1,tau_2]). We will build at most a t x I indicator matrix. We only care about the period [tau_start_1, tau_end_I], so we can further reduce the complexity.

In [5]:
from tsuro.sampling import create_overlap_matrix
bars_df = pl.DataFrame({
        "bar_index": [1,2,3],
        "volume": [10, 20, 50],
        "time_start": ["10:00", "10:05", "10:10"],
        "time_end": ["10:03", "10:10", "10:15"]
        }
)

index = pl.Series(values = ["9:55", "10:00", "10:02", "10:03", "10:04", "10:05", "10:08", "10:10", "10:12", "10:15"])
overlap_matrix = create_overlap_matrix(
    bars_df, 
    time_start_col = "time_start",
    time_end_col = "time_end",
    time_index = index,
    index_col = "bar_index",
    remove_no_overlaps = True
)
overlap_matrix

index,time_start,time_end,bar_1_overlap,bar_2_overlap,bar_3_overlap
u32,str,str,i32,i32,i32
0,"""10:00""","""10:02""",1,0,0
1,"""10:02""","""10:03""",1,0,0
4,"""10:05""","""10:08""",0,1,0
5,"""10:08""","""10:10""",0,1,0
6,"""10:10""","""10:12""",0,0,1
7,"""10:12""","""10:15""",0,0,1


In [9]:
from tsuro.utils import transform_columns_to_index
index = pl.Series(values = ["9:55", "10:00", "10:02", "10:05", "10:08", "10:10", "10:12", "10:05"])

df = pl.DataFrame({
        "volume": [10, 20, 50],
        "time_start": ["10:00", "10:05", "10:10"],
        "time_end": ["10:10", "10:10", "10:15"]
        }
)

df2, index = transform_columns_to_index(df, columns = ["time_start", "time_end"], index = index, return_index = True)
df2

volume,time_start,time_end,time_start_index,time_end_index
i64,str,str,u32,u32
10,"""10:00""","""10:10""",0,4
20,"""10:05""","""10:10""",2,4
50,"""10:10""","""10:15""",4,6


In [11]:
df1, index = transform_columns_to_index(df, columns = ["time_start", "time_end"], return_index = True)
df1

volume,time_start,time_end,time_start_index,time_end_index
i64,str,str,u32,u32
10,"""10:00""","""10:10""",0,2
20,"""10:05""","""10:10""",1,2
50,"""10:10""","""10:15""",2,3


In [12]:
index

values,index
str,u32
"""10:00""",0
"""10:05""",1
"""10:10""",2
"""10:15""",3


In [19]:
index = pl.Series(values = ["10:15", "10:02", "10:05", "10:08", "10:10", "10:12", "10:00"])
index = index.sort()
print(index)

index = index.alias("values")
print(index)
index = pl.DataFrame(index)
index = index.with_columns(pl.int_range(pl.len(), dtype=pl.UInt32).alias("index"))
index

shape: (7,)
Series: '' [str]
[
	"10:00"
	"10:02"
	"10:05"
	"10:08"
	"10:10"
	"10:12"
	"10:15"
]
shape: (7,)
Series: 'values' [str]
[
	"10:00"
	"10:02"
	"10:05"
	"10:08"
	"10:10"
	"10:12"
	"10:15"
]


values,index
str,u32
"""10:00""",0
"""10:02""",1
"""10:05""",2
"""10:08""",3
"""10:10""",4
"""10:12""",5
"""10:15""",6


In [2]:
trades_df = load_cba_trades()
trades_df

Unnamed: 0_level_0,time,price,volume,dollar_value,market,condition
i64,str,f64,i64,f64,str,str
0,"""6/10/2021 17:00""",100.08,12904,1.2914e6,"""ASX""","""ET XT"""
1,"""6/10/2021 16:54""",100.08,19280,1929542.4,"""ASX""","""ET XT"""
2,"""6/10/2021 16:47""",100.3127,18517,1.8575e6,"""CXA""","""SX XT"""
3,"""6/10/2021 16:46""",100.08,5364,536829.12,"""CXA""",
4,"""6/10/2021 16:45""",100.08,24624,2.4644e6,"""ASX""","""ET XT"""
…,…,…,…,…,…,…
66572,"""6/10/2021 7:05""",95.01,6000,570060.0,"""ASX""","""EC XT"""
66573,"""6/10/2021 7:05""",94.01,1200,112812.0,"""ASX""","""EC"""
66574,"""6/10/2021 7:05""",94.01,18700,1.757987e6,"""ASX""","""EC XT"""
66575,"""6/10/2021 7:05""",94.0,1200,112800.0,"""ASX""","""EC"""


In [3]:
# Convert "Time" Column into Datetime Object
trades_df = cast_strings_to_datetime(trades_df, columns = ["time"])
trades_df = trades_df.rename(
   {"time": "date_time"}
)


# Grab Time Fields
trades_df = trades_df.with_columns(
    pl.col("date_time").dt.time().alias("time"),
    pl.col("date_time").dt.hour().alias("hour"),
    pl.col("date_time").dt.minute().alias("minute"),
    pl.col("date_time").dt.second().alias("second"),
    pl.col("date_time").dt.millisecond().alias("millisecond")
)

# Market Hours
trades_df = trades_df.filter(
    pl.col("time").is_between(
        time(9,0), time(17,0)
    )
)
trades_df

Unnamed: 0_level_0,date_time,price,volume,dollar_value,market,condition,time,hour,minute,second,millisecond
i64,datetime[μs],f64,i64,f64,str,str,time,i8,i8,i8,i32
0,2021-06-10 17:00:00,100.08,12904,1.2914e6,"""ASX""","""ET XT""",17:00:00,17,0,0,0
1,2021-06-10 16:54:00,100.08,19280,1929542.4,"""ASX""","""ET XT""",16:54:00,16,54,0,0
2,2021-06-10 16:47:00,100.3127,18517,1.8575e6,"""CXA""","""SX XT""",16:47:00,16,47,0,0
3,2021-06-10 16:46:00,100.08,5364,536829.12,"""CXA""",,16:46:00,16,46,0,0
4,2021-06-10 16:45:00,100.08,24624,2.4644e6,"""ASX""","""ET XT""",16:45:00,16,45,0,0
…,…,…,…,…,…,…,…,…,…,…,…
66537,2021-06-10 10:02:00,102.0,4,408.0,"""ASX""",,10:02:00,10,2,0,0
66538,2021-06-10 10:02:00,102.0,30,3060.0,"""ASX""",,10:02:00,10,2,0,0
66539,2021-06-10 10:02:00,102.0,215,21930.0,"""ASX""",,10:02:00,10,2,0,0
66540,2021-06-10 10:02:00,102.0,233,23766.0,"""ASX""",,10:02:00,10,2,0,0


In [5]:
from tsuro.data_structures import StandardBars

bars = StandardBars(datetime_col = "date_time", price_col = "price", volume_col = "volume")

In [6]:
pdf = bars.create_volume_bars(
    trades_df,
    order_by = "date_time"
)
pdf

bar_index,open_price,high_price,low_price,close_price,datetime_start,datetime_end
i32,f64,f64,f64,f64,datetime[μs],datetime[μs]
27,99.92,99.98,99.77,99.91,2021-06-10 12:44:00,2021-06-10 12:55:00
18,99.9,100.01,99.65,99.68,2021-06-10 11:09:00,2021-06-10 11:13:00
9,100.85,101.08,100.72,100.98,2021-06-10 10:28:00,2021-06-10 10:32:00
30,99.93,99.95,99.64,99.65,2021-06-10 13:24:00,2021-06-10 13:36:00
3,102.5,102.59,102.01,102.07,2021-06-10 10:05:00,2021-06-10 10:10:00
…,…,…,…,…,…,…
38,99.98,100.01,99.84,100.01,2021-06-10 15:02:00,2021-06-10 15:14:00
53,100.08,100.08,100.08,100.08,2021-06-10 16:10:00,2021-06-10 16:10:00
35,99.97,100.0,99.73,99.77,2021-06-10 14:24:00,2021-06-10 14:36:00
47,100.08,100.08,100.08,100.08,2021-06-10 16:10:00,2021-06-10 16:10:00


In [7]:
from tsuro.sampling import create_overlap_matrix

overlap_matrix, bars_df = create_overlap_matrix(
    pdf,
    time_start_col = "datetime_start",
    time_end_col = "datetime_end",
    index_col = "bar_index"
)
overlap_matrix

index,time_start,time_end,bar_27_overlap,bar_18_overlap,bar_9_overlap,bar_30_overlap,bar_3_overlap,bar_0_overlap,bar_6_overlap,bar_15_overlap,bar_12_overlap,bar_24_overlap,bar_21_overlap,bar_33_overlap,bar_48_overlap,bar_60_overlap,bar_42_overlap,bar_51_overlap,bar_63_overlap,bar_45_overlap,bar_36_overlap,bar_54_overlap,bar_39_overlap,bar_16_overlap,bar_4_overlap,bar_13_overlap,bar_28_overlap,bar_7_overlap,bar_10_overlap,bar_25_overlap,bar_1_overlap,bar_19_overlap,bar_31_overlap,bar_22_overlap,bar_55_overlap,bar_64_overlap,bar_40_overlap,bar_43_overlap,bar_52_overlap,bar_61_overlap,bar_37_overlap,bar_58_overlap,bar_34_overlap,bar_49_overlap,bar_46_overlap,bar_8_overlap,bar_11_overlap,bar_29_overlap,bar_32_overlap,bar_14_overlap,bar_17_overlap,bar_5_overlap,bar_20_overlap,bar_26_overlap,bar_2_overlap,bar_23_overlap,bar_56_overlap,bar_50_overlap,bar_44_overlap,bar_41_overlap,bar_38_overlap,bar_53_overlap,bar_35_overlap,bar_47_overlap,bar_62_overlap
u32,datetime[μs],datetime[μs],i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32
0,2021-06-10 10:02:00,2021-06-10 10:05:00,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2021-06-10 10:05:00,2021-06-10 10:10:00,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,2021-06-10 10:10:00,2021-06-10 10:14:00,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,2021-06-10 10:14:00,2021-06-10 10:17:00,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
4,2021-06-10 10:17:00,2021-06-10 10:21:00,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
49,2021-06-10 16:20:00,2021-06-10 16:22:00,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
50,2021-06-10 16:22:00,2021-06-10 16:26:00,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
51,2021-06-10 16:26:00,2021-06-10 16:43:00,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
52,2021-06-10 16:43:00,2021-06-10 16:45:00,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [8]:
bars_df

bar_index,open_price,high_price,low_price,close_price,datetime_start,datetime_end,datetime_start_index,datetime_end_index
i32,f64,f64,f64,f64,datetime[μs],datetime[μs],u32,u32
27,99.92,99.98,99.77,99.91,2021-06-10 12:44:00,2021-06-10 12:55:00,25,26
18,99.9,100.01,99.65,99.68,2021-06-10 11:09:00,2021-06-10 11:13:00,16,17
9,100.85,101.08,100.72,100.98,2021-06-10 10:28:00,2021-06-10 10:32:00,7,8
30,99.93,99.95,99.64,99.65,2021-06-10 13:24:00,2021-06-10 13:36:00,28,29
3,102.5,102.59,102.01,102.07,2021-06-10 10:05:00,2021-06-10 10:10:00,1,2
…,…,…,…,…,…,…,…,…
38,99.98,100.01,99.84,100.01,2021-06-10 15:02:00,2021-06-10 15:14:00,36,37
53,100.08,100.08,100.08,100.08,2021-06-10 16:10:00,2021-06-10 16:10:00,44,44
35,99.97,100.0,99.73,99.77,2021-06-10 14:24:00,2021-06-10 14:36:00,33,34
47,100.08,100.08,100.08,100.08,2021-06-10 16:10:00,2021-06-10 16:10:00,44,44


In [9]:
bars_start_times = pdf.select("datetime_start").unique()
bars_end_times = pdf.select("datetime_end").unique()
bars_end_times

datetime_end
datetime[μs]
2021-06-10 16:10:00
2021-06-10 16:20:00
2021-06-10 10:08:00
2021-06-10 15:38:00
2021-06-10 16:43:00
…
2021-06-10 10:47:00
2021-06-10 13:43:00
2021-06-10 14:58:00
2021-06-10 13:02:00
