In [1]:
import os
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt

from utils.paths import ROOT, DATA_DIR, RAW_DIR, PROCESSED_DIR, EQUITY_DIR

## Preprocess data

#### Read a CSV file

In [2]:
stocks_eq_pth = os.path.join(PROCESSED_DIR, "tech_stocks_us_equity_1d.csv")
stocks_eq_df = pd.read_csv(stocks_eq_pth)
# stocks_eq_df.columns = ["Date", "Close", "High", "Low", "Open", "Volume"]
stocks_eq_df.head()

  stocks_eq_df = pd.read_csv(stocks_eq_pth)


Unnamed: 0,Price,Price.1,Price.2,Close,Close.1,Close.2,Close.3,Close.4,Close.5,Close.6,...,Volume.72,Volume.73,Volume.74,Volume.75,Volume.76,Volume.77,Volume.78,Volume.79,Volume.80,Volume.81
0,Ticker,ANSS,JNPR,AAPL,ACN,ADBE,ADI,ADP,ADSK,AKAM,...,TDY,TEL,TER,TRMB,TXN,TYL,UBER,VRSN,WDC,ZBRA
1,Date,,,,,,,,,,...,,,,,,,,,,
2,2015-01-02,,,24.26104164123535,74.25262451171875,72.33999633789062,44.51115036010742,65.98639678955078,59.529998779296875,63.25,...,145700,948900,1030300,1106000,4020100,216200,,684500,1278547,411800
3,2015-01-05,,,23.57756996154785,72.99890899658203,71.9800033569336,43.70171356201172,65.93885803222656,58.65999984741211,61.939998626708984,...,266200,953400,1877000,1254900,5599300,270300,,874400,2199752,420300
4,2015-01-06,,,23.57979393005371,72.47238159179688,70.52999877929688,42.67587661743164,65.3922348022461,57.5,60.66999816894531,...,243000,1448800,2377700,2291600,5468400,239600,,1289300,2665713,527500


#### Parse Columns

In [3]:
parsed_columns = []
# Loop each columns
for col_name in stocks_eq_df.columns:
    
    parts = col_name.split('.') # แยกชื่อด้วยจุด
    feature = parts[0]
    ticker = stocks_eq_df.loc[0, col_name]

    # เพิ่ม (Feature, Ticker_ID) เข้าไปใน List
    parsed_columns.append((feature, ticker))

# Applying MultiIndex
new_index = pd.MultiIndex.from_tuples(parsed_columns, names=['Feature', 'Ticker'])
stocks_eq_df.columns = new_index

stocks_eq_df.head()

Feature,Price,Price,Price,Close,Close,Close,Close,Close,Close,Close,...,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume
Ticker,Ticker,ANSS,JNPR,AAPL,ACN,ADBE,ADI,ADP,ADSK,AKAM,...,TDY,TEL,TER,TRMB,TXN,TYL,UBER,VRSN,WDC,ZBRA
0,Ticker,ANSS,JNPR,AAPL,ACN,ADBE,ADI,ADP,ADSK,AKAM,...,TDY,TEL,TER,TRMB,TXN,TYL,UBER,VRSN,WDC,ZBRA
1,Date,,,,,,,,,,...,,,,,,,,,,
2,2015-01-02,,,24.26104164123535,74.25262451171875,72.33999633789062,44.51115036010742,65.98639678955078,59.529998779296875,63.25,...,145700,948900,1030300,1106000,4020100,216200,,684500,1278547,411800
3,2015-01-05,,,23.57756996154785,72.99890899658203,71.9800033569336,43.70171356201172,65.93885803222656,58.65999984741211,61.939998626708984,...,266200,953400,1877000,1254900,5599300,270300,,874400,2199752,420300
4,2015-01-06,,,23.57979393005371,72.47238159179688,70.52999877929688,42.67587661743164,65.3922348022461,57.5,60.66999816894531,...,243000,1448800,2377700,2291600,5468400,239600,,1289300,2665713,527500


#### Remove 2 rows

In [4]:
# Remove 2 rows
steq_df = stocks_eq_df.iloc[2:]
steq_df = steq_df.reset_index(drop=True)
steq_df.head()

Feature,Price,Price,Price,Close,Close,Close,Close,Close,Close,Close,...,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume
Ticker,Ticker,ANSS,JNPR,AAPL,ACN,ADBE,ADI,ADP,ADSK,AKAM,...,TDY,TEL,TER,TRMB,TXN,TYL,UBER,VRSN,WDC,ZBRA
0,2015-01-02,,,24.26104164123535,74.25262451171875,72.33999633789062,44.51115036010742,65.98639678955078,59.52999877929688,63.25,...,145700,948900,1030300,1106000,4020100,216200,,684500,1278547,411800
1,2015-01-05,,,23.57756996154785,72.99890899658203,71.9800033569336,43.70171356201172,65.93885803222656,58.65999984741211,61.93999862670898,...,266200,953400,1877000,1254900,5599300,270300,,874400,2199752,420300
2,2015-01-06,,,23.57979393005371,72.47238159179688,70.52999877929688,42.67587661743164,65.3922348022461,57.5,60.66999816894531,...,243000,1448800,2377700,2291600,5468400,239600,,1289300,2665713,527500
3,2015-01-07,,,23.910430908203125,73.99351501464844,71.11000061035156,43.12467956542969,65.962646484375,57.380001068115234,60.15999984741211,...,245300,1929000,902400,2096400,4577100,220300,,552400,2054222,467800
4,2015-01-08,,,24.829124450683597,75.1218490600586,72.91999816894531,43.88603591918945,67.47581481933594,58.79999923706055,61.56999969482422,...,240300,1635300,1626800,2036000,6295200,264700,,1282600,3351291,324400


#### Set Indexes

In [5]:
# 1. Get col name
first_col_name = steq_df.columns[0]

# 2. Covert Cols to datetime
steq_df[first_col_name] = pd.to_datetime(steq_df[first_col_name])

# 3. Set Col be Index
steq_df = steq_df.set_index(first_col_name, drop=True)

# 4. Change index name
steq_df.index.name = 'Date'

steq_df.head()

Feature,Price,Price,Close,Close,Close,Close,Close,Close,Close,Close,...,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume
Ticker,ANSS,JNPR,AAPL,ACN,ADBE,ADI,ADP,ADSK,AKAM,AMAT,...,TDY,TEL,TER,TRMB,TXN,TYL,UBER,VRSN,WDC,ZBRA
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2015-01-02,,,24.26104164123535,74.25262451171875,72.33999633789062,44.51115036010742,65.98639678955078,59.52999877929688,63.25,21.80723571777344,...,145700,948900,1030300,1106000,4020100,216200,,684500,1278547,411800
2015-01-05,,,23.57756996154785,72.99890899658203,71.9800033569336,43.70171356201172,65.93885803222656,58.65999984741211,61.93999862670898,21.108293533325195,...,266200,953400,1877000,1254900,5599300,270300,,874400,2199752,420300
2015-01-06,,,23.57979393005371,72.47238159179688,70.52999877929688,42.67587661743164,65.3922348022461,57.5,60.66999816894531,20.313232421875,...,243000,1448800,2377700,2291600,5468400,239600,,1289300,2665713,527500
2015-01-07,,,23.910430908203125,73.99351501464844,71.11000061035156,43.12467956542969,65.962646484375,57.380001068115234,60.15999984741211,20.50544166564941,...,245300,1929000,902400,2096400,4577100,220300,,552400,2054222,467800
2015-01-08,,,24.829124450683597,75.1218490600586,72.91999816894531,43.88603591918945,67.47581481933594,58.79999923706055,61.56999969482422,21.16071128845215,...,240300,1635300,1626800,2036000,6295200,264700,,1282600,3351291,324400


#### Drop Price Column

In [6]:
# filt means filterd
# steq means stock equity
steq_df = steq_df.drop(columns="Price", level="Feature")
steq_df.head()

Feature,Close,Close,Close,Close,Close,Close,Close,Close,Close,Close,...,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume
Ticker,AAPL,ACN,ADBE,ADI,ADP,ADSK,AKAM,AMAT,AMD,ANET,...,TDY,TEL,TER,TRMB,TXN,TYL,UBER,VRSN,WDC,ZBRA
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2015-01-02,24.26104164123535,74.25262451171875,72.33999633789062,44.51115036010742,65.98639678955078,59.52999877929688,63.25,21.80723571777344,2.6700000762939453,3.9175000190734863,...,145700,948900,1030300,1106000,4020100,216200,,684500,1278547,411800
2015-01-05,23.57756996154785,72.99890899658203,71.9800033569336,43.70171356201172,65.93885803222656,58.65999984741211,61.93999862670898,21.108293533325195,2.6600000858306885,3.987499952316284,...,266200,953400,1877000,1254900,5599300,270300,,874400,2199752,420300
2015-01-06,23.57979393005371,72.47238159179688,70.52999877929688,42.67587661743164,65.3922348022461,57.5,60.66999816894531,20.313232421875,2.630000114440918,3.988125085830689,...,243000,1448800,2377700,2291600,5468400,239600,,1289300,2665713,527500
2015-01-07,23.910430908203125,73.99351501464844,71.11000061035156,43.12467956542969,65.962646484375,57.380001068115234,60.15999984741211,20.50544166564941,2.5799999237060547,4.071249961853027,...,245300,1929000,902400,2096400,4577100,220300,,552400,2054222,467800
2015-01-08,24.829124450683597,75.1218490600586,72.91999816894531,43.88603591918945,67.47581481933594,58.79999923706055,61.56999969482422,21.16071128845215,2.609999895095825,4.132500171661377,...,240300,1635300,1626800,2036000,6295200,264700,,1282600,3351291,324400


#### Check feature nums

In [7]:
num_features = steq_df.columns.get_level_values("Feature").nunique()
num_tickers = steq_df.columns.get_level_values("Ticker").nunique()

print(f"Number of Features: {num_features}")
print(f"Number of Tickers: {num_tickers}")

# ---  "Name list" ---
print("\nUnique Features:")
print(steq_df.columns.get_level_values('Feature').unique())

print("\nUnique Tickers (sample):")
print(steq_df.columns.get_level_values('Ticker').unique()[:5])

Number of Features: 5
Number of Tickers: 82

Unique Features:
Index(['Close', 'High', 'Low', 'Open', 'Volume'], dtype='object', name='Feature')

Unique Tickers (sample):
Index(['AAPL', 'ACN', 'ADBE', 'ADI', 'ADP'], dtype='object', name='Ticker')


#### Save File for Backup

In [8]:
steq_pth = os.path.join(PROCESSED_DIR, "v1_tech_stocks_us_equity_1d.csv")
steq_df.to_csv(steq_pth)

## Process
* steq_df = stock equity dataframe
* Number of Features: 5
* Number of Tickers: 82
* Unique Features: Index(['Close', 'High', 'Low', 'Open', 'Volume'], dtype='object', name='Feature')
* Unique Tickers (sample): Index(['AAPL', 'ACN', 'ADBE', 'ADI', 'ADP'], dtype='object', name='Ticker')

### Information Driven Bars

#### Tick Imbalance Bars

$
b_t = 
\begin{cases} 
    b_{t-1} & \text{if } \Delta p_t = 0 \\ 
    \frac{|\Delta p_t|}{\Delta p_t} & \text{if } \Delta p_t \neq 0 
\end{cases}
$ <br>  
$b_t \in \{-1, 1\}$ ; +1 = buy (price up), -1 = sell (price down) <br>
$b_t$ means signal directions 

Using Close price ( $C_t$ ) as Price ( $P_t$ )

In [9]:
features_to_select = ['Close', 'Volume']

steq_df_selected = steq_df.loc[:, features_to_select]

# steq_df.loc[ (slice(None), "Close") ]
steq_df_selected.head(2)

Feature,Close,Close,Close,Close,Close,Close,Close,Close,Close,Close,...,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume
Ticker,AAPL,ACN,ADBE,ADI,ADP,ADSK,AKAM,AMAT,AMD,ANET,...,TDY,TEL,TER,TRMB,TXN,TYL,UBER,VRSN,WDC,ZBRA
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2015-01-02,24.26104164123535,74.25262451171875,72.33999633789062,44.51115036010742,65.98639678955078,59.52999877929688,63.25,21.80723571777344,2.6700000762939453,3.9175000190734863,...,145700,948900,1030300,1106000,4020100,216200,,684500,1278547,411800
2015-01-05,23.57756996154785,72.99890899658203,71.9800033569336,43.70171356201172,65.93885803222656,58.65999984741211,61.93999862670898,21.108293533325195,2.6600000858306885,3.987499952316284,...,266200,953400,1877000,1254900,5599300,270300,,874400,2199752,420300


In [10]:
def trade_direction_signal(price_series, init_dir=1):
    """
    Compute trade direction signal b_t from price Series.
    +1 = buy pressure, -1 = sell pressure
    If no price change, carry forward previous direction.
    
    Parameters
    ----------
    price_series : pd.Series
        Series of closing prices (single symbol)
    init_dir : int
        Initial direction (default +1)
    
    Returns
    -------
    pd.Series
        Series of b_t (+1/-1)
    """
    p = price_series.fillna(0).values  # fill missing prices
    b = np.zeros(len(p), dtype=float)
    
    for i in range(1, len(p)):
        diff = p[i] - p[i-1]
        if diff != 0:
            b[i] = abs(diff) / diff  # = +1 or -1
        else:
            b[i] = b[i-1]            # keep previous direction

    return pd.Series(b, index=price_series.index, name="Bt")

In [11]:
imba_bar_steq_df = steq_df.copy()
for ticker in steq_df['Close'].columns:
    imba_bar_steq_df[('Bt', ticker)] = trade_direction_signal(steq_df[('Close', ticker)].astype(float))


imba_bar_steq_df = imba_bar_steq_df.iloc[1:]
imba_bar_steq_df.head(2)

Feature,Close,Close,Close,Close,Close,Close,Close,Close,Close,Close,...,Bt,Bt,Bt,Bt,Bt,Bt,Bt,Bt,Bt,Bt
Ticker,AAPL,ACN,ADBE,ADI,ADP,ADSK,AKAM,AMAT,AMD,ANET,...,TDY,TEL,TER,TRMB,TXN,TYL,UBER,VRSN,WDC,ZBRA
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2015-01-05,23.57756996154785,72.99890899658203,71.9800033569336,43.70171356201172,65.93885803222656,58.65999984741211,61.93999862670898,21.108293533325195,2.6600000858306885,3.987499952316284,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0
2015-01-06,23.57979393005371,72.47238159179688,70.52999877929688,42.67587661743164,65.3922348022461,57.5,60.66999816894531,20.313232421875,2.630000114440918,3.988125085830689,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0


$\theta_T = \sum_{t=1}^T b_t$ <br>

In [14]:
theta_dict = {}
for ticker in imba_bar_steq_df['Bt'].columns:
    theta_dict[ticker] = imba_bar_steq_df[('Bt', ticker)].astype(float).cumsum()

theta_df = pd.concat(theta_dict, axis=1)
theta_df.columns = pd.MultiIndex.from_product([['Theta'], theta_df.columns])

imba_bar_steq_df = pd.concat([imba_bar_steq_df, theta_df], axis=1)
imba_bar_steq_df.tail(5)

Unnamed: 0_level_0,Close,Close,Close,Close,Close,Close,Close,Close,Close,Close,...,Theta,Theta,Theta,Theta,Theta,Theta,Theta,Theta,Theta,Theta
Unnamed: 0_level_1,AAPL,ACN,ADBE,ADI,ADP,ADSK,AKAM,AMAT,AMD,ANET,...,TDY,TEL,TER,TRMB,TXN,TYL,UBER,VRSN,WDC,ZBRA
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2025-10-24,262.820007,247.649994,353.519989,238.009995,280.940002,312.880005,75.050003,228.75,252.919998,153.820007,...,217.0,169.0,185.0,115.0,185.0,205.0,-17.0,235.0,71.0,131.0
2025-10-27,268.809998,250.770004,357.799988,243.009995,280.529999,313.529999,75.489998,231.330002,259.670013,156.809998,...,216.0,170.0,186.0,114.0,186.0,206.0,-16.0,236.0,70.0,132.0
2025-10-28,269.0,253.350006,359.910004,239.350006,279.630005,311.079987,75.75,227.639999,258.01001,156.770004,...,215.0,169.0,185.0,113.0,185.0,207.0,-17.0,235.0,69.0,131.0
2025-10-29,269.700012,247.75,337.859985,235.039993,261.220001,297.079987,73.010002,235.75,264.329987,162.029999,...,214.0,170.0,186.0,112.0,184.0,206.0,-16.0,234.0,70.0,130.0
2025-10-30,271.399994,249.25,339.23999,232.899994,261.529999,300.769989,73.93,232.550003,254.839996,158.440002,...,215.0,171.0,187.0,111.0,185.0,207.0,-15.0,235.0,69.0,129.0


$T^* = \arg \min_{T} \left\{ |\theta_T| \geq E_0[T] | 2P[b_T = 1] - 1 | \right\}$

In [22]:
threshold = 100
for ticker in imba_bar_steq_df['Theta'].columns:
    bar_idx = imba_bar_steq_df.loc[imba_bar_steq_df[("Theta", ticker)].abs() >= threshold].index
    print(f"{ticker}: {len(bar_idx)}")


AAPL: 1285
ACN: 1897
ADBE: 1862
ADI: 1015
ADP: 1690
ADSK: 1366
AKAM: 1481
AMAT: 1369
AMD: 0
ANET: 1942
ANSS: 0
APH: 1684
AVGO: 1281
BR: 1453
CDNS: 1703
CDW: 1803
CPAY: 1454
CRM: 1356
CRWD: 0
CSCO: 893
CTSH: 0
DAY: 1351
DELL: 1734
ENPH: 0
EPAM: 1592
FFIV: 963
FI: 1844
FICO: 1645
FIS: 1617
FSLR: 0
FTNT: 1504
FTV: 0
GDDY: 1839
GEN: 216
GLW: 282
GRMN: 1190
HPE: 1036
HPQ: 1238
IBM: 579
INTC: 7
INTU: 1900
IT: 1623
JBL: 1108
JKHY: 1842
JNPR: 0
KEYS: 1236
KLAC: 1698
LDOS: 1597
LRCX: 1953
MCHP: 1071
MPWR: 1874
MSFT: 1579
MSI: 1496
MU: 0
NOW: 1921
NTAP: 1153
NVDA: 1737
NXPI: 0
ON: 1272
ORCL: 1538
PANW: 1391
PAYC: 1898
PAYX: 1613
PLTR: 0
PTC: 1802
QCOM: 87
QRVO: 0
ROP: 1896
SMCI: 1290
SNPS: 2041
STX: 1823
SWKS: 1361
TDY: 1503
TEL: 1261
TER: 1978
TRMB: 134
TXN: 1963
TYL: 1879
UBER: 0
VRSN: 1885
WDC: 0
ZBRA: 1217


In [25]:
threshold = 150
bar_idx_dict = {}

for ticker in imba_bar_steq_df['Theta'].columns:
    cond = imba_bar_steq_df[('Theta', ticker)].abs() >= threshold
    bar_idx_dict[ticker] = imba_bar_steq_df.index[cond]
bar_idx_dict

{'AAPL': DatetimeIndex(['2024-08-09', '2024-08-12', '2024-08-13', '2024-08-14',
                '2024-08-15', '2024-08-16', '2024-08-19', '2024-08-20',
                '2024-08-21', '2024-08-22',
                ...
                '2025-10-17', '2025-10-20', '2025-10-21', '2025-10-22',
                '2025-10-23', '2025-10-24', '2025-10-27', '2025-10-28',
                '2025-10-29', '2025-10-30'],
               dtype='datetime64[ns]', name='Date', length=308, freq=None),
 'ACN': DatetimeIndex(['2019-08-19', '2019-08-21', '2019-08-22', '2019-08-23',
                '2019-08-26', '2019-08-27', '2019-08-28', '2019-08-29',
                '2019-08-30', '2019-09-03',
                ...
                '2025-10-17', '2025-10-20', '2025-10-21', '2025-10-22',
                '2025-10-23', '2025-10-24', '2025-10-27', '2025-10-28',
                '2025-10-29', '2025-10-30'],
               dtype='datetime64[ns]', name='Date', length=1555, freq=None),
 'ADBE': DatetimeIndex(['2020-02-14', 