In [49]:
import pandas as pd 
import os 
from sklearn.linear_model import LinearRegression
from statsmodels.tsa.stattools import coint



In [51]:
pairs_data = {}
pairs_key = {}

current_pair = 'pair1'
lookback = 100 
p_threshold = 0.05

pairs_data[current_pair] = [
    pd.read_csv(f"data/{f}", index_col=0) for f in os.listdir('data')
    if f.startswith(current_pair) and f.endswith('.csv')
]

pairs_key[current_pair] = [
    f.split('.')[0] for f in os.listdir('data')
    if f.startswith(current_pair) and f.endswith('.csv')
]

In [52]:

asset1 = pairs_data[current_pair][0].ffill()
asset2 = pairs_data[current_pair][1].ffill()
display(asset1)

Unnamed: 0_level_0,rtype,publisher_id,instrument_id,open,high,low,close,volume,symbol
ts_event,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2025-10-20 00:00:00+00:00,33,1,651434,56.95,56.96,56.91,56.95,73,CLZ5
2025-10-20 00:01:00+00:00,33,1,651434,56.96,56.99,56.96,56.99,22,CLZ5
2025-10-20 00:02:00+00:00,33,1,651434,56.97,56.98,56.96,56.97,19,CLZ5
2025-10-20 00:03:00+00:00,33,1,651434,56.98,57.01,56.97,57.00,70,CLZ5
2025-10-20 00:04:00+00:00,33,1,651434,57.01,57.01,56.95,56.96,49,CLZ5
...,...,...,...,...,...,...,...,...,...
2025-10-24 20:55:00+00:00,33,1,651434,61.47,61.48,61.46,61.47,8,CLZ5
2025-10-24 20:56:00+00:00,33,1,651434,61.46,61.47,61.45,61.46,32,CLZ5
2025-10-24 20:57:00+00:00,33,1,651434,61.45,61.46,61.45,61.45,55,CLZ5
2025-10-24 20:58:00+00:00,33,1,651434,61.45,61.45,61.44,61.44,52,CLZ5


In [58]:
### getting close price at each candle 

asset1_close = asset1['close'].to_frame(name=pairs_key[current_pair][0])
asset2_close = asset2['close'].to_frame(name=pairs_key[current_pair][1])

pair_close = asset1_close.join(asset2_close, how='outer').dropna()

In [59]:
pair_close

Unnamed: 0_level_0,pair1_wti_oil_future,pair1_brent_oil_future
ts_event,Unnamed: 1_level_1,Unnamed: 2_level_1
2025-10-20 00:00:00+00:00,56.95,60.90
2025-10-20 00:01:00+00:00,56.99,60.90
2025-10-20 00:02:00+00:00,56.97,60.93
2025-10-20 00:03:00+00:00,57.00,60.97
2025-10-20 00:04:00+00:00,56.96,60.97
...,...,...
2025-10-24 20:55:00+00:00,61.47,65.13
2025-10-24 20:56:00+00:00,61.46,65.11
2025-10-24 20:57:00+00:00,61.45,65.12
2025-10-24 20:58:00+00:00,61.44,65.10


In [60]:
### verifying cointegration and stationarity
pair_close["cointegrated"] = 0
pair_close["residual"] = 0.0
pair_close["zscore"] = 0.0 

is_cointegrated = False 
lr = LinearRegression()

for i in range(lookback, len(pair_close), lookback):  
    x = pair_close[pairs_key[current_pair][0]].iloc[i-lookback:i].values[:,None]
    y = pair_close[pairs_key[current_pair][1]].iloc[i-lookback:i].values[:,None]

    if is_cointegrated:
        # Compute and normalize signal on forward window
        x_new = pair_close[pairs_key[current_pair][0]].iloc[i:i+lookback].values[:,None]
        y_new = pair_close[pairs_key[current_pair][1]].iloc[i:i+lookback].values[:,None]
        spread_back = y - lr.coef_ * x
        spread_forward = y_new - lr.coef_ * x_new
        zscore = (spread_forward - spread_back.mean()) / spread_back.std()
        pair_close.iloc[i:i+lookback, pair_close.columns.get_loc("cointegrated")] = 1
        pair_close.iloc[i:i+lookback, pair_close.columns.get_loc("residual")] = spread_forward
        pair_close.iloc[i:i+lookback, pair_close.columns.get_loc("zscore")] = zscore

    _, p, _ = coint(x,y)
    is_cointegrated = p < p_threshold
    lr.fit(x,y)



In [75]:
blocks = (pair_close['cointegrated'].diff().fillna(0) != 0).cumsum()
coint_blocks = blocks[pair_close['cointegrated'] == 1]
coint_period_ids = coint_blocks.unique()
num_plots = len(coint_period_ids)

In [100]:
# get unique dates from blocks

unique_dates = pd.to_datetime(blocks.index).normalize().unique()
unique_dates

DatetimeIndex(['2025-10-20 00:00:00+00:00', '2025-10-21 00:00:00+00:00',
               '2025-10-22 00:00:00+00:00', '2025-10-23 00:00:00+00:00',
               '2025-10-24 00:00:00+00:00'],
              dtype='datetime64[ns, UTC]', name='ts_event', freq=None)

In [68]:
# getting asset 1 with coint block 
asset1_coint_blocks = asset1.loc[coint_blocks.index]
asset1_coint_blocks

Unnamed: 0_level_0,rtype,publisher_id,instrument_id,open,high,low,close,volume,symbol
ts_event,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2025-10-20 03:50:00+00:00,33,1,651434,57.01,57.02,57.01,57.02,21,CLZ5
2025-10-20 03:53:00+00:00,33,1,651434,57.00,57.00,56.98,56.99,6,CLZ5
2025-10-20 03:54:00+00:00,33,1,651434,56.99,56.99,56.98,56.99,6,CLZ5
2025-10-20 03:55:00+00:00,33,1,651434,56.98,57.00,56.98,56.99,4,CLZ5
2025-10-20 03:56:00+00:00,33,1,651434,56.98,56.98,56.97,56.97,21,CLZ5
...,...,...,...,...,...,...,...,...,...
2025-10-24 15:57:00+00:00,33,1,651434,62.28,62.29,62.26,62.27,114,CLZ5
2025-10-24 15:58:00+00:00,33,1,651434,62.27,62.29,62.23,62.28,291,CLZ5
2025-10-24 15:58:00+00:00,33,1,651434,62.27,62.29,62.23,62.28,291,CLZ5
2025-10-24 15:59:00+00:00,33,1,651434,62.28,62.29,62.27,62.27,174,CLZ5


In [78]:
for i, block_id in enumerate(coint_period_ids):
    # Create a mask for the current cointegrated period
    mask = (blocks == block_id) & (pair_close['cointegrated'] == 1)
    period_df = pair_close[mask]
    display(period_df)
    display(asset1.loc[period_df.index])
    break

Unnamed: 0_level_0,pair1_wti_oil_future,pair1_brent_oil_future,cointegrated,residual,zscore
ts_event,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2025-10-20 03:50:00+00:00,57.02,60.98,1,5.678195,-1.279125
2025-10-20 03:53:00+00:00,56.99,60.96,1,5.687291,-0.322497
2025-10-20 03:54:00+00:00,56.99,60.96,1,5.687291,-0.322497
2025-10-20 03:55:00+00:00,56.99,60.96,1,5.687291,-0.322497
2025-10-20 03:56:00+00:00,56.97,60.95,1,5.696689,0.665822
...,...,...,...,...,...
2025-10-20 05:54:00+00:00,56.83,60.79,1,5.672470,-1.881256
2025-10-20 05:55:00+00:00,56.82,60.78,1,5.672169,-1.912947
2025-10-20 05:56:00+00:00,56.81,60.78,1,5.681867,-0.892937
2025-10-20 05:57:00+00:00,56.81,60.78,1,5.681867,-0.892937


Unnamed: 0_level_0,rtype,publisher_id,instrument_id,open,high,low,close,volume,symbol
ts_event,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2025-10-20 03:50:00+00:00,33,1,651434,57.01,57.02,57.01,57.02,21,CLZ5
2025-10-20 03:53:00+00:00,33,1,651434,57.00,57.00,56.98,56.99,6,CLZ5
2025-10-20 03:54:00+00:00,33,1,651434,56.99,56.99,56.98,56.99,6,CLZ5
2025-10-20 03:55:00+00:00,33,1,651434,56.98,57.00,56.98,56.99,4,CLZ5
2025-10-20 03:56:00+00:00,33,1,651434,56.98,56.98,56.97,56.97,21,CLZ5
...,...,...,...,...,...,...,...,...,...
2025-10-20 05:54:00+00:00,33,1,651434,56.83,56.83,56.83,56.83,23,CLZ5
2025-10-20 05:55:00+00:00,33,1,651434,56.83,56.83,56.82,56.82,46,CLZ5
2025-10-20 05:56:00+00:00,33,1,651434,56.82,56.82,56.81,56.81,38,CLZ5
2025-10-20 05:57:00+00:00,33,1,651434,56.82,56.82,56.81,56.81,28,CLZ5
