In [116]:
import pandas as pd
# data types
from typing import Dict, Tuple, List
import glob
import os
import matplotlib.pyplot as plt

In [7]:
def load_raw_data_dic(folder_path: str) -> Dict:
    '''
    Load CSV files from a specified folder into a dictionary of DataFrames.
    '''
    
    # Use glob to get all CSV files in the folder
    csv_files = glob.glob(os.path.join(folder_path, '*.csv'))
    
    # Initialize an empty dictionary to store DataFrames
    dataframes_dic = {}
    
    # Loop through the list of CSV files and read each one into a DataFrame
    for file in csv_files:
        # Extract the file name without the folder path and extension
        file_name = os.path.basename(file).replace('.csv', '')
    
        # Read the CSV file into a DataFrame
        df = pd.read_csv(file)
    
        # Store the DataFrame in the dictionary
        dataframes_dic[file_name] = df
    
    # Optionally, display the keys of the dictionary to see the loaded DataFrames
    print(dataframes_dic.keys())

    return dataframes_dic

In [9]:
data = load_raw_data_dic("data")

dict_keys(['amex', 'United_Health', 'Amgen', 'Merck_Co', 'Walmart', 'Apple', 'Verizon_Communications', 'Intel', 'Johnson_Johnson', 'Home_Depot', 'Caterpillar', 'Visa', 'JP_Morgan_Chase', 'Disney', 'IBM', 'Honeywell', 'Nike', 'Chevron_corp', '3M', 'Microsoft', 'Boeing', 'Coca_Cola_Company', 'Salesforce', 'Goldman_Sachs', 'Cisco', 'Travelers', 'McDonalds', 'Proctor_Gamble'])


In [11]:
IBM = data["IBM"]

In [42]:
381*2516

958596

In [22]:
print(IBM.info())
print(IBM.describe(include="all"))
print(IBM.head(381))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 958596 entries, 0 to 958595
Data columns (total 8 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   Date Time  958596 non-null  object 
 1   Open       958596 non-null  float64
 2   High       958596 non-null  float64
 3   Low        958596 non-null  float64
 4   Close      958596 non-null  float64
 5   Volume     958596 non-null  float64
 6   Date       958596 non-null  object 
 7   Time       958596 non-null  object 
dtypes: float64(5), object(3)
memory usage: 58.5+ MB
None
                  Date Time           Open           High            Low  \
count                958596  958596.000000  958596.000000  958596.000000   
unique               958596            NaN            NaN            NaN   
top     2010-01-04 09:35:00            NaN            NaN            NaN   
freq                      1            NaN            NaN            NaN   
mean                    NaN     117.038388

In [32]:
import pandas as pd
import numpy as np
from typing import Tuple, Dict

def clean_and_label(df: pd.DataFrame) -> pd.DataFrame:
    """Parse time, build intraday returns per day, and day-level RV + log-RV + next-day label."""
    df = df.copy()
    # Parse timestamp
    df["ts"] = pd.to_datetime(df["Date Time"])
    df = df.sort_values("ts").reset_index(drop=True)

    # Derive 'day' as calendar date (market local time if needed)
    df["day"] = df["ts"].dt.date

    # Minute log returns, computed WITHIN each day (no overnight mix; no need to drop first bar)
    df["log_close"] = np.log(df["Close"].astype(float))
    df["ret_1m"] = df.groupby("day")["log_close"].diff().fillna(0.0)

    # (Optional) light winsorization for bad ticks
    df["ret_1m"] = df["ret_1m"].clip(-0.05, 0.05)

    # Bars per day: check consistency (mode)
    bars_per_day = (
        df.groupby("day")["ret_1m"].size().mode().iat[0]
    )

    # Drop days that don’t match the typical length (e.g., half-days) to keep things simple now
    day_counts = df.groupby("day")["ret_1m"].size()
    good_days = day_counts[day_counts == bars_per_day].index
    df = df[df["day"].isin(good_days)].copy()

    # Day-level realized variance & volatility
    agg = df.groupby("day")["ret_1m"].agg(
        rv=lambda x: np.sum(np.square(x)),   # realized variance
    ).reset_index()
    agg["log_rv"] = np.log(agg["rv"].replace(0.0, 1e-12))

    # Next-day target
    agg["log_rv_tplus1"] = agg["log_rv"].shift(-1)

    # Merge back to minute-level df (constant within the day)
    df = df.merge(agg[["day", "rv", "log_rv", "log_rv_tplus1"]], on="day", how="left")

    # Drop the last day (no next-day label)
    last_day = agg["day"].iloc[-1]
    df = df[df["day"] != last_day].copy()

    # Keep useful columns
    return df[["ts", "day", "Open", "High", "Low", "Close", "Volume", "ret_1m", "rv", "log_rv", "log_rv_tplus1"]]

def temporal_split_days(df: pd.DataFrame, train_frac=0.6, val_frac=0.2):
    """Return lists of days for train/val/test with no leakage."""
    days = sorted(df["day"].unique())
    n = len(days)
    n_train = int(n * train_frac)
    n_val = int(n * val_frac)
    train_days = days[:n_train]
    val_days   = days[n_train:n_train + n_val]
    test_days  = days[n_train + n_val:]
    return train_days, val_days, test_days

def build_day_windows(
    df: pd.DataFrame,
    context_days: int = 21,
    feature_cols: Tuple[str, ...] = ("ret_1m",),
) -> Tuple[np.ndarray, np.ndarray, Dict]:
    """
    Build samples where each X is (context_days * bars_per_day, d_in) and y is log_rv of day+1.
    We return dense numpy arrays suitable for later tensorization.
    """
    df = df.copy()
    # Get canonical bars/day
    bars_per_day = df.groupby("day")["ret_1m"].size().mode().iat[0]

    # Prepare per-day arrays
    grouped = {d: g for d, g in df.groupby("day")}
    days = sorted(grouped.keys())

    X_list, y_list, day_idx_list = [], [], []

    for i in range(len(days) - context_days):
        context = days[i : i + context_days]      # d−K..d
        target_day = days[i + context_days]       # d+1

        # Stack intraday features for the context
        feats = []
        ok = True
        for d in context:
            g = grouped[d]
            if len(g) != bars_per_day:
                ok = False; break
            feats.append(g[list(feature_cols)].to_numpy(dtype=np.float32))
        if not ok:
            continue
        Xi = np.vstack(feats)  # shape: (context_days * bars_per_day, d_in)

        # Target = next-day log-RV (constant within that day)
        y_val = grouped[target_day]["log_rv"].iloc[0]  # or grouped[target_day]["log_rv_tplus1"] on previous day

        X_list.append(Xi)
        y_list.append(y_val)
        day_idx_list.append((context[0], context[-1], target_day))

    X = np.stack(X_list, axis=0)                  # [N, T, d_in]
    y = np.array(y_list, dtype=np.float32)        # [N]
    meta = {"bars_per_day": bars_per_day, "samples": day_idx_list, "features": feature_cols}
    return X, y, meta


In [33]:
# Make sure Date Time is parsed as datetime
IBM["Date Time"] = pd.to_datetime(IBM["Date Time"])

In [34]:
# Sort chronologically
df_ibm = IBM.sort_values("Date Time").reset_index(drop=True)

# Extract the date part
df_ibm["day"] = df_ibm["Date Time"].dt.date

# Get the first 3 unique days
first_3_days = df_ibm["day"].unique()[:3]

# Slice those days
df_first3 = df_ibm[df_ibm["day"].isin(first_3_days)].copy()

# Inspect
print(first_3_days)
print(df_first3.head(20))

[datetime.date(2010, 1, 4) datetime.date(2010, 1, 5)
 datetime.date(2010, 1, 6)]
             Date Time     Open     High      Low    Close   Volume  \
0  2010-01-04 09:35:00  85.2401  85.2789  85.1624  85.2207  42796.0   
1  2010-01-04 09:36:00  85.2271  85.4732  85.2142  85.4343  65921.0   
2  2010-01-04 09:37:00  85.4214  85.4732  85.3048  85.4680  60876.0   
3  2010-01-04 09:38:00  85.4732  85.5250  85.4279  85.5056  36812.0   
4  2010-01-04 09:39:00  85.4946  85.5056  85.4214  85.4538  29232.0   
5  2010-01-04 09:40:00  85.4732  85.5250  85.4214  85.4214  33407.0   
6  2010-01-04 09:41:00  85.4214  85.6092  85.4214  85.6027  35586.0   
7  2010-01-04 09:42:00  85.5865  85.6351  85.5444  85.6027  27420.0   
8  2010-01-04 09:43:00  85.5962  85.5962  85.5444  85.5703  13890.0   
9  2010-01-04 09:44:00  85.5548  85.6221  85.5509  85.6221  20309.0   
10 2010-01-04 09:45:00  85.6221  85.7646  85.6156  85.6804  51192.0   
11 2010-01-04 09:46:00  85.6610  85.6804  85.6156  85.6804  23012.0

In [35]:
df_first3.head(50).to_dict()

{'Date Time': {0: Timestamp('2010-01-04 09:35:00'),
  1: Timestamp('2010-01-04 09:36:00'),
  2: Timestamp('2010-01-04 09:37:00'),
  3: Timestamp('2010-01-04 09:38:00'),
  4: Timestamp('2010-01-04 09:39:00'),
  5: Timestamp('2010-01-04 09:40:00'),
  6: Timestamp('2010-01-04 09:41:00'),
  7: Timestamp('2010-01-04 09:42:00'),
  8: Timestamp('2010-01-04 09:43:00'),
  9: Timestamp('2010-01-04 09:44:00'),
  10: Timestamp('2010-01-04 09:45:00'),
  11: Timestamp('2010-01-04 09:46:00'),
  12: Timestamp('2010-01-04 09:47:00'),
  13: Timestamp('2010-01-04 09:48:00'),
  14: Timestamp('2010-01-04 09:49:00'),
  15: Timestamp('2010-01-04 09:50:00'),
  16: Timestamp('2010-01-04 09:51:00'),
  17: Timestamp('2010-01-04 09:52:00'),
  18: Timestamp('2010-01-04 09:53:00'),
  19: Timestamp('2010-01-04 09:54:00'),
  20: Timestamp('2010-01-04 09:55:00'),
  21: Timestamp('2010-01-04 09:56:00'),
  22: Timestamp('2010-01-04 09:57:00'),
  23: Timestamp('2010-01-04 09:58:00'),
  24: Timestamp('2010-01-04 09:59:00'

In [193]:
def preprocess_stock(df_raw, patch_len, train_frac, val_frac):
    df = df_raw.copy()

    # turn column to datetime, add helper day column
    df["Date Time"] = pd.to_datetime(df["Date Time"])
    df = df.sort_values("Date Time")
    df["day"] = df["Date Time"].dt.date

    # minutely log returns withing days from close
    # fill first minute with 0 returns
    df["log_close"] = np.log(df["Close"].astype(float))
    df["ret_1m"] = df.groupby("day")["log_close"].diff().fillna(0.0)

    # to check if each days actually has 381 return minutes
    day_return_count = df.groupby("day")["ret_1m"].size()
    print(day_return_count.value_counts())
    bars_mode = int(day_return_count.mode().iat[0]) # get mode of return minutes across dataset

    # compute daily RV
    daily = (df.groupby("day")["ret_1m"]
               .agg(rv=lambda x: np.sum(x**2))
               .reset_index())
    
    # in case there is a zero RV, replace it with small value before log
    daily["log_rv"] = np.log(daily["rv"].replace(0.0, 1e-12))
    # label for inputs from day d is log_rv of day d+1
    daily["log_rv_tplus1"] = daily["log_rv"].shift(-1)
    
    # create binary filter for days without valid target (prob just last day)
    valid_days = daily.dropna(subset=["log_rv_tplus1"])["day"]
    # remove them from both minute_df and daily_df
    df = df[df["day"].isin(valid_days)].copy()
    daily = daily[daily["day"].isin(valid_days)].reset_index(drop=True)

    # create minute index in each day
    df["idx_in_day"] = df.groupby("day").cumcount()

    # flag if we want to patch
    if patch_len == 1:
        # No patching: 1-minute tokens
        df["patch_id"] = df["idx_in_day"]
    # patch index inside day
    else:
        df["patch_id"] = (df["idx_in_day"] // patch_len).astype(int)

    # number of full patches per day (patches with len() == patch_len)
    tokens_full = (bars_mode // patch_len) if patch_len > 1 else bars_mode

     # collect aggregate information for each patch
    g = df.groupby(["day", "patch_id"])
    patch = g.agg(
        r_sum=("ret_1m", "sum"), # summed returns
        r_abs=("ret_1m", lambda x: np.abs(x).sum()), # summed absolute returns
        r_sq =("ret_1m", lambda x: np.sum(x**2)), # summed squared returns
        hi   =("High", "max"), # high
        lo   =("Low", "min"), # low
        vol  =("Volume", "sum"), # volumne
        r_last=("ret_1m", "last"), # last return
        n    =("ret_1m", "size"), # number of values (should be patch_len)
    ).reset_index()

    # drop the last not filled token
    patch = patch[patch["patch_id"] < tokens_full].copy()

    # robust log range
    # patch["range_hl"] = np.log(patch["hi"] / patch["lo"].replace(0, np.nan)).replace([np.inf, -np.inf], 0.0).fillna(0.0)
    # patch = patch.drop(columns=["hi", "lo"])



    # positional encoding
    
    # relative time-of-day encoding
    if tokens_full > 1:
        patch["pos"] = patch["patch_id"] / (tokens_full - 1)
    else:
        patch["pos"] = 0.0
    patch["pos_sin"] = np.sin(2 * np.pi * patch["pos"])
    patch["pos_cos"] = np.cos(2 * np.pi * patch["pos"])

    # calendar encoding
    # create small calendar daily index
    cal = patch[["day"]].drop_duplicates().copy()
    cal_dt = pd.to_datetime(cal["day"])
    
    # Day of Week
    dow = cal_dt.dt.weekday
    cal["dow_sin"] = np.sin(2 * np.pi * dow / 7.0)
    cal["dow_cos"] = np.cos(2 * np.pi * dow / 7.0)
    
    # Day of Month
    dom = cal_dt.dt.day
    cal["dom_sin"] = np.sin(2 * np.pi * dom / 31.0)
    cal["dom_cos"] = np.cos(2 * np.pi * dom / 31.0)
    
    # Month of Year
    moy = cal_dt.dt.month
    cal["moy_sin"] = np.sin(2 * np.pi * moy / 12.0)
    cal["moy_cos"] = np.cos(2 * np.pi * moy / 12.0)

    # Month-end flag (useful for rebalancing effects)
    cal["is_month_end"] = cal_dt.dt.is_month_end.astype(np.int8)

    # Attach calendar features to every patch in that day
    patch = patch.merge(cal, on="day", how="left")

    feat_cols = [
        "r_sum", "r_abs", "r_sq", "vol", "r_last",     # patch stats
        "pos_sin", "pos_cos",                                      # time-of-day
        "dow_sin", "dow_cos", "dom_sin", "dom_cos", "moy_sin", "moy_cos",  # calendar
        "is_month_end",                                            # flag (0/1)
    ]


    # ------------ BUILD X
    # get unique trading days, patch has one row for each patch
    day_list = sorted(patch["day"].unique())
    # initialize lists
    X_days, y_days = [], []
    # map each day to its target in a dict
    label_map = dict(zip(daily["day"], daily["log_rv_tplus1"]))

    # loop over trading days
    for d in day_list:
        # get all patches in one day sorted by time
        day_p = patch[(patch["day"] == d)].sort_values("patch_id")
        # skip days with not full patches
        if day_p.shape[0] != tokens_full:
            continue
        # turn features into numpy array of [Patch_len, n_features]
        # each Xi is a matrix representation of one trading day
        Xi = day_p[feat_cols].to_numpy(dtype=np.float32)            # [T_patches, d_features]
        # look up target for that day from dict
        yi = float(label_map[d])                                    # scalar target (log RV)
        X_days.append(Xi)
        y_days.append(yi)

    # stack data
    X = np.stack(X_days, axis=0)            # [N_days, T_tokens, d_in]
    y = np.array(y_days, dtype=np.float32)  # [N_days]



    # ------------- Train - Val - Test - Split
    # number of trading days in data set
    N = X.shape[0]
    # train split
    n_tr = int(N * train_frac)
    # val split
    n_va = int(N * val_frac)

    # comput split indices
    idx_tr = slice(0, n_tr)
    idx_va = slice(n_tr, n_tr + n_va)
    idx_te = slice(n_tr + n_va, N)

    # split data
    X_train, y_train = X[idx_tr], y[idx_tr]
    X_val, y_val = X[idx_va], y[idx_va]
    X_test, y_test = X[idx_te], y[idx_te]

    # ------------------- Scaling

    # only use train data for mean and sd
    # flattens across days and tokens
    mu = X_train.reshape(-1, X_train.shape[-1]).mean(axis=0, keepdims=True)   # [1, d_in], mean of each feature
    sd = X_train.reshape(-1, X_train.shape[-1]).std(axis=0, keepdims=True) + 1e-8 # sd of each feature

    def scale(arr):
        return (arr - mu) / sd

    X_train = scale(X_train)
    X_val = scale(X_val)
    X_test = scale(X_test)
    
    return {
        "X_train": X_train, "y_train": y_train,
        "X_val":   X_val, "y_val":   y_val,
        "X_test":  X_test, "y_test":  y_test,
        "meta": {
            "tokens_per_day": tokens_full,
            "feature_names": feat_cols,
            "bars_per_day_mode": bars_mode,
            "train_days": int(n_tr),
            "val_days": int(n_va),
            "test_days": int(N - n_tr - n_va),
            "scaler_mean": mu.astype(np.float32),
            "scaler_std": sd.astype(np.float32),
            "patch_len_minutes": patch_len,
        },
    }

In [194]:
X = preprocess_stock(IBM, 20, 0.7, 0.15)

ret_1m
381    2516
Name: count, dtype: int64


In [132]:
IBM

Unnamed: 0,Date Time,Open,High,Low,Close,Volume,Date,Time
0,2010-01-04 09:35:00,85.2401,85.2789,85.1624,85.2207,42796.0,2010-01-04,09:35:00
1,2010-01-04 09:36:00,85.2271,85.4732,85.2142,85.4343,65921.0,2010-01-04,09:36:00
2,2010-01-04 09:37:00,85.4214,85.4732,85.3048,85.4680,60876.0,2010-01-04,09:37:00
3,2010-01-04 09:38:00,85.4732,85.5250,85.4279,85.5056,36812.0,2010-01-04,09:38:00
4,2010-01-04 09:39:00,85.4946,85.5056,85.4214,85.4538,29232.0,2010-01-04,09:39:00
...,...,...,...,...,...,...,...,...
958591,2019-12-31 15:51:00,114.7837,114.8095,114.7794,114.8095,20723.0,2019-12-31,15:51:00
958592,2019-12-31 15:52:00,114.8095,114.8438,114.8052,114.8438,24968.0,2019-12-31,15:52:00
958593,2019-12-31 15:53:00,114.8524,114.9038,114.8524,114.9038,39332.0,2019-12-31,15:53:00
958594,2019-12-31 15:54:00,114.9038,114.9296,114.8910,114.9038,20436.0,2019-12-31,15:54:00


In [140]:
tmp.iloc[376:390]

Unnamed: 0,Date Time,Open,High,Low,Close,Volume,Date,Time,day,log_close,ret_1m,idx_in_day,patch_id
376,2010-01-04 15:51:00,85.7905,85.8034,85.771,85.7905,18650.0,2010-01-04,15:51:00,2010-01-04,4.451908,-0.00015,376,18
377,2010-01-04 15:52:00,85.7905,85.8034,85.771,85.7775,19197.0,2010-01-04,15:52:00,2010-01-04,4.451757,-0.000152,377,18
378,2010-01-04 15:53:00,85.784,85.8099,85.7775,85.797,19559.0,2010-01-04,15:53:00,2010-01-04,4.451984,0.000227,378,18
379,2010-01-04 15:54:00,85.8099,85.8229,85.797,85.8047,26483.0,2010-01-04,15:54:00,2010-01-04,4.452074,9e-05,379,18
380,2010-01-04 15:55:00,85.8099,85.8358,85.797,85.8229,39052.0,2010-01-04,15:55:00,2010-01-04,4.452286,0.000212,380,19
381,2010-01-05 09:35:00,84.6767,84.8516,84.6767,84.7415,123004.0,2010-01-05,09:35:00,2010-01-05,4.439605,0.0,0,0
382,2010-01-05 09:36:00,84.7415,84.7544,84.6638,84.7318,101782.0,2010-01-05,09:36:00,2010-01-05,4.439491,-0.000114,1,0
383,2010-01-05 09:37:00,84.7415,84.7933,84.7221,84.7577,30386.0,2010-01-05,09:37:00,2010-01-05,4.439797,0.000306,2,0
384,2010-01-05 09:38:00,84.7544,84.7609,84.7091,84.7285,45357.0,2010-01-05,09:38:00,2010-01-05,4.439452,-0.000345,3,0
385,2010-01-05 09:39:00,84.7285,84.7285,84.6832,84.7026,37723.0,2010-01-05,09:39:00,2010-01-05,4.439146,-0.000306,4,0


In [205]:
def stack_context_days(X_days, y_days, context_window=22):
    N, T, d = X_days.shape # n_trading_days, tokens_per_day, features_per_token

    # initialize stack lists
    Xc, yc = [], []

    # slide rollwing window over dataset
    for i in range(N - context_window + 1):
        Xi = X_days[i:i+context_window].reshape(context_window * T, d)  # [seq_len, d]
        yi = y_days[i + context_window - 1]  # label aligned to last input day → predict next day

        # append sequence and length to lists
        Xc.append(Xi)
        yc.append(yi)

    # return stacked sequences
    return np.stack(Xc).astype(np.float32), np.array(yc, dtype=np.float32)

out = preprocess_stock(df_ibm, patch_len=15, train_frac=0.7, val_frac=0.15)
Xtr_ctx, ytr_ctx = stack_context_days(out["X_train"], out["y_train"], context_window=22)
Xva_ctx, yva_ctx = stack_context_days(out["X_val"],   out["y_val"],   context_window=22)
Xte_ctx, yte_ctx = stack_context_days(out["X_test"],  out["y_test"],  context_window=22)


ret_1m
381    2516
Name: count, dtype: int64


In [208]:
out["X_train"].shape

(1760, 25, 14)

In [212]:
Xtr_ctx.shape

(1739, 550, 14)

In [209]:
first_seq = Xtr_ctx[0]        # [seq_len, d]
first_target = ytr_ctx[0]     # scalar

print("First seq shape:", first_seq.shape)
print("First target:", first_target)

First seq shape: (550, 14)
First target: -8.786377


# ARCHIVE

In [44]:
import numpy as np
import pandas as pd
from typing import Dict, Tuple

def preprocess_one_stock(
    df_raw: pd.DataFrame,
    patch_len: int = 15,           # minutes per token; set to 1 to skip patching
    train_frac: float = 0.7,
    val_frac: float = 0.15,
    clip_return: float = None,     # winsorize per-minute returns to ±5%
    drop_half_days: bool = True,   # drop days not matching the modal intraday length
) -> Dict[str, np.ndarray]:
    """
    Preprocess a single-stock minute dataframe into per-day token sequences and next-day log-RV labels.

    Input df_raw columns (as in your printout):
      ["Date Time","Open","High","Low","Close","Volume","Date","Time"]

    Returns:
      dict with X_train, y_train, X_val, y_val, X_test, y_test, plus meta.
      Shapes: X_* = [N_days, T_tokens, d_in], y_* = [N_days]
    """
    df = df_raw.copy()

    # ---- 1) Parse & sort time; derive calendar day ----
    df["Date Time"] = pd.to_datetime(df["Date Time"], errors="coerce")
    #df = df.dropna(subset=["Date Time"]).sort_values("Date Time").reset_index(drop=True)
    df = df.sort_values("Date Time").reset_index(drop=True)
    df["day"] = df["Date Time"].dt.date


    if clip_return is not None:
        df["ret_1m"] = df["ret_1m"].clip(-clip_return, clip_return)

    # ---- 3) Bars per day & optional half-day removal ----
    

    # ---- 4) Day-level realized variance and next-day label ----
    daily = (df.groupby("day")["ret_1m"]
               .agg(rv=lambda x: np.sum(x**2))
               .reset_index())
    daily["log_rv"] = np.log(daily["rv"].replace(0.0, 1e-12))
    # label for inputs from day d is log_rv of day d+1
    daily["log_rv_tplus1"] = daily["log_rv"].shift(-1)

    # Keep only days that have a next-day label
    valid_days = daily.dropna(subset=["log_rv_tplus1"])["day"]
    df = df[df["day"].isin(valid_days)].copy()
    daily = daily[daily["day"].isin(valid_days)].reset_index(drop=True)

    # ---- 5) Build per-day tokens (patching) ----
    # index within day and patch id
    df["idx_in_day"] = df.groupby("day").cumcount()
    if patch_len <= 1:
        # No patching: 1-minute tokens
        df["patch_id"] = df["idx_in_day"]
    else:
        df["patch_id"] = (df["idx_in_day"] // patch_len).astype(int)

    # number of *full* tokens per typical day (drop last partial patch for stability)
    tokens_full = (bars_mode // patch_len) if patch_len > 1 else bars_mode

    # aggregate per (day, patch_id)
    g = df.groupby(["day", "patch_id"])
    patch = g.agg(
        r_sum=("ret_1m", "sum"),
        r_abs=("ret_1m", lambda x: np.abs(x).sum()),
        r_sq =("ret_1m", lambda x: np.sum(x**2)),
        hi   =("High", "max"),
        lo   =("Low", "min"),
        vol  =("Volume", "sum"),
        r_last=("ret_1m", "last"),
        n    =("ret_1m", "size"),
    ).reset_index()

    # keep only the first 'tokens_full' patches per day (drop partial tail)
    patch = patch[patch["patch_id"] < tokens_full].copy()

    # robust log range
    patch["range_hl"] = np.log(patch["hi"] / patch["lo"].replace(0, np.nan)).replace([np.inf, -np.inf], 0.0).fillna(0.0)
    patch = patch.drop(columns=["hi", "lo"])

    # time-of-day encodings
    if tokens_full > 1:
        patch["pos"] = patch["patch_id"] / (tokens_full - 1)
    else:
        patch["pos"] = 0.0
    patch["pos_sin"] = np.sin(2 * np.pi * patch["pos"])
    patch["pos_cos"] = np.cos(2 * np.pi * patch["pos"])

    feat_cols = ["r_sum", "r_abs", "r_sq", "range_hl", "vol", "r_last", "pos_sin", "pos_cos"]

    # ---- 6) Build X (per-day tokens × features) and y (next-day log-RV) ----
    day_list = sorted(patch["day"].unique())
    X_days, y_days = [], []
    label_map = dict(zip(daily["day"], daily["log_rv_tplus1"]))

    for d in day_list:
        day_p = patch[(patch["day"] == d)].sort_values("patch_id")
        # ensure we have exactly tokens_full tokens
        if day_p.shape[0] != tokens_full:
            continue
        Xi = day_p[feat_cols].to_numpy(dtype=np.float32)            # [T_tokens, d_in]
        yi = float(label_map[d])                                    # scalar target
        X_days.append(Xi)
        y_days.append(yi)

    X = np.stack(X_days, axis=0)            # [N_days, T_tokens, d_in]
    y = np.array(y_days, dtype=np.float32)  # [N_days]

    # ---- 7) Temporal split (by day order) ----
    N = X.shape[0]
    n_tr = int(N * train_frac)
    n_va = int(N * val_frac)
    idx_tr = slice(0, n_tr)
    idx_va = slice(n_tr, n_tr + n_va)
    idx_te = slice(n_tr + n_va, N)

    X_tr, y_tr = X[idx_tr], y[idx_tr]
    X_va, y_va = X[idx_va], y[idx_va]
    X_te, y_te = X[idx_te], y[idx_te]

    # ---- 8) Feature scaling with train stats only ----
    mu = X_tr.reshape(-1, X_tr.shape[-1]).mean(axis=0, keepdims=True)   # [1, d_in]
    sd = X_tr.reshape(-1, X_tr.shape[-1]).std(axis=0, keepdims=True) + 1e-8

    def scale(arr):
        return (arr - mu) / sd

    X_tr = scale(X_tr); X_va = scale(X_va); X_te = scale(X_te)

    return {
        "X_train": X_tr, "y_train": y_tr,
        "X_val":   X_va, "y_val":   y_va,
        "X_test":  X_te, "y_test":  y_te,
        "meta": {
            "tokens_per_day": tokens_full,
            "feature_names": feat_cols,
            "bars_per_day_mode": bars_mode,
            "train_days": int(n_tr),
            "val_days": int(n_va),
            "test_days": int(N - n_tr - n_va),
            "scaler_mean": mu.astype(np.float32),
            "scaler_std": sd.astype(np.float32),
            "patch_len_minutes": patch_len,
        },
    }


In [63]:
df = IBM.copy()

In [65]:
df["Date Time"] = pd.to_datetime(df["Date Time"], errors="coerce")
#df = df.dropna(subset=["Date Time"]).sort_values("Date Time").reset_index(drop=True)
df = df.sort_values("Date Time").reset_index(drop=True)
df["day"] = df["Date Time"].dt.date

In [66]:
df

Unnamed: 0,Date Time,Open,High,Low,Close,Volume,Date,Time,day
0,2010-01-04 09:35:00,85.2401,85.2789,85.1624,85.2207,42796.0,2010-01-04,09:35:00,2010-01-04
1,2010-01-04 09:36:00,85.2271,85.4732,85.2142,85.4343,65921.0,2010-01-04,09:36:00,2010-01-04
2,2010-01-04 09:37:00,85.4214,85.4732,85.3048,85.4680,60876.0,2010-01-04,09:37:00,2010-01-04
3,2010-01-04 09:38:00,85.4732,85.5250,85.4279,85.5056,36812.0,2010-01-04,09:38:00,2010-01-04
4,2010-01-04 09:39:00,85.4946,85.5056,85.4214,85.4538,29232.0,2010-01-04,09:39:00,2010-01-04
...,...,...,...,...,...,...,...,...,...
958591,2019-12-31 15:51:00,114.7837,114.8095,114.7794,114.8095,20723.0,2019-12-31,15:51:00,2019-12-31
958592,2019-12-31 15:52:00,114.8095,114.8438,114.8052,114.8438,24968.0,2019-12-31,15:52:00,2019-12-31
958593,2019-12-31 15:53:00,114.8524,114.9038,114.8524,114.9038,39332.0,2019-12-31,15:53:00,2019-12-31
958594,2019-12-31 15:54:00,114.9038,114.9296,114.8910,114.9038,20436.0,2019-12-31,15:54:00,2019-12-31


In [64]:
df

Unnamed: 0,Date Time,Open,High,Low,Close,Volume,Date,Time
0,2010-01-04 09:35:00,85.2401,85.2789,85.1624,85.2207,42796.0,2010-01-04,09:35:00
1,2010-01-04 09:36:00,85.2271,85.4732,85.2142,85.4343,65921.0,2010-01-04,09:36:00
2,2010-01-04 09:37:00,85.4214,85.4732,85.3048,85.4680,60876.0,2010-01-04,09:37:00
3,2010-01-04 09:38:00,85.4732,85.5250,85.4279,85.5056,36812.0,2010-01-04,09:38:00
4,2010-01-04 09:39:00,85.4946,85.5056,85.4214,85.4538,29232.0,2010-01-04,09:39:00
...,...,...,...,...,...,...,...,...
958591,2019-12-31 15:51:00,114.7837,114.8095,114.7794,114.8095,20723.0,2019-12-31,15:51:00
958592,2019-12-31 15:52:00,114.8095,114.8438,114.8052,114.8438,24968.0,2019-12-31,15:52:00
958593,2019-12-31 15:53:00,114.8524,114.9038,114.8524,114.9038,39332.0,2019-12-31,15:53:00
958594,2019-12-31 15:54:00,114.9038,114.9296,114.8910,114.9038,20436.0,2019-12-31,15:54:00


In [105]:
# plot data
# Compute realized volatility (square root of RV), scale to percent
daily["realized_vol_pct"] = np.sqrt(daily["rv"]) * 100

plt.figure(figsize=(12,5))
plt.plot(daily["day"], daily["realized_vol_pct"], lw=0.8, color="steelblue")
plt.title("Daily Realized Volatility (IBM, %)")
plt.ylabel("Volatility (%)")
plt.xlabel("Date")
plt.grid(True, alpha=0.3)
plt.show()

SyntaxError: invalid syntax (3325032303.py, line 1)