In [1]:
import numpy as np
from tqdm import tqdm
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error
import matplotlib.pyplot as plt
from model import GalerkinSARIMA
import time

In [3]:
import numpy as np
import pandas as pd
from statsmodels.tsa.arima_process import ArmaProcess

def make_trading_index(
    start_date="2024-01-02",
    n_sessions=10,
    freq="5min",
    session_open="09:30",
    session_close="16:00",
    tz="America/New_York",
):
    """
    Build a timezone-aware DatetimeIndex for intraday bars during regular
    trading hours [session_open, session_close). Mon–Fri only.
    Note: does NOT skip market holidays/half-days.
    """
    # business sessions (Mon–Fri)
    sessions = pd.bdate_range(start=start_date, periods=n_sessions, tz=tz)

    # parse intraday times
    t_open = pd.to_datetime(session_open).time()
    t_close = pd.to_datetime(session_close).time()
    freq_td = pd.to_timedelta(freq)

    # build per-session bar starts: [open, close) with step=freq
    all_stamps = []
    for d in sessions:
        start_dt = pd.Timestamp.combine(d.date(), t_open).tz_localize(tz)
        end_dt   = pd.Timestamp.combine(d.date(), t_close).tz_localize(tz)
        if end_dt <= start_dt:
            raise ValueError("session_close must be after session_open")
        # end is exclusive: last bar starts at (close - freq)
        last_start = end_dt - freq_td
        if last_start < start_dt:
            # session shorter than one bar
            continue
        rng = pd.date_range(start=start_dt, end=last_start, freq=freq)
        all_stamps.append(rng)

    if not all_stamps:
        return pd.DatetimeIndex([], tz=tz, name="Timestamp")

    idx = all_stamps[0].append(all_stamps[1:]).rename("Timestamp")
    return idx


def simulate_ticker_bars(
    symbol="SYN",
    # index control (either pass a ready index, or we’ll build one)
    index: pd.DatetimeIndex | None = None,
    start_date="2024-01-02",
    n_sessions=10,
    freq="5min",
    session_open="09:30",
    session_close="16:00",
    tz="America/New_York",
    # price process params
    start_price=100.0,
    ar=(0.6,),     # AR coefficients (phi_1..phi_p), statsmodels convention
    ma=(0.3,),     # MA coefficients (theta_1..theta_q), statsmodels convention
    d=1,           # differencing order for log price (use 1 for random-walk-like)
    mu_ann=0.08,   # annualized drift (e.g., 8%)
    sigma_ann=0.20,# annualized vol (e.g., 20%)
    intraday_vol_amp=0.6,  # U-shape amplitude (0 -> flat)
    seed=7,
    sessions_per_year=252, # used to convert annualized params to per-bar
):
    """
    Simulate ARIMA(p,d,q) on log-prices with optional intraday U-shaped volatility.
    Returns a DataFrame with columns: Symbol, Price, LogReturn, Session.
    """
    rng = np.random.default_rng(seed)

    # Build or standardize index
    if index is None:
        index = make_trading_index(
            start_date=start_date,
            n_sessions=n_sessions,
            freq=freq,
            session_open=session_open,
            session_close=session_close,
            tz=tz,
        )
    if index.tz is None:
        index = index.tz_localize(tz)
    n = len(index)
    if n == 0:
        return pd.DataFrame(columns=["Symbol", "Price", "LogReturn", "Session"]).set_index(
            pd.DatetimeIndex([], tz=tz, name="Timestamp")
        )

    # Count bars per session (assumes uniform schedule)
    # If a ready-made index was passed with irregular sessions, this uses the modal count.
    session_dates = pd.to_datetime(index.tz_convert(tz)).date
    counts = pd.Series(session_dates).value_counts().sort_values(ascending=False)
    bars_per_session = int(counts.iloc[0])

    bars_per_year = sessions_per_year * max(1, bars_per_session)
    mu_bar = mu_ann / bars_per_year
    sigma_bar = sigma_ann / np.sqrt(bars_per_year)

    # ARMA structure for d-th difference of log-price (returns if d=1)
    ar_poly = np.r_[1, -np.array(ar)]
    ma_poly = np.r_[1,  np.array(ma)]
    ap = ArmaProcess(ar_poly, ma_poly)
    if not ap.isstationary:
        raise ValueError("AR params are not stationary (roots inside unit circle).")

    # U-shaped intraday volatility profile, tiled per session
    x = np.linspace(0, 1, bars_per_session, endpoint=True)
    shape_one = 1.0 + intraday_vol_amp * (4.0 * (x - 0.5) ** 2)  # >= 1
    reps = int(np.ceil(n / bars_per_session))
    vol_shape = np.tile(shape_one, reps)[:n]
    vol_per_bar = sigma_bar * vol_shape

    # Heteroskedastic ARMA-ish returns: filter a standard normal series, then scale
    # (Scaling post-filter is an approximation—good enough for sim purposes.)
    z = rng.standard_normal(n)
    base = ap.generate_sample(nsample=n, distrvs=rng.standard_normal, scale=1.0)

    x_t = base * vol_per_bar + mu_bar  # d-th difference of log-price (returns if d=1)

    # Integrate d times to obtain log-price
    y = x_t.copy()
    for _ in range(d):
        y = np.cumsum(y)

    log_prices = np.log(start_price) + y
    prices = np.exp(log_prices)

    df = pd.DataFrame(
        {
            "Symbol": symbol,
            "Price": prices,
            "LogReturn": np.r_[np.nan, np.diff(log_prices)],
            "Session": session_dates,
        },
        index=index,
    )
    df.index.name = "Timestamp"
    return df


# ----------------- Examples -----------------
if __name__ == "__main__":
    # A) 5-minute bars for 5 sessions
    df_5m = simulate_ticker_bars(
        symbol="SIM5",
        start_date="2024-01-02",
        n_sessions=5,
        freq="5min",
        start_price=120.0,
        ar=(0.4, -0.2),
        ma=(0.3,),
        d=1,
        mu_ann=0.10,
        sigma_ann=0.25,
        intraday_vol_amp=0.7,
        seed=123,
    )
    print("5-min sample:\n", df_5m.head(), "\n", df_5m.tail(), "\n")

    # B) 1-minute bars for 2 sessions
    df_1m = simulate_ticker_bars(
        symbol="SIM1",
        start_date="2024-01-02",
        n_sessions=2,
        freq="1min",
        start_price=50.0,
        ar=(0.6,),
        ma=(0.2,),
        sigma_ann=0.30,
        seed=42,
    )
    print("1-min sample:\n", df_1m.head(), "\n")

    # C) 15-minute bars for 10 sessions with flatter intraday vol
    df_15m = simulate_ticker_bars(
        symbol="SIM15",
        start_date="2024-01-02",
        n_sessions=10,
        freq="15min",
        start_price=250.0,
        intraday_vol_amp=0.2,
    )
    print("15-min sample:\n", df_15m.head(), "\n")


5-min sample:
                           Symbol       Price  LogReturn     Session
Timestamp                                                          
2024-01-02 09:30:00-05:00   SIM5  120.948424        NaN  2024-01-02
2024-01-02 09:35:00-05:00   SIM5  121.584829   0.005248  2024-01-02
2024-01-02 09:40:00-05:00   SIM5  120.890560  -0.005727  2024-01-02
2024-01-02 09:45:00-05:00   SIM5  120.558873  -0.002747  2024-01-02
2024-01-02 09:50:00-05:00   SIM5  121.168824   0.005047  2024-01-02 
                           Symbol       Price  LogReturn     Session
Timestamp                                                          
2024-01-08 15:35:00-05:00   SIM5  122.313930  -0.001330  2024-01-08
2024-01-08 15:40:00-05:00   SIM5  122.746418   0.003530  2024-01-08
2024-01-08 15:45:00-05:00   SIM5  123.147842   0.003265  2024-01-08
2024-01-08 15:50:00-05:00   SIM5  123.251711   0.000843  2024-01-08
2024-01-08 15:55:00-05:00   SIM5  123.178409  -0.000595  2024-01-08 

1-min sample:
               