In [4]:
# Cell 1 ────────────────────────────────────────────────────────────────────────────────
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

FILES = {
    "AMD":  "/workspaces/Stock-Market-Prediction/Microstructure/BATS_AMD, 1_09023.csv",
    "NVDA": "/workspaces/Stock-Market-Prediction/Microstructure/BATS_NVDA, 1_a7f0c.csv",
    "ASML": "/workspaces/Stock-Market-Prediction/Microstructure/BATS_ASML, 1_0f78d.csv",
}

TIMEZONE = "US/Eastern"

SESSION_BINS = {
    "open_30":        ("09:30", "10:00"),
    "lunch_1130_1300": ("11:30", "13:00"),
    "close_30":       ("15:30", "16:00"),
}

plt.rcParams.update({"font.size": 11, "figure.dpi": 110})


In [11]:
# Cell 2 ────────────────────────────────────────────────────────────────────────────────
from io import StringIO
import pathlib, re

def _has_header(sample_line: str) -> bool:
    """True if the first non-empty token looks alphabetic (e.g. 'ts' or 'Date')."""
    token = sample_line.split(",")[0].strip()
    return bool(re.match(r"[A-Za-z]", token))

def load_symbol(path: str | Path) -> pd.DataFrame:
    """
    Read a 1-minute OHLCV CSV and return a tz-aware DataFrame (US/Eastern).

    • Works whether the file **has a header** or is **raw rows only**  
      (like the snippet you pasted).  
    • Auto-detects timestamp timezone and converts to US/Eastern.
    """
    path = Path(path).expanduser().resolve()
    with path.open("r") as fh:
        first_line = fh.readline()

    names = ["ts", "open", "high", "low", "close", "volume"]
    header_opt = 0 if _has_header(first_line) else None

    df = pd.read_csv(path, header=header_opt, names=names)

    # ---------- timestamp parsing & tz conversion ----------
    ts = pd.to_datetime(df["ts"], errors="coerce")

    if ts.dt.tz is None:                                     # tz-naïve
        iso_col = df["ts"].astype(str)
        if iso_col.str.endswith("Z").any() or iso_col.str.contains("+00:00").any():
            ts = ts.dt.tz_localize("UTC").dt.tz_convert(TIMEZONE)
        else:
            ts = ts.dt.tz_localize(TIMEZONE)
    else:                                                    # tz-aware
        ts = ts.dt.tz_convert(TIMEZONE)

    df["ts"] = ts
    df = df.dropna(subset=["ts"]).set_index("ts")
    df[["open","high","low","close","volume"]] = df[
        ["open","high","low","close","volume"]
    ].apply(pd.to_numeric, errors="coerce")

    # ---------- intraday helpers ----------
    market_open = 9 * 3600 + 30 * 60  # 09:30
    df["sec"] = (
        df.index.hour * 3600 + df.index.minute * 60 + df.index.second - market_open
    )
    df["date"] = df.index.date
    df = df[(df["sec"] >= 0) & (df["sec"] <= 2340)].copy()

    return df


In [12]:
# Cell 3 ────────────────────────────────────────────────────────────────────────────────
def per_day_vwap(df: pd.DataFrame) -> pd.Series:
    return df.groupby("date").apply(
        lambda x: np.average(x["close"], weights=x["volume"])
    )

def per_minute_aggregates(df: pd.DataFrame, daily_vwap: pd.Series) -> pd.DataFrame:
    df = df.copy()
    df["vwap_d"]     = df["date"].map(daily_vwap)
    df["delta_vwap"] = df["close"] - df["vwap_d"]

    df["logret"]  = np.log(df["close"]).diff()
    df["abs_ret"] = df["logret"].abs()
    mu_abs        = df.groupby("date")["abs_ret"].transform("mean")
    df["rel_vol"] = df["abs_ret"] / mu_abs

    vol_day         = df.groupby("date")["volume"].transform("sum")
    df["vol_share"] = df["volume"] / vol_day
    return df

def average_intraday(df: pd.DataFrame, field: str) -> pd.Series:
    return df.groupby("sec")[field].mean().sort_index()

def summarise_sessions(df: pd.DataFrame, field: str) -> pd.Series:
    return pd.Series(
        {
            name: df.between_time(t0, t1)[field].mean()
            for name, (t0, t1) in SESSION_BINS.items()
        }
    )


In [13]:
# Cell 4 ────────────────────────────────────────────────────────────────────────────────
data = {}
for sym, fn in FILES.items():
    path = Path(fn).expanduser().resolve()
    if not path.exists():
        raise FileNotFoundError(f"{path} not found – update FILES paths.")
    data[sym] = load_symbol(path)

for sym, df in data.items():
    data[sym] = per_minute_aggregates(df, per_day_vwap(df))


  ts = pd.to_datetime(df["ts"], errors="coerce")


AttributeError: Can only use .dt accessor with datetimelike values