# ChronoTick 2: Data Exploration

This notebook explores the sensor data collected for clock drift prediction.
It demonstrates the preprocessing pipeline, sensor categories, and target
variable (adj_freq_ppm) characteristics across all four machines.

## Setup

**Local:** Run from `tick2/` directory with `uv run jupyter lab`.

**Colab:** Upload two folders to Google Drive under `My Drive/chronotick2/`:
1. `tick2/` -- the Python package (this repo's `tick2/` directory)
2. `data/24h_snapshot/` -- the sensor CSVs (from `sensors/data/24h_snapshot/`)

Then run the setup cell below.

In [None]:
# === Colab Setup (skip if running locally) ===
import os

IN_COLAB = "COLAB_GPU" in os.environ or os.path.exists("/content")

if IN_COLAB:
    from google.colab import drive
    drive.mount("/content/drive")

    # Install tick2 from Drive
    # Expects tick2/ package uploaded to My Drive/chronotick2/tick2/
    !pip install -q /content/drive/MyDrive/chronotick2/tick2/

    DATA_DIR = "/content/drive/MyDrive/chronotick2/data"
else:
    DATA_DIR = None  # uses default (../sensors/data/)

In [None]:
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from tick2.data.preprocessing import (
    TARGET_COL,
    get_feature_cols,
    load_all,
    load_machine,
)
from tick2.data.splits import extract_samples, temporal_split

sns.set_theme(style="whitegrid", font_scale=1.1)
plt.rcParams["figure.figsize"] = (12, 4)
plt.rcParams["figure.dpi"] = 100

data_dir = Path(DATA_DIR) if DATA_DIR else None

## 1. Load All Machines

In [None]:
datasets = load_all(data_dir=data_dir, snapshot="24h_snapshot")

for name, (df, cats) in datasets.items():
    features = get_feature_cols(df)
    categories = sorted(set(cats.values()))
    print(f"{name:16s}: {len(df):6d} rows, {len(features):3d} features, categories: {categories}")

## 2. Target Variable: adj_freq_ppm

The target is the kernel NTP frequency correction in parts per million (PPM).
This represents the clock drift that NTP is actively compensating for.

In [None]:
fig, axes = plt.subplots(len(datasets), 1, figsize=(14, 3 * len(datasets)), sharex=False)
if len(datasets) == 1:
    axes = [axes]

for ax, (name, (df, _)) in zip(axes, datasets.items()):
    ax.plot(df.index, df[TARGET_COL], linewidth=0.3, alpha=0.8)
    ax.set_ylabel("adj_freq_ppm")
    ax.set_title(f"{name} - Clock Drift (adj_freq_ppm)")

plt.tight_layout()
plt.show()

## 3. Target Statistics

In [None]:
stats = []
for name, (df, _) in datasets.items():
    target = df[TARGET_COL]
    stats.append({
        "Machine": name,
        "Mean (ppm)": f"{target.mean():.4f}",
        "Std (ppm)": f"{target.std():.4f}",
        "Min": f"{target.min():.4f}",
        "Max": f"{target.max():.4f}",
        "Range": f"{target.max() - target.min():.4f}",
        "Rows": len(df),
        "Duration (h)": f"{len(df) / 3600:.1f}",
    })

pd.DataFrame(stats)

## 4. Sensor Categories by Machine

In [None]:
# Count features per category per machine
cat_counts = []
for name, (df, cats) in datasets.items():
    for cat in sorted(set(cats.values())):
        n = sum(1 for c, ct in cats.items() if ct == cat)
        cat_counts.append({"Machine": name, "Category": cat, "Features": n})

cat_df = pd.DataFrame(cat_counts)
pivot = cat_df.pivot(index="Category", columns="Machine", values="Features").fillna(0).astype(int)

fig, ax = plt.subplots(figsize=(10, 6))
pivot.plot(kind="barh", ax=ax)
ax.set_xlabel("Number of features")
ax.set_title("Sensor Features by Category")
plt.tight_layout()
plt.show()

## 5. Autocorrelation of Target

In [None]:
fig, axes = plt.subplots(1, len(datasets), figsize=(4 * len(datasets), 3))
if len(datasets) == 1:
    axes = [axes]

for ax, (name, (df, _)) in zip(axes, datasets.items()):
    target = df[TARGET_COL].values
    max_lag = min(3600, len(target) // 2)  # up to 1 hour
    lags = np.arange(0, max_lag, 60)  # every minute
    acf = [np.corrcoef(target[:-lag] if lag > 0 else target, target[lag:] if lag > 0 else target)[0, 1] for lag in lags]
    ax.plot(lags / 60, acf)
    ax.set_xlabel("Lag (minutes)")
    ax.set_ylabel("ACF")
    ax.set_title(name)
    ax.axhline(y=0, color='gray', linestyle='--', linewidth=0.5)

plt.suptitle("Autocorrelation of adj_freq_ppm", y=1.02)
plt.tight_layout()
plt.show()

## 6. Sample Windows Preview

Preview the (context, horizon) windows that will be used for benchmarking.

In [None]:
# Show 3 sample windows for the first available machine
first_name = list(datasets.keys())[0]
first_df = datasets[first_name][0]

samples = extract_samples(first_df, TARGET_COL, context_len=512, horizon_len=60, n_samples=3)

fig, axes = plt.subplots(3, 1, figsize=(12, 8))
for ax, s in zip(axes, samples):
    ctx_vals = s.context[TARGET_COL].values
    hz_vals = s.horizon_true.values
    t_ctx = np.arange(len(ctx_vals))
    t_hz = np.arange(len(ctx_vals), len(ctx_vals) + len(hz_vals))

    ax.plot(t_ctx, ctx_vals, label="Context", color="steelblue")
    ax.plot(t_hz, hz_vals, label="Horizon (truth)", color="coral", linewidth=2)
    ax.axvline(x=len(ctx_vals), color="gray", linestyle="--", alpha=0.5)
    ax.set_ylabel("adj_freq_ppm")
    ax.legend(loc="upper right")
    ax.set_title(f"Sample at idx={s.start_idx}")

axes[-1].set_xlabel("Timestep (seconds)")
plt.suptitle(f"{first_name}: Context (512s) + Horizon (60s) Windows", y=1.01)
plt.tight_layout()
plt.show()

## 7. Feature Correlation with Target

In [None]:
for name, (df, cats) in datasets.items():
    features = get_feature_cols(df)
    if not features:
        continue

    corrs = df[features].corrwith(df[TARGET_COL]).abs().sort_values(ascending=False)
    top_20 = corrs.head(20)

    fig, ax = plt.subplots(figsize=(10, 5))
    colors = ["steelblue" if cats.get(col, "") in ["CPU Core Temp", "CPU Package Temp", "Non-CPU Temp"] else "coral" for col in top_20.index]
    top_20.plot(kind="barh", ax=ax, color=colors)
    ax.set_xlabel("|Correlation| with adj_freq_ppm")
    ax.set_title(f"{name}: Top 20 Correlated Features")
    ax.invert_yaxis()
    plt.tight_layout()
    plt.show()