# 01 - Data Loading and Exploration

This notebook demonstrates loading NAB (Numenta Anomaly Benchmark) data and exploring its structure.

## Contents
1. Loading NAB datasets
2. Visualising time-series with anomaly labels
3. Creating sliding windows
4. Examining class balance

In [None]:
import sys
sys.path.insert(0, '..')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from threatsim.data import (
    load_nab_data,
    create_anomaly_mask,
    create_windows,
    normalise_windows,
    get_dataloaders,
    DEFAULT_DATASETS,
)

plt.style.use('seaborn-v0_8-whitegrid')

## 1. Loading NAB Data

The NAB dataset contains real-world time-series with labelled anomalies. Let's load one of the default datasets.

In [None]:
# Load the machine temperature dataset
dataset_name = "realKnownCause/machine_temperature_system_failure.csv"
df, anomaly_timestamps = load_nab_data(dataset_name)

print(f"Dataset: {dataset_name}")
print(f"Shape: {df.shape}")
print(f"Anomaly timestamps: {anomaly_timestamps}")
print(f"\nFirst few rows:")
df.head()

In [None]:
# Basic statistics
print("Value statistics:")
df['value'].describe()

## 2. Visualising Time-Series with Anomalies

Let's plot the time-series and highlight the anomaly regions.

In [None]:
# Create anomaly mask
anomaly_mask = create_anomaly_mask(df, anomaly_timestamps, window_minutes=30)

print(f"Total points: {len(df)}")
print(f"Anomaly points: {anomaly_mask.sum()} ({100 * anomaly_mask.mean():.2f}%)")

In [None]:
# Plot full time-series
fig, ax = plt.subplots(figsize=(14, 5))

ax.plot(df['timestamp'], df['value'], 'b-', linewidth=0.5, alpha=0.8, label='Value')

# Highlight anomaly regions
anomaly_indices = np.where(anomaly_mask == 1)[0]
if len(anomaly_indices) > 0:
    ax.scatter(
        df['timestamp'].iloc[anomaly_indices],
        df['value'].iloc[anomaly_indices],
        c='red', s=5, alpha=0.5, label='Anomaly Region'
    )

ax.set_xlabel('Timestamp')
ax.set_ylabel('Temperature')
ax.set_title(f'Machine Temperature System Failure Dataset')
ax.legend()
plt.tight_layout()
plt.show()

## 3. Creating Sliding Windows

We convert the time-series into overlapping windows for the transformer model.

In [None]:
# Create windows
values = df['value'].values.astype(np.float32)
windows, window_labels = create_windows(
    values, anomaly_mask, window_size=50, step_size=10
)

print(f"Number of windows: {len(windows)}")
print(f"Window shape: {windows.shape}")
print(f"Anomalous windows: {window_labels.sum()} ({100 * window_labels.mean():.2f}%)")

In [None]:
# Visualise some example windows
fig, axes = plt.subplots(2, 4, figsize=(14, 6))

# Normal windows
normal_indices = np.where(window_labels == 0)[0]
for i, ax in enumerate(axes[0]):
    idx = normal_indices[i * 100]  # Sample from different parts
    ax.plot(windows[idx], 'b-')
    ax.set_title(f'Normal (idx={idx})')
    ax.set_xlabel('Time step')

# Anomalous windows
anomaly_indices = np.where(window_labels == 1)[0]
for i, ax in enumerate(axes[1]):
    if i < len(anomaly_indices):
        idx = anomaly_indices[min(i * 10, len(anomaly_indices) - 1)]
        ax.plot(windows[idx], 'r-')
        ax.set_title(f'Anomaly (idx={idx})')
    ax.set_xlabel('Time step')

plt.suptitle('Example Windows: Normal (top) vs Anomalous (bottom)')
plt.tight_layout()
plt.show()

In [None]:
# Normalise windows
normalised = normalise_windows(windows)

print(f"Before normalisation - Mean: {windows.mean():.2f}, Std: {windows.std():.2f}")
print(f"After normalisation - Mean: {normalised.mean():.4f}, Std: {normalised.std():.4f}")

## 4. Creating DataLoaders

Let's use the convenience function to create train/val/test DataLoaders.

In [None]:
# Create DataLoaders
train_loader, val_loader, test_loader, class_weight = get_dataloaders(
    dataset_names=DEFAULT_DATASETS,
    window_size=50,
    step_size=10,
    batch_size=32,
)

print(f"Training batches: {len(train_loader)}")
print(f"Validation batches: {len(val_loader)}")
print(f"Test batches: {len(test_loader)}")
print(f"Class weight (for handling imbalance): {class_weight.item():.2f}")

In [None]:
# Inspect a batch
batch = next(iter(train_loader))
windows_batch, labels_batch = batch

print(f"Batch windows shape: {windows_batch.shape}")
print(f"Batch labels shape: {labels_batch.shape}")
print(f"Labels in batch: {labels_batch.sum().item()} anomalies, {(1 - labels_batch).sum().item()} normal")

## Summary

We have successfully:
1. Loaded NAB time-series data with anomaly labels
2. Visualised the data and anomaly regions
3. Created sliding windows for model input
4. Set up PyTorch DataLoaders with class weighting

Next: See `03_model_training.ipynb` for training the transformer model.