# 01 — Data Exploration

Load bearing CSV files (synthetic or NASA), inspect structure, plot raw
waveforms and basic frequency spectra.

**Prerequisites:** run `python ../scripts/download_data.py --mode synthetic`
so that `data/raw/bearing_*.csv` files exist.

In [None]:
import sys, os
from pathlib import Path

# Resolve paths so we can import backend modules
PROJECT_ROOT = Path(os.path.abspath("../.."))
sys.path.insert(0, str(PROJECT_ROOT / "backend"))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.fft import fft, fftfreq

%matplotlib inline
plt.rcParams["figure.figsize"] = (12, 4)

In [None]:
# ---------------------------------------------------------------------------
# 1. Load all bearing CSVs
# ---------------------------------------------------------------------------
DATA_DIR = PROJECT_ROOT / "data" / "raw"
csvs = sorted(DATA_DIR.glob("bearing_*.csv"))
print(f"Found {len(csvs)} bearing files:")
for p in csvs:
    print(f"  {p.name}  ({p.stat().st_size / 1024:.0f} KB)")

# Load first bearing as primary example
df = pd.read_csv(csvs[0])
print(f"\nShape: {df.shape}")
df.head()

In [None]:
# ---------------------------------------------------------------------------
# 2. Basic statistics per phase
# ---------------------------------------------------------------------------
if "phase" in df.columns:
    print(df.groupby("phase")[["ch1","ch2","ch3","ch4","temperature"]].describe())
else:
    print(df.describe())

In [None]:
# ---------------------------------------------------------------------------
# 3. Raw waveform plots — one per phase
# ---------------------------------------------------------------------------
phases = df["phase"].unique() if "phase" in df.columns else ["all"]
fig, axes = plt.subplots(len(phases), 1, figsize=(14, 3 * len(phases)), sharex=False)
if len(phases) == 1:
    axes = [axes]

for ax, phase in zip(axes, phases):
    subset = df[df["phase"] == phase] if phase != "all" else df
    # Plot first 5000 samples of ch1
    ax.plot(subset["ch1"].values[:5000], linewidth=0.5)
    ax.set_title(f"Phase: {phase}")
    ax.set_ylabel("Amplitude")

axes[-1].set_xlabel("Sample index")
plt.tight_layout()
plt.show()

In [None]:
# ---------------------------------------------------------------------------
# 4. FFT spectrum — compare normal vs anomaly phase
# ---------------------------------------------------------------------------
SAMPLE_RATE = 20000

def plot_spectrum(signal, ax, label, n=4096):
    """Plot magnitude spectrum of first n samples."""
    seg = signal[:n]
    S = np.abs(fft(seg))
    freqs = fftfreq(n, d=1.0/SAMPLE_RATE)
    pos = freqs > 0
    ax.semilogy(freqs[pos], S[pos], linewidth=0.7, label=label)
    ax.set_xlabel("Frequency (Hz)")
    ax.set_ylabel("Magnitude")
    ax.legend()

fig, ax = plt.subplots(1, 1, figsize=(12, 4))
for phase in ["normal", "anomaly"]:
    subset = df[df["phase"] == phase]
    if len(subset) >= 4096:
        plot_spectrum(subset["ch1"].values, ax, label=phase)
ax.set_title("Frequency Spectrum — Normal vs Anomaly")
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# ---------------------------------------------------------------------------
# 5. Temperature vs vibration envelope
# ---------------------------------------------------------------------------
if "temperature" in df.columns:
    # Subsample for speed
    step = max(1, len(df) // 2000)
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(14, 5), sharex=True)
    ax1.plot(df["ch1"].values[::step], linewidth=0.6, color="steelblue")
    ax1.set_ylabel("Vibration (ch1)")
    ax1.set_title("Vibration & Temperature Correlation")
    ax2.plot(df["temperature"].values[::step], linewidth=0.8, color="darkorange")
    ax2.set_ylabel("Temperature (°C)")
    ax2.set_xlabel("Sample index (subsampled)")
    plt.tight_layout()
    plt.show()

---
**Next notebook:** `02_feature_engineering.ipynb` — extract and visualise all 30 features.