# 02 — Feature Engineering

Run the full feature extraction pipeline on bearing data and visualise:
- Distribution of all 30 features
- Correlation heatmap
- Normal vs anomaly feature comparison

**Prerequisites:** `data/raw/bearing_*.csv` must exist (run `download_data.py`).

In [None]:
import sys, os
from pathlib import Path

PROJECT_ROOT = Path(os.path.abspath("../.."))
sys.path.insert(0, str(PROJECT_ROOT / "backend"))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from app.preprocessing.signal_processing import window_signal
from app.preprocessing.feature_extraction import FeatureExtractor, FEATURE_NAMES

%matplotlib inline
plt.rcParams["figure.figsize"] = (14, 5)

In [None]:
# ---------------------------------------------------------------------------
# 1. Load a bearing with anomalies (bearing_3 or bearing_4)
# ---------------------------------------------------------------------------
DATA_DIR = PROJECT_ROOT / "data" / "raw"
# Pick the first file that has anomalies; fall back to any available
candidates = sorted(DATA_DIR.glob("bearing_*.csv"))
df = None
for p in candidates:
    _df = pd.read_csv(p)
    if "phase" in _df.columns and "anomaly" in _df["phase"].values:
        df = _df
        print(f"Using {p.name} (contains anomalies)")
        break
if df is None:
    df = pd.read_csv(candidates[0])
    print(f"Using {candidates[0].name} (no anomaly phase)")

print(f"Rows: {len(df):,}  |  Phases: {df['phase'].value_counts().to_dict() if 'phase' in df.columns else 'N/A'}")

In [None]:
# ---------------------------------------------------------------------------
# 2. Window + extract features
# ---------------------------------------------------------------------------
from app.preprocessing.signal_processing import DEFAULT_WINDOW_SIZE, DEFAULT_HOP_SIZE

signal = df["ch1"].values.astype(np.float64)
phases = df["phase"].values if "phase" in df.columns else np.full(len(signal), "all")

windows = window_signal(signal)
print(f"Windows: {windows.shape}")

extractor = FeatureExtractor()
features = extractor.extract(windows)   # (n_windows, 30)
print(f"Features: {features.shape}")

# Map phase labels to windows
n_windows = len(windows)
starts = np.arange(n_windows) * DEFAULT_HOP_SIZE
window_phases = phases[starts]
is_anomaly = (window_phases == "anomaly").astype(int)

feat_df = pd.DataFrame(features, columns=FEATURE_NAMES)
feat_df["is_anomaly"] = is_anomaly
feat_df.describe()

In [None]:
# ---------------------------------------------------------------------------
# 3. Feature distributions — normal vs anomaly
# ---------------------------------------------------------------------------
n_features = 30
cols = 5
rows = n_features // cols

fig, axes = plt.subplots(rows, cols, figsize=(20, rows * 3))
axes = axes.flatten()

for i, name in enumerate(FEATURE_NAMES):
    ax = axes[i]
    normal_vals  = feat_df.loc[feat_df["is_anomaly"] == 0, name].values
    anomaly_vals = feat_df.loc[feat_df["is_anomaly"] == 1, name].values

    ax.hist(normal_vals,  bins=40, alpha=0.6, label="normal",  color="steelblue", density=True)
    if len(anomaly_vals) > 0:
        ax.hist(anomaly_vals, bins=40, alpha=0.6, label="anomaly", color="tomato",    density=True)
    ax.set_title(name, fontsize=9)
    ax.legend(fontsize=7)

plt.suptitle("Feature Distributions: Normal vs Anomaly", fontsize=14, y=1.02)
plt.tight_layout()
plt.show()

In [None]:
# ---------------------------------------------------------------------------
# 4. Correlation heatmap (normal data only)
# ---------------------------------------------------------------------------
normal_df = feat_df[feat_df["is_anomaly"] == 0][FEATURE_NAMES]
corr = normal_df.corr()

fig, ax = plt.subplots(figsize=(14, 11))
im = ax.imshow(corr.values, cmap="RdBu_r", vmin=-1, vmax=1, aspect="auto")
fig.colorbar(im, ax=ax, shrink=0.8)

ax.set_xticks(range(30))
ax.set_yticks(range(30))
ax.set_xticklabels(FEATURE_NAMES, rotation=90, fontsize=7)
ax.set_yticklabels(FEATURE_NAMES, fontsize=7)
ax.set_title("Feature Correlation Heatmap (Normal Data)")
plt.tight_layout()
plt.show()

In [None]:
# ---------------------------------------------------------------------------
# 5. Top discriminating features (mean separation)
# ---------------------------------------------------------------------------
if feat_df["is_anomaly"].sum() > 0:
    normal_means  = feat_df[feat_df["is_anomaly"] == 0][FEATURE_NAMES].mean()
    anomaly_means = feat_df[feat_df["is_anomaly"] == 1][FEATURE_NAMES].mean()
    normal_stds   = feat_df[feat_df["is_anomaly"] == 0][FEATURE_NAMES].std()

    # Normalised separation (Cohen's d-like)
    separation = ((anomaly_means - normal_means) / (normal_stds + 1e-8)).abs().sort_values(ascending=False)

    fig, ax = plt.subplots(figsize=(10, 5))
    separation.head(15).plot(kind="barh", ax=ax, color="darkorange")
    ax.set_xlabel("Normalised mean separation")
    ax.set_title("Top 15 Most Discriminating Features")
    ax.invert_yaxis()
    plt.tight_layout()
    plt.show()
    print(separation.head(10))
else:
    print("No anomaly samples in this bearing — skipping separation analysis.")

---
**Next step:** Run `train_models.py` to train the baseline Autoencoder on these features.