## Cell 1 — Setup — Imports and Data Loading Functions

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

RAW = os.path.join("..", "data", "data", "raw")

# Column layout for CMAPSS FD001: unit, cycle, 3 ops, 21 sensors
COLS = ["unit","cycle"] + [f"op{i}" for i in range(1,4)] + [f"s{i}" for i in range(1,22)]

def load_fd001(split="train"):
    if split == "train":
        fn = "train_FD001.txt"
    elif split == "test":
        fn = "test_FD001.txt"
    elif split == "rul":
        fn = "RUL_FD001.txt"
    else:
        raise ValueError("split must be 'train', 'test', or 'rul'")
    path = os.path.join(RAW, fn)
    if split == "rul":
        # single-column file with RUL per test unit
        df = pd.read_csv(path, sep=r"\s+", header=None, names=["RUL"])
    else:
        df = pd.read_csv(path, sep=r"\s+", header=None, names=COLS, engine="python")
    return df

train = load_fd001("train")
test  = load_fd001("test")
rul   = load_fd001("rul")
train.head()


## Cell 2 — Data Overview — Shapes, Columns, and Sanity Checks

In [None]:
print("Train shape:", train.shape)
print("Test shape:", test.shape)
print("RUL (test labels) shape:", rul.shape)

print("\nTrain columns:", list(train.columns))
print("\nNulls in train:\n", train.isna().sum().head(10))
print("\nUnits in train:", train["unit"].nunique())
print("Units in test:", test["unit"].nunique())


## Cell 3 — Feature Engineering — Compute RUL Labels for Trainings

In [None]:
# RUL for training: max(cycle) per unit minus current cycle
train_labeled = train.copy()
max_cycle = train_labeled.groupby("unit")["cycle"].transform("max")
train_labeled["RUL"] = max_cycle - train_labeled["cycle"]
train_labeled.head()


## Cell 4 — Exploration — Distribution of Cycles per Unit

In [None]:
cycles_per_unit = train.groupby("unit")["cycle"].max()
print("Average cycles to failure:", cycles_per_unit.mean().round(2))

plt.figure()
cycles_per_unit.hist(bins=20)
plt.title("Distribution of run-to-failure cycles (train)")
plt.xlabel("cycles")
plt.ylabel("count of units")
plt.show()


## Cell 5 — Sensor Drift Analysis — Early vs. Late Cycle Comparison

In [None]:
# Compare sensor values early vs. just before failure to see drift
def summarize_sensor_drift(df, sensor="s2", tail=20, head=20):
    # early = first 'head' cycles of each unit
    early = df.sort_values(["unit","cycle"]).groupby("unit").head(head)
    # late = last 'tail' cycles of each unit
    late  = df.sort_values(["unit","cycle"]).groupby("unit").tail(tail)
    return early[sensor].describe(), late[sensor].describe()

for s in ["s2","s3","s4","s7","s9","s11","s12","s13","s14","s15"]:
    early_stats, late_stats = summarize_sensor_drift(train, sensor=s, tail=20, head=20)
    print(f"\nSensor {s} — early 20 cycles vs last 20 cycles")
    print("Early:\n", early_stats)
    print("Late:\n", late_stats)


## Cell 6 — Visualization — Sensor Trends for a Sample Unit

In [None]:
unit_id = int(train["unit"].sample(1, random_state=7).iloc[0])
u = train[train["unit"]==unit_id].sort_values("cycle")

plt.figure()
plt.plot(u["cycle"], u["s2"], label="s2")
plt.plot(u["cycle"], u["s3"], label="s3")
plt.plot(u["cycle"], u["s4"], label="s4")
plt.title(f"Sensors over time — Unit {unit_id}")
plt.xlabel("cycle"); plt.ylabel("sensor value")
plt.legend(); plt.show()


## Cell 7 — Correlation Analysis — Sensor Relationship Heatmap

In [None]:
# Matplotlib-only heatmap to avoid seaborn for now
sensors = [c for c in train.columns if c.startswith("s")]
corr = train[sensors].corr()

plt.figure(figsize=(7,6))
plt.imshow(corr, interpolation="nearest")
plt.title("Sensor correlation (train)")
plt.colorbar()
plt.xticks(range(len(sensors)), sensors, rotation=90, fontsize=7)
plt.yticks(range(len(sensors)), sensors, fontsize=7)
plt.tight_layout()
plt.show()


## Cell 8 — Label Validation — Check RUL Progression per Unit

In [None]:
uid = unit_id
tmp = train_labeled[train_labeled["unit"]==uid].sort_values("cycle")[["cycle","RUL"]]
tmp.head(10), tmp.tail(10)
