# Imports y Configuración Inicial

In [27]:
# Imports y configuración
import os
import sys
from pathlib import Path
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

import statsmodels.api as sm
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

# Paths
ROOT = Path.cwd()
DATA_DIR = ROOT / "CMAPSSData"  # ajusta si tu carpeta es distinta
RAW_DIR = Path(r"C:\Users\jucep\OneDrive\Escritorio\Proyecto CMAPSS\MANTENIMIENTO-PREDICTIVO\CMAPSSData\raw")  # reemplaza por la ruta que obtuviste
EDA_OUT = ROOT / "eda_outputs"
EDA_OUT.mkdir(exist_ok=True)

# Visual defaults
plt.rcParams["figure.figsize"] = (12, 6)
sns.set_context("talk")

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

# Funciones Utilitarias

In [28]:
# Funciones reutilizables solicitadas
def read_cmapss(path):
    """Leer archivo C-MAPSS (espacios múltiples) y asignar nombres de columna."""
    col_names = ["unit", "cycle", "op1", "op2", "op3"] + [f"s{i}" for i in range(1, 22)]
    df = pd.read_csv(path, sep=r"\s+", header=None, names=col_names)
    return df

def compute_rul(df):
    """Calcular RUL por unit para dataset de entrenamiento.
       Devuelve df con nueva columna 'RUL' y rul_per_unit dataframe.
    """
    max_cycle = df.groupby("unit")["cycle"].transform("max")
    df = df.copy()
    df["RUL"] = max_cycle - df["cycle"]
    rul_per_unit = df[["unit", "cycle", "RUL"]].copy()
    return df, rul_per_unit

def summary_stats(df):
    """Resumen con percentiles 25,50,75,99 y missing pct por columna."""
    stats = df.describe(percentiles=[.25, .5, .75, .99]).T
    stats = stats.rename(columns={"50%": "50%", "99%": "99%"})
    stats["missing_pct"] = df.isna().mean() * 100
    cols_keep = ["count", "mean", "std", "min", "25%", "50%", "75%", "99%", "max", "missing_pct"]
    return stats[cols_keep].reset_index().rename(columns={"index": "column"})

def per_unit_stats(df):
    """Estadísticas por unidad: cycles_count, cycle_length, first/last val per sensor (flatten)."""
    units = []
    sensors = [c for c in df.columns if c.startswith("s")]
    grouped = df.groupby("unit")
    for u, g in grouped:
        row = {"unit": u, "cycles_count": g["cycle"].nunique(), "cycle_length": g["cycle"].max() - g["cycle"].min() + 1}
        # primeros y últimos valores por sensor (store as semicolon-separated)
        first_vals = {f"first_{s}": g.iloc[0][s] for s in sensors}
        last_vals = {f"last_{s}": g.iloc[-1][s] for s in sensors}
        row.update(first_vals)
        row.update(last_vals)
        units.append(row)
    return pd.DataFrame(units)

def select_representative_units(df, n_random=3):
    """Seleccionar 6 unidades: min length, median length, max length, y 3 aleatorias distintas."""
    lengths = df.groupby("unit")["cycle"].nunique().reset_index(name="len")
    min_u = int(lengths.sort_values("len").iloc[0]["unit"])
    max_u = int(lengths.sort_values("len").iloc[-1]["unit"])
    med_idx = int(len(lengths) // 2)
    med_u = int(lengths.sort_values("len").iloc[med_idx]["unit"])
    # escoger 3 aleatorias distintas de las anteriores
    pool = set(lengths["unit"].unique()) - {min_u, med_u, max_u}
    rnd = list(np.random.choice(list(pool), size=n_random, replace=False))
    selected = [min_u, med_u, max_u] + rnd
    return selected

def rolling_features_by_unit(df, sensors, windows=(5,10,20)):
    """Ejemplo: calcular rolling mean/std y slope (OLS) en ventanas por unidad."""
    out = df.copy()
    for w in windows:
        for s in sensors:
            out[f"{s}_rm_{w}"] = out.groupby("unit")[s].transform(lambda x: x.rolling(window=w, min_periods=1).mean())
            out[f"{s}_rstd_{w}"] = out.groupby("unit")[s].transform(lambda x: x.rolling(window=w, min_periods=1).std().fillna(0))
            # slope via simple linear regression on rolling window
            def slope(x):
                if len(x) < 2:
                    return 0.0
                idx = np.arange(len(x))
                A = np.vstack([idx, np.ones(len(idx))]).T
                m, _ = np.linalg.lstsq(A, x, rcond=None)[0]
                return m
            out[f"{s}_slope_{w}"] = out.groupby("unit")[s].transform(lambda x: x.rolling(window=w, min_periods=2).apply(slope, raw=True).fillna(0))
    return out

def save_fig(fig, filepath):
    """Guardar figura con tight layout y resolución."""
    fig.tight_layout()
    fig.savefig(filepath, dpi=150)
    plt.close(fig)

# Carga de Datos

In [29]:
import pandas as pd
col_names = ["unit","cycle","op1","op2","op3"] + [f"s{i}" for i in range(1,22)]
train_path = RAW_DIR / "train_FD001.txt"
test_path  = RAW_DIR / "test_FD001.txt"
rul_path   = RAW_DIR / "RUL_FD001.txt"

for p in (train_path, test_path, rul_path):
    if not p.exists():
        raise FileNotFoundError(f"No existe: {p}")

train = pd.read_csv(train_path, sep=r"\s+", header=None, names=col_names, engine="python")
test  = pd.read_csv(test_path,  sep=r"\s+", header=None, names=col_names, engine="python")
rul_truth = pd.read_csv(rul_path, sep=r"\s+", header=None, names=["RUL_true"], engine="python")

print("Lectura OK:", train.shape, test.shape, rul_truth.shape)
display(train.head(3))

Lectura OK: (20631, 26) (13096, 26) (100, 1)


Unnamed: 0,unit,cycle,op1,op2,op3,s1,s2,s3,s4,s5,...,s12,s13,s14,s15,s16,s17,s18,s19,s20,s21
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,...,521.66,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.419
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,522.28,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.0,23.4236
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,...,522.42,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442


In [20]:
# 1) Lectura archivo train/test
train_path = RAW_DIR / "train_FD001.txt"
test_path = RAW_DIR / "test_FD001.txt"
rul_path = RAW_DIR / "RUL_FD001.txt"

if not (train_path.exists() and test_path.exists() and rul_path.exists()):
    missing = [p for p in (train_path, test_path, rul_path) if not p.exists()]
    raise FileNotFoundError(f"Faltan archivos en raw/: {missing}")

train = read_cmapss(train_path)
test = read_cmapss(test_path)
rul_truth = pd.read_csv(rul_path, sep=r"\s+", header=None, names=["RUL_true"])

FileNotFoundError: Faltan archivos en raw/: [WindowsPath('c:/Users/jucep/OneDrive/Escritorio/Proyecto CMAPSS/mantenimiento-predictivo/notebooks/CMAPSSData/raw/train_FD001.txt'), WindowsPath('c:/Users/jucep/OneDrive/Escritorio/Proyecto CMAPSS/mantenimiento-predictivo/notebooks/CMAPSSData/raw/test_FD001.txt'), WindowsPath('c:/Users/jucep/OneDrive/Escritorio/Proyecto CMAPSS/mantenimiento-predictivo/notebooks/CMAPSSData/raw/RUL_FD001.txt')]

# Resumen general y summary_stats.csv

# Chequeos de Calidad

# Cálculo de RUL y guardado rul_per_unit.csv 

# Count units vs cycles plot

# Selección de 6 unidades representativas

# Trajectory samples plot (6 unidades, sensors s2,s3,s7,s15)

# Operative conditions (histograms y hexbin)

# Correlación (Pearson y Spearman) y heatmap

# PCA (normalizar sensores y scree plot)

# Temporal properties — ACF/PACF para 4 sensores

# RUL trajectories plot

# Candidate features generation and top_features.csv

# Leakage checks y recomendaciones 

# Validación scheme recommendation 

 Recomendación de validación:
- Grouped K-Fold por unidad (k=5) donde cada fold contiene un conjunto de unidades completas. Evita fuga temporal.
- Leave-one-unit-out para análisis de generalización a un motor no visto.
- Para LSTM/GRU: usar validación temporal interna por último x% de ciclos por unidad (holdout por ciclo) y además grouped folds por unidad para estimar varianza. """


# Exports adicionales y checks automáticos

# README de la carpeta eda_outputs

# Mensaje final y checklist impreso

In [10]:
# Mensaje final
print("EDA completo — artifacts guardados en ./eda_outputs/")
print(f"Número de unidades procesadas: {train['unit'].nunique()}")
print("Unidades seleccionadas (6):", selected_units)
print("Top features file:", EDA_OUT / "top_features.csv")
print("Recomendación rápida: usar StandardScaler para sensores; ventanas iniciales prioritarias: 5, 10, 20; priorizar slopes y rolling std como features.")

EDA completo — artifacts guardados en ./eda_outputs/


NameError: name 'train' is not defined