In [433]:
import os
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.mixture import GaussianMixture

pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

## Datos

In [434]:
data_dir = "./CMAPSSData"
print("data_dir:", data_dir)
assert os.path.exists(data_dir), f"data_dir not found: {data_dir}"

# common column names used by C-MAPSS
COL_NAMES = (['unit', 'cycle', 'op_setting_1', 'op_setting_2', 'op_setting_3'] +
             [f'sensor_{i+1}' for i in range(21)])

def discover_files(data_dir):
    """
    Return dict mapping FD ids ('FD001',...) to dicts with keys possibly 'train','test','rul'.
    This version is tolerant to filename variations and case.
    """
    mapping = {}
    for path in glob.glob(os.path.join(data_dir, "*")):
        name = os.path.basename(path).upper()
        # detect FD id present in filename
        for fd in ['FD001','FD002','FD003','FD004']:
            if fd in name:
                if fd not in mapping:
                    mapping[fd] = {}
                if 'TRAIN' in name:
                    mapping[fd]['train'] = path
                if 'TEST' in name:
                    mapping[fd]['test'] = path
                if 'RUL' in name or name.startswith('RUL_') or 'RUL' in name:
                    mapping[fd]['rul'] = path
                # also accept files named like 'train_FD001.txt' etc.
                # note: we allow multiple matches; last one wins (fine for typical datasets)
    return mapping

def load_cmapss_pair(train_path, test_path):
    """Load a single FD dataset duo"""
    train = pd.read_csv(train_path, sep='\s+', header=None, names=COL_NAMES)

    test = pd.read_csv(test_path, sep='\s+', header=None, names=COL_NAMES)

    train[['unit','cycle']] = train[['unit','cycle']].astype(int)
    test[['unit','cycle']] = test[['unit','cycle']].astype(int)
    return train.reset_index(drop=True), test.reset_index(drop=True)

# discover dataset files
mapping = discover_files(data_dir)
print("Discovered dataset keys and file types:")
for k,v in mapping.items():
    print(k, v.keys())

# --- Choose dataset FDID to run ---
FDID = 'FD001'

# If chosen FDID doesn't have train+test, pick the first available FD that has both
if FDID not in mapping or 'train' not in mapping[FDID] or 'test' not in mapping[FDID]:
    print(f"Requested {FDID} is missing train/test. Searching for first FD with both train and test...")
    chosen = None
    for fd, files in mapping.items():
        if 'train' in files and 'test' in files:
            chosen = fd
            break
    if chosen is None:
        raise FileNotFoundError(f"No FD dataset with both train and test found in {data_dir}. Mapping: {mapping}")
    print(f"Switching to available dataset: {chosen}")
    FDID = chosen

train_path = mapping[FDID]['train']
test_path  = mapping[FDID]['test']

print("Using:", train_path, test_path)
train_df, test_df = load_cmapss_pair(train_path, test_path)
print(f"Loaded {FDID}: train rows={len(train_df)} (units={train_df['unit'].nunique()}), test rows={len(test_df)} (units={test_df['unit'].nunique()})")

data_dir: ./CMAPSSData
Discovered dataset keys and file types:
FD001 dict_keys(['test', 'train', 'rul'])
FD003 dict_keys(['rul', 'test', 'train'])
FD002 dict_keys(['train', 'rul', 'test'])
FD004 dict_keys(['rul', 'test', 'train'])
Using: ./CMAPSSData/train_FD001.txt ./CMAPSSData/test_FD001.txt
Loaded FD001: train rows=20631 (units=100), test rows=13096 (units=100)


  train = pd.read_csv(train_path, sep='\s+', header=None, names=COL_NAMES)
  test = pd.read_csv(test_path, sep='\s+', header=None, names=COL_NAMES)


In [435]:
train_df.head()

Unnamed: 0,unit,cycle,op_setting_1,op_setting_2,op_setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,sensor_6,sensor_7,sensor_8,sensor_9,sensor_10,sensor_11,sensor_12,sensor_13,sensor_14,sensor_15,sensor_16,sensor_17,sensor_18,sensor_19,sensor_20,sensor_21
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,21.61,554.36,2388.06,9046.19,1.3,47.47,521.66,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.419
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,21.61,553.75,2388.04,9044.07,1.3,47.49,522.28,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.0,23.4236
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,21.61,554.26,2388.08,9052.94,1.3,47.27,522.42,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,21.61,554.45,2388.11,9049.48,1.3,47.13,522.86,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,21.61,554.0,2388.06,9055.15,1.3,47.28,522.19,2388.04,8133.8,8.4294,0.03,393,2388,100.0,38.9,23.4044


In [436]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20631 entries, 0 to 20630
Data columns (total 26 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   unit          20631 non-null  int64  
 1   cycle         20631 non-null  int64  
 2   op_setting_1  20631 non-null  float64
 3   op_setting_2  20631 non-null  float64
 4   op_setting_3  20631 non-null  float64
 5   sensor_1      20631 non-null  float64
 6   sensor_2      20631 non-null  float64
 7   sensor_3      20631 non-null  float64
 8   sensor_4      20631 non-null  float64
 9   sensor_5      20631 non-null  float64
 10  sensor_6      20631 non-null  float64
 11  sensor_7      20631 non-null  float64
 12  sensor_8      20631 non-null  float64
 13  sensor_9      20631 non-null  float64
 14  sensor_10     20631 non-null  float64
 15  sensor_11     20631 non-null  float64
 16  sensor_12     20631 non-null  float64
 17  sensor_13     20631 non-null  float64
 18  sensor_14     20631 non-nu

In [437]:
train_df.describe(include='all')

Unnamed: 0,unit,cycle,op_setting_1,op_setting_2,op_setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,sensor_6,sensor_7,sensor_8,sensor_9,sensor_10,sensor_11,sensor_12,sensor_13,sensor_14,sensor_15,sensor_16,sensor_17,sensor_18,sensor_19,sensor_20,sensor_21
count,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0
mean,51.506568,108.807862,-9e-06,2e-06,100.0,518.67,642.680934,1590.523119,1408.933782,14.62,21.609803,553.367711,2388.096652,9065.242941,1.3,47.541168,521.41347,2388.096152,8143.752722,8.442146,0.03,393.210654,2388.0,100.0,38.816271,23.289705
std,29.227633,68.88099,0.002187,0.000293,0.0,0.0,0.500053,6.13115,9.000605,5.3292e-15,0.001389,0.885092,0.070985,22.08288,0.0,0.267087,0.737553,0.071919,19.076176,0.037505,3.469531e-18,1.548763,0.0,0.0,0.180746,0.108251
min,1.0,1.0,-0.0087,-0.0006,100.0,518.67,641.21,1571.04,1382.25,14.62,21.6,549.85,2387.9,9021.73,1.3,46.85,518.69,2387.88,8099.94,8.3249,0.03,388.0,2388.0,100.0,38.14,22.8942
25%,26.0,52.0,-0.0015,-0.0002,100.0,518.67,642.325,1586.26,1402.36,14.62,21.61,552.81,2388.05,9053.1,1.3,47.35,520.96,2388.04,8133.245,8.4149,0.03,392.0,2388.0,100.0,38.7,23.2218
50%,52.0,104.0,0.0,0.0,100.0,518.67,642.64,1590.1,1408.04,14.62,21.61,553.44,2388.09,9060.66,1.3,47.51,521.48,2388.09,8140.54,8.4389,0.03,393.0,2388.0,100.0,38.83,23.2979
75%,77.0,156.0,0.0015,0.0003,100.0,518.67,643.0,1594.38,1414.555,14.62,21.61,554.01,2388.14,9069.42,1.3,47.7,521.95,2388.14,8148.31,8.4656,0.03,394.0,2388.0,100.0,38.95,23.3668
max,100.0,362.0,0.0087,0.0006,100.0,518.67,644.53,1616.91,1441.49,14.62,21.61,556.06,2388.56,9244.59,1.3,48.53,523.38,2388.56,8293.72,8.5848,0.03,400.0,2388.0,100.0,39.43,23.6184


Se puede notar que op_setting_3, sensor_1, sensor_10, sensor_18 y sensor_19 siempre se mantiene constante, por lo que no aporta ninguna información

In [438]:
# Si se usa FD003, sensor 10 si es importante
train_df.drop(['op_setting_3', 'sensor_1', 'sensor_10', 'sensor_18', 'sensor_19'], axis=1, inplace=True)
units = train_df['unit'].nunique()

In [439]:
train_df = train_df.groupby('unit').apply(lambda x: x.head(int(len(x) * 0.4)))
train_df.reset_index(drop=True, inplace=True)

  train_df = train_df.groupby('unit').apply(lambda x: x.head(int(len(x) * 0.4)))


In [None]:
# Seleccionamos las características para el modelo (unit y cycle no afectan)
df_st = train_df[['op_setting_1', 'op_setting_2'] + [f'sensor_{i}' for i in range(2, 10)] + [f'sensor_{i}' for i in range(11, 18)] + [f'sensor_{i}' for i in range(20, 22)]]

# Normalizar los datos
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_st)
y_good = [1] * len(df_scaled)
num_anomalies = int(len(df_scaled) * 0.1)
y_good[:num_anomalies] = [0] * num_anomalies

# Uso SMOTE para aumentar datos
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_aug, y_resampled = smote.fit_resample(df_scaled, y_good)

print(f"Datos antes del aumento: {len(df_scaled)}")
print(f"Datos después del aumento: {len(X_aug)}")

df_aug = pd.DataFrame(X_aug, columns=df_st.columns)
df_aug['anomaly'] = y_resampled
df_aug.drop(['anomaly'], axis=1, inplace=True)
df_aug.describe(include='all')


Datos antes del aumento: 8216
Datos después del aumento: 14790


Unnamed: 0,op_setting_1,op_setting_2,sensor_2,sensor_3,sensor_4,sensor_5,sensor_6,sensor_7,sensor_8,sensor_9,sensor_11,sensor_12,sensor_13,sensor_14,sensor_15,sensor_16,sensor_17,sensor_20,sensor_21
count,14790.0,14790.0,14790.0,14790.0,14790.0,14790.0,14790.0,14790.0,14790.0,14790.0,14790.0,14790.0,14790.0,14790.0,14790.0,14790.0,14790.0,14790.0,14790.0
mean,0.005182,0.010762,-0.147247,-0.12872,-0.194806,-3.552714e-15,-0.06946,0.175783,-0.168201,0.006729,-0.212399,0.198717,-0.165608,0.043055,-0.138064,-1.0408340000000001e-17,-0.161127,0.151281,0.147579
std,0.960527,0.966876,0.97221,0.943853,0.982209,0.0,1.15372,0.987713,1.006276,1.092008,0.990535,0.998883,1.032234,1.107387,0.951988,0.0,0.96909,0.956903,0.95435
min,-4.018562,-2.034039,-3.201655,-3.466857,-3.539543,-3.552714e-15,-5.033223,-3.348777,-2.967798,-2.966884,-2.972259,-3.385003,-3.090141,-2.933661,-3.580933,-1.0408340000000001e-17,-3.799446,-3.867522,-3.936486
25%,-0.640337,-0.679703,-0.84269,-0.776496,-0.937396,-3.552714e-15,0.19868,-0.534875,-0.939165,-0.780037,-0.989962,-0.52964,-0.9506,-0.702589,-0.783747,-1.0408340000000001e-17,-1.059172,-0.48491,-0.527868
50%,0.053819,0.000867,-0.158412,-0.142209,-0.290931,-3.552714e-15,0.19868,0.285107,-0.303783,-0.007674,-0.3376,0.314857,-0.312306,0.086616,-0.19975,-1.0408340000000001e-17,-0.29466,0.206987,0.216802
75%,0.655421,0.78557,0.524438,0.506587,0.509282,-3.552714e-15,0.19868,0.908305,0.536204,0.887561,0.521528,0.955678,0.545762,0.927408,0.47101,-1.0408340000000001e-17,0.581536,0.831853,0.840368
max,3.524598,2.035773,3.74121,3.670101,3.321401,-3.552714e-15,0.19868,3.468775,3.302522,2.810449,2.869811,3.029428,3.272689,2.820024,3.668922,-1.0408340000000001e-17,3.210125,3.897109,3.371616


In [441]:
df_aug.head()

Unnamed: 0,op_setting_1,op_setting_2,sensor_2,sensor_3,sensor_4,sensor_5,sensor_6,sensor_7,sensor_8,sensor_9,sensor_11,sensor_12,sensor_13,sensor_14,sensor_15,sensor_16,sensor_17,sensor_20,sensor_21
0,-0.316397,-1.355737,-1.566463,0.530011,-0.447737,-3.552714e-15,0.19868,0.735323,-0.017059,-1.266683,0.578803,-0.429136,-0.726804,0.087869,-0.021845,-1.0408340000000001e-17,-0.29466,1.05264,0.828892
1,0.886806,-1.016586,-0.681851,0.984103,-0.019771,-3.552714e-15,0.19868,-0.245504,-0.385902,-1.520159,0.693353,0.817555,0.182171,-0.805311,0.440912,-1.0408340000000001e-17,-0.29466,0.591375,0.88755
2,-1.982371,1.01832,-0.145722,0.163738,0.15883,-3.552714e-15,0.19868,0.574532,0.351783,-0.459626,-0.566701,1.099066,-0.545009,-0.58734,-0.085803,-1.0408340000000001e-17,-2.047053,0.206987,-0.124949
3,0.331481,0.000867,-0.145722,-0.950073,-0.233754,-3.552714e-15,0.19868,0.880035,0.905046,-0.873318,-1.368554,1.983816,0.363966,-0.512178,-1.951879,-1.0408340000000001e-17,-0.29466,-0.331155,0.253782
4,-0.871722,-0.677435,-0.09211,-0.937221,0.499181,-3.552714e-15,0.19868,0.156474,-0.017059,-0.19539,-0.509426,0.636584,-0.363214,-0.515936,0.350618,-1.0408340000000001e-17,0.581536,-0.1774,0.642714


In [None]:
from sklearn.mixture import GaussianMixture

# Crear el modelo GMM
gmm = GaussianMixture(n_components=1, random_state=42)

gmm.fit(df_aug)

y_prob = gmm.predict_proba(df_aug)

# Evaluar el resultado
df_good = pd.DataFrame(df_aug, columns=df_st.columns)
df_good['probabilidad_normal'] = y_prob.max(axis=1)

# Visualizar las probabilidades de normalidad
df_good['anomalía'] = df_good['probabilidad_normal'] < 0.5 

print(df_good[['probabilidad_normal', 'anomalía']].head())

   probabilidad_normal  anomalía
0                  1.0     False
1                  1.0     False
2                  1.0     False
3                  1.0     False
4                  1.0     False
