In [1]:
import os
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Parámetros

In [None]:
RUL_THRESHOLD = 0.5 # 

## Datos

In [14]:
data_dir = "./CMAPSSData"
print("data_dir:", data_dir)
assert os.path.exists(data_dir), f"data_dir not found: {data_dir}"

# common column names used by C-MAPSS
COL_NAMES = (['unit', 'cycle', 'op_setting_1', 'op_setting_2', 'op_setting_3'] +
             [f'sensor_{i+1}' for i in range(21)])

def discover_files(data_dir):
    """
    Return dict mapping FD ids ('FD001',...) to dicts with keys possibly 'train','test','rul'.
    This version is tolerant to filename variations and case.
    """
    mapping = {}
    for path in glob.glob(os.path.join(data_dir, "*")):
        name = os.path.basename(path).upper()
        # detect FD id present in filename
        for fd in ['FD001','FD002','FD003','FD004']:
            if fd in name:
                if fd not in mapping:
                    mapping[fd] = {}
                if 'TRAIN' in name:
                    mapping[fd]['train'] = path
                if 'TEST' in name:
                    mapping[fd]['test'] = path
                if 'RUL' in name or name.startswith('RUL_') or 'RUL' in name:
                    mapping[fd]['rul'] = path
                # also accept files named like 'train_FD001.txt' etc.
                # note: we allow multiple matches; last one wins (fine for typical datasets)
    return mapping

def load_cmapss_pair(train_path, test_path, rul_path=None):
    """Load a single FD dataset trio and compute per-row RUL for train and test (if RUL provided)."""
    train = pd.read_csv(train_path, sep='\s+', header=None, names=COL_NAMES)

    test = pd.read_csv(test_path, sep='\s+', header=None, names=COL_NAMES)

    train[['unit','cycle']] = train[['unit','cycle']].astype(int)
    test[['unit','cycle']] = test[['unit','cycle']].astype(int)
    return train.reset_index(drop=True), test.reset_index(drop=True)

# discover dataset files
mapping = discover_files(data_dir)
print("Discovered dataset keys and file types:")
for k,v in mapping.items():
    print(k, v.keys())

# --- Choose dataset FDID to run (change to FD002/FD003/FD004 as needed) ---
FDID = 'FD001'   # <------ change here if you want FD002/FD003/FD004

# If chosen FDID doesn't have train+test, pick the first available FD that has both
if FDID not in mapping or 'train' not in mapping[FDID] or 'test' not in mapping[FDID]:
    print(f"Requested {FDID} is missing train/test. Searching for first FD with both train and test...")
    chosen = None
    for fd, files in mapping.items():
        if 'train' in files and 'test' in files:
            chosen = fd
            break
    if chosen is None:
        raise FileNotFoundError(f"No FD dataset with both train and test found in {data_dir}. Mapping: {mapping}")
    print(f"Switching to available dataset: {chosen}")
    FDID = chosen

train_path = mapping[FDID]['train']
test_path  = mapping[FDID]['test']

print("Using:", train_path, test_path)
train_df, test_df = load_cmapss_pair(train_path, test_path, rul_path)
print(f"Loaded {FDID}: train rows={len(train_df)} (units={train_df['unit'].nunique()}), test rows={len(test_df)} (units={test_df['unit'].nunique()})")

data_dir: ./CMAPSSData
Discovered dataset keys and file types:
FD001 dict_keys(['train', 'rul', 'test'])
FD003 dict_keys(['train', 'rul', 'test'])
FD004 dict_keys(['rul', 'train', 'test'])
FD002 dict_keys(['train', 'rul', 'test'])
Using: ./CMAPSSData/train_FD001.txt ./CMAPSSData/test_FD001.txt
Loaded FD001: train rows=20631 (units=100), test rows=13096 (units=100)


In [15]:
train_df.head()

Unnamed: 0,unit,cycle,op_setting_1,op_setting_2,op_setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,...,sensor_12,sensor_13,sensor_14,sensor_15,sensor_16,sensor_17,sensor_18,sensor_19,sensor_20,sensor_21
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,...,521.66,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.419
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,522.28,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.0,23.4236
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,...,522.42,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,...,522.86,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,522.19,2388.04,8133.8,8.4294,0.03,393,2388,100.0,38.9,23.4044


In [16]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20631 entries, 0 to 20630
Data columns (total 26 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   unit          20631 non-null  int64  
 1   cycle         20631 non-null  int64  
 2   op_setting_1  20631 non-null  float64
 3   op_setting_2  20631 non-null  float64
 4   op_setting_3  20631 non-null  float64
 5   sensor_1      20631 non-null  float64
 6   sensor_2      20631 non-null  float64
 7   sensor_3      20631 non-null  float64
 8   sensor_4      20631 non-null  float64
 9   sensor_5      20631 non-null  float64
 10  sensor_6      20631 non-null  float64
 11  sensor_7      20631 non-null  float64
 12  sensor_8      20631 non-null  float64
 13  sensor_9      20631 non-null  float64
 14  sensor_10     20631 non-null  float64
 15  sensor_11     20631 non-null  float64
 16  sensor_12     20631 non-null  float64
 17  sensor_13     20631 non-null  float64
 18  sensor_14     20631 non-nu

In [19]:
train_df.describe(include='all')

Unnamed: 0,unit,cycle,op_setting_1,op_setting_2,op_setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,...,sensor_12,sensor_13,sensor_14,sensor_15,sensor_16,sensor_17,sensor_18,sensor_19,sensor_20,sensor_21
count,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,...,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0
mean,51.506568,108.807862,-9e-06,2e-06,100.0,518.67,642.680934,1590.523119,1408.933782,14.62,...,521.41347,2388.096152,8143.752722,8.442146,0.03,393.210654,2388.0,100.0,38.816271,23.289705
std,29.227633,68.88099,0.002187,0.000293,0.0,0.0,0.500053,6.13115,9.000605,1.7764e-15,...,0.737553,0.071919,19.076176,0.037505,1.3878120000000003e-17,1.548763,0.0,0.0,0.180746,0.108251
min,1.0,1.0,-0.0087,-0.0006,100.0,518.67,641.21,1571.04,1382.25,14.62,...,518.69,2387.88,8099.94,8.3249,0.03,388.0,2388.0,100.0,38.14,22.8942
25%,26.0,52.0,-0.0015,-0.0002,100.0,518.67,642.325,1586.26,1402.36,14.62,...,520.96,2388.04,8133.245,8.4149,0.03,392.0,2388.0,100.0,38.7,23.2218
50%,52.0,104.0,0.0,0.0,100.0,518.67,642.64,1590.1,1408.04,14.62,...,521.48,2388.09,8140.54,8.4389,0.03,393.0,2388.0,100.0,38.83,23.2979
75%,77.0,156.0,0.0015,0.0003,100.0,518.67,643.0,1594.38,1414.555,14.62,...,521.95,2388.14,8148.31,8.4656,0.03,394.0,2388.0,100.0,38.95,23.3668
max,100.0,362.0,0.0087,0.0006,100.0,518.67,644.53,1616.91,1441.49,14.62,...,523.38,2388.56,8293.72,8.5848,0.03,400.0,2388.0,100.0,39.43,23.6184


Se puede notar que op_setting_3 siempre se mantiene constante, por lo que no aporta ninguna información

In [25]:
train_df.drop(['op_setting_3'], axis=1, inplace=True)

In [28]:
units = train_df['unit'].nunique()