# Feature Engineering – Maintenance prédictive (Valve)

Objectif :
- Extraire des caractéristiques statistiques par cycle
- Construire un DataFrame final prêt pour le Machine Learning


In [1]:
import numpy as np
import pandas as pd
from pathlib import Path


In [2]:
DATA_DIR = Path("data/raw/condition+monitoring+of+hydraulic+systems")

SENSORS_100HZ = ["PS1","PS2","PS3","PS4","PS5","PS6","EPS1"]
SENSORS_10HZ  = ["FS1","FS2"]
SENSORS_1HZ   = ["TS1","TS2","TS3","TS4","VS1"]

SENSORS = SENSORS_100HZ + SENSORS_10HZ + SENSORS_1HZ


In [3]:
PROFILE_PATH = DATA_DIR / "profile.txt"

profile = pd.read_csv(
    PROFILE_PATH,
    delim_whitespace=True,
    header=None,
    encoding="latin1"
)

profile.shape


  profile = pd.read_csv(


(2205, 5)

In [4]:
profile.head()



Unnamed: 0,0,1,2,3,4
0,3,100,0,130,1
1,3,100,0,130,1
2,3,100,0,130,1
3,3,100,0,130,1
4,3,100,0,130,1


In [5]:
# cible binaire : 1 = valve OK (100%), 0 = défaut
valve_state = (profile.iloc[:, 1] == 100).astype(int)

valve_state.value_counts()


1
1    1125
0    1080
Name: count, dtype: int64

In [6]:
def load_sensor_matrix(sensor_name):
    path = DATA_DIR / f"{sensor_name}.txt"

    df = pd.read_csv(
        path,
        delim_whitespace=True,
        header=None,
        encoding="latin1"
    )

    # conversion explicite en float
    df = df.apply(pd.to_numeric, errors="coerce")

    return df


In [7]:
# Chargement de tous les capteurs dans un dictionnaire
sensor_data = {}

for sensor in SENSORS:
    df = load_sensor_matrix(sensor)
    sensor_data[sensor] = df
    print(f"{sensor} chargé : {df.shape}")


  df = pd.read_csv(
  df = pd.read_csv(


PS1 chargé : (2205, 6000)


  df = pd.read_csv(


PS2 chargé : (2205, 6000)


  df = pd.read_csv(


PS3 chargé : (2205, 6000)


  df = pd.read_csv(


PS4 chargé : (2205, 6000)


  df = pd.read_csv(


PS5 chargé : (2205, 6000)


  df = pd.read_csv(


PS6 chargé : (2205, 6000)


  df = pd.read_csv(


EPS1 chargé : (2205, 6000)


  df = pd.read_csv(


FS1 chargé : (2205, 600)


  df = pd.read_csv(


FS2 chargé : (2205, 600)
TS1 chargé : (2205, 60)
TS2 chargé : (2205, 60)
TS3 chargé : (2205, 60)


  df = pd.read_csv(
  df = pd.read_csv(
  df = pd.read_csv(
  df = pd.read_csv(


TS4 chargé : (2205, 60)
VS1 chargé : (2205, 60)


  df = pd.read_csv(


In [8]:
def extract_features(signal):
    feats = {}

    feats["mean"] = np.mean(signal)
    feats["std"] = np.std(signal)
    feats["rms"] = np.sqrt(np.mean(signal**2))
    feats["iqr"] = np.percentile(signal, 75) - np.percentile(signal, 25)

    # temporel
    feats["trend"] = np.polyfit(np.arange(len(signal)), signal, 1)[0]
    feats["max_diff"] = np.max(np.abs(np.diff(signal)))
    feats["autocorr"] = np.corrcoef(signal[:-1], signal[1:])[0,1]
    feats["energy"] = np.sum(signal**2)

    # fréquentiel
    fft = np.abs(np.fft.rfft(signal))
    feats["dominant_freq"] = np.argmax(fft)
    feats["spectral_entropy"] = -np.sum((fft/fft.sum()) * np.log(fft/fft.sum() + 1e-12))

    return feats


In [9]:
import numpy as np

def extract_features(signal):
    feats = {}

    # statistiques
    feats["mean"] = np.mean(signal)
    feats["std"] = np.std(signal)
    feats["rms"] = np.sqrt(np.mean(signal**2))
    feats["iqr"] = np.percentile(signal, 75) - np.percentile(signal, 25)

    # temporelles
    feats["trend"] = np.polyfit(np.arange(len(signal)), signal, 1)[0]
    feats["max_diff"] = np.max(np.abs(np.diff(signal)))
    feats["autocorr"] = np.corrcoef(signal[:-1], signal[1:])[0, 1]
    feats["energy"] = np.sum(signal**2)

    # fréquentielles
    fft = np.abs(np.fft.rfft(signal))
    psd = fft / np.sum(fft)

    feats["dominant_freq"] = np.argmax(fft)
    feats["spectral_entropy"] = -np.sum(psd * np.log(psd + 1e-12))

    return feats


In [10]:
dataset_rows = []

n_cycles = sensor_data[SENSORS[0]].shape[0]

for cycle in range(n_cycles):
    row = {}

    for sensor in SENSORS:
        signal = sensor_data[sensor].iloc[cycle].values
        signal = np.nan_to_num(signal)

        feats = extract_features(signal)

        for k, v in feats.items():
            row[f"{sensor}_{k}"] = v

    row["valve_ok"] = valve_state.iloc[cycle]
    dataset_rows.append(row)

dataset = pd.DataFrame(dataset_rows)
dataset.shape


  c /= stddev[:, None]
  psd = fft / np.sum(fft)


(2205, 141)

In [11]:
dataset.head()

Unnamed: 0,PS1_mean,PS1_std,PS1_rms,PS1_iqr,PS1_trend,PS1_max_diff,PS1_autocorr,PS1_energy,PS1_dominant_freq,PS1_spectral_entropy,...,VS1_std,VS1_rms,VS1_iqr,VS1_trend,VS1_max_diff,VS1_autocorr,VS1_energy,VS1_dominant_freq,VS1_spectral_entropy,valve_ok
0,160.673492,13.938147,161.276914,15.12,-0.004043,14.75,0.999791,156061500.0,0,1.834864,...,0.026852,0.577575,0.05,-0.000618,0.038,0.926329,20.015539,0,0.576063,1
1,160.60332,14.117791,161.222636,15.13,-0.004177,2.93,0.999959,155956400.0,0,1.331912,...,0.027013,0.566494,0.0315,-0.000858,0.037,0.929533,19.254955,0,0.569901,1
2,160.34772,14.191436,160.974495,15.28,-0.004211,3.06,0.999956,155476700.0,0,1.348098,...,0.036422,0.577683,0.046,-0.001313,0.035,0.936527,20.023034,0,0.665683,1
3,160.188088,14.226617,160.818594,15.31,-0.004222,2.72,0.999957,155175700.0,0,1.347256,...,0.033184,0.570233,0.04425,-0.000875,0.045,0.927742,19.509942,0,0.594516,1
4,160.000472,14.275244,160.636028,15.3,-0.004249,3.08,0.999955,154823600.0,0,1.361982,...,0.033203,0.578321,0.04425,-0.001028,0.049,0.923011,20.067284,0,0.599769,1


In [12]:
from pathlib import Path

PROCESSED_DIR = Path("data/processed")
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

dataset_path = PROCESSED_DIR / "dataset.csv"
dataset.to_csv(dataset_path, index=False)

dataset_path


WindowsPath('data/processed/dataset.csv')