# Implementation: Robust Data Pipeline

**Goal**: Generate synthetic sensor data and validate it professionally.

In [None]:
import pandas as pd
import numpy as np
import pandera as pa
from pandera.typing import DataFrame, Series
import logging

# 1. Setup Logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# 2. Synthetic Data Generator
def generate_engine_data(engine_id: int, max_cycles: int) -> pd.DataFrame:
    cycles = np.arange(1, max_cycles + 1)
    
    # Base sensors (Temperature, Vibration)
    # Degradation signal: Exponential rise near failure
    degradation = np.exp(cycles / max_cycles * 5) / 100 
    
    sensor_1 = 100 + np.random.normal(0, 1, max_cycles) + degradation * 20 # Temp
    sensor_2 = 0.5 + np.random.normal(0, 0.05, max_cycles) + degradation # Vibration
    
    # Target: Remaining Useful Life (RUL)
    rul = max_cycles - cycles
    
    return pd.DataFrame({
        "engine_id": engine_id,
        "cycle": cycles,
        "sensor_temp": sensor_1,
        "sensor_vib": sensor_2,
        "RUL": rul
    })

# Generate 10 engines
df_list = []
for i in range(1, 11):
    # Engines fail at different times (randomness)
    life = np.random.randint(50, 200)
    df_list.append(generate_engine_data(i, life))

raw_df = pd.concat(df_list).reset_index(drop=True)
logger.info(f"Generated raw data: {raw_df.shape}")

# 3. Define Schema (The Contract)
class SensorSchema(pa.SchemaModel):
    engine_id: Series[int] = pa.Field(ge=1)
    cycle: Series[int] = pa.Field(ge=1)
    sensor_temp: Series[float] = pa.Field(ge=50, le=500) # Temp constraints
    sensor_vib: Series[float] = pa.Field(ge=0, le=10)    # Vibration constraints
    RUL: Series[int] = pa.Field(ge=0)

# 4. Validate
try:
    validated_df = SensorSchema.validate(raw_df)
    logger.info("Data Schema Validation PASSED ✅")
except pa.errors.SchemaError as e:
    logger.error(f"Data Validation FAILED ❌: {e}")

# 5. Quick EDA
print(validated_df.head())
print(validated_df.describe())