# 1. CONFIGURACIÓN INICIAL Y LIBRERÍAS

In [1]:
# ===================================================================
# BLOQUE 1: CONFIGURACIÓN INICIAL Y LIBRERÍAS
# ===================================================================
# Importar librerías esenciales para limpieza
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings

# Configuración para mejor visualización
warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Configurar pandas para mejor visualización
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 50)

# Verificar instalación exitosa
print("✅ Librerías importadas exitosamente")
print(f"📊 Pandas versión: {pd.__version__}")
print(f"🔢 NumPy versión: {np.__version__}")
print(f"📈 Matplotlib versión: {plt.matplotlib.__version__}")

✅ Librerías importadas exitosamente
📊 Pandas versión: 2.2.2
🔢 NumPy versión: 2.0.2
📈 Matplotlib versión: 3.10.0


# 2. CARGA DE DATOS

In [2]:
# ===================================================================
# BLOQUE 2: CARGA DE DATOS
# ===================================================================
# Opción 1: Subir archivo desde tu computadora
from google.colab import files

print("📂 Selecciona tu archivo CSV desde tu computadora:")
uploaded = files.upload()

# Obtener el nombre del archivo subido
filename = list(uploaded.keys())[0]
print(f"📄 Archivo detectado: {filename}")

# Cargar el dataset con manejo de errores
try:
    df = pd.read_csv(filename, encoding='utf-8')
except UnicodeDecodeError:
    print("⚠️ Problema de encoding, intentando con latin-1...")
    df = pd.read_csv(filename, encoding='latin-1')

# Información básica de carga
print(f"✅ Dataset cargado exitosamente")
print(f"📏 Dimensiones: {df.shape[0]:,} filas × {df.shape[1]} columnas")
print(f"💾 Memoria usada: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

# Opción 2: Cargar desde URL (comentado)
# df = pd.read_csv('https://tu-url-del-dataset.csv')

📂 Selecciona tu archivo CSV desde tu computadora:


Saving predictive_maintenance.csv to predictive_maintenance.csv
📄 Archivo detectado: predictive_maintenance.csv
✅ Dataset cargado exitosamente
📏 Dimensiones: 10,000 filas × 10 columnas
💾 Memoria usada: 2.33 MB


# 3. ANÁLISIS EXPLORATORIO DE DATOS (EDA)

In [3]:
# ===================================================================
# BLOQUE 3: ANÁLISIS EXPLORATORIO DE DATOS (EDA)
# ===================================================================
# Información general del dataset
print("="*50)
print("📊 REPORTE DE CALIDAD DE DATOS")
print("="*50)

# 1. INFORMACIÓN BÁSICA
print("\n🔢 INFORMACIÓN BÁSICA:")
print(f"Filas: {df.shape[0]:,}")
print(f"Columnas: {df.shape[1]}")
print(f"Memoria total: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

# 2. TIPOS DE DATOS
print("\n📋 TIPOS DE DATOS:")
print(df.dtypes.value_counts())

# 3. ANÁLISIS DE VALORES FALTANTES
print("\n❓ VALORES FALTANTES:")
missing_data = df.isnull().sum()
missing_percent = 100 * missing_data / len(df)
missing_table = pd.DataFrame({
    'Columna': missing_data.index,
    'Faltantes': missing_data.values,
    'Porcentaje': missing_percent.values
}).sort_values('Porcentaje', ascending=False)

# Mostrar solo columnas con valores faltantes
missing_table_filtered = missing_table[missing_table['Faltantes'] > 0]
if len(missing_table_filtered) > 0:
    print(missing_table_filtered.to_string(index=False))
else:
    print("✅ No hay valores faltantes en el dataset")

# 4. DUPLICADOS
print("\n🔄 DUPLICADOS:")
duplicates = df.duplicated().sum()
print(f"Filas duplicadas: {duplicates:,} ({100*duplicates/len(df):.2f}%)")

# 5. ESTADÍSTICAS DESCRIPTIVAS
print("\n📊 ESTADÍSTICAS DESCRIPTIVAS:")
print("\nColumnas numéricas:")
print(df.describe())

print("\nColumnas categóricas:")
print(df.describe(include=['object']))

📊 REPORTE DE CALIDAD DE DATOS

🔢 INFORMACIÓN BÁSICA:
Filas: 10,000
Columnas: 10
Memoria total: 2.33 MB

📋 TIPOS DE DATOS:
int64      4
object     3
float64    3
Name: count, dtype: int64

❓ VALORES FALTANTES:
✅ No hay valores faltantes en el dataset

🔄 DUPLICADOS:
Filas duplicadas: 0 (0.00%)

📊 ESTADÍSTICAS DESCRIPTIVAS:

Columnas numéricas:
               UDI  Air temperature [K]  Process temperature [K]  \
count  10000.00000         10000.000000             10000.000000   
mean    5000.50000           300.004930               310.005560   
std     2886.89568             2.000259                 1.483734   
min        1.00000           295.300000               305.700000   
25%     2500.75000           298.300000               308.800000   
50%     5000.50000           300.100000               310.100000   
75%     7500.25000           301.500000               311.100000   
max    10000.00000           304.500000               313.800000   

       Rotational speed [rpm]   Torque [Nm]

# 4. LIMPIEZA COMPLETA DE DATOS

In [4]:
# ===================================================================
# BLOQUE 4: LIMPIEZA COMPLETA DE DATOS
# ===================================================================
# Crear una copia para mantener los datos originales
df_original = df.copy()
df_clean = df.copy()

print("📋 INICIANDO PROCESO DE LIMPIEZA...")
print(f"📊 Dataset original: {df_clean.shape}")

# 1. ELIMINAR DUPLICADOS
print("\n🔄 ELIMINANDO DUPLICADOS:")
initial_rows = len(df_clean)
df_clean = df_clean.drop_duplicates()
duplicates_removed = initial_rows - len(df_clean)
print(f"   • Duplicados eliminados: {duplicates_removed:,}")
print(f"   • Filas restantes: {len(df_clean):,}")

# 2. LIMPIAR NOMBRES DE COLUMNAS
print("\n📝 ESTANDARIZANDO NOMBRES DE COLUMNAS:")
print("   Nombres originales:", list(df_clean.columns[:5]), "...")

df_clean.columns = (df_clean.columns
                   .str.strip()           # Quitar espacios al inicio/final
                   .str.lower()           # Convertir a minúsculas
                   .str.replace(' ', '_')     # Espacios por guiones bajos
                   .str.replace('[^a-zA-Z0-9_]', '', regex=True)) # Solo alfanuméricos
print("   Nombres estandarizados:", list(df_clean.columns[:5]), "...")

# 3. MANEJO DE VALORES FALTANTES
print("\n🔧 MANEJO DE VALORES FALTANTES:")
numeric_cols = df_clean.select_dtypes(include=[np.number]).columns
categorical_cols = df_clean.select_dtypes(include=['object']).columns

for col in numeric_cols:
    missing_count = df_clean[col].isnull().sum()
    if missing_count > 0:
        median_value = df_clean[col].median()
        df_clean[col] = df_clean[col].fillna(median_value)
        print(f"   • {col}: {missing_count:,} valores → mediana ({median_value:.2f})")

for col in categorical_cols:
    missing_count = df_clean[col].isnull().sum()
    if missing_count > 0:
        mode_values = df_clean[col].mode()
        fill_value = mode_values[0] if len(mode_values) > 0 else 'Unknown'
        df_clean[col] = df_clean[col].fillna(fill_value)
        print(f"   • {col}: {missing_count:,} valores → '{fill_value}'")

# 4. DETECTAR Y TRATAR OUTLIERS
print("\n🎯 TRATAMIENTO DE OUTLIERS:")
for col in numeric_cols:
    Q1 = df_clean[col].quantile(0.25)
    Q3 = df_clean[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    outliers_mask = (df_clean[col] < lower_bound) | (df_clean[col] > upper_bound)
    outliers_count = outliers_mask.sum()

    if outliers_count > 0:
        outlier_percentage = (outliers_count / len(df_clean)) * 100
        if outlier_percentage > 5:  # Winsorización si >5% outliers
            df_clean[col] = np.where(df_clean[col] < lower_bound, lower_bound, df_clean[col])
            df_clean[col] = np.where(df_clean[col] > upper_bound, upper_bound, df_clean[col])
            print(f"   • {col}: {outliers_count:,} outliers winsorizados ({outlier_percentage:.1f}%)")
        else:
            print(f"   • {col}: {outliers_count:,} outliers mantenidos ({outlier_percentage:.1f}%)")

print(f"\n✅ LIMPIEZA COMPLETADA")
print(f"📊 Dimensiones finales: {df_clean.shape}")
print(f"❓ Valores faltantes restantes: {df_clean.isnull().sum().sum()}")

📋 INICIANDO PROCESO DE LIMPIEZA...
📊 Dataset original: (10000, 10)

🔄 ELIMINANDO DUPLICADOS:
   • Duplicados eliminados: 0
   • Filas restantes: 10,000

📝 ESTANDARIZANDO NOMBRES DE COLUMNAS:
   Nombres originales: ['UDI', 'Product ID', 'Type', 'Air temperature [K]', 'Process temperature [K]'] ...
   Nombres estandarizados: ['udi', 'product_id', 'type', 'air_temperature_k', 'process_temperature_k'] ...

🔧 MANEJO DE VALORES FALTANTES:

🎯 TRATAMIENTO DE OUTLIERS:
   • rotational_speed_rpm: 418 outliers mantenidos (4.2%)
   • torque_nm: 69 outliers mantenidos (0.7%)
   • target: 339 outliers mantenidos (3.4%)

✅ LIMPIEZA COMPLETADA
📊 Dimensiones finales: (10000, 10)
❓ Valores faltantes restantes: 0


# 5. EXPORTAR DATASET LIMPIO (¡IMPORTANTE PARA ACTIVIDAD 6!)

In [5]:
# ===================================================================
# BLOQUE 5: EXPORTAR DATASET LIMPIO (¡IMPORTANTE PARA ACTIVIDAD 6!)
# ===================================================================
print("💾 GUARDANDO DATASET LIMPIO...")

# Opción 1: Guardar como CSV (recomendado para Power BI)
filename_clean = 'cleaned_predictive_maintenance.csv'
df_clean.to_csv(filename_clean, index=False, encoding='utf-8')
print(f"✅ Dataset limpio guardado como: {filename_clean}")

# Opción 2: Descargar automáticamente a tu computadora
from google.colab import files
files.download(filename_clean)
print(f"📥 Archivo descargado a tu computadora")

# Verificación final
print(f"📊 Archivo final: {df_clean.shape[0]:,} filas × {df_clean.shape[1]} columnas")
print(f"📁 Tamaño: {df_clean.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print(f"🎯 Listo para usar en Actividad 6!")

💾 GUARDANDO DATASET LIMPIO...
✅ Dataset limpio guardado como: cleaned_predictive_maintenance.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

📥 Archivo descargado a tu computadora
📊 Archivo final: 10,000 filas × 10 columnas
📁 Tamaño: 2.33 MB
🎯 Listo para usar en Actividad 6!


# 6. Validación Pre-Análisis

In [6]:
# ===================================================================
# BLOQUE 6: SETUP INICIAL / VALIDACIÓN INICIAL PREDICTIVE MAINTENANCE
# ===================================================================

import pandas as pd
import numpy as np

# Cargar dataset
df = df_clean

print("\u2699\ufe0f PREDICTIVE MAINTENANCE - VALIDACIÓN INICIAL")
print("=" * 50)
print(f"Shape: {df.shape}")

# Información básica Industrial IoT
print(f"\n🏭 OVERVIEW INDUSTRIAL IoT:")
print(f"Total Equipment Units: {df['udi'].nunique():,}")
print(f"Total Product IDs: {df['product_id'].nunique():,}")
print(f"Equipment Types: {df['type'].value_counts().to_dict()}")
print(f"Failure Rate: {df['target'].mean():.3f} ({df['target'].mean()*100:.1f}%)")

# Sensor data ranges
print(f"\n📊 SENSOR DATA RANGES:")
print(f"Air Temperature: {df['air_temperature_k'].min():.1f}K - {df['air_temperature_k'].max():.1f}K")
print(f"Process Temperature: {df['process_temperature_k'].min():.1f}K - {df['process_temperature_k'].max():.1f}K")
print(f"Rotational Speed: {df['rotational_speed_rpm'].min()} - {df['rotational_speed_rpm'].max()} rpm")
print(f"Torque: {df['torque_nm'].min():.1f} - {df['torque_nm'].max():.1f} Nm")
print(f"Tool Wear: {df['tool_wear_min'].min()} - {df['tool_wear_min'].max()} min")

# Verificar calidad de datos
print(f"\n🔍 CALIDAD DE DATOS:")
print(f"Missing values: {df.isnull().sum().sum()}")
print(f"Duplicates: {df.duplicated().sum()}")

# Identificar variables sospechosas para VDRER
print(f"\n🔍 VARIABLES 'SOSPECHOSAS' PARA VDRER:")
print(f"udi: {df['udi'].min()} - {df['udi'].max()} (¿secuencial?)")
print(f"product_id: Ejemplo '{df['product_id'].iloc[0]}'")
print(f"type: {df['type'].unique()} (¿solo quality levels?)")

# Failure types analysis
if 'failure_type' in df.columns:
    print(f"\nFailure Types:")
    print(df['failure_type'].value_counts())

print(f"\n✅ Dataset listo para aplicar Framework VDRER")

⚙️ PREDICTIVE MAINTENANCE - VALIDACIÓN INICIAL
Shape: (10000, 10)

🏭 OVERVIEW INDUSTRIAL IoT:
Total Equipment Units: 10,000
Total Product IDs: 10,000
Equipment Types: {'L': 6000, 'M': 2997, 'H': 1003}
Failure Rate: 0.034 (3.4%)

📊 SENSOR DATA RANGES:
Air Temperature: 295.3K - 304.5K
Process Temperature: 305.7K - 313.8K
Rotational Speed: 1168 - 2886 rpm
Torque: 3.8 - 76.6 Nm
Tool Wear: 0 - 253 min

🔍 CALIDAD DE DATOS:
Missing values: 0
Duplicates: 0

🔍 VARIABLES 'SOSPECHOSAS' PARA VDRER:
udi: 1 - 10000 (¿secuencial?)
product_id: Ejemplo 'M14860'
type: ['M' 'L' 'H'] (¿solo quality levels?)

Failure Types:
failure_type
No Failure                  9652
Heat Dissipation Failure     112
Power Failure                 95
Overstrain Failure            78
Tool Wear Failure             45
Random Failures               18
Name: count, dtype: int64

✅ Dataset listo para aplicar Framework VDRER


# 7. FASE 1: VALIDAR - Verificación de Dataset Limpio -Fases del Framework VDRER

In [7]:
# ====================================================================
# BLOQUE 7: ✅ (SIN CARGAR CSV) 🔍 FASE 1: VALIDAR - Verificación de Dataset Predictive Maintenance
# ====================================================================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import warnings
warnings.filterwarnings('ignore')

print("\n" + "="*70)
print("⚙️ ACTIVIDAD 6: PREDICTIVE MAINTENANCE - EXTRACCIÓN DE VALOR VDRER")
print("="*70)

# ===== FASE 1: VALIDAR =====
df = df_clean
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('[\[\]]', '', regex=True)
print(f"✅ Dataset cargado: {df.shape[0]:,} filas × {df.shape[1]} columnas")

# Overview del sistema industrial
print(f"\n🏠 OVERVIEW INDUSTRIAL SYSTEM:")
print(f"🔧 Equipment Units: {df['udi'].nunique():,}")
print(f"📦 Product Types: {df['product_id'].nunique():,}")
print(f"⚡ Equipment Classes: {df['type'].value_counts().to_dict()}")
print(f"💥 Overall Failure Rate: {df['target'].mean():.3f} ({df['target'].mean()*100:.1f}%)")

# Sensor data summary
print(f"\n📊 SENSOR DATA SUMMARY:")
sensor_cols = ['air_temperature_k', 'process_temperature_k',
               'rotational_speed_rpm', 'torque_nm', 'tool_wear_min']
for col in sensor_cols:
    print(f"   {col}: {df[col].min():.1f} - {df[col].max():.1f} (mean: {df[col].mean():.1f})")


⚙️ ACTIVIDAD 6: PREDICTIVE MAINTENANCE - EXTRACCIÓN DE VALOR VDRER
✅ Dataset cargado: 10,000 filas × 10 columnas

🏠 OVERVIEW INDUSTRIAL SYSTEM:
🔧 Equipment Units: 10,000
📦 Product Types: 10,000
⚡ Equipment Classes: {'L': 6000, 'M': 2997, 'H': 1003}
💥 Overall Failure Rate: 0.034 (3.4%)

📊 SENSOR DATA SUMMARY:
   air_temperature_k: 295.3 - 304.5 (mean: 300.0)
   process_temperature_k: 305.7 - 313.8 (mean: 310.0)
   rotational_speed_rpm: 1168.0 - 2886.0 (mean: 1538.8)
   torque_nm: 3.8 - 76.6 (mean: 40.0)
   tool_wear_min: 0.0 - 253.0 (mean: 108.0)


# 8. FASE 2: DESCUBRIR - Identificar Variables Aparentemente "Inútiles" -Fases del Framework VDRER

In [8]:
# ====================================================================
# BLOQUE 8: 🔍 FASE 2: DESCUBRIR - Identificar Variables Industriales "Inútiles"
# ====================================================================

print(f"\n🔍 FASE 2: DESCUBRIR - Variables 'inútiles' industriales")

insights_found = []

# UDI patterns analysis
print(f"\n📊 ANÁLISIS UDI:")
print(f"Rango: {df['udi'].min()} - {df['udi'].max()}")
udi_gaps = df['udi'].diff().describe()
print(f"Gaps en secuencia UDI: mean={udi_gaps['mean']:.1f}, std={udi_gaps['std']:.1f}")

# Product ID structure analysis
print(f"\n📊 ANÁLISIS PRODUCT ID:")
print(f"Formato típico: {df['product_id'].iloc[0]}")
product_prefixes = df['product_id'].str[0].value_counts()
print(f"Prefijos encontrados: {product_prefixes.to_dict()}")


🔍 FASE 2: DESCUBRIR - Variables 'inútiles' industriales

📊 ANÁLISIS UDI:
Rango: 1 - 10000
Gaps en secuencia UDI: mean=1.0, std=0.0

📊 ANÁLISIS PRODUCT ID:
Formato típico: M14860
Prefijos encontrados: {'L': 6000, 'M': 2997, 'H': 1003}


# 9. FASE 3: REUTILIZAR - Extraer Valor de Variables "Inútiles" -Fases del Framework VDRER

In [9]:
# ====================================================================
# BLOQUE 9: ♻️ FASE 3: REUTILIZAR - Extraer Valor de Códigos Industriales "Inútiles"
# ====================================================================

print(f"\n♻️ FASE 3: REUTILIZAR - Extrayendo valor de códigos industriales")

# Equipment installation cohorts
df['installation_cohort'] = pd.qcut(df['udi'], q=5, labels=['Gen1', 'Gen2', 'Gen3', 'Gen4', 'Gen5'])

# Product family
df['product_family'] = df['product_id'].str[0]

# Equipment age proxy
df['equipment_age_proxy'] = (df['udi'] - df['udi'].min()) / (df['udi'].max() - df['udi'].min())

# Failure risk by type
failure_by_type = df.groupby('type')['target'].agg(['mean', 'count'])
print(f"\n⚠️ FAILURE RATE BY EQUIPMENT TYPE:")
for eq_type in failure_by_type.index:
    failure_rate = failure_by_type.loc[eq_type, 'mean']
    count = failure_by_type.loc[eq_type, 'count']
    print(f"   Type {eq_type}: {failure_rate:.3f} failure rate ({count:,} units)")

insights_found.extend([
    "UDI revela equipment installation cohorts con diferentes reliability",
    "Product ID estructura indica product families con failure patterns",
    "Type classification correlaciona con failure rates específicos"
])


♻️ FASE 3: REUTILIZAR - Extrayendo valor de códigos industriales

⚠️ FAILURE RATE BY EQUIPMENT TYPE:
   Type H: 0.021 failure rate (1,003 units)
   Type L: 0.039 failure rate (6,000 units)
   Type M: 0.028 failure rate (2,997 units)


# 10. FASE 4: ENGINEER - Crear Features Derivadas con Valor -Fases del Framework VDRER

In [10]:
# ================================================================
# BLOQUE 10: ⚙️ FASE 4: ENGINEER - Crear Features Derivadas de Reliability
# ================================================================

print(f"\n🔧 FASE 4: ENGINEER - Creando features de reliability")

new_features_created = []

# Renombrar columnas para cumplir con convención: minúsculas, sin espacios, con guiones bajos
column_rename_map = {
    'Air temperature [K]': 'air_temperature_k',
    'Process temperature [K]': 'process_temperature_k',
    'Rotational speed [rpm]': 'rotational_speed_rpm',
    'Torque [Nm]': 'torque_nm',
    'Tool wear [min]': 'tool_wear_min',
    'Product ID': 'product_id',
    'Type': 'type',
    'Target': 'target'
}
df.rename(columns=column_rename_map, inplace=True)

# Equipment Reliability Index
df['reliability_index'] = (
    (1 - df['target']) * 0.4 +
    (1 - df['equipment_age_proxy']) * 0.2 +
    (df['tool_wear_min'] / df['tool_wear_min'].max()) * -0.2 +
    (df['type'].map({'L': 0.6, 'M': 0.8, 'H': 1.0})) * 0.2
)

# Maintenance Efficiency Score
df['operating_stress'] = (
    (df['air_temperature_k'] - df['air_temperature_k'].mean()).abs() / df['air_temperature_k'].std() +
    (df['process_temperature_k'] - df['process_temperature_k'].mean()).abs() / df['process_temperature_k'].std() +
    (df['torque_nm'] - df['torque_nm'].mean()).abs() / df['torque_nm'].std()
) / 3

df['maintenance_efficiency'] = np.where(
    df['target'] == 0,
    1 / (df['operating_stress'] + 0.1),
    0.1
)

# Failure Risk Assessment
scaler = StandardScaler()
risk_features = [
    'air_temperature_k', 'process_temperature_k',
    'rotational_speed_rpm', 'torque_nm', 'tool_wear_min'
]
risk_scaled = scaler.fit_transform(df[risk_features])
df['failure_risk_score'] = np.sqrt(np.sum(risk_scaled**2, axis=1))

# Equipment Lifecycle Stage
df['lifecycle_stage'] = pd.cut(df['equipment_age_proxy'],
    bins=[0, 0.2, 0.5, 0.8, 1.0],
    labels=['new_install', 'operating', 'mature', 'end_of_life']
)

# Equipment Criticality Assessment
df['criticality_score'] = (
    df['failure_risk_score'] * 0.4 +
    (df['type'].map({'L': 1, 'M': 2, 'H': 3})) * 0.3 +
    df['operating_stress'] * 0.3
)

df['criticality_level'] = pd.cut(df['criticality_score'],
    bins=[0, 1, 2, 3, float('inf')],
    labels=['low', 'medium', 'high', 'critical']
)

new_features_created.extend([
    'reliability_index', 'maintenance_efficiency', 'failure_risk_score',
    'lifecycle_stage', 'criticality_level', 'operating_stress'
])

print(f"✅ Features industriales creadas: {len(new_features_created)}")

# Performance analysis por cohort
cohort_analysis = df.groupby('installation_cohort').agg({
    'target': 'mean',
    'reliability_index': 'mean',
    'maintenance_efficiency': 'mean'
}).round(3)

print(f"\n📈 PERFORMANCE BY INSTALLATION COHORT:")
for cohort in cohort_analysis.index:
    failure_rate = cohort_analysis.loc[cohort, 'target']
    reliability = cohort_analysis.loc[cohort, 'reliability_index']
    efficiency = cohort_analysis.loc[cohort, 'maintenance_efficiency']
    print(f"   {cohort}: Failure={failure_rate:.3f}, Reliability={reliability:.3f}, Efficiency={efficiency:.3f}")


🔧 FASE 4: ENGINEER - Creando features de reliability
✅ Features industriales creadas: 6

📈 PERFORMANCE BY INSTALLATION COHORT:
   Gen1: Failure=0.026, Reliability=0.624, Efficiency=1.000
   Gen2: Failure=0.024, Reliability=0.585, Efficiency=1.530
   Gen3: Failure=0.077, Reliability=0.521, Efficiency=0.913
   Gen4: Failure=0.022, Reliability=0.506, Efficiency=1.680
   Gen5: Failure=0.020, Reliability=0.469, Efficiency=1.309


# 11. FASE 5: REPORTAR - Documentar Insights y Exportar Dataset -Fases del Framework VDRER

In [11]:
# ================================================================
# BLOQUE 11: 📋 FASE 5: REPORTAR - Documentar Insights Predictive Maintenance y Exportar Dataset
# ================================================================

print(f"\n📋 FASE 5: REPORTAR - Exportando dataset industrial enriquecido")

print(f"\n📊 REPORTE FINAL INDUSTRIAL:")
print(f"   💎 Insights de reliability extraídos: {len(insights_found)}")
print(f"   ⚙️ Features de maintenance creadas: {len(new_features_created)}")
print(f"   📈 Dimensiones finales: {df.shape}")

# Equipment criticality distribution
criticality_dist = df['criticality_level'].value_counts()
print(f"\n🚨 EQUIPMENT CRITICALITY DISTRIBUTION:")
for level in ['Low', 'Medium', 'High', 'Critical']:
    if level in criticality_dist.index:
        count = criticality_dist[level]
        pct = count / len(df) * 100
        print(f"   {level}: {count:,} equipos ({pct:.1f}%)")

# Lifecycle stage analysis
lifecycle_dist = df['lifecycle_stage'].value_counts()
print(f"\n📅 EQUIPMENT LIFECYCLE DISTRIBUTION:")
for stage in lifecycle_dist.index:
    count = lifecycle_dist[stage]
    pct = count / len(df) * 100
    print(f"   {stage}: {count:,} equipos ({pct:.1f}%)")

# Export enhanced dataset
try:
    filename = 'predictive_maintenance_value_extracted_python.csv'
    df.to_csv(filename, index=False, encoding='utf-8')
    print(f"✅ Guardado como: {filename}")

    # Download if in Colab
    try:
        from google.colab import files
        files.download(filename)
        print(f"📥 ¡Archivo descargado automáticamente!")
    except:
        print(f"📝 Archivo listo para descargar manualmente")

except Exception as e:
    print(f"⚠️ Error en exportación: {e}")

print("\n" + "="*70)
print("🎉 PREDICTIVE MAINTENANCE VDRER COMPLETADO")
print("🚀 ¡Dataset listo para Industrial IoT Dashboard en Power BI!")
print("="*70)

print(f"\n📚 INDUSTRIAL INSIGHTS SUMMARY:")
print(f"✅ Has aplicado VDRER a datos de sensores industriales")
print(f"✅ Transformaste IDs 'administrativos' en equipment intelligence")
print(f"✅ Creaste reliability index, maintenance efficiency, failure prediction")
print(f"✅ Preparaste base para predictive maintenance analytics")
print(f"\n💡 ¡Has convertido códigos industriales en oro de maintenance intelligence!")


📋 FASE 5: REPORTAR - Exportando dataset industrial enriquecido

📊 REPORTE FINAL INDUSTRIAL:
   💎 Insights de reliability extraídos: 3
   ⚙️ Features de maintenance creadas: 6
   📈 Dimensiones finales: (10000, 20)

🚨 EQUIPMENT CRITICALITY DISTRIBUTION:

📅 EQUIPMENT LIFECYCLE DISTRIBUTION:
   operating: 3,000 equipos (30.0%)
   mature: 3,000 equipos (30.0%)
   end_of_life: 2,000 equipos (20.0%)
   new_install: 1,999 equipos (20.0%)
✅ Guardado como: predictive_maintenance_value_extracted_python.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

📥 ¡Archivo descargado automáticamente!

🎉 PREDICTIVE MAINTENANCE VDRER COMPLETADO
🚀 ¡Dataset listo para Industrial IoT Dashboard en Power BI!

📚 INDUSTRIAL INSIGHTS SUMMARY:
✅ Has aplicado VDRER a datos de sensores industriales
✅ Transformaste IDs 'administrativos' en equipment intelligence
✅ Creaste reliability index, maintenance efficiency, failure prediction
✅ Preparaste base para predictive maintenance analytics

💡 ¡Has convertido códigos industriales en oro de maintenance intelligence!
