In [94]:
import unittest
import pandas as pd
import os
from glob import glob
from concurrent.futures import ThreadPoolExecutor
from typing import List, Dict
from time import perf_counter
import traceback
import plotly.express as px

# .env
# variables de entorno encontrar la ruta de los datos
DATA_DIR = os.path.abspath(os.path.join("..", "data-set"))

COLUMN_MAPPING = {
    "sensor": [
        'ShiftDate', 'Shift', 'TimeStamp', 'RecordDuration', 'Equipment',
        'TruckFleet', 'FuelLevel', 'FuelLevelLiters', 'FuelGauge', 'Speed',
        'RPM', 'Ralenti', 'Latitude', 'Longitude', 'Elevation'
    ],
    "time_model": [
        'ShiftDate', 'Shift', 'TimeStamp', 'RecordDuration', 'Equipment',
        'TruckFleet', 'Status', 'Category', 'Event'
    ],
    "cycle": [
        'travel_empty_seconds', 'haul_loaded_seconds', 'waiting_loader_seconds',
        'spotting_time_seconds', 'loading_time_seconds', 'waiting_dump_seconds',
        'reversing_seconds', 'tipping_time_seconds', 'total_travel_time_seconds',
        'operational_time_seconds', 'total_cycle_time_seconds', 'loaded_distance_m',
        'empty_distance_m', 'equivalent_distance_m', 'measured_tonnage',
        'reported_tonnage', 'loading_easting', 'loading_northing',
        'loading_elevation_cm', 'loading_slope_percent', 'dump_easting',
        'dump_northing', 'dump_elevation_cm', 'dump_slope_percent',
        'loading_timestamp', 'dumping_timestamp', 'ShiftDate', 'shift_type',
        'truck_name', 'truck_model', 'fleet_name', 'loading_face', 'material_type',
        'dump_type', 'dump_location', 'TimeStamp'
    ]
}

# config.py
# pip install python-dotenv
""""
implementar en un futuro
import os
import json
from dotenv import load_dotenv

load_dotenv()

DATA_DIR = os.getenv('DATA_DIR', '/ruta/por/defecto')
COLUMN_MAPPING = json.loads(os.getenv('COLUMN_MAPPING', '{"sensor":[], "time_model":[], "cycle":[]}'))
"""
# clase para cargar los datos
# from config import DATA_DIR, COLUMN_MAPPING
class DataLoader:
    def __init__(self, truck: str):

        self.base_dir = DATA_DIR
        self.truck = truck.upper()
        self._validate_truck_exists() 

    # validar que el camion exista
    def _validate_truck_exists(self):
        """Valida que existan archivos para el camión especificado"""
        pattern = os.path.join(self.base_dir, "train_data_*", "test_*", f"{self.truck}_*.csv")
        files = glob(pattern)
        if not files:
            raise ValueError(f"No se encontraron archivos para el camión {self.truck}")

    def _generate_file_patterns(self, truck: str, data_type: str) -> list:
        """Genera patrones de búsqueda para tipo de conjunto y año"""
        return [
            os.path.join(
                self.base_dir,
                f"train_data_{data_type}",
                f"test_*",
                f"{self.truck}_*.csv"
            )
        ]
    
    def _find_matching_files(self, patterns: list) -> list:
        """Encuentra archivos que coincidan con los patrones"""
        return [file for pattern in patterns for file in glob(pattern)]
    
    def _load_single_file(self, file_path: str, data_type: str) -> pd.DataFrame:
        """Carga y procesa un solo archivo CSV"""
        try:
            # Detectar separador
            sep = self._detect_separator(file_path)
            
            # Cargar datos
            df = pd.read_csv(
                file_path,
                sep=sep,
                header=None,
                names=COLUMN_MAPPING[data_type],
                encoding='utf-8-sig',
                on_bad_lines='warn'
            )
            
            # Procesar fechas
            date_cols = ['ShiftDate', 'TimeStamp']
            for col in date_cols:
                if col in df.columns:
                    df[col] = self._parse_date_column(df[col])
            
            return df

        except Exception as e:
            print(f"Error en {file_path}: {str(e)}")
            return pd.DataFrame()

    def _detect_separator(self, file_path: str) -> str:
        """Detecta el separador leyendo la primera línea"""
        separadores = [';', '\t', ',']
        try:
            with open(file_path, 'r', encoding='utf-8-sig') as f:
                primera_linea = f.readline().strip()
            return next((s for s in separadores if s in primera_linea), ',')
        except:
            return ','

    def _parse_date_column(self, col: pd.Series) -> pd.Series:
        """Convierte a datetime y maneja zonas horarias"""
        try:
            # Forzar UTC si no tiene zona horaria
            return pd.to_datetime(col, errors='coerce', utc=True).dt.tz_convert(None)
        except Exception as e:
            print(f"Error parseando fechas: {str(e)}")
            return col  # Devuelve original para debug 
        
    def load_data(self) -> Dict[str, pd.DataFrame]:
        """Carga todos los tipos de datos en DataFrames separados"""
        
        datasets = {}
        
        # Iterar sobre cada tipo de datos definido en COLUMN_MAPPING (sensor, time_model, cycle)
        for data_type in COLUMN_MAPPING:
            try:
                patterns = self._generate_file_patterns(self.truck, data_type)
                files = self._find_matching_files(patterns)
                
                if not files:
                    print(f"Advertencia: No se encontraron archivos de {data_type}")
                    datasets[data_type] = pd.DataFrame()
                    continue
                
                with ThreadPoolExecutor() as executor:
                    dfs = list(executor.map(lambda f: self._load_single_file(f, data_type), files))
                
                combined_df = pd.concat(dfs, ignore_index=True)
                datasets[data_type] = combined_df.sort_values('TimeStamp') if 'TimeStamp' in combined_df else combined_df

                ## feedback para ver si se cargaron los datos correctamente
                print(f"feedback {data_type.upper()}: Cargados {len(combined_df)} registros")
            
            except KeyError:
                print(f"Columnas no definidas para {data_type}")
                datasets[data_type] = pd.DataFrame()
        
        return datasets


In [98]:
class ETLDataProcessor:
    def __init__(self, truck: str):
        self.processed = {}
        self.column_mapping = COLUMN_MAPPING  
        self.raw_datasets = DataLoader(truck).load_data()
        
        # 2. Validación de entrada
        if not isinstance(self.raw_datasets, dict):
            raise TypeError("Se esperaba un diccionario de DataFrames")
            
    def _clean_data(self):
        """Limpieza robusta respetando las reglas especificadas"""
        for dtype, df in self.raw_datasets.items():
            if df.empty:
                self.processed[dtype] = df
                continue

            # 1. Validar columnas obligatorias
            required_cols = ['ShiftDate', 'TimeStamp']
            missing = [col for col in required_cols if col not in df.columns]
            if missing:
                raise ValueError(f"Columnas críticas faltantes en {dtype}: {missing}")

            # 2. Eliminar filas con ShiftDate o TimeStamp nulos (ambos deben tener valor)
            df = df.dropna(subset=required_cols, how='any')

            # 3. Limpieza específica por tipo de dataset
            if dtype == 'sensor':
                # Eliminar filas donde todos los campos numéricos son nulos
                df = df.dropna(subset=['Speed', 'RPM', 'FuelLevel'], how='all')
                
            elif dtype == 'time_model':
                # Eliminar filas sin Status
                df = df.dropna(subset=['Status'])

            self.processed[dtype] = df

    def _transform_columns(self):
        """Transformaciones de formato y tipos"""
        for dtype, df in self.processed.items():
            if df.empty:
                continue
                
            # Convertir fechas a formatos correctos
            df['ShiftDate'] = pd.to_datetime(df['ShiftDate'])
            df['TimeStamp'] = pd.to_datetime(df['TimeStamp']).dt.tz_localize(None)

            # columnas agregadas
            df.insert(loc=1, column="ShiftYear", value=df['ShiftDate'].dt.year)
            df.insert(loc=2, column="ShiftMonth", value=df['ShiftDate'].dt.month)
            df.insert(loc=3, column="ShiftDay", value=df['ShiftDate'].dt.day)

            # modificar la columna a solo hora minuto y segundo
            df['TimeStamp'] = pd.to_datetime(df['TimeStamp']).dt.time
            
            # agregar metadatos
            df.insert(loc=len(df.columns), column="data_type", value=dtype)

            self.processed[dtype] = df
                
    def run_etl(self) -> dict:
        self._clean_data()
        self._transform_columns()
        return self.processed
        

In [100]:
from typing import Dict, Union
import pandas as pd

class DataValidator:
    def __init__(self, raw_data: Dict[str, pd.DataFrame], processed_data: Dict[str, pd.DataFrame]):
        """
        Args:
            raw_data: Diccionario con datasets crudos
            processed_data: Diccionario con datasets procesados
        """
        self.raw_data = self._validate_input(raw_data, "raw")
        self.processed_data = self._validate_input(processed_data, "processed")
        self.truck_id = self._detect_truck_id()
        self.required_columns = {
            'sensor': ['ShiftDate', 'TimeStamp', 'Speed', 'RPM', 'FuelLevel'],
            'time_model': ['ShiftDate', 'TimeStamp', 'Status'],
            'cycle': []
        }

    def _validate_input(self, data: dict, data_type: str) -> Dict[str, pd.DataFrame]:
        """Valida estructura de los datos de entrada"""
        if not isinstance(data, dict):
            raise TypeError(f"{data_type} debe ser un diccionario")
        for key, df in data.items():
            if not isinstance(key, str):
                raise TypeError(f"Claves en {data_type} deben ser strings")
            if not isinstance(df, pd.DataFrame):
                raise TypeError(f"Valores en {data_type} deben ser DataFrames")
        return data

    def _detect_truck_id(self) -> str:
        """Detecta ID del camión desde datos procesados o crudos"""
        for dataset in [self.processed_data, self.raw_data]:
            for dtype in ['sensor', 'time_model', 'cycle']:
                df = dataset.get(dtype, pd.DataFrame())
                if not df.empty and 'Equipment' in df.columns:
                    return df['Equipment'].iloc[0]
        return 'N/A'

    def _calculate_metrics(self, df: pd.DataFrame) -> Dict[str, str]:
        """Calcula métricas para un DataFrame individual"""
        metrics = {
            'filas': '0',
            'columnas': '0',
            'nulos': '0',
            'duplicados': '0',
            'rango_temporal': 'N/A'
        }
        
        if not df.empty:
            metrics.update({
                'filas': f"{len(df):,}",
                'columnas': f"{len(df.columns):,}",
                'nulos': f"{df.isnull().sum().sum():,}",
                'duplicados': f"{df.duplicated().sum():,}"
            })
            
            if 'ShiftDate' in df.columns:
                valid_dates = pd.to_datetime(df['ShiftDate'], errors='coerce').dropna()
                if not valid_dates.empty:
                    min_date = valid_dates.min().strftime('%Y-%m-%d')
                    max_date = valid_dates.max().strftime('%Y-%m-%d')
                    metrics['rango_temporal'] = f"{min_date} - {max_date}"
        
        return metrics

    def generate_comparison_table(self) -> Dict[str, Dict[str, Dict[str, str]]]:
        """Genera tabla comparativa entre datos crudos y procesados"""
        results = {}
        
        for dtype in ['sensor', 'time_model', 'cycle']:
            raw_df = self.raw_data.get(dtype, pd.DataFrame())
            processed_df = self.processed_data.get(dtype, pd.DataFrame())
            
            results[dtype] = {
                'crudo': self._calculate_metrics(raw_df),
                'procesado': self._calculate_metrics(processed_df)
            }
        
        return results

    def print_comparison(self):
        """Imprime tabla comparativa formateada"""
        comparison = self.generate_comparison_table()
        
        print(f"\n{'='*65}")
        print(f" COMPARATIVO DATOS CRUDOS vs PROCESADOS - CAMIÓN: {self.truck_id} ")
        print(f"{'='*65}\n")
        
        # Encabezados
        header = (
            f"{'Dataset':<12} | {'Tipo':<8} | {'Filas':>10} | {'Columnas':>10} | "
            f"{'Nulos':>10} | {'Duplicados':>12} | {'Rango Temporal'}"
        )
        print(header)
        print("-"*95)
        
        # Filas
        for dtype, data in comparison.items():
            print(f"{dtype.upper():<12} | {'CRUDO':<8} | "
                  f"{data['crudo']['filas']:>10} | {data['crudo']['columnas']:>10} | "
                  f"{data['crudo']['nulos']:>10} | {data['crudo']['duplicados']:>12} | "
                  f"{data['crudo']['rango_temporal']}")
            
            print(f"{'':<12} | {'PROCESADO':<8} | "
                  f"{data['procesado']['filas']:>10} | {data['procesado']['columnas']:>10} | "
                  f"{data['procesado']['nulos']:>10} | {data['procesado']['duplicados']:>12} | "
                  f"{data['procesado']['rango_temporal']}")
            print("-"*95)

In [101]:
# Cargar datos
loader = DataLoader("T-210")
raw_data = loader.load_data()

# Procesar datos
processor = ETLDataProcessor("T-210")
processed_data = processor.run_etl()

# Validar y comparar
validator = DataValidator(raw_data, processed_data)
validator.print_comparison()

feedback SENSOR: Cargados 443246 registros
feedback TIME_MODEL: Cargados 9226 registros
Advertencia: No se encontraron archivos de cycle
feedback SENSOR: Cargados 443246 registros
feedback TIME_MODEL: Cargados 9226 registros
Advertencia: No se encontraron archivos de cycle

 COMPARATIVO DATOS CRUDOS vs PROCESADOS - CAMIÓN: T-210 

Dataset      | Tipo     |      Filas |   Columnas |      Nulos |   Duplicados | Rango Temporal
-----------------------------------------------------------------------------------------------
SENSOR       | CRUDO    |    443,246 |         15 |         35 |            0 | 2024-02-01 - 2025-04-01
             | PROCESADO |    443,246 |         19 |         35 |            0 | 2024-02-01 - 2025-04-01
-----------------------------------------------------------------------------------------------
TIME_MODEL   | CRUDO    |      9,226 |          9 |          0 |            0 | 2024-02-01 - 2025-04-01
             | PROCESADO |      9,226 |         13 |          0 |  

In [99]:
# Flujo de uso
#loader = DataLoader(DATA_DIR, "T-210")
processor = ETLDataProcessor("T-210")
clean_data = processor.run_etl()

# Resultados esperados:
print("Columnas sensor:", clean_data['time_model'].columns.tolist())
# ['ShiftDate', 'TimeStamp', 'Speed', 'RPM', 'FuelLevel', 
#  'Latitude', 'Longitude', 'Hour', 'ShiftMonth']

print("Coordenadas corregidas:", 
      clean_data['sensor'][['Latitude', 'Longitude']].describe())
# Latitude: min -90, max 90
# Longitude: min -180, max 180
clean_data['sensor'].head(5)

feedback SENSOR: Cargados 443246 registros
feedback TIME_MODEL: Cargados 9226 registros
Advertencia: No se encontraron archivos de cycle
Columnas sensor: ['ShiftDate', 'ShiftYear', 'ShiftMonth', 'ShiftDay', 'Shift', 'TimeStamp', 'RecordDuration', 'Equipment', 'TruckFleet', 'Status', 'Category', 'Event', 'data_type']
Coordenadas corregidas:            Latitude     Longitude
count  4.432390e+05  4.432390e+05
mean  -7.595287e+07 -2.419300e+08
std    1.627874e+05  5.146528e+05
min   -7.662248e+07 -2.421211e+08
25%   -7.596555e+07 -2.419518e+08
50%   -7.594814e+07 -2.419389e+08
75%   -7.593820e+07 -2.419094e+08
max    0.000000e+00  0.000000e+00


Unnamed: 0,ShiftDate,ShiftYear,ShiftMonth,ShiftDay,Shift,TimeStamp,RecordDuration,Equipment,TruckFleet,FuelLevel,FuelLevelLiters,FuelGauge,Speed,RPM,Ralenti,Latitude,Longitude,Elevation,data_type
0,2024-02-01,2024,2,1,D,07:01:30,,T-210,CAT 789C,31.74,1015.68,Medium,0.0,0.0,Ralenti,-76001573.0,-241968218.0,417059.0,sensor
1,2024-02-01,2024,2,1,D,07:04:30,180.0,T-210,CAT 789C,31.07,994.24,Medium,22.0,0.0,Moviendose,-75996785.0,-241969167.0,416300.0,sensor
2,2024-02-01,2024,2,1,D,07:05:30,60.0,T-210,CAT 789C,30.985,991.68,Medium,32.0,0.0,Moviendose,-75999602.0,-241955269.0,411169.0,sensor
3,2024-02-01,2024,2,1,D,07:06:30,60.0,T-210,CAT 789C,31.07,994.24,Medium,22.0,0.0,Moviendose,-76001260.0,-241953481.0,408329.0,sensor
4,2024-02-01,2024,2,1,D,07:07:00,30.0,T-210,CAT 789C,31.235,999.68,Medium,19.0,0.0,Moviendose,-75996360.0,-241950079.0,407360.0,sensor


In [61]:
clean_data['sensor'].head(5)

Unnamed: 0,ShiftDate,Shift,TimeStamp,RecordDuration,Equipment,TruckFleet,FuelLevel,FuelLevelLiters,FuelGauge,Speed,RPM,Ralenti,Latitude,Longitude,Elevation,ShiftMonth
0,2024-02-01,D,07:01:30,,T-210,CAT 789C,31.74,1015.68,Medium,0.0,0.0,Ralenti,-76001573.0,-241968218.0,417059.0,2
1,2024-02-01,D,07:04:30,180.0,T-210,CAT 789C,31.07,994.24,Medium,22.0,0.0,Moviendose,-75996785.0,-241969167.0,416300.0,2
2,2024-02-01,D,07:05:30,60.0,T-210,CAT 789C,30.985,991.68,Medium,32.0,0.0,Moviendose,-75999602.0,-241955269.0,411169.0,2
3,2024-02-01,D,07:06:30,60.0,T-210,CAT 789C,31.07,994.24,Medium,22.0,0.0,Moviendose,-76001260.0,-241953481.0,408329.0,2
4,2024-02-01,D,07:07:00,30.0,T-210,CAT 789C,31.235,999.68,Medium,19.0,0.0,Moviendose,-75996360.0,-241950079.0,407360.0,2


In [26]:
# Flujo de uso
loader = DataLoader(DATA_DIR, "T-210")
raw_datasets = loader.load_data()

# Inspeccionar cada dataset
for dtype, df in raw_datasets.items():
    print(f"\n{'='*50}")
    print(f"DATASET: {dtype.upper()}".center(50))
    print(f"{'='*50}\n")
    
    if df.empty:
        print("¡Dataset vacío!")
        continue
    
    # Mostrar tipos de datos
    print(" TIPOS DE DATOS ".center(50, "-"))
    print(df.dtypes)
    
    # Mostrar primeras filas
    print("\n PRIMERAS 3 FILAS ".center(50, "-"))
    print(df.head(3))
    
    # Mostrar valores nulos por columna
    print("\n VALORES NULOS ".center(50, "-"))
    print(df.isnull().sum())
    
    # Mostrar información general (incluye memoria usada)
    print("\n INFORMACIÓN GENERAL ".center(50, "-"))
    df.info()

feedback SENSOR: Cargados 443246 registros
feedback TIME_MODEL: Cargados 9226 registros
Advertencia: No se encontraron archivos de cycle

                 DATASET: SENSOR                  

----------------- TIPOS DE DATOS -----------------
ShiftDate          datetime64[ns]
Shift                      object
TimeStamp          datetime64[ns]
RecordDuration            float64
Equipment                  object
TruckFleet                 object
FuelLevel                 float64
FuelLevelLiters           float64
FuelGauge                  object
Speed                     float64
RPM                       float64
Ralenti                    object
Latitude                  float64
Longitude                 float64
Elevation                 float64
dtype: object
---------------
 PRIMERAS 3 FILAS ----------------
   ShiftDate Shift           TimeStamp  RecordDuration Equipment TruckFleet  \
0 2024-02-01     D 2024-02-01 07:01:30             NaN     T-210   CAT 789C   
1 2024-02-01     D 2024-02

In [58]:
import pandas as pd
from typing import Dict

class DataValidator:
    def __init__(self, datasets: Dict[str, pd.DataFrame]):
        self.datasets = datasets
        self.truck_id = self._detect_truck_id()
        self.required_columns = {
            'sensor': ['ShiftDate', 'TimeStamp', 'Speed', 'RPM', 'FuelLevel'],
            'time_model': ['ShiftDate', 'TimeStamp', 'Status'],
            'cycle': []  # Permitir dataset vacío
        }

    def _detect_truck_id(self) -> str:
        for df in self.datasets.values():
            if not df.empty and 'Equipment' in df.columns:
                return df['Equipment'].iloc[0]
        return 'N/A'

    def generate_comparison_table(self) -> Dict[str, Dict[str, str]]:
        results = {}
        for dtype in ['sensor', 'time_model', 'cycle']:
            df = self.datasets.get(dtype, pd.DataFrame())
            
            rango_temporal = 'N/A'
            if not df.empty and 'ShiftDate' in df.columns:
                valid_dates = df['ShiftDate'].dropna()
                if not valid_dates.empty:
                    rango_temporal = f"{valid_dates.min().date()} a {valid_dates.max().date()}"

            results[dtype] = {
                'Dimensiones': f"{len(df):,} x {len(df.columns)}" if not df.empty else 'N/A',
                'Valores Nulos': f"{df.isnull().sum().sum():,}" if not df.empty else 'N/A',
                'Duplicados': f"{df.duplicated().sum():,}" if not df.empty else 'N/A',
                'Rango Temporal': rango_temporal
            }
        return results

class ETLDataProcessor:
    def __init__(self, raw_datasets: Dict[str, pd.DataFrame]):
        self.raw_validator = DataValidator(raw_datasets)
        self.processed_datasets = {}
        self.processed_validator = None

    def _clean_data(self) -> None:
        self.processed_datasets = {k: v.copy() for k, v in self.raw_validator.datasets.items()}
        
        for dtype, df in self.processed_datasets.items():
            if df.empty:
                continue

            # Mantener registros con al menos una fecha válida
            if 'ShiftDate' in df.columns and 'TimeStamp' in df.columns:
                df = df[df[['ShiftDate', 'TimeStamp']].notna().any(axis=1)]

            # Limpieza específica
            if dtype == 'sensor':
                df = df.dropna(subset=['Speed', 'RPM', 'FuelLevel'], how='all')
            elif dtype == 'time_model':
                df = df.dropna(subset=['Status'])

            self.processed_datasets[dtype] = df

    def _transform_columns(self) -> None:
        for dtype, df in self.processed_datasets.items():
            if df.empty:
                continue

            # Conversión de tipos numéricos
            if dtype == 'sensor':
                df['Speed'] = pd.to_numeric(df['Speed'], errors='coerce')
                df['RPM'] = pd.to_numeric(df['RPM'], errors='coerce')

            # Componentes temporales
            if 'ShiftDate' in df.columns:
                df['ShiftYear'] = df['ShiftDate'].dt.year
                df['ShiftMonth'] = df['ShiftDate'].dt.month
                df['ShiftDay'] = df['ShiftDate'].dt.day
            if 'TimeStamp' in df.columns:
                df['Hour'] = df['TimeStamp'].dt.hour

            self.processed_datasets[dtype] = df

    def _apply_schema(self) -> None:
        COLUMN_MAPPING = {
            'sensor': [
                'ShiftYear', 'ShiftMonth', 'ShiftDay', 'Hour',
                'Speed', 'RPM', 'FuelLevel', 'FuelLevelLiters'
            ],
            'time_model': [
                'ShiftYear', 'ShiftMonth', 'ShiftDay', 'Hour',
                'Status', 'Category', 'Event'
            ],
            'cycle': []  # Sin columnas requeridas
        }

        for dtype, df in self.processed_datasets.items():
            if df.empty:
                continue

            allowed_cols = [col for col in COLUMN_MAPPING[dtype] if col in df.columns]
            self.processed_datasets[dtype] = df[allowed_cols] if allowed_cols else df

    def run_etl(self) -> 'ETLDataProcessor':
        self._clean_data()
        self._transform_columns()
        self._apply_schema()
        
        self.processed_validator = DataValidator(self.processed_datasets)
        schema_issues = self.processed_validator.validate_schema()
        
        if any(schema_issues.values()):
            print("⚠️ ALERTA: Problemas de esquema detectados")
            for dtype, issues in schema_issues.items():
                print(f"- {dtype.upper()}: {', '.join(issues)}")
        
        return self

    def show_comparison(self) -> None:
        raw_table = self.raw_validator.generate_comparison_table()
        processed_table = self.processed_validator.generate_comparison_table()
        
        print(f"\n{'='*60}")
        print("🚀 COMPARATIVO ETL COMPLETO".center(60))
        print(f"{'='*60}\n")
        
        for dtype in ['sensor', 'time_model', 'cycle']:
            print(f"📊 {dtype.upper()} ".ljust(60, '▬'))
            print(f"{'MÉTRICA':<25} | {'ORIGINAL':<15} | {'PROCESADO':<15}")
            print(f"{'-'*60}")
            
            for metric in ['Dimensiones', 'Valores Nulos', 'Duplicados', 'Rango Temporal']:
                orig = raw_table[dtype].get(metric, 'N/A')
                proc = processed_table[dtype].get(metric, 'N/A')
                print(f"{metric:<25} | {orig:<15} | {proc:<15}")
            print("\n")

In [22]:
# Cargar datos
loader = DataLoader(DATA_DIR, "T-210")
raw_datasets = loader.load_data()

# Validación inicial
initial_validator = DataValidator(raw_datasets)
print("VALIDACIÓN INICIAL:")
for dtype, issues in initial_validator.validate_schema().items():
    print(f".Dataset {dtype.upper()}: {issues if issues else 'Esquema válido'}")

# Ejecución ETL
etl = ETLDataProcessor(raw_datasets)
etl.run_etl()
etl.show_comparison()

feedback SENSOR: Cargados 443246 registros
feedback TIME_MODEL: Cargados 9226 registros
Advertencia: No se encontraron archivos de cycle
VALIDACIÓN INICIAL:
.Dataset SENSOR: Esquema válido
.Dataset TIME_MODEL: Esquema válido
.Dataset CYCLE: ['Dataset vacío']


KeyError: 'ShiftDate'

In [4]:
sensor_df.head(1)

Unnamed: 0,ShiftDate,Shift,TimeStamp,RecordDuration,Equipment,TruckFleet,FuelLevel,FuelLevelLiters,FuelGauge,Speed,RPM,Ralenti,Latitude,Longitude,Elevation
0,2024-02-01,D,2024-02-01 07:01:30.000,,T-210,CAT 789C,31.74,1015.68,Medium,0.0,0.0,Ralenti,-76001573.0,-241968218.0,417059.0


In [5]:
# Flujo de uso
loader = DataLoader(DATA_DIR, "T-210")
raw_datasets = loader.load_data()

# Validación inicial
initial_validator = DataValidator(raw_datasets)
print("VALIDACIÓN INICIAL:")
for dtype, issues in initial_validator.validate_schema().items():
    print(f".Dataset {dtype.upper()}: {issues if issues else 'Esquema válido'}")

# Ejecución ETL con validación integrada
etl = ETLDataProcessor(raw_datasets)
etl.run_etl()
etl.show_comparison()

feedback SENSOR: Cargados 443246 registros
feedback TIME_MODEL: Cargados 9226 registros
Advertencia: No se encontraron archivos de cycle
VALIDACIÓN INICIAL:
.Dataset SENSOR: ['ShiftDate no es datetime64']
.Dataset TIME_MODEL: Esquema válido
.Dataset CYCLE: ['Dataset vacío']
⚠️ ALERTA: Problemas de esquema detectados
- SENSOR: ShiftDate, TimeStamp
- TIME_MODEL: ShiftDate, TimeStamp
- CYCLE: Dataset vacío


AttributeError: 'str' object has no attribute 'strftime'

In [15]:

truck_name = "T-210"

# Cargar los datos
loader = DataLoader(base_dir=DATA_DIR, truck=truck_name)
raw_datasets = loader.load_data()
# Ejecutar el pipeline ETL
etl = ETLDataProcessor(raw_datasets)
etl.run_etl()
etl.show_comparison()


feedback SENSOR: Cargados 443246 registros
feedback TIME_MODEL: Cargados 9226 registros
Advertencia: No se encontraron archivos de cycle


ValueError: NaTType does not support strftime

In [6]:
time_model_df.head(1)
#print(len(time_model_df.columns))
# sensor_df.tail(10)
# cycle_df.head(10)

Unnamed: 0,ShiftDate,Shift,TimeStamp,RecordDuration,Equipment,TruckFleet,Status,Category,Event,YearMonth,DataType
0,2024-02-01,D,2024-02-01 09:01:14,7067,T-210,CAT 789C,Operativo,efectivo,Producción,2024-02,time_model


In [124]:
# Cargar datos originales
loader = DataLoader(DATA_DIR, "T-234")
raw_datasets = loader.load_data()

# Procesar y comparar
etl_processor = ETLDataProcessor(raw_datasets)
etl_processor.run_etl().show_comparison()
for dtype in self.processed_datasets:
    df = self.processed_datasets[dtype]

    # 1. Limpiar columnas no estándar
    df = self._enforce_standard_columns(df, dtype)

    # 2. Limpieza genérica
    if 'TimeStamp' in df.columns:
        df.dropna(subset=['TimeStamp'], inplace=True)

    # 3. Limpieza específica
    if dtype == 'sensor':
        df.dropna(subset=['Speed', 'RPM'], inplace=True)
        df = df[(df['Speed'] > 0) & (df['RPM'] > 0)]

    elif dtype == 'time_model':
        df = df[df['Status'].isin(['active', 'idle'])]

    # 👉 4. ¡GUARDAR LOS CAMBIOS!
    self.processed_datasets[dtype] = df


feedback SENSOR: Cargados 6789 registros
feedback TIME_MODEL: Cargados 316 registros
Advertencia: No se encontraron archivos de cycle


🚀 REPORTE COMPARATIVO ANTES/DESPUÉS
─────────────────────── raw_datasets ───────────────────────

DATOS CRUDOS PARA CAMIÓN: T-234

🔍 Dataset: SENSOR
  📏 Dimensiones: 6,789 filas x 18 columnas

  🧹 CALIDAD DE DATOS:

  • Valores nulos por columna:
    - Truck: 0
    - ShiftDate: 0
    - Shift: 0
    - TimeStamp: 0
    - RecordDuration: 13
    - Equipment: 0
    - TruckFleet: 0
    - FuelLevel: 0
    - FuelLevelLiters: 0
    - FuelGauge: 0
    - Speed: 0
    - RPM: 0
    - Ralenti: 0
    - Latitude: 1
    - Longitude: 1
    - Elevation: 1
    - YearMonth: 0
    - DataType: 0
  • Valores nulos: 16.00
  • Registros duplicados: 0.00

  ⏱️  ANÁLISIS TEMPORAL:
  • Rango: 2024-02-01 12:58:00 - 2025-03-23 15:52:00
  • Frecuencia media: 0 days 01:28:16.529169121

🔍 Dataset: TIME_MODEL
  📏 Dimensiones: 316 filas x 12 columnas

  🧹 CALIDAD DE DATOS:

  • Valores n

IndexError: tuple index out of range

In [49]:
os.getcwd()

'C:\\Users\\JoseFM\\Desktop\\Sancristobal\\AI-DataAssistant\\Docs'

In [50]:
os.path.abspath(os.path.join(os.getcwd(), ".."))

'C:\\Users\\JoseFM\\Desktop\\Sancristobal\\AI-DataAssistant'

In [17]:
from ipywidgets import interact, widgets
import plotly.express as px

def prepare_daily_data(df: pd.DataFrame, truck_id: str) -> pd.DataFrame:
    # Filtrar por camión
    df = df[df['Truck'] == truck_id].copy()
    
    # Asegurar zona horaria
    df['fecha'] = df['TimeStamp'].dt.tz_localize('UTC').dt.tz_convert('America/Santiago')
    
    # Extraer componentes
    df['mes'] = df['fecha'].dt.month_name(locale='es')
    df['dia_mes'] = df['fecha'].dt.day
    df['año'] = df['fecha'].dt.year

    # Agrupar por día
    daily_counts = df.groupby(['año', 'mes', 'dia_mes']).size().reset_index(name='conteo')

    # Orden de meses en español
    month_order = ['Enero', 'Febrero', 'Marzo', 'Abril', 'Mayo', 'Junio',
                   'Julio', 'Agosto', 'Septiembre', 'Octubre', 'Noviembre', 'Diciembre']
    daily_counts['mes'] = pd.Categorical(daily_counts['mes'], categories=month_order, ordered=True)

    return daily_counts.sort_values(['año', 'mes', 'dia_mes'])


def plot_daily_histogram_interactive(df_sensor: pd.DataFrame, truck_id: str):
    # Prepara datos
    hist_data = prepare_daily_data(df_sensor, truck_id)
    meses_disponibles = hist_data['mes'].dropna().unique()

    @interact(
        Mes=widgets.Dropdown(
            options=meses_disponibles,
            description='Seleccionar Mes:',
            style={'description_width': 'initial'}
        )
    )
    def plot_histogram(Mes):
        # Filtrar mes
        filtered = hist_data[hist_data['mes'] == Mes]

        fig = px.bar(
            filtered,
            x='dia_mes',
            y='conteo',
            color='año',
            barmode='group',
            title=f'Registros diarios de {truck_id} - {Mes}',
            labels={'dia_mes': 'Día del Mes', 'conteo': 'Número de Registros'},
            template='plotly_white'
        )

        fig.update_layout(
            xaxis=dict(
                tickmode='linear',
                dtick=1,
                range=[0.5, 31.5]
            ),
            hovermode='x unified',
            height=500
        )

        # Línea de promedio
        avg = filtered['conteo'].mean()
        fig.add_hline(
            y=avg,
            line_dash="dot",
            line_color="red",
            annotation_text=f'Promedio: {avg:.1f}',
            annotation_position="top right"
        )

        fig.show()


In [18]:
loader = DataLoader(DATA_DIR, "T-210")
raw_datasets = loader.load_data()

# Dataset de sensores crudo
df_sensor = raw_datasets['sensor']

# Mostrar histograma diario por mes del camión T-234
plot_daily_histogram_interactive(df_sensor, truck_id='T-210')


feedback SENSOR: Cargados 443246 registros
feedback TIME_MODEL: Cargados 9226 registros
Advertencia: No se encontraron archivos de cycle


interactive(children=(Dropdown(description='Seleccionar Mes:', options=('Febrero', 'Marzo', 'Abril', 'Mayo', '…