In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, RobustScaler, LabelEncoder, OneHotEncoder, OrdinalEncoder, KBinsDiscretizer, PolynomialFeatures, FunctionTransformer, TargetEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.feature_selection import SelectKBest, RFE, RFECV, SelectFromModel, mutual_info_regression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, auc, precision_recall_curve, mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import HistGradientBoostingClassifier, VotingClassifier, VotingRegressor, AdaBoostClassifier, RandomForestClassifier, HistGradientBoostingRegressor, AdaBoostRegressor, RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBClassifier, XGBRegressor
from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.base import BaseEstimator, RegressorMixin
from imblearn.over_sampling import SMOTE
from typing import List, Tuple, Any, Dict, Literal
from scipy.stats import uniform, randint
from collections import Counter
import warnings
from sklearn.base import BaseEstimator, RegressorMixin, TransformerMixin
from dataclasses import dataclass
import logging

warnings.filterwarnings('ignore')


In [2]:
with open(r'C:\Users\USER\Desktop\MTPE\Analitica_MTPE\clean_data.csv', encoding='utf-8') as f:
    data = pd.read_csv(f)
# display full columns
pd.set_option('display.max_columns', None)

In [3]:
#Aplicando target encoder a todas las variables categoricas excluyendo la varivle FECHA_ACCIDENTE
categorical_columns = data.select_dtypes(include=['object']).columns

for column in categorical_columns:
    if column != 'FECHA_ACCIDENTE' and column != 'GRAVEDAD_ACCIDENTE':
        target_encoder = TargetEncoder()
        data[column] = target_encoder.fit_transform(data[column].values.reshape(-1,1), data['GRAVEDAD_ACCIDENTE'])
# transformar la variable objetivo a binario donde 1 es grave y 0 es leve
data['GRAVEDAD_ACCIDENTE'] = data['GRAVEDAD_ACCIDENTE'].apply(lambda x: 1 if x == 'ACCIDENTE INCAPACITANTE' else 0)
# RobustScaler para las variables numericas
scaler = RobustScaler()
data[['EDAD']] = scaler.fit_transform(data[['EDAD']])

In [4]:
data.value_counts('GRAVEDAD_ACCIDENTE')

GRAVEDAD_ACCIDENTE
0    9667
1    2310
Name: count, dtype: int64

In [5]:
data.to_csv(r'C:\Users\USER\Desktop\MTPE\Analitica_MTPE\clean_data_target_encoder.csv', index=False)

In [6]:
# construir un modelo de clasificación  XGBoost primero sin seleccion de variables
X_1 = data.drop(['GRAVEDAD_ACCIDENTE','FECHA_CORTE','PERIODO_REGISTRO','FECHA_ACCIDENTE','DIAS_DESCANZO','MONTO_DESCANSO'], axis=1)
y_1 = data['GRAVEDAD_ACCIDENTE']
from models.classifier_model import TreeEnsemblePipeline, Pipeline
clf_pipeline = TreeEnsemblePipeline()
model, X_test, y_test = clf_pipeline.build_model(X_1, y_1)

In [7]:
# Definir las variables para entrenar el modelo de regresion
X_2 = data.drop(['GRAVEDAD_ACCIDENTE','FECHA_CORTE','PERIODO_REGISTRO','FECHA_ACCIDENTE','DIAS_DESCANZO','MONTO_DESCANSO'], axis=1)
y_2 = data['DIAS_DESCANZO']

from models.model_pipeline import OptimizedEnsemblePipeline, ModelConfig

# Configura y ejecuta el modelo
config = ModelConfig(winsor_limits=(0.10, 0.90))
days_pipeline = OptimizedEnsemblePipeline(config)
model, X_test, y_test, training_rmse = days_pipeline.build_model(X_2, y_2)

2024-11-03 12:13:25,979 - INFO - Starting model building process with 11977 samples...
2024-11-03 12:13:25,989 - INFO - Original data statistics:
2024-11-03 12:13:25,990 - INFO - Original statistics:
2024-11-03 12:13:25,992 - INFO - Mean: 4.46
2024-11-03 12:13:25,993 - INFO - Std: 6.92
2024-11-03 12:13:25,996 - INFO - Min: 0.00
2024-11-03 12:13:25,996 - INFO - Max: 180.00
2024-11-03 12:13:26,001 - INFO - Quartiles:
0.25    1.0
0.50    3.0
0.75    5.0
Name: DIAS_DESCANZO, dtype: float64
2024-11-03 12:13:26,004 - INFO - Number of potential outliers: 638
2024-11-03 12:13:26,007 - INFO - 
Winsorized data statistics:
2024-11-03 12:13:26,007 - INFO - Winsorized statistics:
2024-11-03 12:13:26,009 - INFO - Mean: 3.29
2024-11-03 12:13:26,011 - INFO - Std: 2.52
2024-11-03 12:13:26,011 - INFO - Min: 0.00
2024-11-03 12:13:26,011 - INFO - Max: 8.00
2024-11-03 12:13:26,016 - INFO - Quartiles:
0.25    1.0
0.50    3.0
0.75    5.0
dtype: float64
2024-11-03 12:13:26,019 - INFO - Number of potential out

In [8]:
# Definir las variables para entrenar el modelo de regresion para MONTO_DESCANSO
X_3 = data.drop(['GRAVEDAD_ACCIDENTE','FECHA_CORTE','PERIODO_REGISTRO','FECHA_ACCIDENTE','DIAS_DESCANZO','MONTO_DESCANSO'], axis=1)
y_3 = data['MONTO_DESCANSO']
from models.model_pipeline import OptimizedEnsemblePipeline, ModelConfig
# Crear la configuración
config = ModelConfig(
    n_features=12,
    test_size=0.2,
    cv_folds=3,
    winsor_limits=(0.10, 0.90)
)

# Inicializar la pipeline
amount_pipeline = OptimizedEnsemblePipeline(config)

# Entrenar el modelo
model, X_test, y_test, training_rmse = amount_pipeline.build_model(X_3, y_3)

# Evaluar el modelo
metrics = amount_pipeline.evaluate_model(model, X_test, y_test)

2024-11-03 12:16:48,720 - INFO - Starting model building process with 11977 samples...
2024-11-03 12:16:48,730 - INFO - Original data statistics:
2024-11-03 12:16:48,730 - INFO - Original statistics:
2024-11-03 12:16:48,733 - INFO - Mean: 314.20
2024-11-03 12:16:48,734 - INFO - Std: 825.92
2024-11-03 12:16:48,734 - INFO - Min: 0.00
2024-11-03 12:16:48,737 - INFO - Max: 42227.42
2024-11-03 12:16:48,740 - INFO - Quartiles:
0.25     65.81
0.50    151.24
0.75    297.16
Name: MONTO_DESCANSO, dtype: float64
2024-11-03 12:16:48,743 - INFO - Number of potential outliers: 917
2024-11-03 12:16:48,746 - INFO - 
Winsorized data statistics:
2024-11-03 12:16:48,746 - INFO - Winsorized statistics:
2024-11-03 12:16:48,748 - INFO - Mean: 207.37
2024-11-03 12:16:48,751 - INFO - Std: 191.03
2024-11-03 12:16:48,752 - INFO - Min: 0.00
2024-11-03 12:16:48,754 - INFO - Max: 613.87
2024-11-03 12:16:48,755 - INFO - Quartiles:
0.25     65.81
0.50    151.24
0.75    297.16
dtype: float64
2024-11-03 12:16:48,761 -

TypeError: 'NoneType' object is not iterable

In [11]:
# Script 1: save_models.py (para usar en Jupyter Notebook)
import joblib
from pathlib import Path

def save_trained_models(clf_pipeline, days_pipeline, amount_pipeline, output_dir=r'C:\Users\USER\Desktop\MTPE\Analitica_MTPE\trained_models'):
    """
    Guarda los modelos entrenados y sus características seleccionadas.
    
    Parámetros:
    -----------
    clf_pipeline : TreeEnsemblePipeline
        Pipeline entrenado para clasificación de gravedad
    days_pipeline : OptimizedEnsemblePipeline
        Pipeline entrenado para predicción de días
    amount_pipeline : OptimizedEnsemblePipeline
        Pipeline entrenado para predicción de monto
    output_dir : str
        Directorio donde se guardarán los modelos
    """
    # Crear directorio si no existe
    Path(output_dir).mkdir(parents=True, exist_ok=True)
    
    # Asegurarse de que los selectores de características estén ajustados
    clf_pipeline.feature_selector.fit(clf_pipeline.X_train, clf_pipeline.y_train)
    days_pipeline.feature_selector.fit(days_pipeline.X_train, days_pipeline.y_train)
    amount_pipeline.feature_selector.fit(amount_pipeline.X_train, amount_pipeline.y_train)
    
    # Guardar modelos
    joblib.dump(clf_pipeline, f'{output_dir}/clf_pipeline.joblib')
    joblib.dump(days_pipeline, f'{output_dir}/days_pipeline.joblib')
    joblib.dump(amount_pipeline, f'{output_dir}/amount_pipeline.joblib')
    
    # Guardar características seleccionadas
    features = {
        'clf': clf_pipeline.feature_selector.get_support(),
        'days': days_pipeline.feature_selector.get_support(),
        'amount': amount_pipeline.feature_selector.get_support()
    }
    joblib.dump(features, f'{output_dir}/selected_features.joblib')
    
    print(f"Modelos y características guardados en {output_dir}/")

# Uso en Jupyter Notebook:
save_trained_models(
    clf_pipeline=clf_pipeline,  # Tu pipeline de clasificación
    days_pipeline=days_pipeline,  # Tu pipeline de días
    amount_pipeline=amount_pipeline  # Tu pipeline de monto
)


AttributeError: 'TreeEnsemblePipeline' object has no attribute 'X_train'