In [None]:
###Script de Preprocesamiento para Gradient Boosting y Random Forest###
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from xgboost import XGBClassifier, XGBRegressor
from lightgbm import LGBMClassifier, LGBMRegressor
from sklearn.metrics import classification_report, mean_squared_error
import warnings
warnings.filterwarnings('ignore')

In [None]:
#Cargar datos (reemplaza con tu dataframe)
df = pd.read_csv('tus_datos.csv')
# Para este ejemplo, asumiremos que df ya está cargado

## 1. Preparación inicial ------------------------------------------------------

# Definir si es problema de clasificación o regresión (ajusta según tu caso)
problem_type = 'classification'  # o 'regression'

# Definir variable objetivo (ajusta según tu caso)
target_column = 'Engagement'  # Ejemplo, cambia por tu target real

# Separar características y objetivo
X = df.drop(columns=[target_column])
y = df[target_column]

# Dividir en train y test (80-20)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

In [None]:
## 2. Preprocesamiento ---------------------------------------------------------

# Identificar tipos de columnas (ajusta según tu dataset)
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

# Quitar columnas no relevantes (timestamp, IDs, etc.)
cols_to_drop = ['comentario_timestamp']  # Ajusta según tu caso
numeric_cols = [col for col in numeric_cols if col not in cols_to_drop]
categorical_cols = [col for col in categorical_cols if col not in cols_to_drop]

# Transformaciones:
# - Escalado robusto para numéricas (mejor para datos con outliers)
# - OneHot para categóricas con pocas categorías, TargetEncoding para muchas

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())  # Alternativa: RobustScaler()
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))
    # Para muchas categorías, considera TargetEncoding
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)])


In [None]:
## 3. Pipelines completos ------------------------------------------------------

# Pipeline para Random Forest
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42) if problem_type == 'classification' 
                      else RandomForestRegressor(random_state=42))
])

# Pipeline para XGBoost
xgb_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(random_state=42, eval_metric='logloss') if problem_type == 'classification' 
                     else XGBRegressor(random_state=42))
])

# Pipeline para LightGBM
lgbm_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LGBMClassifier(random_state=42) if problem_type == 'classification' 
                      else LGBMRegressor(random_state=42))
])

In [None]:
## 4. Entrenamiento y evaluación -----------------------------------------------

# Diccionario de modelos
models = {
    'Random Forest': rf_pipeline,
    'XGBoost': xgb_pipeline,
    'LightGBM': lgbm_pipeline
}

# Entrenar y evaluar cada modelo
for name, model in models.items():
    print(f"\n--- Entrenando {name} ---")
    
    # Entrenamiento
    model.fit(X_train, y_train)
    
    # Predicción
    y_pred = model.predict(X_test)
    
    # Evaluación
    if problem_type == 'classification':
        print(f"Resultados de {name}:")
        print(classification_report(y_test, y_pred))
    else:
        mse = mean_squared_error(y_test, y_pred)
        print(f"MSE de {name}: {mse:.4f}")
    
    # Feature importance (solo para árboles)
    if hasattr(model.named_steps['classifier'], 'feature_importances_'):
        # Obtener nombres de características después del preprocesamiento
        if 'onehot' in preprocessor.named_transformers_['cat'].named_steps:
            ohe = preprocessor.named_transformers_['cat'].named_steps['onehot']
            cat_features = ohe.get_feature_names_out(categorical_cols)
            all_features = np.concatenate([numeric_cols, cat_features])
        else:
            all_features = numeric_cols + categorical_cols
            
        importances = model.named_steps['classifier'].feature_importances_
        feat_imp = pd.Series(importances, index=all_features).sort_values(ascending=False)
        
        print(f"\nTop 10 características importantes de {name}:")
        print(feat_imp.head(10))