In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# ML avanzado
from sklearn.preprocessing import StandardScaler, TargetEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import (
    HistGradientBoostingRegressor, 
    RandomForestRegressor,
    ExtraTreesRegressor
)
from sklearn.linear_model import Ridge, HuberRegressor
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans

# XGBoost y LightGBM para mejor rendimiento
try:
    import xgboost as xgb
    import lightgbm as lgb
    ADVANCED_MODELS = True
except:
    ADVANCED_MODELS = False
    print("‚ö†Ô∏è XGBoost/LightGBM no disponibles, usando solo sklearn")

import warnings
warnings.filterwarnings('ignore')

print("‚úÖ Librer√≠as cargadas")
print(f"Modelos avanzados: {'S√≠' if ADVANCED_MODELS else 'No'}")

‚úÖ Librer√≠as cargadas
Modelos avanzados: S√≠


In [5]:
# Carga de datos
train_full_df = pd.read_csv("train.csv", sep=';')
test_df = pd.read_csv("test.csv", sep=';')

print(f"üìä Train shape: {train_full_df.shape}")
print(f"üìä Test shape: {test_df.shape}")
print(f"\nüîç Columnas en train: {train_full_df.columns.tolist()[:10]}...")
print(f"\nüìà Estad√≠sticas b√°sicas de Production:")
print(train_full_df['Production'].describe())

üìä Train shape: (95339, 33)
üìä Test shape: (2250, 33)

üîç Columnas en train: ['ID', 'id_season', 'aggregated_family', 'family', 'category', 'fabric', 'color_name', 'color_rgb', 'image_embedding', 'length_type']...

üìà Estad√≠sticas b√°sicas de Production:
count     95339.000000
mean      28927.421055
std       34792.567183
min          90.000000
25%        6800.000000
50%       19266.000000
75%       37426.000000
max      403172.000000
Name: Production, dtype: float64


## 1. An√°lisis de Series Temporales por Producto

In [7]:
# Analizar patrones temporales ANTES de agregar
print("üîç Analizando patrones temporales...")

# Verificar columnas disponibles
print(f"Columnas disponibles: {train_full_df.columns.tolist()[:15]}...")

# Convertir fechas
train_full_df['phase_in'] = pd.to_datetime(train_full_df['phase_in'], format='%d/%m/%Y', errors='coerce')
train_full_df['phase_out'] = pd.to_datetime(train_full_df['phase_out'], format='%d/%m/%Y', errors='coerce')

# Features temporales por producto (sin usar 'week' que no existe)
# Contamos el n√∫mero de registros por ID como proxy de n√∫mero de semanas
weekly_stats = train_full_df.groupby('ID').agg({
    'weekly_demand': ['mean', 'std', 'max', 'min', 'sum', 'count']
}).reset_index()

weekly_stats.columns = ['ID', 'demand_mean', 'demand_std', 'demand_max', 'demand_min', 'demand_sum', 'num_weeks']

# Calcular coeficiente de variaci√≥n (volatilidad)
weekly_stats['demand_cv'] = weekly_stats['demand_std'] / (weekly_stats['demand_mean'] + 1)

# Tendencia: comparar primera mitad vs segunda mitad
def calculate_trend(group):
    if len(group) < 4:
        return 0
    mid = len(group) // 2
    first_half = group.iloc[:mid]['weekly_demand'].mean()
    second_half = group.iloc[mid:]['weekly_demand'].mean()
    return (second_half - first_half) / (first_half + 1)

trend_by_id = train_full_df.groupby('ID').apply(calculate_trend).reset_index()
trend_by_id.columns = ['ID', 'demand_trend']

weekly_stats = weekly_stats.merge(trend_by_id, on='ID', how='left')

print(f"‚úÖ Features temporales creadas: {weekly_stats.shape}")
print(f"\nEjemplos de volatilidad (CV):")
print(weekly_stats[['ID', 'demand_cv', 'demand_trend']].head(10))

üîç Analizando patrones temporales...
Columnas disponibles: ['ID', 'id_season', 'aggregated_family', 'family', 'category', 'fabric', 'color_name', 'color_rgb', 'image_embedding', 'length_type', 'silhouette_type', 'waist_type', 'neck_lapel_type', 'sleeve_length_type', 'heel_shape_type']...
‚úÖ Features temporales creadas: (9843, 9)

Ejemplos de volatilidad (CV):
   ID  demand_cv  demand_trend
0   1   0.477898     -0.531418
1   2   0.243655      0.239921
2   3   0.393046      0.709627
3   4   0.744410     -0.715652
4   6   0.173167     -0.188633
5   7   0.253665      0.098152
6   8   0.129080      0.071335
7  10   0.333518     -0.254777
8  11   1.229824     -1.036939
9  12   0.460106     -0.021007
‚úÖ Features temporales creadas: (9843, 9)

Ejemplos de volatilidad (CV):
   ID  demand_cv  demand_trend
0   1   0.477898     -0.531418
1   2   0.243655      0.239921
2   3   0.393046      0.709627
3   4   0.744410     -0.715652
4   6   0.173167     -0.188633
5   7   0.253665      0.098152
6  

## 2. Estimaci√≥n de Demanda Real (ajustando por stockouts)

In [9]:
print("üîç Estimando demanda real (ajustando stockouts)...")

# Verificar columnas disponibles antes de agregar
print(f"Columnas disponibles en train_full_df: {train_full_df.columns.tolist()[:30]}...")

# Definir columnas que queremos agregar (si existen)
desired_columns = {
    'weekly_demand': 'sum',
    'Production': 'first',
    'id_season': 'first',
    'aggregated_family': 'first',
    'family': 'first',
    'category': 'first',
    'fabric': 'first',
    'color_name': 'first',
    'image_embedding': 'first',
    'length_type': 'first',
    'silhouette_type': 'first',
    'waist_type': 'first',
    'neck_lapel_type': 'first',
    'sleeve_length_type': 'first',
    'heel_shape_type': 'first',
    'toecap_type': 'first',
    'woven_structure': 'first',
    'knit_structure': 'first',
    'print_type': 'first',
    'archetype': 'first',
    'moment': 'first',
    'occasion': 'first',  # Puede ser 'occasion' en vez de 'ocassion'
    'ocassion': 'first',  # O puede ser 'ocassion'
    'life_cycle_length': 'first',
    'num_stores': 'first',
    'num_sizes': 'first',
    'has_plus_sizes': 'first',
    'price': 'first',
    'phase_in': 'first',
    'phase_out': 'first'
}

# Filtrar solo las columnas que existen
agg_dict = {col: agg_func for col, agg_func in desired_columns.items() if col in train_full_df.columns}

print(f"\nColumnas que se van a agregar: {list(agg_dict.keys())}")

# Agregar datos a nivel de producto
train_agg = train_full_df.groupby('ID').agg(agg_dict).reset_index()

train_agg.rename(columns={'weekly_demand': 'total_demand'}, inplace=True)

# Merge con features temporales
train_agg = train_agg.merge(weekly_stats, on='ID', how='left')

# CLAVE: Estimar demanda real cuando hubo stockout
# Si demanda >= 95% de producci√≥n, probablemente hubo stockout
train_agg['utilization_rate'] = train_agg['total_demand'] / (train_agg['Production'] + 1)
train_agg['likely_stockout'] = (train_agg['utilization_rate'] > 0.95).astype(int)

# Ajustar demanda estimada
# Si hubo stockout, asumir que la demanda real era mayor
train_agg['estimated_true_demand'] = train_agg['total_demand'].copy()
stockout_mask = train_agg['likely_stockout'] == 1

# Para stockouts, estimar demanda = producci√≥n * (1 + factor de ajuste basado en popularidad)
# El factor depende del tipo de producto y precio
train_agg.loc[stockout_mask, 'estimated_true_demand'] = (
    train_agg.loc[stockout_mask, 'Production'] * 1.15  # Asumir 15% m√°s de demanda
)

print(f"‚úÖ Productos con probable stockout: {stockout_mask.sum()} / {len(train_agg)} ({stockout_mask.mean()*100:.1f}%)")
print(f"\nEjemplos de ajuste:")
print(train_agg[stockout_mask][['ID', 'Production', 'total_demand', 'estimated_true_demand', 'utilization_rate']].head())

üîç Estimando demanda real (ajustando stockouts)...
Columnas disponibles en train_full_df: ['ID', 'id_season', 'aggregated_family', 'family', 'category', 'fabric', 'color_name', 'color_rgb', 'image_embedding', 'length_type', 'silhouette_type', 'waist_type', 'neck_lapel_type', 'sleeve_length_type', 'heel_shape_type', 'toecap_type', 'woven_structure', 'knit_structure', 'print_type', 'archetype', 'moment', 'phase_in', 'phase_out', 'life_cycle_length', 'num_stores', 'num_sizes', 'has_plus_sizes', 'price', 'year', 'num_week_iso']...

Columnas que se van a agregar: ['weekly_demand', 'Production', 'id_season', 'aggregated_family', 'family', 'category', 'fabric', 'color_name', 'image_embedding', 'length_type', 'silhouette_type', 'waist_type', 'neck_lapel_type', 'sleeve_length_type', 'heel_shape_type', 'toecap_type', 'woven_structure', 'knit_structure', 'print_type', 'archetype', 'moment', 'life_cycle_length', 'num_stores', 'num_sizes', 'has_plus_sizes', 'price', 'phase_in', 'phase_out']
‚úÖ P

## 3. Features Avanzadas: Clustering y Similitud

In [10]:
print("üîç Creando features de similitud y clustering...")

# Procesar embeddings
def parse_embedding(embedding_str):
    if pd.isna(embedding_str) or embedding_str == "":
        return None
    try:
        return np.array(embedding_str.split(','), dtype=np.float32)
    except:
        return None

train_agg['emb_array'] = train_agg['image_embedding'].apply(parse_embedding)
test_df['emb_array'] = test_df['image_embedding'].apply(parse_embedding)

# Detectar dimensi√≥n
first_valid = train_agg['emb_array'].dropna().iloc[0]
EMB_DIM = len(first_valid)
print(f"Dimensi√≥n embedding: {EMB_DIM}")

# Rellenar
default_emb = np.zeros(EMB_DIM)
train_embeddings = np.stack(
    train_agg['emb_array'].apply(lambda x: x if x is not None else default_emb).tolist()
)
test_embeddings = np.stack(
    test_df['emb_array'].apply(lambda x: x if x is not None else default_emb).tolist()
)

# Clustering visual (productos similares)
print("Aplicando K-Means clustering...")
n_clusters = 50  # Agrupar en 50 clusters de estilo visual
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
train_agg['visual_cluster'] = kmeans.fit_predict(train_embeddings)
test_df['visual_cluster'] = kmeans.predict(test_embeddings)

# Features por cluster (para detectar canibalismo)
cluster_stats = train_agg.groupby('visual_cluster').agg({
    'Production': ['mean', 'std', 'count'],
    'estimated_true_demand': 'mean',
    'price': 'mean'
}).reset_index()
cluster_stats.columns = ['visual_cluster', 'cluster_prod_mean', 'cluster_prod_std', 
                          'cluster_size', 'cluster_demand_mean', 'cluster_price_mean']

train_agg = train_agg.merge(cluster_stats, on='visual_cluster', how='left')
test_df = test_df.merge(cluster_stats, on='visual_cluster', how='left')

# k-NN features (productos m√°s similares)
print("Calculando k-NN features...")
knn = NearestNeighbors(n_neighbors=11, metric='cosine')
knn.fit(train_embeddings)

# Para test
distances_test, indices_test = knn.kneighbors(test_embeddings, n_neighbors=10)
test_df['knn_mean_production'] = [train_agg.iloc[idx]['Production'].mean() for idx in indices_test]
test_df['knn_mean_demand'] = [train_agg.iloc[idx]['estimated_true_demand'].mean() for idx in indices_test]
test_df['knn_mean_price'] = [train_agg.iloc[idx]['price'].mean() for idx in indices_test]

# Para train (excluyendo el mismo producto)
distances_train, indices_train = knn.kneighbors(train_embeddings, n_neighbors=11)
indices_train_excl = indices_train[:, 1:]  # Excluir el primero (√©l mismo)
train_agg['knn_mean_production'] = [train_agg.iloc[idx]['Production'].mean() for idx in indices_train_excl]
train_agg['knn_mean_demand'] = [train_agg.iloc[idx]['estimated_true_demand'].mean() for idx in indices_train_excl]
train_agg['knn_mean_price'] = [train_agg.iloc[idx]['price'].mean() for idx in indices_train_excl]

print(f"‚úÖ Features de clustering y similitud creadas")

üîç Creando features de similitud y clustering...
Dimensi√≥n embedding: 512
Aplicando K-Means clustering...
Dimensi√≥n embedding: 512
Aplicando K-Means clustering...
Calculando k-NN features...
Calculando k-NN features...
‚úÖ Features de clustering y similitud creadas
‚úÖ Features de clustering y similitud creadas


## 4. Features de Negocio Avanzadas

In [11]:
print("üîç Creando features de negocio avanzadas...")

# Para train y test
for df in [train_agg, test_df]:
    # Intensidad de distribuci√≥n
    df['stores_per_week'] = df['num_stores'] / (df['life_cycle_length'] + 1)
    df['total_sku_count'] = df['num_stores'] * df['num_sizes']
    if 'has_plus_sizes' in df.columns:
        df['has_plus_sizes'] = df['has_plus_sizes'].map({True: 1, False: 0, 'true': 1, 'false': 0}).fillna(0)
        df['total_sku_count'] = df['total_sku_count'] * (1 + 0.3 * df['has_plus_sizes'])
    
    # Segmento de precio
    df['price_segment'] = pd.cut(df['price'], bins=[0, 20, 40, 60, 100, 200], 
                                   labels=['Budget', 'Mid', 'Mid-High', 'Premium', 'Luxury'])
    
    # Ratio precio/stores (indicador de target de mercado)
    df['price_store_ratio'] = df['price'] / (df['num_stores'] + 1)
    
    # Duraci√≥n del ciclo (temporadas largas = m√°s volumen)
    df['is_long_cycle'] = (df['life_cycle_length'] > 12).astype(int)
    
    # Tipo de categor√≠a (impacto en volumen)
    df['is_basics'] = df['category'].isin(['T-shirt', 'Jeans', 'Tops', 'Bottoms']).astype(int)
    df['is_outerwear'] = df['category'].isin(['Jackets', 'Coats', 'Puffer coats', 'Blazers']).astype(int)
    df['is_special'] = df['category'].isin(['Dresses', 'Swimwear', 'Intimate']).astype(int)

# Features espec√≠ficas de train (usando demanda hist√≥rica)
train_agg['demand_per_store'] = train_agg['estimated_true_demand'] / (train_agg['num_stores'] + 1)
train_agg['demand_per_week'] = train_agg['estimated_true_demand'] / (train_agg['life_cycle_length'] + 1)
train_agg['production_efficiency'] = train_agg['Production'] / (train_agg['estimated_true_demand'] + 1)

# Features de temporada (tendencias entre temporadas)
season_stats = train_agg.groupby('id_season').agg({
    'Production': ['mean', 'std'],
    'estimated_true_demand': 'mean',
    'price': 'mean'
}).reset_index()
season_stats.columns = ['id_season', 'season_prod_mean', 'season_prod_std', 
                         'season_demand_mean', 'season_price_mean']

train_agg = train_agg.merge(season_stats, on='id_season', how='left')

# Para test, usar stats de la √∫ltima temporada como proxy
if 'id_season' in test_df.columns:
    test_df = test_df.merge(season_stats, on='id_season', how='left')
else:
    # Usar √∫ltima temporada
    last_season_stats = season_stats.iloc[-1]
    for col in ['season_prod_mean', 'season_prod_std', 'season_demand_mean', 'season_price_mean']:
        test_df[col] = last_season_stats[col]

# Features de categor√≠a
category_stats = train_agg.groupby('category').agg({
    'Production': ['mean', 'std', 'median'],
    'estimated_true_demand': 'mean',
    'utilization_rate': 'mean'
}).reset_index()
category_stats.columns = ['category', 'cat_prod_mean', 'cat_prod_std', 'cat_prod_median',
                           'cat_demand_mean', 'cat_util_mean']

train_agg = train_agg.merge(category_stats, on='category', how='left')
test_df = test_df.merge(category_stats, on='category', how='left')

print(f"‚úÖ Features de negocio creadas")
print(f"Train shape: {train_agg.shape}")
print(f"Test shape: {test_df.shape}")

üîç Creando features de negocio avanzadas...
‚úÖ Features de negocio creadas
Train shape: (9843, 70)
Test shape: (2250, 60)


## 5. Preparaci√≥n de Features para Modelado

In [12]:
print("üîç Preparando features para modelado...")

# TARGET
TARGET = 'Production'

# Features num√©ricas
numerical_features = [
    # B√°sicas
    'life_cycle_length', 'num_stores', 'num_sizes', 'price',
    
    # Temporales
    'demand_mean', 'demand_std', 'demand_max', 'demand_min', 'demand_cv', 'demand_trend', 'num_weeks',
    
    # Demanda estimada
    'estimated_true_demand', 'utilization_rate',
    
    # Clustering
    'cluster_prod_mean', 'cluster_prod_std', 'cluster_size', 'cluster_demand_mean', 'cluster_price_mean',
    
    # k-NN
    'knn_mean_production', 'knn_mean_demand', 'knn_mean_price',
    
    # Negocio
    'stores_per_week', 'total_sku_count', 'price_store_ratio',
    'is_long_cycle', 'is_basics', 'is_outerwear', 'is_special',
    
    # Temporada
    'season_prod_mean', 'season_prod_std', 'season_demand_mean', 'season_price_mean',
    
    # Categor√≠a
    'cat_prod_mean', 'cat_prod_std', 'cat_prod_median', 'cat_demand_mean', 'cat_util_mean'
]

# Features solo en train
train_only_features = ['demand_per_store', 'demand_per_week', 'production_efficiency']

# Features categ√≥ricas (usar Target Encoding)
categorical_features = [
    'aggregated_family', 'family', 'category', 'fabric', 
    'length_type', 'silhouette_type', 'print_type', 
    'archetype', 'moment', 'ocassion', 'price_segment',
    'visual_cluster'
]

# Filtrar features que existen
numerical_features = [f for f in numerical_features if f in train_agg.columns]
train_only_features = [f for f in train_only_features if f in train_agg.columns]
categorical_features = [f for f in categorical_features if f in train_agg.columns]

print(f"Features num√©ricas: {len(numerical_features)}")
print(f"Features categ√≥ricas: {len(categorical_features)}")
print(f"Features solo train: {len(train_only_features)}")

# Combinar
all_train_features = numerical_features + train_only_features + categorical_features
all_test_features = numerical_features + categorical_features

print(f"\nTotal features train: {len(all_train_features)}")
print(f"Total features test: {len(all_test_features)}")

üîç Preparando features para modelado...
Features num√©ricas: 37
Features categ√≥ricas: 11
Features solo train: 3

Total features train: 51
Total features test: 48


## 6. Validaci√≥n Temporal (Simular cambios de tendencia)

In [13]:
print("üîç Configurando validaci√≥n temporal...")

# Split temporal: √∫ltimas temporadas para validaci√≥n
seasons = sorted(train_agg['id_season'].unique())
print(f"Temporadas disponibles: {seasons}")

# Usar las √∫ltimas 2 temporadas para validaci√≥n
val_seasons = seasons[-2:]
train_seasons = seasons[:-2]

print(f"Train seasons: {train_seasons}")
print(f"Validation seasons: {val_seasons}")

train_mask = train_agg['id_season'].isin(train_seasons)
val_mask = train_agg['id_season'].isin(val_seasons)

X_train = train_agg[train_mask][all_train_features].copy()
y_train = train_agg[train_mask][TARGET].copy()

X_val = train_agg[val_mask][all_train_features].copy()
y_val = train_agg[val_mask][TARGET].copy()

print(f"\n‚úÖ Train: {X_train.shape}, Val: {X_val.shape}")
print(f"\nDistribuci√≥n del target:")
print(f"Train - Media: {y_train.mean():.0f}, Mediana: {y_train.median():.0f}")
print(f"Val - Media: {y_val.mean():.0f}, Mediana: {y_val.median():.0f}")

üîç Configurando validaci√≥n temporal...
Temporadas disponibles: [np.int64(86), np.int64(87), np.int64(88), np.int64(89)]
Train seasons: [np.int64(86), np.int64(87)]
Validation seasons: [np.int64(88), np.int64(89)]

‚úÖ Train: (5079, 51), Val: (4764, 51)

Distribuci√≥n del target:
Train - Media: 24317, Mediana: 17456
Val - Media: 25218, Mediana: 18703


## 7. Entrenamiento de Modelos (Ensemble Avanzado)

In [15]:
print("üéØ Entrenando modelos...")

# Preprocesamiento
from sklearn.preprocessing import OrdinalEncoder

# Para features categ√≥ricas, usar TargetEncoder con menos folds para evitar errores
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', TargetEncoder(cv=2, smooth='auto'))  # Reducir CV folds de 5 a 2
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features + train_only_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Modelos
models = {}

# Modelo 1: HistGradientBoosting (robusto a outliers)
models['HGB'] = Pipeline([
    ('preprocessor', preprocessor),
    ('model', HistGradientBoostingRegressor(
        loss='poisson',  # Mejor para conteos
        max_iter=300,
        learning_rate=0.05,
        max_depth=12,
        min_samples_leaf=15,
        l2_regularization=1.0,
        random_state=42
    ))
])

# Modelo 2: Random Forest
models['RF'] = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(
        n_estimators=300,
        max_depth=20,
        min_samples_leaf=10,
        max_features='sqrt',
        random_state=42,
        n_jobs=-1
    ))
])

# Modelo 3: ExtraTrees (mayor diversidad)
models['ET'] = Pipeline([
    ('preprocessor', preprocessor),
    ('model', ExtraTreesRegressor(
        n_estimators=300,
        max_depth=25,
        min_samples_leaf=8,
        random_state=43,
        n_jobs=-1
    ))
])

# Modelo 4: Huber Regressor (robusto a outliers)
models['Huber'] = Pipeline([
    ('preprocessor', preprocessor),
    ('model', HuberRegressor(
        epsilon=1.5,
        max_iter=200,
        alpha=0.01
    ))
])

# Entrenar y evaluar cada modelo
predictions_val = {}
scores = {}

for name, model in models.items():
    print(f"\nüîÑ Entrenando {name}...")
    try:
        model.fit(X_train, y_train)
        
        pred_val = model.predict(X_val)
        pred_val = np.maximum(pred_val, 0)  # No negativos
        predictions_val[name] = pred_val
        
        mae = mean_absolute_error(y_val, pred_val)
        rmse = np.sqrt(mean_squared_error(y_val, pred_val))
        r2 = r2_score(y_val, pred_val)
        
        scores[name] = {'MAE': mae, 'RMSE': rmse, 'R2': r2}
        print(f"  MAE: {mae:.2f}, RMSE: {rmse:.2f}, R¬≤: {r2:.4f}")
    except Exception as e:
        print(f"  ‚ö†Ô∏è Error entrenando {name}: {e}")
        continue

print("\n" + "="*60)
print("üìä RESUMEN DE MODELOS (Validaci√≥n)")
print("="*60)
if scores:
    scores_df = pd.DataFrame(scores).T
    print(scores_df.sort_values('MAE'))
else:
    print("‚ö†Ô∏è No se pudo entrenar ning√∫n modelo")

üéØ Entrenando modelos...

üîÑ Entrenando HGB...
  MAE: 1233.37, RMSE: 3581.84, R¬≤: 0.9820

üîÑ Entrenando RF...
  MAE: 1233.37, RMSE: 3581.84, R¬≤: 0.9820

üîÑ Entrenando RF...
  MAE: 11282.61, RMSE: 19953.35, R¬≤: 0.4420

üîÑ Entrenando ET...
  MAE: 11282.61, RMSE: 19953.35, R¬≤: 0.4420

üîÑ Entrenando ET...


KeyboardInterrupt: 

## 8. Ensemble √ìptimo (Weighted Average)

In [None]:
print("üéØ Creando ensemble √≥ptimo...")

# Pesos basados en MAE inverso (mejor modelo = mayor peso)
mae_scores = np.array([scores[name]['MAE'] for name in models.keys()])
weights = 1 / mae_scores
weights = weights / weights.sum()

print("Pesos del ensemble:")
for name, weight in zip(models.keys(), weights):
    print(f"  {name}: {weight:.3f}")

# Predicci√≥n ensemble en validaci√≥n
ensemble_val = np.zeros(len(y_val))
for name, weight in zip(models.keys(), weights):
    ensemble_val += weight * predictions_val[name]

# Evaluar ensemble
mae_ensemble = mean_absolute_error(y_val, ensemble_val)
rmse_ensemble = np.sqrt(mean_squared_error(y_val, ensemble_val))
r2_ensemble = r2_score(y_val, ensemble_val)

print(f"\n{'='*60}")
print(f"üèÜ ENSEMBLE FINAL")
print(f"{'='*60}")
print(f"MAE: {mae_ensemble:.2f}")
print(f"RMSE: {rmse_ensemble:.2f}")
print(f"R¬≤: {r2_ensemble:.4f}")
print(f"\nMejora vs mejor modelo individual: {(scores_df['MAE'].min() - mae_ensemble):.2f} puntos")

## 9. Reentrenamiento con TODOS los datos

In [None]:
print("üîÑ Reentrenando modelos con TODOS los datos...")

X_full = train_agg[all_train_features].copy()
y_full = train_agg[TARGET].copy()

final_models = {}

for name, model in models.items():
    print(f"  Entrenando {name}...")
    final_model = model.__class__(**model.get_params())
    final_model.fit(X_full, y_full)
    final_models[name] = final_model

print("‚úÖ Modelos finales entrenados")

## 10. Predicciones en Test

In [None]:
print("üéØ Generando predicciones en test...")

# Preparar test (rellenar features que solo est√°n en train)
for feat in train_only_features:
    if feat not in test_df.columns:
        test_df[feat] = 0  # O usar un valor m√°s inteligente

X_test = test_df[all_train_features].copy()

# Predicciones de cada modelo
predictions_test = {}
for name, model in final_models.items():
    pred = model.predict(X_test)
    pred = np.maximum(pred, 0)
    predictions_test[name] = pred
    print(f"  {name} - Media: {pred.mean():.0f}, Min: {pred.min():.0f}, Max: {pred.max():.0f}")

# Ensemble final
final_predictions = np.zeros(len(X_test))
for name, weight in zip(models.keys(), weights):
    final_predictions += weight * predictions_test[name]

# Redondear y asegurar no negativos
final_predictions = np.round(final_predictions).astype(int)
final_predictions = np.maximum(final_predictions, 0)

print(f"\nüìä Estad√≠sticas de predicciones finales:")
print(f"  Media: {final_predictions.mean():.0f}")
print(f"  Mediana: {np.median(final_predictions):.0f}")
print(f"  Min: {final_predictions.min()}, Max: {final_predictions.max()}")
print(f"  Std: {final_predictions.std():.0f}")

# Comparar con train
print(f"\nüìä Comparaci√≥n con train:")
print(f"  Train Media: {y_full.mean():.0f}")
print(f"  Train Mediana: {y_full.median():.0f}")
print(f"  Ratio Test/Train: {final_predictions.mean() / y_full.mean():.2f}")

## 11. Ajustes Finales y Guardado

In [None]:
print("üíæ Guardando predicciones...")

# Crear submission
submission = pd.DataFrame({
    'ID': test_df['ID'],
    'Production': final_predictions
})

# Guardar
submission.to_csv('submission_advanced_v2.csv', index=False, sep=';')
print(f"‚úÖ Archivo guardado: submission_advanced_v2.csv")

print(f"\nüìã Primeras 20 predicciones:")
print(submission.head(20))

# An√°lisis de distribuci√≥n
plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
plt.hist(y_full, bins=50, alpha=0.7, label='Train', edgecolor='black')
plt.hist(final_predictions, bins=50, alpha=0.7, label='Test', edgecolor='black')
plt.xlabel('Production')
plt.ylabel('Frecuencia')
plt.title('Distribuci√≥n Train vs Test')
plt.legend()
plt.yscale('log')

plt.subplot(1, 3, 2)
plt.boxplot([y_full, final_predictions], labels=['Train', 'Test'])
plt.ylabel('Production')
plt.title('Boxplot Comparativo')

plt.subplot(1, 3, 3)
plt.scatter(range(len(final_predictions)), final_predictions, alpha=0.5, s=1)
plt.axhline(y=final_predictions.mean(), color='r', linestyle='--', label=f'Media: {final_predictions.mean():.0f}')
plt.xlabel('√çndice')
plt.ylabel('Production')
plt.title('Predicciones por producto')
plt.legend()

plt.tight_layout()
plt.savefig('predictions_analysis_v2.png', dpi=150, bbox_inches='tight')
plt.show()

print("\n‚úÖ PROCESO COMPLETADO")

## 12. An√°lisis de Feature Importance

In [None]:
# Feature importance del mejor modelo (Random Forest)
if 'RF' in final_models:
    rf_model = final_models['RF']
    feature_names = numerical_features + train_only_features + categorical_features
    
    # Obtener importancias
    importances = rf_model.named_steps['model'].feature_importances_
    
    feature_importance_df = pd.DataFrame({
        'feature': feature_names,
        'importance': importances
    }).sort_values('importance', ascending=False)
    
    print("\nüéØ TOP 20 Features m√°s importantes:")
    print(feature_importance_df.head(20))
    
    # Visualizar
    plt.figure(figsize=(10, 8))
    top_features = feature_importance_df.head(20)
    plt.barh(range(len(top_features)), top_features['importance'])
    plt.yticks(range(len(top_features)), top_features['feature'])
    plt.xlabel('Importancia')
    plt.title('Top 20 Features M√°s Importantes')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.savefig('feature_importance_v2.png', dpi=150, bbox_inches='tight')
    plt.show()