In [None]:
# Instalar LightGBM y Optuna
!pip install lightgbm
!pip install optuna

# Importar las librerías necesarias
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error
import optuna
import warnings
import re

# Configuraciones
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
warnings.filterwarnings('ignore')



In [None]:
import pandas as pd

# Define la ruta base para no repetir
base_path = 'data/'

try:
    # --- Archivos de ENTRADA (usan punto y coma) ---
    print("Cargando train.csv (con delimiter=';')...")
    train_df = pd.read_csv(
        base_path + 'train.csv',
        delimiter=';',
        on_bad_lines='skip'
    )

    print("Cargando test.csv (con delimiter=';')...")
    test_df = pd.read_csv(
        base_path + 'test.csv',
        delimiter=';',
        on_bad_lines='skip'
    )

    # --- Archivo de MUESTRA DE SALIDA (usa coma) ---
    print("Cargando sample_submission.csv (con delimiter=',')...")
    sample_submission = pd.read_csv(
        base_path + 'sample_submission.csv'
    )

    print("\n--- ¡TODOS LOS DATOS CARGADOS! ---")
    print(train_df.head())

except FileNotFoundError:
    print(f"Error: No se encontraron los archivos CSV en la ruta: {base_path}")
except Exception as e:
    print(f"Ocurrió un error inesperado: {e}")

Cargando train.csv (con delimiter=';')...
Cargando test.csv (con delimiter=';')...
Cargando sample_submission.csv (con delimiter=',')...

--- ¡TODOS LOS DATOS CARGADOS! ---
   ID  id_season      aggregated_family   family  \
0   1         86  Dresses and jumpsuits  Dresses   
1   1         86  Dresses and jumpsuits  Dresses   
2   1         86  Dresses and jumpsuits  Dresses   
3   1         86  Dresses and jumpsuits  Dresses   
4   1         86  Dresses and jumpsuits  Dresses   

                              category fabric color_name  color_rgb  \
0  Dresses, jumpsuits and Complete set  WOVEN   AMARILLO  255,215,0   
1  Dresses, jumpsuits and Complete set  WOVEN   AMARILLO  255,215,0   
2  Dresses, jumpsuits and Complete set  WOVEN   AMARILLO  255,215,0   
3  Dresses, jumpsuits and Complete set  WOVEN   AMARILLO  255,215,0   
4  Dresses, jumpsuits and Complete set  WOVEN   AMARILLO  255,215,0   

                                     image_embedding length_type  \
0  0.072266474,-0.1

In [None]:

print("Agregando el set de entrenamiento...")

# 1. Crear la variable objetivo (y): Suma de la demanda semanal
# CORRECCIÓN: .rename('total_demand') para evitar KeyError
y_train = train_df.groupby('ID')['weekly_demand'].sum().rename('total_demand')

# 2. Crear las características (X): Tomar las características estáticas
# CORRECCIÓN: No se añaden 'total_sales' ni 'Production' para evitar la fuga de datos
static_cols_to_drop = ['num_week_iso', 'year', 'weekly_sales', 'weekly_demand', 'Production']
X_train_static = train_df.drop(columns=static_cols_to_drop).drop_duplicates(subset=['ID'])

# 3. Combinar X e y
X_train_static = X_train_static.set_index('ID')
train_agg_df = X_train_static.join(y_train)

# 4. Preparar el DataFrame de test
X_test = test_df.set_index('ID')

print("¡Agregación completada!")
print(f"Nuevo set de entrenamiento (train_agg_df): {train_agg_df.shape}")
print(train_agg_df.head())

Agregando el set de entrenamiento...
¡Agregación completada!
Nuevo set de entrenamiento (train_agg_df): (9843, 28)
    id_season       aggregated_family     family  \
ID                                                 
1          86   Dresses and jumpsuits    Dresses   
2          88                  Shirts      Shirt   
3          89                   Jeans      Jeans   
4          89                  Shirts      Shirt   
6          86  Sweaters and Cardigans  Cardigans   

                               category  fabric    color_name    color_rgb  \
ID                                                                           
1   Dresses, jumpsuits and Complete set   WOVEN      AMARILLO    255,215,0   
2                                  Tops   WOVEN  VERDE PASTEL  178,211,178   
3                               Bottoms   JEANS         ARENA  219,207,189   
4                                  Tops   WOVEN       CELESTE    0,130,195   
6                                  Tops  TRICOT     

In [None]:

print("Iniciando Ingeniería de Características...")

# --- 4.1: PROCESAR IMAGE EMBEDDINGS (512 dimensiones) ---
print("Procesando Image Embeddings...")
sample_embed_str = train_agg_df['image_embedding'].dropna().iloc[0]
EMBED_DIM = len(sample_embed_str.split(','))
print(f"Dimensión del Embedding detectada: {EMBED_DIM}")
embed_cols = [f'embed_{i}' for i in range(EMBED_DIM)]

def process_embeddings(df):
    def parse_embed(embed_str):
        if pd.isna(embed_str): return [0.0] * EMBED_DIM
        try:
            return [float(x) for x in embed_str.split(',')]
        except ValueError:
            return [0.0] * EMBED_DIM

    embeddings_list = df['image_embedding'].apply(parse_embed)
    embed_df = pd.DataFrame(embeddings_list, columns=embed_cols, index=df.index)
    df_processed = df.join(embed_df)
    df_processed = df_processed.drop(columns=['image_embedding'])
    return df_processed

train_processed = process_embeddings(train_agg_df)
test_processed = process_embeddings(X_test)

# --- 4.2: PROCESAR CATEGÓRICAS, RGB Y FECHAS ---
print("Procesando columnas categóricas, RGB y fechas...")

categorical_cols = train_processed.select_dtypes(include=['object', 'category']).columns.tolist() + ['id_season']
if 'color_rgb' in categorical_cols: categorical_cols.remove('color_rgb')

combined_df = pd.concat([train_processed.drop(columns=['total_demand']), test_processed], keys=['train', 'test'])

# MEJORA: CONTEXTUALIZAR EL PRECIO
print("Creando features de precio contextual...")
family_avg_price = combined_df.groupby('family')['price'].transform('mean')
combined_df['price_vs_family_avg'] = combined_df['price'] / (family_avg_price + 1e-6)

# Procesar RGB
def process_rgb(df):
    def parse_rgb(rgb_str):
        if pd.isna(rgb_str): return 0, 0, 0
        try:
            parts = [int(c) for c in rgb_str.split(',')]
            return parts[0], parts[1], parts[2] if len(parts) == 3 else (0,0,0)
        except Exception: return 0, 0, 0
    rgb_tuples = df['color_rgb'].apply(parse_rgb)
    df['color_R'] = [r for r,g,b in rgb_tuples]; df['color_G'] = [g for r,g,b in rgb_tuples]; df['color_B'] = [b for r,g,b in rgb_tuples]
    return df.drop(columns=['color_rgb'])

combined_df = process_rgb(combined_df)

# Procesar fechas
date_cols = ['phase_in', 'phase_out']
for col in date_cols:
    combined_df[col] = pd.to_datetime(combined_df[col], errors='coerce')
    combined_df[f'{col}_month'] = combined_df[col].dt.month
    combined_df[f'{col}_dayofyear'] = combined_df[col].dt.dayofyear

# Factorizar categóricas
for col in categorical_cols:
    if col in combined_df.columns:
        combined_df[col], _ = pd.factorize(combined_df[col])

# Rellenar NaNs
combined_df = combined_df.fillna(-1)

# Separar de nuevo
numeric_cols = combined_df.select_dtypes(include=np.number).columns
X_train_final = combined_df.loc['train'][numeric_cols]
X_test_final = combined_df.loc['test'][numeric_cols]
y_train_final = train_processed['total_demand'].fillna(0)

# Limpiar nombres de columnas (para error LGBM)
def sanitize_col_name(col): return re.sub(r'[^A-Za-z0-9_]+', '_', col)
X_train_final.columns = [sanitize_col_name(col) for col in X_train_final.columns]
X_test_final.columns = [sanitize_col_name(col) for col in X_test_final.columns]
categorical_cols = [sanitize_col_name(col) for col in categorical_cols if col in X_train_final.columns]


# --- ¡BLOQUE DE ELIMINACIÓN DE RUIDO (LISTA CORREGIDA)! ---
print("Iniciando eliminación de ruido (features con importancia 0)...")

# La lista que obtuviste de la Celda 7 (AHORA SÍ, CORREGIDA)
useless_features_list = ['embed_337', 'embed_329', 'embed_330', 'embed_331', 'embed_332', 'embed_333', 'embed_334', 'embed_335', 'embed_336', 'embed_328', 'embed_338', 'embed_339', 'embed_340', 'embed_341', 'embed_342', 'embed_343', 'embed_344', 'embed_320', 'embed_312', 'embed_313', 'embed_314', 'embed_315', 'embed_316', 'embed_317', 'embed_318', 'embed_319', 'embed_345', 'embed_321', 'embed_322', 'embed_323', 'embed_324', 'embed_325', 'embed_326', 'embed_327', 'embed_370', 'embed_362', 'embed_363', 'embed_364', 'embed_365', 'embed_366', 'embed_367', 'embed_368', 'embed_369', 'embed_361', 'embed_371', 'embed_372', 'embed_373', 'embed_374', 'embed_375', 'embed_376', 'embed_377', 'embed_311', 'embed_360', 'embed_359', 'embed_358', 'embed_357', 'embed_356', 'embed_355', 'embed_354', 'embed_353', 'embed_352', 'embed_351', 'embed_350', 'embed_349', 'embed_348', 'embed_347', 'embed_346', 'embed_270', 'embed_262', 'embed_263', 'embed_264', 'embed_265', 'embed_266', 'embed_267', 'embed_268', 'embed_269', 'embed_261', 'embed_271', 'embed_272', 'embed_273', 'embed_274', 'embed_275', 'embed_276', 'embed_277', 'embed_253', 'embed_245', 'embed_246', 'embed_247', 'embed_248', 'embed_249', 'embed_250', 'embed_251', 'embed_252', 'embed_278', 'embed_254', 'embed_255', 'embed_256', 'embed_257', 'embed_258', 'embed_259', 'embed_260', 'embed_303', 'embed_295', 'embed_296', 'embed_297', 'embed_298', 'embed_299', 'embed_300', 'embed_301', 'embed_302', 'embed_294', 'embed_304', 'embed_305', 'embed_306', 'embed_307', 'embed_308', 'embed_309', 'embed_310', 'embed_378', 'embed_293', 'embed_292', 'embed_291', 'embed_290', 'embed_289', 'embed_288', 'embed_287', 'embed_286', 'embed_285', 'embed_284', 'embed_283', 'embed_282', 'embed_281', 'embed_280', 'embed_279', 'embed_470', 'embed_462', 'embed_463', 'embed_464', 'embed_465', 'embed_466', 'embed_467', 'embed_468', 'embed_469', 'embed_461', 'embed_471', 'embed_472', 'embed_473', 'embed_474', 'embed_475', 'embed_476', 'embed_477', 'embed_453', 'embed_445', 'embed_446', 'embed_447', 'embed_448', 'embed_449', 'embed_450', 'embed_451', 'embed_452', 'embed_478', 'embed_454', 'embed_455', 'embed_456', 'embed_457', 'embed_458', 'embed_459', 'embed_460', 'embed_503', 'embed_495', 'embed_496', 'embed_497', 'embed_498', 'embed_499', 'embed_500', 'embed_501', 'embed_502', 'embed_494', 'embed_505', 'embed_506', 'embed_507', 'embed_508', 'embed_509', 'embed_510', 'embed_511', 'embed_444', 'embed_493', 'embed_492', 'embed_491', 'embed_490', 'embed_489', 'embed_488', 'embed_487', 'embed_486', 'embed_485', 'embed_484', 'embed_483', 'embed_482', 'embed_481', 'embed_480', 'embed_479', 'embed_403', 'embed_395', 'embed_396', 'embed_397', 'embed_398', 'embed_399', 'embed_400', 'embed_401', 'embed_402', 'embed_394', 'embed_404', 'embed_405', 'embed_406', 'embed_407', 'embed_408', 'embed_409', 'embed_410', 'embed_411', 'embed_393', 'embed_392', 'embed_391', 'embed_390', 'embed_389', 'embed_388', 'embed_387', 'embed_386', 'embed_385', 'embed_384', 'embed_383', 'embed_382', 'embed_381', 'embed_380', 'embed_379', 'embed_436', 'embed_428', 'embed_429', 'embed_430', 'embed_431', 'embed_432', 'embed_433', 'embed_434', 'embed_435', 'embed_427', 'embed_437', 'embed_438', 'embed_439', 'embed_440', 'embed_441', 'embed_442', 'embed_443', 'embed_244', 'embed_426', 'embed_425', 'embed_424', 'embed_423', 'embed_422', 'embed_421', 'embed_420', 'embed_419', 'embed_418', 'embed_417', 'embed_416', 'embed_415', 'embed_414', 'embed_413', 'embed_412', 'embed_70', 'embed_85', 'embed_84', 'embed_83', 'embed_82', 'embed_81', 'embed_80', 'embed_79', 'embed_78', 'embed_77', 'embed_76', 'embed_75', 'embed_74', 'embed_73', 'embed_72', 'embed_71', 'embed_86', 'embed_69', 'embed_68', 'embed_67', 'embed_66', 'embed_65', 'embed_64', 'embed_63', 'embed_62', 'embed_61', 'embed_60', 'embed_59', 'embed_58', 'embed_57', 'embed_56', 'embed_55', 'embed_102', 'embed_117', 'embed_116', 'embed_115', 'embed_114', 'embed_113', 'embed_112', 'embed_111', 'embed_110', 'embed_109', 'embed_108', 'embed_107', 'embed_106', 'embed_105', 'embed_104', 'embed_103', 'embed_54', 'embed_101', 'embed_100', 'embed_99', 'embed_98', 'embed_97', 'embed_96', 'embed_95', 'embed_94', 'embed_93', 'embed_92', 'embed_91', 'embed_90', 'embed_89', 'embed_88', 'embed_87', 'Unnamed_28', 'embed_22', 'embed_21', 'embed_20', 'embed_19', 'embed_18', 'embed_17', 'embed_16', 'embed_15', 'embed_14', 'embed_13', 'embed_12', 'embed_11', 'embed_10', 'embed_9', 'embed_8', 'embed_23', 'embed_6', 'embed_5', 'embed_4', 'embed_3', 'embed_2', 'embed_1', 'embed_0', 'Unnamed_29', 'Unnamed_30', 'Unnamed_31', 'Unnamed_32', 'embed_7', 'toecap_type', 'heel_shape_type', 'embed_504', 'embed_38', 'embed_53', 'embed_52', 'embed_51', 'embed_50', 'embed_49', 'embed_48', 'embed_47', 'embed_46', 'embed_45', 'embed_44', 'embed_43', 'embed_42', 'embed_41', 'embed_40', 'embed_39', 'embed_243', 'embed_37', 'embed_36', 'embed_35', 'embed_34', 'embed_33', 'embed_32', 'embed_31', 'embed_30', 'embed_29', 'embed_28', 'embed_27', 'embed_26', 'embed_25', 'embed_24', 'embed_196', 'embed_211', 'embed_210', 'embed_209', 'embed_208', 'embed_207', 'embed_206', 'embed_205', 'embed_204', 'embed_203', 'embed_202', 'embed_201', 'embed_200', 'embed_199', 'embed_198', 'embed_197', 'embed_212', 'embed_195', 'embed_194', 'embed_193', 'embed_192', 'embed_191', 'embed_190', 'embed_189', 'embed_188', 'embed_187', 'embed_186', 'embed_185', 'embed_184', 'embed_183', 'embed_182', 'embed_181', 'embed_227', 'embed_242', 'embed_241', 'embed_240', 'embed_239', 'embed_238', 'embed_237', 'embed_236', 'embed_235', 'embed_234', 'embed_233', 'embed_232', 'embed_231', 'embed_230', 'embed_229', 'embed_228', 'embed_119', 'embed_226', 'embed_225', 'embed_224', 'embed_223', 'embed_222', 'embed_221', 'embed_220', 'embed_219', 'embed_218', 'embed_217', 'embed_216', 'embed_215', 'embed_214', 'embed_213', 'embed_133', 'embed_148', 'embed_147', 'embed_146', 'embed_145', 'embed_144', 'embed_143', 'embed_142', 'embed_141', 'embed_140', 'embed_139', 'embed_138', 'embed_137', 'embed_136', 'embed_135', 'embed_134', 'embed_179', 'embed_132', 'embed_131', 'embed_130', 'embed_129', 'embed_128', 'embed_127', 'embed_126', 'embed_125', 'embed_124', 'embed_123', 'embed_122', 'embed_121', 'embed_120', 'embed_118', 'embed_150', 'embed_180', 'embed_178', 'embed_177', 'embed_176', 'embed_175', 'embed_174', 'embed_173', 'embed_172', 'embed_171', 'embed_170', 'embed_169', 'embed_168', 'embed_167', 'embed_166', 'embed_165', 'embed_163', 'embed_149', 'embed_151', 'embed_152', 'embed_153', 'embed_154', 'embed_155', 'embed_156', 'embed_158', 'embed_164', 'embed_162', 'embed_161', 'embed_160', 'embed_159', 'embed_157']

# Sanitizar la lista
features_to_drop = [sanitize_col_name(col) for col in useless_features_list]

# Asegurarnos de que las columnas existen antes de borrarlas
features_to_drop_safe = [col for col in features_to_drop if col in X_train_final.columns]

X_train_final = X_train_final.drop(columns=features_to_drop_safe)
X_test_final = X_test_final.drop(columns=features_to_drop_safe)

print(f"¡Ruido eliminado! Se quitaron {len(features_to_drop_safe)} features.")

# Actualizar la lista de categóricas (importante!)
categorical_cols = [col for col in categorical_cols if col in X_train_final.columns]
# --- FIN DEL BLOQUE DE ELIMINACIÓN ---


print("¡Ingeniería de Características completada! (con 'price_vs_family_avg' y limpieza de ruido)")


Iniciando Ingeniería de Características...
Procesando Image Embeddings...
Dimensión del Embedding detectada: 512
Procesando columnas categóricas, RGB y fechas...
Creando features de precio contextual...
Iniciando eliminación de ruido (features con importancia 0)...
¡Ruido eliminado! Se quitaron 519 features.
¡Ingeniería de Características completada! (con 'price_vs_family_avg' y limpieza de ruido)


In [None]:



# CELDA 5 (MEJORADA - CORREGIDA) - OPTUNA CON CV TEMPORAL

from sklearn.model_selection import TimeSeriesSplit

print("Iniciando búsqueda de hiperparámetros (con CV Temporal)...")

# Preparar datos
X_train_split = X_train_final[X_train_final['id_season'] < X_train_final['id_season'].max()]
y_train_split = y_train_final[X_train_final['id_season'] < X_train_final['id_season'].max()]
categorical_cols_clean = [col for col in categorical_cols if col in X_train_final.columns]

# --- MEJORA: Validación Cruzada Temporal ---
tscv = TimeSeriesSplit(n_splits=3)

# --- MEJORA: Métrica de Quantile Loss real ---
def quantile_loss(y_true, y_pred, alpha):
    e = y_true - y_pred
    return np.mean(np.maximum(alpha * e, (alpha - 1) * e))

def objective(trial):
    params = {
        'objective': 'quantile', 'metric': 'quantile',
        'alpha': trial.suggest_float('alpha', 0.6, 0.95), # Rango amplio
        'n_estimators': 3000,
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'num_leaves': trial.suggest_int('num_leaves', 20, 100),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.6, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.6, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log=True),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True),
        'verbose': -1, 'n_jobs': -1, 'seed': 48,
    }

    cv_scores = []
    # Iterar sobre los 3 folds temporales
    for train_idx, val_idx in tscv.split(X_train_split):
        X_train_fold, X_val_fold = X_train_split.iloc[train_idx], X_train_split.iloc[val_idx]
        y_train_fold, y_val_fold = y_train_split.iloc[train_idx], y_train_split.iloc[val_idx]

        lgb_train_fold = lgb.Dataset(X_train_fold, y_train_fold, categorical_feature=categorical_cols_clean)

        # --- CORRECCIÓN AQUÍ ---
        # Esta es la línea que faltaba
        lgb_val_fold = lgb.Dataset(X_val_fold, y_val_fold, reference=lgb_train_fold, categorical_feature=categorical_cols_clean)
        # --- FIN DE LA CORRECCIÓN ---

        model_fold = lgb.train(params, lgb_train_fold,
                              valid_sets=[lgb_val_fold], # Usar fold de validación real
                              valid_names=['validation'],
                              callbacks=[lgb.early_stopping(50, verbose=False)])

        preds_fold = model_fold.predict(X_val_fold, num_iteration=model_fold.best_iteration)
        # Calcular el loss real
        loss = quantile_loss(y_val_fold, preds_fold, params['alpha'])
        cv_scores.append(loss)

    return np.mean(cv_scores)

# --- MEJORA: Aumentar n_trials ---
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50) # Aumentado de 30 a 50

best_params = study.best_params
best_params.update({
    'objective': 'quantile', 'metric': 'quantile', 'n_estimators': 2000,
    'verbose': -1, 'n_jobs': -1, 'seed': 42, 'boosting_type': 'gbdt',
})

print("¡Búsqueda finalizada!")
print(f"Mejor Quantile Loss en CV: {study.best_value}")
print("Mejores parámetros:", best_params)

[I 2025-11-16 10:00:56,950] A new study created in memory with name: no-name-3367e74c-68c8-4daf-99cf-671f9da732f7


Iniciando búsqueda de hiperparámetros (con CV Temporal)...


[I 2025-11-16 10:00:58,861] Trial 0 finished with value: 1872.5148030057487 and parameters: {'alpha': 0.6757688266914754, 'learning_rate': 0.07686094861478975, 'num_leaves': 57, 'feature_fraction': 0.8274728772035531, 'bagging_fraction': 0.8004506026899156, 'bagging_freq': 4, 'lambda_l1': 0.016030141655983498, 'lambda_l2': 8.133891581869063e-06}. Best is trial 0 with value: 1872.5148030057487.
[I 2025-11-16 10:01:02,579] Trial 1 finished with value: 1721.4385252797465 and parameters: {'alpha': 0.741299577121876, 'learning_rate': 0.039728932870990975, 'num_leaves': 91, 'feature_fraction': 0.7223922951124473, 'bagging_fraction': 0.7484101480825631, 'bagging_freq': 4, 'lambda_l1': 0.00024111325534900954, 'lambda_l2': 2.979807224901025e-08}. Best is trial 1 with value: 1721.4385252797465.
[I 2025-11-16 10:01:04,675] Trial 2 finished with value: 1767.650809737199 and parameters: {'alpha': 0.7264333647188868, 'learning_rate': 0.06390271635625798, 'num_leaves': 40, 'feature_fraction': 0.97534

¡Búsqueda finalizada!
Mejor Quantile Loss en CV: 782.8255783453677
Mejores parámetros: {'alpha': 0.9495553009609264, 'learning_rate': 0.0888931777639407, 'num_leaves': 50, 'feature_fraction': 0.6916482588849088, 'bagging_fraction': 0.9602975447671267, 'bagging_freq': 2, 'lambda_l1': 1.779030299481771, 'lambda_l2': 0.7427925206991387, 'objective': 'quantile', 'metric': 'quantile', 'n_estimators': 2000, 'verbose': -1, 'n_jobs': -1, 'seed': 42, 'boosting_type': 'gbdt'}


In [None]:


# CELDA 6 (MEJORADA) - ENTRENAMIENTO CON ENSEMBLE

print("Iniciando entrenamiento del modelo FINAL (Ensemble de 3 alphas)...")

# Preparar los datos de validación y entrenamiento completos
X_train_split = X_train_final[X_train_final['id_season'] < X_train_final['id_season'].max()]
y_train_split = y_train_final[X_train_final['id_season'] < X_train_final['id_season'].max()]
X_val_split = X_train_final[X_train_final['id_season'] == X_train_final['id_season'].max()]
y_val_split = y_train_final[X_train_final['id_season'] == X_train_final['id_season'].max()]
categorical_cols_clean = [col for col in categorical_cols if col in X_train_final.columns]

lgb_train = lgb.Dataset(X_train_split, y_train_split, categorical_feature=categorical_cols_clean)
lgb_val = lgb.Dataset(X_val_split, y_val_split, reference=lgb_train, categorical_feature=categorical_cols_clean)

# --- MEJORA: ENSEMBLE DE ALPHAS ---
# Entrenamos 3 modelos con alphas altos y promediamos
alphas_to_train = [0.7, 0.8, 0.9]
models = []

# Usar los mejores parámetros de Optuna (excepto 'alpha')
base_params = best_params.copy()
del base_params['alpha'] # Quitamos el alpha específico de Optuna

for alpha in alphas_to_train:
    print(f"\n--- Entrenando modelo para alpha = {alpha} ---")
    params_alpha = base_params.copy()
    params_alpha['alpha'] = alpha # Asignamos el alpha del ensemble

    model = lgb.train(
        params_alpha,
        lgb_train,
        valid_sets=[lgb_train, lgb_val],
        callbacks=[lgb.early_stopping(100), lgb.log_evaluation(100)]
    )
    models.append(model)

# --- GENERAR SUMISIÓN CON PROMEDIO DE ENSEMBLE ---
print("Generando predicciones (promedio del ensemble)...")
predictions = np.zeros(len(X_test_final))
for model in models:
    # Sumar las predicciones de cada modelo
    predictions += model.predict(X_test_final, num_iteration=model.best_iteration)

# Promediar las predicciones
predictions = predictions / len(models)
# --- FIN ENSEMBLE ---

predictions[predictions < 0] = 0 # Clipa negativos

# Crear el DataFrame de sumisión
submission_df = pd.DataFrame({'ID': X_test_final.index, 'Production': predictions})
submission_df['Production'] = np.round(submission_df['Production']).astype(int)

# Guardar el archivo
submission_df.to_csv('submission.csv', index=False)

print("¡Archivo 'submission.csv' creado!")
print("Se usó un ensemble de 3 modelos (alphas 0.7, 0.8, 0.9) con features PCA y Similitud.")
print(submission_df.head())


Iniciando entrenamiento del modelo FINAL (Ensemble de 3 alphas)...

--- Entrenando modelo para alpha = 0.7 ---
Training until validation scores don't improve for 100 rounds
[100]	training's quantile: 1223.53	valid_1's quantile: 2068.11
[200]	training's quantile: 1080.46	valid_1's quantile: 2057.99
[300]	training's quantile: 1000.23	valid_1's quantile: 2050.53
[400]	training's quantile: 950.369	valid_1's quantile: 2042.81
[500]	training's quantile: 915.286	valid_1's quantile: 2044.34
[600]	training's quantile: 883.376	valid_1's quantile: 2038.88
[700]	training's quantile: 861.651	valid_1's quantile: 2040.83
Early stopping, best iteration is:
[610]	training's quantile: 880.449	valid_1's quantile: 2037.89

--- Entrenando modelo para alpha = 0.8 ---
Training until validation scores don't improve for 100 rounds
[100]	training's quantile: 970.503	valid_1's quantile: 1898.82
[200]	training's quantile: 866.587	valid_1's quantile: 1901.69
Early stopping, best iteration is:
[105]	training's quan

In [None]:
# Instalar LightGBM y Optuna
!pip install lightgbm
!pip install optuna


# Importar las librerías necesarias
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error
import optuna
import warnings
import re


# Configuraciones
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
warnings.filterwarnings('ignore')
import pandas as pd


# Define la ruta base para no repetir
base_path = '/content/datathon-fme-2025-mango/'


try:
    # --- Archivos de ENTRADA (usan punto y coma) ---
    print("Cargando train.csv (con delimiter=';')...")
    train_df = pd.read_csv(
        base_path + 'train.csv',
        delimiter=';',
        on_bad_lines='skip'
    )


    print("Cargando test.csv (con delimiter=';')...")
    test_df = pd.read_csv(
        base_path + 'test.csv',
        delimiter=';',
        on_bad_lines='skip'
    )


    # --- Archivo de MUESTRA DE SALIDA (usa coma) ---
    print("Cargando sample_submission.csv (con delimiter=',')...")
    sample_submission = pd.read_csv(
        base_path + 'sample_submission.csv'
    )


    print("\n--- ¡TODOS LOS DATOS CARGADOS! ---")
    print(train_df.head())


except FileNotFoundError:
    print(f"Error: No se encontraron los archivos CSV en la ruta: {base_path}")
except Exception as e:
    print(f"Ocurrió un error inesperado: {e}")


print("Agregando el set de entrenamiento...")


# 1. Crear la variable objetivo (y): Suma de la demanda semanal
# CORRECCIÓN: .rename('total_demand') para evitar KeyError
y_train = train_df.groupby('ID')['weekly_demand'].sum().rename('total_demand')


# 2. Crear las características (X): Tomar las características estáticas
# CORRECCIÓN: No se añaden 'total_sales' ni 'Production' para evitar la fuga de datos
static_cols_to_drop = ['num_week_iso', 'year', 'weekly_sales', 'weekly_demand', 'Production']
X_train_static = train_df.drop(columns=static_cols_to_drop).drop_duplicates(subset=['ID'])


# 3. Combinar X e y
X_train_static = X_train_static.set_index('ID')
train_agg_df = X_train_static.join(y_train)


# 4. Preparar el DataFrame de test
X_test = test_df.set_index('ID')


print("¡Agregación completada!")
print(f"Nuevo set de entrenamiento (train_agg_df): {train_agg_df.shape}")
print(train_agg_df.head())


print("Iniciando Ingeniería de Características...")


# --- 4.1: PROCESAR IMAGE EMBEDDINGS (512 dimensiones) ---
print("Procesando Image Embeddings...")
sample_embed_str = train_agg_df['image_embedding'].dropna().iloc[0]
EMBED_DIM = len(sample_embed_str.split(','))
print(f"Dimensión del Embedding detectada: {EMBED_DIM}")
embed_cols = [f'embed_{i}' for i in range(EMBED_DIM)]

def process_embeddings(df):
    def parse_embed(embed_str):
        if pd.isna(embed_str): return [0.0] * EMBED_DIM
        try:
            return [float(x) for x in embed_str.split(',')]
        except ValueError:
            return [0.0] * EMBED_DIM


    embeddings_list = df['image_embedding'].apply(parse_embed)
    embed_df = pd.DataFrame(embeddings_list, columns=embed_cols, index=df.index)
    df_processed = df.join(embed_df)
    df_processed = df_processed.drop(columns=['image_embedding'])
    return df_processed

train_processed = process_embeddings(train_agg_df)
test_processed = process_embeddings(X_test)


# --- 4.2: PROCESAR CATEGÓRICAS, RGB Y FECHAS ---
print("Procesando columnas categóricas, RGB y fechas...")

categorical_cols = train_processed.select_dtypes(include=['object', 'category']).columns.tolist() + ['id_season']
if 'color_rgb' in categorical_cols: categorical_cols.remove('color_rgb')


combined_df = pd.concat([train_processed.drop(columns=['total_demand']), test_processed], keys=['train', 'test'])


# MEJORA: CONTEXTUALIZAR EL PRECIO
print("Creando features de precio contextual...")
family_avg_price = combined_df.groupby('family')['price'].transform('mean')
combined_df['price_vs_family_avg'] = combined_df['price'] / (family_avg_price + 1e-6)


# Procesar RGB
def process_rgb(df):
    def parse_rgb(rgb_str):
        if pd.isna(rgb_str): return 0, 0, 0
        try:
            parts = [int(c) for c in rgb_str.split(',')]
            return parts[0], parts[1], parts[2] if len(parts) == 3 else (0,0,0)
        except Exception: return 0, 0, 0
    rgb_tuples = df['color_rgb'].apply(parse_rgb)
    df['color_R'] = [r for r,g,b in rgb_tuples]; df['color_G'] = [g for r,g,b in rgb_tuples]; df['color_B'] = [b for r,g,b in rgb_tuples]
    return df.drop(columns=['color_rgb'])


combined_df = process_rgb(combined_df)


# Procesar fechas
date_cols = ['phase_in', 'phase_out']
for col in date_cols:
    combined_df[col] = pd.to_datetime(combined_df[col], errors='coerce')
    combined_df[f'{col}_month'] = combined_df[col].dt.month
    combined_df[f'{col}_dayofyear'] = combined_df[col].dt.dayofyear


# Factorizar categóricas
for col in categorical_cols:
    if col in combined_df.columns:
        combined_df[col], _ = pd.factorize(combined_df[col])


# Rellenar NaNs
combined_df = combined_df.fillna(-1)


# Separar de nuevo
numeric_cols = combined_df.select_dtypes(include=np.number).columns
X_train_final = combined_df.loc['train'][numeric_cols]
X_test_final = combined_df.loc['test'][numeric_cols]
y_train_final = train_processed['total_demand'].fillna(0)


# Limpiar nombres de columnas (para error LGBM)
def sanitize_col_name(col): return re.sub(r'[^A-Za-z0-9_]+', '_', col)
X_train_final.columns = [sanitize_col_name(col) for col in X_train_final.columns]
X_test_final.columns = [sanitize_col_name(col) for col in X_test_final.columns]
categorical_cols = [sanitize_col_name(col) for col in categorical_cols if col in X_train_final.columns]




# --- ¡BLOQUE DE ELIMINACIÓN DE RUIDO (LISTA CORREGIDA)! ---
print("Iniciando eliminación de ruido (features con importancia 0)...")


# La lista que obtuviste de la Celda 7 (AHORA SÍ, CORREGIDA)
useless_features_list = ['embed_337', 'embed_329', 'embed_330', 'embed_331', 'embed_332', 'embed_333', 'embed_334', 'embed_335', 'embed_336', 'embed_328', 'embed_338', 'embed_339', 'embed_340', 'embed_341', 'embed_342', 'embed_343', 'embed_344', 'embed_320', 'embed_312', 'embed_313', 'embed_314', 'embed_315', 'embed_316', 'embed_317', 'embed_318', 'embed_319', 'embed_345', 'embed_321', 'embed_322', 'embed_323', 'embed_324', 'embed_325', 'embed_326', 'embed_327', 'embed_370', 'embed_362', 'embed_363', 'embed_364', 'embed_365', 'embed_366', 'embed_367', 'embed_368', 'embed_369', 'embed_361', 'embed_371', 'embed_372', 'embed_373', 'embed_374', 'embed_375', 'embed_376', 'embed_377', 'embed_311', 'embed_360', 'embed_359', 'embed_358', 'embed_357', 'embed_356', 'embed_355', 'embed_354', 'embed_353', 'embed_352', 'embed_351', 'embed_350', 'embed_349', 'embed_348', 'embed_347', 'embed_346', 'embed_270', 'embed_262', 'embed_263', 'embed_264', 'embed_265', 'embed_266', 'embed_267', 'embed_268', 'embed_269', 'embed_261', 'embed_271', 'embed_272', 'embed_273', 'embed_274', 'embed_275', 'embed_276', 'embed_277', 'embed_253', 'embed_245', 'embed_246', 'embed_247', 'embed_248', 'embed_249', 'embed_250', 'embed_251', 'embed_252', 'embed_278', 'embed_254', 'embed_255', 'embed_256', 'embed_257', 'embed_258', 'embed_259', 'embed_260', 'embed_303', 'embed_295', 'embed_296', 'embed_297', 'embed_298', 'embed_299', 'embed_300', 'embed_301', 'embed_302', 'embed_294', 'embed_304', 'embed_305', 'embed_306', 'embed_307', 'embed_308', 'embed_309', 'embed_310', 'embed_378', 'embed_293', 'embed_292', 'embed_291', 'embed_290', 'embed_289', 'embed_288', 'embed_287', 'embed_286', 'embed_285', 'embed_284', 'embed_283', 'embed_282', 'embed_281', 'embed_280', 'embed_279', 'embed_470', 'embed_462', 'embed_463', 'embed_464', 'embed_465', 'embed_466', 'embed_467', 'embed_468', 'embed_469', 'embed_461', 'embed_471', 'embed_472', 'embed_473', 'embed_474', 'embed_475', 'embed_476', 'embed_477', 'embed_453', 'embed_445', 'embed_446', 'embed_447', 'embed_448', 'embed_449', 'embed_450', 'embed_451', 'embed_452', 'embed_478', 'embed_454', 'embed_455', 'embed_456', 'embed_457', 'embed_458', 'embed_459', 'embed_460', 'embed_503', 'embed_495', 'embed_496', 'embed_497', 'embed_498', 'embed_499', 'embed_500', 'embed_501', 'embed_502', 'embed_494', 'embed_505', 'embed_506', 'embed_507', 'embed_508', 'embed_509', 'embed_510', 'embed_511', 'embed_444', 'embed_493', 'embed_492', 'embed_491', 'embed_490', 'embed_489', 'embed_488', 'embed_487', 'embed_486', 'embed_485', 'embed_484', 'embed_483', 'embed_482', 'embed_481', 'embed_480', 'embed_479', 'embed_403', 'embed_395', 'embed_396', 'embed_397', 'embed_398', 'embed_399', 'embed_400', 'embed_401', 'embed_402', 'embed_394', 'embed_404', 'embed_405', 'embed_406', 'embed_407', 'embed_408', 'embed_409', 'embed_410', 'embed_411', 'embed_393', 'embed_392', 'embed_391', 'embed_390', 'embed_389', 'embed_388', 'embed_387', 'embed_386', 'embed_385', 'embed_384', 'embed_383', 'embed_382', 'embed_381', 'embed_380', 'embed_379', 'embed_436', 'embed_428', 'embed_429', 'embed_430', 'embed_431', 'embed_432', 'embed_433', 'embed_434', 'embed_435', 'embed_427', 'embed_437', 'embed_438', 'embed_439', 'embed_440', 'embed_441', 'embed_442', 'embed_443', 'embed_244', 'embed_426', 'embed_425', 'embed_424', 'embed_423', 'embed_422', 'embed_421', 'embed_420', 'embed_419', 'embed_418', 'embed_417', 'embed_416', 'embed_415', 'embed_414', 'embed_413', 'embed_412', 'embed_70', 'embed_85', 'embed_84', 'embed_83', 'embed_82', 'embed_81', 'embed_80', 'embed_79', 'embed_78', 'embed_77', 'embed_76', 'embed_75', 'embed_74', 'embed_73', 'embed_72', 'embed_71', 'embed_86', 'embed_69', 'embed_68', 'embed_67', 'embed_66', 'embed_65', 'embed_64', 'embed_63', 'embed_62', 'embed_61', 'embed_60', 'embed_59', 'embed_58', 'embed_57', 'embed_56', 'embed_55', 'embed_102', 'embed_117', 'embed_116', 'embed_115', 'embed_114', 'embed_113', 'embed_112', 'embed_111', 'embed_110', 'embed_109', 'embed_108', 'embed_107', 'embed_106', 'embed_105', 'embed_104', 'embed_103', 'embed_54', 'embed_101', 'embed_100', 'embed_99', 'embed_98', 'embed_97', 'embed_96', 'embed_95', 'embed_94', 'embed_93', 'embed_92', 'embed_91', 'embed_90', 'embed_89', 'embed_88', 'embed_87', 'Unnamed_28', 'embed_22', 'embed_21', 'embed_20', 'embed_19', 'embed_18', 'embed_17', 'embed_16', 'embed_15', 'embed_14', 'embed_13', 'embed_12', 'embed_11', 'embed_10', 'embed_9', 'embed_8', 'embed_23', 'embed_6', 'embed_5', 'embed_4', 'embed_3', 'embed_2', 'embed_1', 'embed_0', 'Unnamed_29', 'Unnamed_30', 'Unnamed_31', 'Unnamed_32', 'embed_7', 'toecap_type', 'heel_shape_type', 'embed_504', 'embed_38', 'embed_53', 'embed_52', 'embed_51', 'embed_50', 'embed_49', 'embed_48', 'embed_47', 'embed_46', 'embed_45', 'embed_44', 'embed_43', 'embed_42', 'embed_41', 'embed_40', 'embed_39', 'embed_243', 'embed_37', 'embed_36', 'embed_35', 'embed_34', 'embed_33', 'embed_32', 'embed_31', 'embed_30', 'embed_29', 'embed_28', 'embed_27', 'embed_26', 'embed_25', 'embed_24', 'embed_196', 'embed_211', 'embed_210', 'embed_209', 'embed_208', 'embed_207', 'embed_206', 'embed_205', 'embed_204', 'embed_203', 'embed_202', 'embed_201', 'embed_200', 'embed_199', 'embed_198', 'embed_197', 'embed_212', 'embed_195', 'embed_194', 'embed_193', 'embed_192', 'embed_191', 'embed_190', 'embed_189', 'embed_188', 'embed_187', 'embed_186', 'embed_185', 'embed_184', 'embed_183', 'embed_182', 'embed_181', 'embed_227', 'embed_242', 'embed_241', 'embed_240', 'embed_239', 'embed_238', 'embed_237', 'embed_236', 'embed_235', 'embed_234', 'embed_233', 'embed_232', 'embed_231', 'embed_230', 'embed_229', 'embed_228', 'embed_119', 'embed_226', 'embed_225', 'embed_224', 'embed_223', 'embed_222', 'embed_221', 'embed_220', 'embed_219', 'embed_218', 'embed_217', 'embed_216', 'embed_215', 'embed_214', 'embed_213', 'embed_133', 'embed_148', 'embed_147', 'embed_146', 'embed_145', 'embed_144', 'embed_143', 'embed_142', 'embed_141', 'embed_140', 'embed_139', 'embed_138', 'embed_137', 'embed_136', 'embed_135', 'embed_134', 'embed_179', 'embed_132', 'embed_131', 'embed_130', 'embed_129', 'embed_128', 'embed_127', 'embed_126', 'embed_125', 'embed_124', 'embed_123', 'embed_122', 'embed_121', 'embed_120', 'embed_118', 'embed_150', 'embed_180', 'embed_178', 'embed_177', 'embed_176', 'embed_175', 'embed_174', 'embed_173', 'embed_172', 'embed_171', 'embed_170', 'embed_169', 'embed_168', 'embed_167', 'embed_166', 'embed_165', 'embed_163', 'embed_149', 'embed_151', 'embed_152', 'embed_153', 'embed_154', 'embed_155', 'embed_156', 'embed_158', 'embed_164', 'embed_162', 'embed_161', 'embed_160', 'embed_159', 'embed_157']


# Sanitizar la lista
features_to_drop = [sanitize_col_name(col) for col in useless_features_list]


# Asegurarnos de que las columnas existen antes de borrarlas
features_to_drop_safe = [col for col in features_to_drop if col in X_train_final.columns]

X_train_final = X_train_final.drop(columns=features_to_drop_safe)
X_test_final = X_test_final.drop(columns=features_to_drop_safe)


print(f"¡Ruido eliminado! Se quitaron {len(features_to_drop_safe)} features.")


# Actualizar la lista de categóricas (importante!)
categorical_cols = [col for col in categorical_cols if col in X_train_final.columns]
# --- FIN DEL BLOQUE DE ELIMINACIÓN ---




print("¡Ingeniería de Características completada! (con 'price_vs_family_avg' y limpieza de ruido)")






# CELDA 5 (MEJORADA - CORREGIDA) - OPTUNA CON CV TEMPORAL


from sklearn.model_selection import TimeSeriesSplit


print("Iniciando búsqueda de hiperparámetros (con CV Temporal)...")


# Preparar datos
X_train_split = X_train_final[X_train_final['id_season'] < X_train_final['id_season'].max()]
y_train_split = y_train_final[X_train_final['id_season'] < X_train_final['id_season'].max()]
categorical_cols_clean = [col for col in categorical_cols if col in X_train_final.columns]


# --- MEJORA: Validación Cruzada Temporal ---
tscv = TimeSeriesSplit(n_splits=3)


# --- MEJORA: Métrica de Quantile Loss real ---
def quantile_loss(y_true, y_pred, alpha):
    e = y_true - y_pred
    return np.mean(np.maximum(alpha * e, (alpha - 1) * e))


def objective(trial):
    params = {
        'objective': 'quantile', 'metric': 'quantile',
        'alpha': trial.suggest_float('alpha', 0.90, 0.97), # Rango amplio
        'n_estimators': 1500,
        'learning_rate': trial.suggest_float('learning_rate', 0.03, 0.7),
        'num_leaves': trial.suggest_int('num_leaves', 60, 120),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.55, 0.75),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.55, 0.75),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 2),
        'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 1e-2, log=True),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 1e-2, log=True),
        'verbose': -1, 'n_jobs': -1, 'seed': 42,
    }


    cv_scores = []
    # Iterar sobre los 3 folds temporales
    for train_idx, val_idx in tscv.split(X_train_split):
        X_train_fold, X_val_fold = X_train_split.iloc[train_idx], X_train_split.iloc[val_idx]
        y_train_fold, y_val_fold = y_train_split.iloc[train_idx], y_train_split.iloc[val_idx]


        lgb_train_fold = lgb.Dataset(X_train_fold, y_train_fold, categorical_feature=categorical_cols_clean)


        # --- CORRECCIÓN AQUÍ ---
        # Esta es la línea que faltaba
        lgb_val_fold = lgb.Dataset(X_val_fold, y_val_fold, reference=lgb_train_fold, categorical_feature=categorical_cols_clean)
        # --- FIN DE LA CORRECCIÓN ---


        model_fold = lgb.train(params, lgb_train_fold,
                              valid_sets=[lgb_val_fold], # Usar fold de validación real
                              valid_names=['validation'],
                              callbacks=[lgb.early_stopping(50, verbose=False)])


        preds_fold = model_fold.predict(X_val_fold, num_iteration=model_fold.best_iteration)
        # Calcular el loss real
        loss = quantile_loss(y_val_fold, preds_fold, params['alpha'])
        cv_scores.append(loss)


    return np.mean(cv_scores)


# --- MEJORA: Aumentar n_trials ---
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50) # Aumentado de 30 a 50


best_params = study.best_params
best_params.update({
    'objective': 'quantile', 'metric': 'quantile', 'n_estimators': 2000,
    'verbose': -1, 'n_jobs': -1, 'seed': 42, 'boosting_type': 'gbdt',
})


print("¡Búsqueda finalizada!")
print(f"Mejor Quantile Loss en CV: {study.best_value}")
print("Mejores parámetros:", best_params)




# CELDA 6 (MEJORADA) - ENTRENAMIENTO CON ENSEMBLE


print("Iniciando entrenamiento del modelo FINAL (Ensemble de 3 alphas)...")


# Preparar los datos de validación y entrenamiento completos
X_train_split = X_train_final[X_train_final['id_season'] < X_train_final['id_season'].max()]
y_train_split = y_train_final[X_train_final['id_season'] < X_train_final['id_season'].max()]
X_val_split = X_train_final[X_train_final['id_season'] == X_train_final['id_season'].max()]
y_val_split = y_train_final[X_train_final['id_season'] == X_train_final['id_season'].max()]
categorical_cols_clean = [col for col in categorical_cols if col in X_train_final.columns]


lgb_train = lgb.Dataset(X_train_split, y_train_split, categorical_feature=categorical_cols_clean)
lgb_val = lgb.Dataset(X_val_split, y_val_split, reference=lgb_train, categorical_feature=categorical_cols_clean)


# --- MEJORA: ENSEMBLE DE ALPHAS ---
# Entrenamos 3 modelos con alphas altos y promediamos
alphas_to_train = [0.7, 0.8, 0.9]
models = []


# Usar los mejores parámetros de Optuna (excepto 'alpha')
base_params = best_params.copy()
del base_params['alpha'] # Quitamos el alpha específico de Optuna


for alpha in alphas_to_train:
    print(f"\n--- Entrenando modelo para alpha = {alpha} ---")
    params_alpha = base_params.copy()
    params_alpha['alpha'] = alpha # Asignamos el alpha del ensemble


    model = lgb.train(
        params_alpha,
        lgb_train,
        valid_sets=[lgb_train, lgb_val],
        callbacks=[lgb.early_stopping(100), lgb.log_evaluation(100)]
    )
    models.append(model)


# --- GENERAR SUMISIÓN CON PROMEDIO DE ENSEMBLE ---
print("Generando predicciones (promedio del ensemble)...")
predictions = np.zeros(len(X_test_final))
for model in models:
    # Sumar las predicciones de cada modelo
    predictions += model.predict(X_test_final, num_iteration=model.best_iteration)


# Promediar las predicciones
predictions = predictions / len(models)
# --- FIN ENSEMBLE ---


predictions[predictions < 0] = 0 # Clipa negativos


# Crear el DataFrame de sumisión
submission_df = pd.DataFrame({'ID': X_test_final.index, 'Production': predictions})
submission_df['Production'] = np.round(submission_df['Production']).astype(int)


# Guardar el archivo
submission_df.to_csv('submission.csv', index=False)


print("¡Archivo 'submission.csv' creado!")
print("Se usó un ensemble de 3 modelos (alphas 0.7, 0.8, 0.9) con features PCA y Similitud.")
print(submission_df.head())

Cargando train.csv (con delimiter=';')...
Cargando test.csv (con delimiter=';')...
Cargando sample_submission.csv (con delimiter=',')...

--- ¡TODOS LOS DATOS CARGADOS! ---
   ID  id_season      aggregated_family   family  \
0   1         86  Dresses and jumpsuits  Dresses   
1   1         86  Dresses and jumpsuits  Dresses   
2   1         86  Dresses and jumpsuits  Dresses   
3   1         86  Dresses and jumpsuits  Dresses   
4   1         86  Dresses and jumpsuits  Dresses   

                              category fabric color_name  color_rgb  \
0  Dresses, jumpsuits and Complete set  WOVEN   AMARILLO  255,215,0   
1  Dresses, jumpsuits and Complete set  WOVEN   AMARILLO  255,215,0   
2  Dresses, jumpsuits and Complete set  WOVEN   AMARILLO  255,215,0   
3  Dresses, jumpsuits and Complete set  WOVEN   AMARILLO  255,215,0   
4  Dresses, jumpsuits and Complete set  WOVEN   AMARILLO  255,215,0   

                                     image_embedding length_type  \
0  0.072266474,-0.1

[I 2025-11-16 10:03:15,125] A new study created in memory with name: no-name-014a6e3f-2049-4837-909b-c45e79cda674


Iniciando eliminación de ruido (features con importancia 0)...
¡Ruido eliminado! Se quitaron 519 features.
¡Ingeniería de Características completada! (con 'price_vs_family_avg' y limpieza de ruido)
Iniciando búsqueda de hiperparámetros (con CV Temporal)...


[I 2025-11-16 10:03:15,652] Trial 0 finished with value: 694.6923869233069 and parameters: {'alpha': 0.9655116290355074, 'learning_rate': 0.12272038309575277, 'num_leaves': 80, 'feature_fraction': 0.592104233058228, 'bagging_fraction': 0.5661350744231922, 'bagging_freq': 1, 'lambda_l1': 0.0036102424054675424, 'lambda_l2': 0.0018485542696568859}. Best is trial 0 with value: 694.6923869233069.
[I 2025-11-16 10:03:16,172] Trial 1 finished with value: 1109.7203749035953 and parameters: {'alpha': 0.9378291506288102, 'learning_rate': 0.6637323056095102, 'num_leaves': 109, 'feature_fraction': 0.6974128236686459, 'bagging_fraction': 0.6425939684388595, 'bagging_freq': 1, 'lambda_l1': 4.951260879171578e-06, 'lambda_l2': 5.629875969459524e-05}. Best is trial 0 with value: 694.6923869233069.
[I 2025-11-16 10:03:16,672] Trial 2 finished with value: 1017.24725703285 and parameters: {'alpha': 0.9406389143059463, 'learning_rate': 0.35882797429209035, 'num_leaves': 105, 'feature_fraction': 0.646419035

¡Búsqueda finalizada!
Mejor Quantile Loss en CV: 626.637251299977
Mejores parámetros: {'alpha': 0.9698218421893332, 'learning_rate': 0.08672985006709003, 'num_leaves': 71, 'feature_fraction': 0.619169834150951, 'bagging_fraction': 0.6276654879000183, 'bagging_freq': 1, 'lambda_l1': 1.953492344935628e-05, 'lambda_l2': 1.5123844393590924e-07, 'objective': 'quantile', 'metric': 'quantile', 'n_estimators': 2000, 'verbose': -1, 'n_jobs': -1, 'seed': 42, 'boosting_type': 'gbdt'}
Iniciando entrenamiento del modelo FINAL (Ensemble de 3 alphas)...

--- Entrenando modelo para alpha = 0.7 ---
Training until validation scores don't improve for 100 rounds
[100]	training's quantile: 1197.22	valid_1's quantile: 2206.16
[200]	training's quantile: 1027.43	valid_1's quantile: 2190.77
[300]	training's quantile: 929.106	valid_1's quantile: 2187.85
[400]	training's quantile: 866.829	valid_1's quantile: 2177.96
[500]	training's quantile: 820.871	valid_1's quantile: 2178.26
[600]	training's quantile: 782.414