In [7]:
from pathlib import Path
import os

# Establece la raíz del proyecto manualmente
project_root = Path("F:/JCMDataCenter/Cursos/Evolve Academy/Data Scientist IA/Futpeak") # sobremesa
#project_root = Path("C:/Users/juanm/Desktop/FUTPEAK/Futpeak") # portátil

# Cambia el directorio de trabajo actual a esa raíz
os.chdir(project_root)

print("📁 Directorio de trabajo actual:", Path.cwd())

📁 Directorio de trabajo actual: F:\JCMDataCenter\Cursos\Evolve Academy\Data Scientist IA\Futpeak


# ETAPA 1 — Carga y exploración de los datos

En esta primera etapa, cargamos los archivos `matchlogs` y `metadata`, que contienen información por partido y por jugador, respectivamente.

Nuestro objetivo aquí no es solo cargar los datos, sino **entender qué columnas hay, qué tipo de variables contiene cada una, y cuáles vamos a usar o descartar**.

### ¿Qué buscamos al explorar?
- Qué columnas tienen valores nulos
- Qué columnas son categóricas, numéricas o de fecha
- Si hay identificadores únicos (`Player_ID`)
- Qué variables nos pueden servir para el modelo

Vamos a centrarnos en **variables relacionadas con el rendimiento en el campo**, y **descartar muchas otras que no aportan directamente al objetivo de predecir la evolución del jugador**.



In [8]:
import pandas as pd

# Load datasets
df_matchlogs = pd.read_csv("data/processed/cleaned_matchlogs.csv")
df_metadata = pd.read_csv("data/processed/cleaned_metadata.csv")

# Show general info for structure
print("🔍 Matchlogs structure:")
df_matchlogs.info()

print("\n🧾 Metadata structure:")
df_metadata.info()

# Preview a few rows
df_matchlogs.head()


🔍 Matchlogs structure:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 368257 entries, 0 to 368256
Data columns (total 46 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   Player_name            368257 non-null  object 
 1   Player_ID              368257 non-null  object 
 2   Seasons                368257 non-null  object 
 3   Date                   368257 non-null  object 
 4   Day                    368257 non-null  object 
 5   Competition            368257 non-null  object 
 6   Round                  368257 non-null  object 
 7   Home_Away              368257 non-null  object 
 8   Result                 368257 non-null  object 
 9   Player_team            368257 non-null  object 
 10  Rival_team             368257 non-null  object 
 11  Start                  368257 non-null  object 
 12  Position               283886 non-null  object 
 13  Minutes                367852 non-null  float64
 14  Goals        

Unnamed: 0,Player_name,Player_ID,Seasons,Date,Day,Competition,Round,Home_Away,Result,Player_team,...,x_assisted_G,Shot_creating_actions,Goal_creating_actions,Passes_completed,Passes_att,Percent_passes,Progressive_passes,Feet_control,Progressive_control,Dribling_suc
0,Luciano Abecasis,6c510f2d,2016,2016-02-05,Fri,Primera Div,Matchweek 1,Away,L 0–1,Godoy Cruz,...,,,,,,,,,,
1,Luciano Abecasis,6c510f2d,2016,2016-02-12,Fri,Primera Div,Matchweek 2,Home,D 1–1,Godoy Cruz,...,,,,,,,,,,
2,Luciano Abecasis,6c510f2d,2016,2016-02-18,Thu,Primera Div,Matchweek 3,Away,W 2–1,Godoy Cruz,...,,,,,,,,,,
3,Luciano Abecasis,6c510f2d,2016,2016-02-21,Sun,Primera Div,Matchweek 4,Home,W 4–1,Godoy Cruz,...,,,,,,,,,,
4,Luciano Abecasis,6c510f2d,2016,2016-02-26,Fri,Primera Div,Matchweek 5,Away,W 3–2,Godoy Cruz,...,,,,,,,,,,


In [9]:
df = df_matchlogs.merge(df_metadata, on='Player_ID', how='left', suffixes=('', '_meta'))
# Eliminar columnas duplicadas de metadata si ya existen en matchlogs
columns_to_drop = [col for col in df.columns if col.endswith('_meta') and col[:-5] in df.columns]
df.drop(columns=columns_to_drop, inplace=True)



In [10]:
df.columns

Index(['Player_name', 'Player_ID', 'Seasons', 'Date', 'Day', 'Competition',
       'Round', 'Home_Away', 'Result', 'Player_team', 'Rival_team', 'Start',
       'Position', 'Minutes', 'Goals', 'Assists', 'Penalty_kick',
       'Penalty_kick_att', 'Shots', 'Shots_on_target', 'Yellow_cards',
       'Red_cards', 'Fouls_committed', 'Fouls_drawn', 'Offsides', 'Crosses',
       'Tackles_won', 'Interceptions', 'Own_goals', 'Penaltys_won',
       'Penaltys_conceded', 'Touches', 'Tackles', 'Blocks', 'xG',
       'non_penalty_xG', 'x_assisted_G', 'Shot_creating_actions',
       'Goal_creating_actions', 'Passes_completed', 'Passes_att',
       'Percent_passes', 'Progressive_passes', 'Feet_control',
       'Progressive_control', 'Dribling_suc', 'Full_name', 'Url_template',
       'Birth_date', 'Age', 'Footed', 'Birth_place', 'Nationality', 'Club',
       'Gender'],
      dtype='object')

In [11]:
print("🔹 Después del merge inicial:")
print("  Total jugadores únicos:", df['Player_ID'].nunique())


🔹 Después del merge inicial:
  Total jugadores únicos: 2223


In [12]:
performance_cols = [
    'Goals', 'Assists', 'Shots', 'Shots_on_target',
    'Yellow_cards', 'Red_cards', 'Minutes'
]

df[performance_cols] = df[performance_cols].fillna(0)


# ETAPA 2 — Filtrado de jugadores ofensivos y cálculo del debut

No todos los jugadores tienen el mismo tipo de rendimiento ni las mismas estadísticas.  
En este proyecto, **nos centramos en jugadores ofensivos y mediocampistas**, porque son los que aportan goles, asistencias y tiros (las métricas que usaremos para valorar el rendimiento).

Además, para poder analizar la progresión desde el inicio de la carrera, necesitamos:
- Detectar el **primer partido profesional (debut)**.
- Calcular desde entonces en qué año de carrera está cada jugador en cada partido.

También calcularemos la edad del jugador en cada partido, porque **la edad es una variable predictiva clave** para el tipo de evolución que tendrá.


In [13]:
POSITION_GROUPS = {
    'GOALKEEPER': ['GK'],

    'DEFENSIVE': [
        'CB',    # Center Back
        'LB',    # Left Back
        'RB',    # Right Back
        'FB',    # Fullback
        'LWB',   # Left Wing Back
        'RWB',   # Right Wing Back
        'SW',    # Sweeper
        'D'      # Generic Defender
    ],

    'MIDFIELD': [
        'CM',    # Center Midfielder
        'DM',    # Defensive Midfielder
        'MF',     # Generic Midfielder
        'AM'     # Attacking Midfielder (can also be attacking, but here consider control of juego)
    ],

    'ATTACKING': [
        'CF',    # Center Forward
        'ST',    # Striker
        'F',     # Generic Forward
        'FW',    # Forward (alternative code)
        'LW',    # Left Wing
        'RW',    # Right Wing
        'WF',    # Wing Forward
        'IF',    # Inside Forward
        'OL',    # Outside Left
        'OR'     # Outside Right
    ]
}


In [14]:
# Define relevant positions for attacking/midfield players
target_positions = POSITION_GROUPS['MIDFIELD'] + POSITION_GROUPS['ATTACKING']

# Create a function to check if position is relevant
def is_target_position(pos):
    if pd.isna(pos):
        return False
    return any(p in pos.split('-') for p in target_positions)

# Filter metadata to only include target players
df_metadata['Is_Target'] = df_metadata['Position'].apply(is_target_position)
df_metadata_target = df_metadata[df_metadata['Is_Target']]

# Filter matchlogs for selected players
target_ids = df_metadata_target['Player_ID'].unique()
df_matchlogs_target = df_matchlogs[df_matchlogs['Player_ID'].isin(target_ids)].copy()


In [15]:
# Convert Date columns to datetime
df_matchlogs_target['Date'] = pd.to_datetime(df_matchlogs_target['Date'], errors='coerce')
df_metadata['Birth_date'] = pd.to_datetime(df_metadata['Birth_date'], errors='coerce')

# Merge Birth_date into matchlogs
df_matchlogs_target = df_matchlogs_target.merge(
    df_metadata[['Player_ID', 'Birth_date']],
    on='Player_ID', how='left'
)

# Calcular debut por jugador (primer partido con minutos > 0)
debut_dates = df_matchlogs_target[df_matchlogs_target['Minutes'] > 0].groupby('Player_ID')['Date'].min()
df_matchlogs_target['Debut_date'] = df_matchlogs_target['Player_ID'].map(debut_dates)

# Calcular año relativo desde el debut
df_matchlogs_target['year_since_debut'] = (
    (df_matchlogs_target['Date'] - df_matchlogs_target['Debut_date']).dt.days // 365 + 1
)

# Calcular edad en el partido
df_matchlogs_target['age'] = (
    (df_matchlogs_target['Date'] - df_matchlogs_target['Birth_date']).dt.days / 365.25
)


# ETAPA 3 — Cálculo del rendimiento ofensivo por partido

Queremos transformar las estadísticas individuales (goles, asistencias, tiros, tarjetas...) en **una única métrica de rendimiento**.  
Esta métrica nos permitirá comparar partidos entre sí de manera más objetiva.

Además, vamos a normalizar esta métrica por 90 minutos.  
¿Por qué? Porque jugar 15 minutos y marcar un gol no equivale a jugar 90 y marcar uno. **Necesitamos ajustar el rendimiento al tiempo jugado.**

### Fórmula propuesta:
- +5 por cada gol
- +4 por asistencia
- +0.5 por tiro a puerta
- +0.1 por tiro fuera
- -1 por amarilla
- -2 por roja

El resultado se divide por los minutos jugados y se multiplica por 90 → rendimiento por 90 minutos.


In [16]:
# Fill missing values with 0
cols = ['Goals', 'Assists', 'Shots', 'Shots_on_target', 'Yellow_cards', 'Red_cards', 'Minutes']
df_matchlogs_target[cols] = df_matchlogs_target[cols].fillna(0)

# Function to compute performance rating
def calculate_rating(row):
    score = (
        row['Goals'] * 5 +
        row['Assists'] * 4 +
        row['Shots_on_target'] * 0.5 +
        (row['Shots'] - row['Shots_on_target']) * 0.1 -
        row['Yellow_cards'] * 1 -
        row['Red_cards'] * 2
    )
    return score / (row['Minutes'] / 90) if row['Minutes'] > 0 else 0

# Apply to dataset
df_matchlogs_target['rating_per_90'] = df_matchlogs_target.apply(calculate_rating, axis=1)


In [17]:
print("🔹 Jugadores con rating_per_90:")
print(df_matchlogs_target[df_matchlogs_target['rating_per_90'].notna()]['Player_ID'].nunique())


🔹 Jugadores con rating_per_90:
1423


# ETAPA 4 — Agregación por año y creación de la trayectoria del jugador

Ahora que ya tenemos el rendimiento de cada partido (`rating_per_90`), necesitamos pasar de datos por partido a **una vista por año** desde el debut.

### ¿Por qué agregamos por año?
Porque queremos ver cómo evoluciona el jugador en su carrera. Si analizamos partido por partido, hay demasiado ruido.

### ¿Qué vamos a calcular por jugador y año?
- Minutos totales jugados
- Goles totales
- Asistencias totales
- Media de `rating_per_90` del año
- Edad promedio ese año

Este dataset será nuestra "foto" anual del jugador, y servirá para modelar su evolución.


In [18]:
# Hacemos el groupby como tú tienes
career_df = df_matchlogs_target.groupby(['Player_ID', 'year_since_debut']).agg({
    'Minutes': 'sum',
    'Goals': 'sum',
    'Assists': 'sum',
    'rating_per_90': 'mean',
    'age': 'mean'
}).reset_index()

# Añadir columna 'Position' y 'Player_name' mapeando desde metadata
career_df = career_df.merge(
    df_metadata[['Player_ID', 'Player_name', 'Position']],
    on='Player_ID', how='left'
)



# ETAPA 5 — Detección del año de pico y clasificación de tipo de carrera

Para predecir cómo será la evolución de un jugador, primero tenemos que saber **cómo fue la de los jugadores históricos**.

Aquí vamos a:
1. Detectar el **año de pico** de cada jugador, es decir, el año en el que tuvo su mejor rendimiento (`rating_per_90`).
2. Clasificar ese año como:
   - `temprano` si ocurrió en los primeros 2 años
   - `medio` si ocurrió entre los años 3 y 4
   - `tardío` si ocurrió en el año 5 o más

Esto se convertirá en nuestra **variable objetivo (`target`)**. Es lo que el modelo intentará predecir.

Además, filtraremos los años con muy pocos minutos jugados, ya que pueden dar lugar a "falsos picos".


In [19]:
# Filtrar años con suficiente tiempo jugado (evita picos engañosos)
valid_seasons = career_df[career_df['Minutes'] >= 300]

# Buscar el año con mejor rating por jugador
peak_info = (
    valid_seasons.sort_values(['Player_ID', 'rating_per_90'], ascending=[True, False])
    .groupby('Player_ID')
    .first()
    .reset_index()
    .rename(columns={'year_since_debut': 'peak_year', 'rating_per_90': 'peak_rating'})
)

# Unir esta info a la trayectoria completa
career_with_peak = career_df.merge(
    peak_info[['Player_ID', 'peak_year', 'peak_rating']],
    on='Player_ID', how='left'
)

# Clasificar el tipo de carrera según el año de pico
def assign_peak_group(y):
    if y <= 3:
        return 'temprano'
    elif y <= 6:
        return 'medio'
    else:
        return 'tardío'

# Aplicar clasificación
career_with_peak['peak_group'] = career_with_peak['peak_year'].apply(assign_peak_group)


In [20]:
print("🔹 Jugadores después de agrupar por año desde debut:")
print(career_df['Player_ID'].nunique())


🔹 Jugadores después de agrupar por año desde debut:
1423


In [21]:
print("🔹 Jugadores con temporada válida para detectar el pico:")
print(valid_seasons['Player_ID'].nunique())


🔹 Jugadores con temporada válida para detectar el pico:
1286


In [22]:
print("🔹 Jugadores con peak_year:")
print(peak_info['Player_ID'].nunique())


🔹 Jugadores con peak_year:
1286


# ETAPA 6 — Preparación del dataset de entrenamiento

En esta etapa vamos a construir el dataset que usaremos para entrenar el modelo.

### ¿Qué haremos?
- Seleccionar los primeros **5 años** de cada jugador desde su debut
- Crear una tabla donde cada fila es un jugador y cada columna representa su rendimiento, edad y minutos en cada uno de esos años
- Calcular también el **crecimiento de rendimiento** entre años

Esto nos dará las variables de entrada (`X`). Ya tenemos el grupo (`peak_group`) como variable objetivo (`y`).


In [33]:
# Tomamos los primeros 5 años de cada jugador
early_years = career_with_peak[career_with_peak['year_since_debut'] <= 5]

# Pivot por año para convertir en columnas
pivot_rating = early_years.pivot(index='Player_ID', columns='year_since_debut', values='rating_per_90')
pivot_age = early_years.pivot(index='Player_ID', columns='year_since_debut', values='age')
pivot_minutes = early_years.pivot(index='Player_ID', columns='year_since_debut', values='Minutes')

# Renombramos las columnas
pivot_rating.columns = [f'rating_year_{i}' for i in pivot_rating.columns]
pivot_age.columns = [f'age_year_{i}' for i in pivot_age.columns]
pivot_minutes.columns = [f'minutes_year_{i}' for i in pivot_minutes.columns]

# Combinamos todas las variables
pivot_combined = pd.concat([pivot_rating, pivot_age, pivot_minutes], axis=1).fillna(0)

# -----------------------------------------------
# NUEVAS VARIABLES PARA MEJORAR EL MODELO
# -----------------------------------------------

# 1. Variabilidad del rating → mide si es regular o inconsistente
# Por ejemplo: rating_std = 0.1 (estable) vs 2.0 (altibajos)
pivot_combined['rating_std'] = pivot_combined[[f'rating_year_{i}' for i in range(1, 6)]].std(axis=1)

# 2. Mejor y peor año de rendimiento (máximo y mínimo)
pivot_combined['rating_max'] = pivot_combined[[f'rating_year_{i}' for i in range(1, 6)]].max(axis=1)
pivot_combined['rating_min'] = pivot_combined[[f'rating_year_{i}' for i in range(1, 6)]].min(axis=1)

# 3. Diferencia entre mejor y peor año → ¿es un jugador con pico claro o plano?
pivot_combined['rating_range'] = pivot_combined['rating_max'] - pivot_combined['rating_min']

# 4. Ratio de minutos jugados entre año 5 y año 1
# Si es >1, significa que fue ganando minutos (más importante en el equipo)
# Si es <1, significa que fue perdiendo protagonismo
pivot_combined['minutes_ratio_5_1'] = pivot_combined['minutes_year_5'] / (pivot_combined['minutes_year_1'] + 1)

# 5. Diferencia de edad entre el año 5 y el 1 (aunque normalmente serán 4 años)
pivot_combined['age_diff'] = pivot_combined['age_year_5'] - pivot_combined['age_year_1']


# Calculamos crecimiento en rating entre años
pivot_combined['growth_2_1'] = pivot_combined['rating_year_2'] - pivot_combined['rating_year_1']
pivot_combined['growth_3_2'] = pivot_combined['rating_year_3'] - pivot_combined['rating_year_2']
pivot_combined['growth_4_3'] = pivot_combined['rating_year_4'] - pivot_combined['rating_year_3']
pivot_combined['growth_5_4'] = pivot_combined['rating_year_5'] - pivot_combined['rating_year_4']
pivot_combined['avg_rating'] = pivot_combined[[f'rating_year_{i}' for i in range(1, 6)]].mean(axis=1)
pivot_combined['sum_minutes'] = pivot_combined[[f'minutes_year_{i}' for i in range(1, 6)]].sum(axis=1)
pivot_combined['rating_trend'] = pivot_combined['rating_year_5'] - pivot_combined['rating_year_1']
pivot_combined['minutes_trend'] = pivot_combined['minutes_year_5'] - pivot_combined['minutes_year_1']

pivot_combined = pivot_combined.reset_index()

model_df = pivot_combined.merge(
    peak_info[['Player_ID', 'peak_year']],
    on='Player_ID',
    how='left'
)
model_df = model_df[model_df['peak_year'].notna()]
model_df['peak_group'] = model_df['peak_year'].apply(assign_peak_group)

def assign_peak_group(y):
    if y <= 3:
        return 'temprano'
    elif y <= 6:
        return 'medio'
    else:
        return 'tardío'

In [24]:
print("Total players after pivot:", pivot_combined.shape[0])
print("Players in model_df:", model_df.shape[0])


Total players after pivot: 1423
Players in model_df: 1286


In [25]:
print("Distribution of peak_years:")
model_df['peak_year'].value_counts()


Distribution of peak_years:


peak_year
7.0     157
5.0     138
9.0     135
6.0     129
8.0     129
4.0     125
1.0     123
3.0     115
2.0      96
10.0     94
11.0     42
12.0      3
Name: count, dtype: int64

In [26]:
print("🎯 peak_info:", peak_info['Player_ID'].nunique())


🎯 peak_info: 1286


In [36]:
# FUTPEAK - Model Comparator Notebook
# Objetivo: Comparar XGBoost, LightGBM y CatBoost sobre el dataset ya procesado

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, cohen_kappa_score, matthews_corrcoef
import optuna
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# ----------------------------
# CARGA Y PREPARACIÓN DE DATOS
# ----------------------------

# Suponemos que vienes desde el bloque de procesamiento que genera 'model_df'
# Ya contiene Player_ID como índice, y las columnas: peak_year, peak_group, y todas las features numéricas

# Asegura que el índice es Player_ID
model_df = model_df.reset_index().set_index('Player_ID')

# Features y target
X = model_df.drop(columns=['peak_year', 'peak_group'])
y = model_df['peak_group']

columns_to_drop = [col for col in X.columns if '_year_0' in col or col == 'index']
X = X.drop(columns=columns_to_drop)

# Codificamos target para modelos
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split por jugador
player_ids = model_df.index.unique()
train_ids, test_ids = train_test_split(
    player_ids, test_size=0.2, random_state=42,
    stratify=y.loc[player_ids]
)

X_train = X.loc[train_ids]
X_test = X.loc[test_ids]
y_train = y_encoded[np.isin(model_df.index, train_ids)]
y_test = y_encoded[np.isin(model_df.index, test_ids)]

# ----------------------------
# FUNCIÓN DE EVALUACIÓN
# ----------------------------

def evaluate_model(name, model):
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    acc = accuracy_score(y_test, preds)
    f1 = f1_score(y_test, preds, average='weighted')
    kappa = cohen_kappa_score(y_test, preds)
    mcc = matthews_corrcoef(y_test, preds)
    return {
        'Model': name,
        'Accuracy': acc,
        'F1': f1,
        'Kappa': kappa,
        'MCC': mcc
    }

# ----------------------------
# OPTUNA: TUNEO PARA CADA MODELO
# ----------------------------

def optuna_objective(trial, model_type):
    if model_type == 'xgboost':
        params = {
            'objective': 'multi:softmax',
            'num_class': len(np.unique(y_train)),
            'eval_metric': 'mlogloss',
            'use_label_encoder': False,
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'n_estimators': trial.suggest_int('n_estimators', 100, 600),
            'subsample': trial.suggest_float('subsample', 0.6, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0)
        }
        model = XGBClassifier(**params)

    elif model_type == 'lightgbm':
        params = {
            'objective': 'multiclass',
            'num_class': len(np.unique(y_train)),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'n_estimators': trial.suggest_int('n_estimators', 100, 600),
            'subsample': trial.suggest_float('subsample', 0.6, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0)
        }
        model = LGBMClassifier(**params)

    elif model_type == 'catboost':
        params = {
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
            'depth': trial.suggest_int('depth', 3, 10),
            'iterations': trial.suggest_int('iterations', 100, 600),
            'verbose': 0,
            'loss_function': 'MultiClass'
        }
        model = CatBoostClassifier(**params)

    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    return f1_score(y_test, preds, average='weighted')

# ----------------------------
# CORRER OPTUNA PARA CADA MODELO
# ----------------------------

def run_optuna(model_type, n_trials=30):
    study = optuna.create_study(direction='maximize')
    study.optimize(lambda trial: optuna_objective(trial, model_type), n_trials=n_trials)
    return study.best_trial.params

# ----------------------------
# COMPARACIÓN FINAL
# ----------------------------

results = []

# XGBoost
xgb_params = run_optuna('xgboost')
xgb_model = XGBClassifier(**xgb_params, objective='multi:softmax', num_class=len(np.unique(y_train)), eval_metric='mlogloss', use_label_encoder=False)
results.append(evaluate_model('XGBoost', xgb_model))

# LightGBM
lgbm_params = run_optuna('lightgbm')
lgbm_model = LGBMClassifier(**lgbm_params, objective='multiclass', num_class=len(np.unique(y_train)))
results.append(evaluate_model('LightGBM', lgbm_model))

# CatBoost
cat_params = run_optuna('catboost')
cat_model = CatBoostClassifier(**cat_params, loss_function='MultiClass', verbose=0)
results.append(evaluate_model('CatBoost', cat_model))

# Mostrar resultados ordenados
result_df = pd.DataFrame(results)
print("\n🔍 COMPARACIÓN FINAL DE MODELOS:")
print(result_df.sort_values(by='F1', ascending=False).reset_index(drop=True))

[I 2025-05-14 13:00:04,674] A new study created in memory with name: no-name-2639c70c-a0bb-4470-8497-ee8f9dfff66d
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
[I 2025-05-14 13:00:05,592] Trial 0 finished with value: 0.2984304201762688 and parameters: {'learning_rate': 0.2586284371271574, 'max_depth': 9, 'n_estimators': 310, 'subsample': 0.961173824378593, 'colsample_bytree': 0.8563079108310478}. Best is trial 0 with value: 0.2984304201762688.
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
[I 2025-05-14 13:00:06,104] Trial 1 finished with value: 0.3060906750552014 and parameters: {'learning_rate': 0.13464594304593547, 'max_depth': 9, 'n_estimators': 133, 'subsample': 0.8034704638412176, 'colsample_bytree': 0.925486305003372}. Best is trial 1 with value: 0.3060906750552014.
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
[I 2025-05-14 13:00:07,3

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000478 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 1028, number of used features: 29
[LightGBM] [Info] Start training from score -1.189167
[LightGBM] [Info] Start training from score -0.830577
[LightGBM] [Info] Start training from score -1.348122


[I 2025-05-14 13:00:33,090] Trial 0 finished with value: 0.3341038531971479 and parameters: {'learning_rate': 0.11957857604392821, 'max_depth': 8, 'n_estimators': 383, 'subsample': 0.9318517445184966, 'colsample_bytree': 0.8138053833718768}. Best is trial 0 with value: 0.3341038531971479.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000553 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 1028, number of used features: 29
[LightGBM] [Info] Start training from score -1.189167
[LightGBM] [Info] Start training from score -0.830577
[LightGBM] [Info] Start training from score -1.348122


[I 2025-05-14 13:00:33,450] Trial 1 finished with value: 0.35531495486484227 and parameters: {'learning_rate': 0.26967043544585917, 'max_depth': 6, 'n_estimators': 512, 'subsample': 0.671169885454898, 'colsample_bytree': 0.7860003410631817}. Best is trial 1 with value: 0.35531495486484227.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000494 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 1028, number of used features: 29
[LightGBM] [Info] Start training from score -1.189167
[LightGBM] [Info] Start training from score -0.830577
[LightGBM] [Info] Start training from score -1.348122


[I 2025-05-14 13:00:33,644] Trial 2 finished with value: 0.34702068629904453 and parameters: {'learning_rate': 0.27767099726414063, 'max_depth': 3, 'n_estimators': 422, 'subsample': 0.8858074849154509, 'colsample_bytree': 0.7657442346547806}. Best is trial 1 with value: 0.35531495486484227.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000531 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 1028, number of used features: 29
[LightGBM] [Info] Start training from score -1.189167
[LightGBM] [Info] Start training from score -0.830577
[LightGBM] [Info] Start training from score -1.348122


[I 2025-05-14 13:00:33,971] Trial 3 finished with value: 0.33643593608535405 and parameters: {'learning_rate': 0.1820418935288147, 'max_depth': 8, 'n_estimators': 342, 'subsample': 0.9867397650721814, 'colsample_bytree': 0.8935149863754244}. Best is trial 1 with value: 0.35531495486484227.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000592 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 1028, number of used features: 29
[LightGBM] [Info] Start training from score -1.189167
[LightGBM] [Info] Start training from score -0.830577
[LightGBM] [Info] Start training from score -1.348122


[I 2025-05-14 13:00:34,217] Trial 4 finished with value: 0.3348916691939948 and parameters: {'learning_rate': 0.1559690324473188, 'max_depth': 8, 'n_estimators': 248, 'subsample': 0.6571766537248864, 'colsample_bytree': 0.9774367164555018}. Best is trial 1 with value: 0.35531495486484227.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000633 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 1028, number of used features: 29
[LightGBM] [Info] Start training from score -1.189167
[LightGBM] [Info] Start training from score -0.830577
[LightGBM] [Info] Start training from score -1.348122


[I 2025-05-14 13:00:34,649] Trial 5 finished with value: 0.3045252754297825 and parameters: {'learning_rate': 0.24366188149136545, 'max_depth': 10, 'n_estimators': 529, 'subsample': 0.6400070358802645, 'colsample_bytree': 0.8090076470800246}. Best is trial 1 with value: 0.35531495486484227.
[I 2025-05-14 13:00:34,709] Trial 6 finished with value: 0.31121919380184365 and parameters: {'learning_rate': 0.1744811519970551, 'max_depth': 4, 'n_estimators': 109, 'subsample': 0.6777494381597219, 'colsample_bytree': 0.8633644492952691}. Best is trial 1 with value: 0.35531495486484227.
[I 2025-05-14 13:00:34,791] Trial 7 finished with value: 0.2924781725509195 and parameters: {'learning_rate': 0.056776578403747696, 'max_depth': 3, 'n_estimators': 221, 'subsample': 0.7533720097130643, 'colsample_bytree': 0.6399861726198275}. Best is trial 1 with value: 0.35531495486484227.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000430 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 1028, number of used features: 29
[LightGBM] [Info] Start training from score -1.189167
[LightGBM] [Info] Start training from score -0.830577
[LightGBM] [Info] Start training from score -1.348122
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000525 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 1028, number of used features: 29
[LightGBM] [Info] Start training from score -1.189167
[LightGBM] [Info] Start training from score -0.830577
[LightGBM] [Info] Start training from score -1.348122
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000561 secon

[I 2025-05-14 13:00:35,110] Trial 8 finished with value: 0.32491413964270616 and parameters: {'learning_rate': 0.2884613236054375, 'max_depth': 5, 'n_estimators': 507, 'subsample': 0.7994533745997681, 'colsample_bytree': 0.7251086474659375}. Best is trial 1 with value: 0.35531495486484227.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000406 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 1028, number of used features: 29
[LightGBM] [Info] Start training from score -1.189167
[LightGBM] [Info] Start training from score -0.830577
[LightGBM] [Info] Start training from score -1.348122


[I 2025-05-14 13:00:35,535] Trial 9 finished with value: 0.30730310234860286 and parameters: {'learning_rate': 0.05779609763202569, 'max_depth': 10, 'n_estimators': 380, 'subsample': 0.9154257009250797, 'colsample_bytree': 0.7304740209041745}. Best is trial 1 with value: 0.35531495486484227.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000516 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 1028, number of used features: 29
[LightGBM] [Info] Start training from score -1.189167
[LightGBM] [Info] Start training from score -0.830577
[LightGBM] [Info] Start training from score -1.348122


[I 2025-05-14 13:00:35,965] Trial 10 finished with value: 0.3287443015524369 and parameters: {'learning_rate': 0.2257891458125105, 'max_depth': 6, 'n_estimators': 584, 'subsample': 0.7324065757667214, 'colsample_bytree': 0.6683852499677291}. Best is trial 1 with value: 0.35531495486484227.
[I 2025-05-14 13:00:36,162] Trial 11 finished with value: 0.34083251790741126 and parameters: {'learning_rate': 0.2983598622994115, 'max_depth': 3, 'n_estimators': 466, 'subsample': 0.8703189748152088, 'colsample_bytree': 0.7510238973729352}. Best is trial 1 with value: 0.35531495486484227.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000586 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 1028, number of used features: 29
[LightGBM] [Info] Start training from score -1.189167
[LightGBM] [Info] Start training from score -0.830577
[LightGBM] [Info] Start training from score -1.348122
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000566 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 1028, number of used features: 29
[LightGBM] [Info] Start training from score -1.189167
[LightGBM] [Info] Start training from score -0.830577
[LightGBM] [Info] Start training from score -1.348122


[I 2025-05-14 13:00:36,483] Trial 12 finished with value: 0.3259022727045983 and parameters: {'learning_rate': 0.24579225873854155, 'max_depth': 6, 'n_estimators': 439, 'subsample': 0.8423892820318309, 'colsample_bytree': 0.8766437788346773}. Best is trial 1 with value: 0.35531495486484227.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000451 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 1028, number of used features: 29
[LightGBM] [Info] Start training from score -1.189167
[LightGBM] [Info] Start training from score -0.830577
[LightGBM] [Info] Start training from score -1.348122


[I 2025-05-14 13:00:36,848] Trial 13 finished with value: 0.35456723613137714 and parameters: {'learning_rate': 0.22772008247698122, 'max_depth': 5, 'n_estimators': 580, 'subsample': 0.6093709412224204, 'colsample_bytree': 0.7810832719843206}. Best is trial 1 with value: 0.35531495486484227.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000533 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 1028, number of used features: 29
[LightGBM] [Info] Start training from score -1.189167
[LightGBM] [Info] Start training from score -0.830577
[LightGBM] [Info] Start training from score -1.348122


[I 2025-05-14 13:00:37,212] Trial 14 finished with value: 0.3240032580195424 and parameters: {'learning_rate': 0.20334239791054315, 'max_depth': 5, 'n_estimators': 599, 'subsample': 0.6002633494303786, 'colsample_bytree': 0.6041770944361725}. Best is trial 1 with value: 0.35531495486484227.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000478 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 1028, number of used features: 29
[LightGBM] [Info] Start training from score -1.189167
[LightGBM] [Info] Start training from score -0.830577
[LightGBM] [Info] Start training from score -1.348122


[I 2025-05-14 13:00:37,656] Trial 15 finished with value: 0.35223967291241903 and parameters: {'learning_rate': 0.13265469068398827, 'max_depth': 7, 'n_estimators': 537, 'subsample': 0.7116849619508517, 'colsample_bytree': 0.9443941042273405}. Best is trial 1 with value: 0.35531495486484227.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000590 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 1028, number of used features: 29
[LightGBM] [Info] Start training from score -1.189167
[LightGBM] [Info] Start training from score -0.830577
[LightGBM] [Info] Start training from score -1.348122


[I 2025-05-14 13:00:37,900] Trial 16 finished with value: 0.30949934486014186 and parameters: {'learning_rate': 0.2574985399896302, 'max_depth': 5, 'n_estimators': 315, 'subsample': 0.6000028715034086, 'colsample_bytree': 0.8354673058312033}. Best is trial 1 with value: 0.35531495486484227.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000454 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 1028, number of used features: 29
[LightGBM] [Info] Start training from score -1.189167
[LightGBM] [Info] Start training from score -0.830577
[LightGBM] [Info] Start training from score -1.348122


[I 2025-05-14 13:00:38,279] Trial 17 finished with value: 0.3123406941934232 and parameters: {'learning_rate': 0.21423308714475026, 'max_depth': 7, 'n_estimators': 479, 'subsample': 0.695190264665995, 'colsample_bytree': 0.6824281528435181}. Best is trial 1 with value: 0.35531495486484227.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000448 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 1028, number of used features: 29
[LightGBM] [Info] Start training from score -1.189167
[LightGBM] [Info] Start training from score -0.830577
[LightGBM] [Info] Start training from score -1.348122


[I 2025-05-14 13:00:38,564] Trial 18 finished with value: 0.338690588661222 and parameters: {'learning_rate': 0.26618139845258026, 'max_depth': 4, 'n_estimators': 560, 'subsample': 0.7609883055706648, 'colsample_bytree': 0.7848440673484632}. Best is trial 1 with value: 0.35531495486484227.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000585 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 1028, number of used features: 29
[LightGBM] [Info] Start training from score -1.189167
[LightGBM] [Info] Start training from score -0.830577
[LightGBM] [Info] Start training from score -1.348122


[I 2025-05-14 13:00:38,966] Trial 19 finished with value: 0.3161472151377424 and parameters: {'learning_rate': 0.01164691958075359, 'max_depth': 6, 'n_estimators': 503, 'subsample': 0.6411228992061384, 'colsample_bytree': 0.7081320583648936}. Best is trial 1 with value: 0.35531495486484227.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000461 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 1028, number of used features: 29
[LightGBM] [Info] Start training from score -1.189167
[LightGBM] [Info] Start training from score -0.830577
[LightGBM] [Info] Start training from score -1.348122


[I 2025-05-14 13:00:39,278] Trial 20 finished with value: 0.36120390782096673 and parameters: {'learning_rate': 0.1994121222573198, 'max_depth': 4, 'n_estimators': 595, 'subsample': 0.8137345160425768, 'colsample_bytree': 0.7757042591525638}. Best is trial 20 with value: 0.36120390782096673.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000592 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 1028, number of used features: 29
[LightGBM] [Info] Start training from score -1.189167
[LightGBM] [Info] Start training from score -0.830577
[LightGBM] [Info] Start training from score -1.348122


[I 2025-05-14 13:00:39,599] Trial 21 finished with value: 0.31721732937726793 and parameters: {'learning_rate': 0.20942994077174656, 'max_depth': 4, 'n_estimators': 599, 'subsample': 0.7918422010363773, 'colsample_bytree': 0.7724225919879024}. Best is trial 20 with value: 0.36120390782096673.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000538 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 1028, number of used features: 29
[LightGBM] [Info] Start training from score -1.189167
[LightGBM] [Info] Start training from score -0.830577
[LightGBM] [Info] Start training from score -1.348122


[I 2025-05-14 13:00:39,939] Trial 22 finished with value: 0.31827298632567746 and parameters: {'learning_rate': 0.23002316824399743, 'max_depth': 5, 'n_estimators': 554, 'subsample': 0.6312398993654458, 'colsample_bytree': 0.8279230561740476}. Best is trial 20 with value: 0.36120390782096673.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000549 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 1028, number of used features: 29
[LightGBM] [Info] Start training from score -1.189167
[LightGBM] [Info] Start training from score -0.830577
[LightGBM] [Info] Start training from score -1.348122


[I 2025-05-14 13:00:40,241] Trial 23 finished with value: 0.331320594632983 and parameters: {'learning_rate': 0.1733458271987071, 'max_depth': 4, 'n_estimators': 547, 'subsample': 0.8344327726809793, 'colsample_bytree': 0.9053304708252052}. Best is trial 20 with value: 0.36120390782096673.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000585 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 1028, number of used features: 29
[LightGBM] [Info] Start training from score -1.189167
[LightGBM] [Info] Start training from score -0.830577
[LightGBM] [Info] Start training from score -1.348122


[I 2025-05-14 13:00:40,595] Trial 24 finished with value: 0.32744107269383316 and parameters: {'learning_rate': 0.1946645707940269, 'max_depth': 6, 'n_estimators': 472, 'subsample': 0.6827455985906506, 'colsample_bytree': 0.7977567115916921}. Best is trial 20 with value: 0.36120390782096673.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000513 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 1028, number of used features: 29
[LightGBM] [Info] Start training from score -1.189167
[LightGBM] [Info] Start training from score -0.830577
[LightGBM] [Info] Start training from score -1.348122


[I 2025-05-14 13:00:40,885] Trial 25 finished with value: 0.31235311988553394 and parameters: {'learning_rate': 0.11824630897613377, 'max_depth': 5, 'n_estimators': 427, 'subsample': 0.7269934192967424, 'colsample_bytree': 0.8416155544734165}. Best is trial 20 with value: 0.36120390782096673.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000544 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 1028, number of used features: 29
[LightGBM] [Info] Start training from score -1.189167
[LightGBM] [Info] Start training from score -0.830577
[LightGBM] [Info] Start training from score -1.348122


[I 2025-05-14 13:00:41,291] Trial 26 finished with value: 0.3306703572783162 and parameters: {'learning_rate': 0.23954396210794995, 'max_depth': 7, 'n_estimators': 507, 'subsample': 0.7704186026887061, 'colsample_bytree': 0.6923918390750041}. Best is trial 20 with value: 0.36120390782096673.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000536 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 1028, number of used features: 29
[LightGBM] [Info] Start training from score -1.189167
[LightGBM] [Info] Start training from score -0.830577
[LightGBM] [Info] Start training from score -1.348122


[I 2025-05-14 13:00:41,579] Trial 27 finished with value: 0.33655669900142593 and parameters: {'learning_rate': 0.2600829318131746, 'max_depth': 4, 'n_estimators': 562, 'subsample': 0.8331396599727422, 'colsample_bytree': 0.7476875180682188}. Best is trial 20 with value: 0.36120390782096673.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000525 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 1028, number of used features: 29
[LightGBM] [Info] Start training from score -1.189167
[LightGBM] [Info] Start training from score -0.830577
[LightGBM] [Info] Start training from score -1.348122


[I 2025-05-14 13:00:42,131] Trial 28 finished with value: 0.3368029741439974 and parameters: {'learning_rate': 0.1572957059555064, 'max_depth': 9, 'n_estimators': 577, 'subsample': 0.6159005836412108, 'colsample_bytree': 0.8536523047264702}. Best is trial 20 with value: 0.36120390782096673.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000630 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 1028, number of used features: 29
[LightGBM] [Info] Start training from score -1.189167
[LightGBM] [Info] Start training from score -0.830577
[LightGBM] [Info] Start training from score -1.348122


[I 2025-05-14 13:00:42,481] Trial 29 finished with value: 0.3090751856837688 and parameters: {'learning_rate': 0.12066877850484962, 'max_depth': 8, 'n_estimators': 390, 'subsample': 0.6597715104715342, 'colsample_bytree': 0.7992546361060332}. Best is trial 20 with value: 0.36120390782096673.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000445 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 1028, number of used features: 29
[LightGBM] [Info] Start training from score -1.189167
[LightGBM] [Info] Start training from score -0.830577
[LightGBM] [Info] Start training from score -1.348122


[I 2025-05-14 13:00:42,797] A new study created in memory with name: no-name-9b1d3c34-e0ca-477d-b4e2-adbad7195220




[I 2025-05-14 13:00:44,789] Trial 0 finished with value: 0.3258704440874983 and parameters: {'learning_rate': 0.12709352938390053, 'depth': 5, 'iterations': 398}. Best is trial 0 with value: 0.3258704440874983.
[I 2025-05-14 13:00:45,799] Trial 1 finished with value: 0.3096581105729532 and parameters: {'learning_rate': 0.16978708912648466, 'depth': 7, 'iterations': 106}. Best is trial 0 with value: 0.3258704440874983.
[I 2025-05-14 13:00:46,096] Trial 2 finished with value: 0.31444000003986877 and parameters: {'learning_rate': 0.13378714147693796, 'depth': 3, 'iterations': 137}. Best is trial 0 with value: 0.3258704440874983.
[I 2025-05-14 13:00:46,639] Trial 3 finished with value: 0.33523145721036124 and parameters: {'learning_rate': 0.21007664464955564, 'depth': 3, 'iterations': 375}. Best is trial 3 with value: 0.33523145721036124.
[I 2025-05-14 13:00:47,031] Trial 4 finished with value: 0.2727280181318312 and parameters: {'learning_rate': 0.1128625683531815, 'depth': 4, 'iterations


🔍 COMPARACIÓN FINAL DE MODELOS:
      Model  Accuracy        F1     Kappa       MCC
0  LightGBM  0.368217  0.361204  0.012400  0.012473
1   XGBoost  0.375969  0.356392  0.002378  0.002460
2  CatBoost  0.352713  0.341094 -0.016059 -0.016204


In [28]:
print("Columns in X:")
print(X.columns.tolist())

Columns in X:
['index', 'rating_year_0', 'rating_year_1', 'rating_year_2', 'rating_year_3', 'rating_year_4', 'rating_year_5', 'age_year_0', 'age_year_1', 'age_year_2', 'age_year_3', 'age_year_4', 'age_year_5', 'minutes_year_0', 'minutes_year_1', 'minutes_year_2', 'minutes_year_3', 'minutes_year_4', 'minutes_year_5', 'rating_std', 'rating_max', 'rating_min', 'rating_range', 'minutes_ratio_5_1', 'age_diff', 'growth_2_1', 'growth_3_2', 'growth_4_3', 'growth_5_4', 'avg_rating', 'sum_minutes', 'rating_trend', 'minutes_trend']


In [29]:
print(model_df['peak_group'].value_counts(normalize=True))


peak_group
tardío      0.435459
medio       0.304821
temprano    0.259720
Name: proportion, dtype: float64


In [30]:
print(X_train.shape, y_train.shape)


(1028, 33) (1028,)
