In [1]:
import pandas as pd

# Carga de datasets
df_limpio = pd.read_csv('../../../dataset/cleaned_final_for_training/df_final.csv', sep=',')

# Verificación rápida
print("Dataset cargado correctamente:")
print(f"universo → {df_limpio.shape}")


Dataset cargado correctamente:
universo → (85216, 31)


In [3]:
variables_seleccionadas = [c for c in df_limpio.columns if c != 'target']

RandomForestClassifier

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# === VARIABLES SELECCIONADAS (TOP 1 DE CADA GRUPO) ===
# variables_seleccionadas : numéricas seleccionadas previamente
# cat_selected : categóricas seleccionadas por ANOVA
features = list(variables_seleccionadas) # combinar numéricas y categóricas seleccionadas

# Separar X e y
X = df_limpio[features]
y = df_limpio['target']

# Dividir en train (70%) y temp (30%)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# Dividir temp en validation (20%) y test (10%)
# Nota: validation = 20/30 = 2/3 del temp, test = 1/3 del temp
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=1/3, random_state=42, stratify=y_temp
)

print(f"Train: {X_train.shape}, Validation: {X_val.shape}, Test: {X_test.shape}")

# Entrenar Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
rf.fit(X_train, y_train)

# Evaluar en validation
y_val_pred = rf.predict(X_val)
print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))
print(classification_report(y_val, y_val_pred))

# Evaluar en test
y_test_pred = rf.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))


Train: (59651, 30), Validation: (17043, 30), Test: (8522, 30)
Validation Accuracy: 0.8623481781376519
              precision    recall  f1-score   support

           0       0.86      1.00      0.93     14705
           1       0.41      0.01      0.03      1747
           2       0.00      0.00      0.00        97
           3       1.00      0.01      0.01       153
           4       0.00      0.00      0.00        62
           5       0.00      0.00      0.00       279

    accuracy                           0.86     17043
   macro avg       0.38      0.17      0.16     17043
weighted avg       0.80      0.86      0.80     17043

Test Accuracy: 0.8628256277869045
              precision    recall  f1-score   support

           0       0.86      1.00      0.93      7353
           1       0.49      0.02      0.04       873
           2       0.00      0.00      0.00        48
           3       0.00      0.00      0.00        77
           4       0.00      0.00      0.00       

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


XGBClassifier

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier

# === VARIABLES SELECCIONADAS (TOP 1 DE CADA GRUPO) ===
features = list(variables_seleccionadas)

# Separar X e y
X = df_limpio[features]
y = df_limpio['target']

# Dividir en train (70%) y temp (30%)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# Dividir temp en validation (20%) y test (10%)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=1/3, random_state=42, stratify=y_temp
)

print(f"Train: {X_train.shape}, Validation: {X_val.shape}, Test: {X_test.shape}")

# ===== CÁLCULO DE PESO BALANCEADO =====
n_negative = sum(y_train == 0)
n_positive = sum(y_train == 1)
scale_pos_weight = n_negative / n_positive

print("scale_pos_weight =", scale_pos_weight)

# ===== ENTRENAR XGBOOST =====
xgb = XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="logloss",         # evita warnings
    scale_pos_weight=scale_pos_weight,
    random_state=42
)

xgb.fit(X_train, y_train)

# ===== VALIDACIÓN =====
y_val_pred = xgb.predict(X_val)
print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))
print(classification_report(y_val, y_val_pred))

# ===== TEST FINAL =====
y_test_pred = xgb.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))


Train: (59651, 30), Validation: (17043, 30), Test: (8522, 30)
scale_pos_weight = 8.419433993129397


Parameters: { "scale_pos_weight" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Validation Accuracy: 0.863697705802969
              precision    recall  f1-score   support

           0       0.87      0.99      0.93     14705
           1       0.50      0.06      0.11      1747
           2       0.00      0.00      0.00        97
           3       0.00      0.00      0.00       153
           4       1.00      0.08      0.15        62
           5       0.40      0.01      0.01       279

    accuracy                           0.86     17043
   macro avg       0.46      0.19      0.20     17043
weighted avg       0.81      0.86      0.81     17043

Test Accuracy: 0.8637643745599625
              precision    recall  f1-score   support

           0       0.87      0.99      0.93      7353
           1       0.51      0.06      0.10       873
           2       0.00      0.00      0.00        48
           3       1.00      0.01      0.03        77
           4       1.00      0.06      0.12        31
           5       0.33      0.01      0.01       140

    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


LightGBM

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import lightgbm as lgb
import os
os.environ["LOKY_MAX_CPU_COUNT"] = "4"  # pon el número correcto de cores lógicos


# === VARIABLES SELECCIONADAS (TOP 1 DE CADA GRUPO) ===
features = list(variables_seleccionadas)

# Separar X e y
X = df_limpio[features]
y = df_limpio['target']

# Dividir en train (70%) y temp (30%)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# Dividir temp en validation (20%) y test (10%)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=1/3, random_state=42, stratify=y_temp
)

print(f"Train: {X_train.shape}, Validation: {X_val.shape}, Test: {X_test.shape}")

# ===== ENTRENAR LIGHTGBM =====
lgbm = lgb.LGBMClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=-1,           # sin límite
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    class_weight='balanced'  # <--- balance automático de clases
)

lgbm.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric='binary_logloss')

# ===== VALIDACIÓN =====
y_val_pred = lgbm.predict(X_val)
print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))
print(classification_report(y_val, y_val_pred))

# ===== TEST FINAL =====
y_test_pred = lgbm.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))


Train: (59651, 30), Validation: (17043, 30), Test: (8522, 30)
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002625 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7052
[LightGBM] [Info] Number of data points in the train set: 59651, number of used features: 30
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
Validation Accuracy: 0.6377398345361732
              precision    recall  f1-score   support

           0       0.94      0.64      0.76     14705
           1       0.26      0.76      0.39      1747
           2       0.08      0.23      0.11        97
    

BalancedBaggingClassifier

In [8]:
from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


# ===== CREAR BALANCEDBAGGINGCLASSIFIER =====
bbc = BalancedBaggingClassifier(
    estimator=DecisionTreeClassifier(),  # <-- cambio aquí
    n_estimators=50,                     
    sampling_strategy='auto',
    replacement=False,
    random_state=42,
    n_jobs=-1
)

# ===== ENTRENAR =====
bbc.fit(X_train, y_train)

# ===== VALIDACIÓN =====
y_val_pred = bbc.predict(X_val)
print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))
print(classification_report(y_val, y_val_pred))
print("Confusion Matrix:\n", confusion_matrix(y_val, y_val_pred))

# ===== TEST FINAL =====
y_test_pred = bbc.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred))


Validation Accuracy: 0.4492166872029572
              precision    recall  f1-score   support

           0       0.95      0.42      0.58     14705
           1       0.23      0.67      0.35      1747
           2       0.06      0.69      0.11        97
           3       0.04      0.56      0.08       153
           4       0.03      0.39      0.06        62
           5       0.08      0.46      0.14       279

    accuracy                           0.45     17043
   macro avg       0.23      0.53      0.22     17043
weighted avg       0.85      0.45      0.54     17043

Confusion Matrix:
 [[6183 3797 1008 1847  439 1431]
 [ 267 1167   15   57  233    8]
 [   0    6   67    7    3   14]
 [  16   17    7   86   11   16]
 [   5   19   13    1   24    0]
 [  25   11   56   50    8  129]]
Test Accuracy: 0.4602206054916686
              precision    recall  f1-score   support

           0       0.96      0.43      0.60      7353
           1       0.24      0.69      0.36       873
  