In [16]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
import shap
from sklearn.preprocessing import OneHotEncoder
import joblib

In [21]:
X_train.columns

Index(['P220_5', 'P220_9', 'P220_2', 'P220_3', 'P220_1_CANT_2', 'P219_UM_COD',
       'P220_2_VAL', 'P220_1_PREC_2', 'FACTOR', 'P220_2_PRE_KG', 'P220_4',
       'P219_CANT_2', 'P220_2_PREC_1', 'P209_ANIO', 'P220_1_PRE_KG', 'P220_8',
       'P217_SUP_ha', 'P218', 'P217_SUP_1', 'P220_7', 'P217_SUP_2', 'P220_6',
       'P220_10', 'duracion_cosecha_meses', 'produccion_area_ratio',
       'precio_valor_ratio', 'REGION', 'P204_NOM', 'P204_TIPO', 'P208', 'P212',
       'P222_1', 'P222_2', 'P222_6', 'P223_2', 'P223_3', 'P223_5'],
      dtype='object')

In [22]:
# Cargar conjuntos de datos
X_train = pd.read_csv("../01_Data/X_train.csv")
X_test = pd.read_csv("../01_Data/X_test.csv")
y_train = pd.read_csv("../01_Data/y_train.csv").squeeze()
y_test = pd.read_csv("../01_Data/y_test.csv").squeeze()

# Identificar columnas categóricas
categorical_columns = X_train.select_dtypes(include=['object', 'category']).columns

# Codificar columnas categóricas con One-Hot Encoding
encoder = OneHotEncoder(drop='first', handle_unknown='ignore')
X_train_encoded = pd.DataFrame(encoder.fit_transform(X_train[categorical_columns]))
X_test_encoded = pd.DataFrame(encoder.transform(X_test[categorical_columns]))

# Mantener columnas numéricas
numerical_columns = X_train.select_dtypes(include=['int64', 'float64']).columns
X_train_encoded[numerical_columns] = X_train[numerical_columns].reset_index(drop=True)
X_test_encoded[numerical_columns] = X_test[numerical_columns].reset_index(drop=True)

# Asegurar que los nombres de columnas coincidan
X_train_encoded.columns = list(encoder.get_feature_names_out(categorical_columns)) + list(numerical_columns)
X_test_encoded.columns = list(encoder.get_feature_names_out(categorical_columns)) + list(numerical_columns)

# Crear modelo XGBoost
model = XGBClassifier(
    use_label_encoder=False,
    eval_metric='auc',
    random_state=42
)

# Entrenar el modelo
model.fit(X_train_encoded, y_train)

# Predicciones
y_pred = model.predict(X_test_encoded)
y_proba = model.predict_proba(X_test_encoded)[:, 1]

# Métricas
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_proba)
print("Accuracy en testeo:", accuracy)
print("ROC AUC en testeo:", roc_auc)
print("Reporte de Clasificación:\n", classification_report(y_test, y_pred))

# Matriz de confusión
cm = confusion_matrix(y_test, y_pred)
print("Matriz de Confusión:\n", cm)

# Curva ROC
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, label=f'Curva ROC (AUC = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], 'k--', label='Aleatorio')
plt.title('Curva ROC - XGBoost')
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

# Interpretación con SHAP
explainer = shap.Explainer(model, X_train_encoded)
shap_values = explainer(X_test_encoded)

# Visualizar importancia de características
shap.summary_plot(shap_values, X_test_encoded, plot_type="bar")

# Visualizar SHAP values para observaciones individuales
shap.summary_plot(shap_values, X_test_encoded)

# SHAP Force Plot para una observación
shap.force_plot(explainer.expected_value, shap_values[0].values, X_test_encoded.iloc[0, :], matplotlib=True)

# Guardar modelo
joblib.dump(model, "modelo_xgboost_basico.joblib")
print("Modelo guardado exitosamente.")

ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, the experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:P204_NOM: object