# 1. Setup

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import RandomizedSearchCV
import dalex as dx


from sklearn.model_selection import train_test_split, StratifiedKFold
import shap
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score
import lightgbm as lgb
import xgboost as xgb
import shap
import lime
import lime.lime_tabular
from xgboost import XGBClassifier
from sklearn.inspection import permutation_importance
from alibi.explainers import Counterfactual
from sklearn.inspection import PartialDependenceDisplay


sns.set_theme(style="whitegrid")

# 2. Load Dataset

In [None]:
# Load training dataset
df_train = pd.read_csv('./dataset.csv')
X_train_full = df_train.iloc[:,1:].drop('target_variable', axis=1)

y_train_full = df_train['target_variable']

In [None]:
# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X_train_full,
    y_train_full,
    test_size=0.2,
    random_state=42,
    stratify=y_train_full
)

# 3. Model

## LightGBM

### Métodos locales

In [None]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold

# Crear modelo base con LightGBM
lgbm = LGBMClassifier(
    objective='binary',
    random_state=42,
    verbosity=-1  # Silenciar logs
)

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [4, 6, 8],
    'learning_rate': [0.05, 0.1],
    'subsample': [0.8, 1.0],
    'min_child_samples': [10, 20]
}

search = RandomizedSearchCV(
    estimator=lgbm,
    param_distributions=param_grid,
    n_iter=5,        # ⬅️ Solo 5 combinaciones
    scoring='f1',
    cv=3,            # ⬅️ Solo 3 folds (no 5)
    random_state=42,
    n_jobs=-1,
    verbose=1
)

# Entrenar
search.fit(X_train, y_train)

# Mejor modelo
best_lgbm = search.best_estimator_

In [None]:
search.best_estimator_

In [None]:
Parameters
boosting_type 	'gbdt'
num_leaves 	31
max_depth 	8
learning_rate 	0.1
n_estimators 	200
subsample_for_bin 	200000
objective 	'binary'
class_weight 	None
min_split_gain 	0.0
min_child_weight 	0.001
min_child_samples 	10
subsample 	1.0
subsample_freq 	0
colsample_bytree 	1.0
reg_alpha 	0.0
reg_lambda 	0.0
random_state 	42
n_jobs 	None
importance_type 	'split'
verbosity 	-1

In [None]:
# fit model
best_lgbm.fit(X_train, y_train)

# make predictions
preds = best_lgbm.predict(X_test)



# Predecir con el mejor modelo
preds_best_lgbm = best_lgbm.predict(X_test)

# Calcular métricas
print(classification_report(y_test, preds_best_lgbm))

# Matriz de confusión
cm_best = confusion_matrix(y_test, preds_best_lgbm)
sns.heatmap(cm_best, annot=True, fmt='d', cmap='Blues')
plt.title('Matriz de confusión - Mejor modelo')
plt.ylabel('Real')
plt.xlabel('Predicho')
plt.show()

#### Sharp

In [None]:
import shap

# Crear explainer
explainer = shap.TreeExplainer(best_lgbm)

# Obtener SHAP values
shap_values = explainer.shap_values(X_test)

# Si es clasificación binaria, LightGBM también puede devolver una lista
if isinstance(shap_values, list):
    shap_values = shap_values[1]  # Tomar la clase positiva (1)

# Gráfico global
shap.summary_plot(shap_values, X_test, feature_names=feature_names)

#### Lime

In [None]:
import lime
import lime.lime_tabular
from lightgbm import LGBMClassifier

# Asumiendo que ya tienes best_lgbm (tu modelo entrenado con LightGBM)
# y X_train, X_test, feature_names definidos

# Crear el explainer LIME (igual que con XGBoost)
explainer_lime = lime.lime_tabular.LimeTabularExplainer(
    X_train.values,                   # Datos de entrenamiento
    feature_names=feature_names,      # Nombres de las columnas
    class_names=['Lost', 'Won'],      # Etiquetas de clase
    mode='classification'
)

# Elegir una instancia específica para explicar (ej. índice 0)
i = 0
exp = explainer_lime.explain_instance(
    X_test.iloc[i].values,            # Instancia a explicar
    best_lgbm.predict_proba,          # ❗ Aquí usas best_lgbm en lugar de best_bst
    num_features=X_test.shape[1]      # Número de variables más influyentes
)

# Mostrar la explicación
exp.save_to_file('lime_explanation.html')  # Guardar como HTML


### Métodos globales

In [None]:
import PyALE
import matplotlib.pyplot as plt

# Lista de variables numéricas (excluye categóricas)
numeric_features = [
    'product_A_sold_in_the_past',
    'product_B_sold_in_the_past',
    'product_A_recommended',
    'product_A',
    'product_C',
    'product_D',
    'cust_hitrate',  
    'cust_interactions', 
    'cust_contracts', 
    'opp_month', 
    'opp_old'  
]


# Generar ALE para cada variable
for feature in numeric_features:
    try:
        ale_eff = PyALE.ale(
            X=X_test,
            model=best_bst,
            feature=[feature],
            grid_size=50,
            include_CI=False
        )
        ale_eff.plot()
        plt.title(f"ALE for {feature} (XGBoost)")
        plt.show()
    except Exception as e:
        print(f"Error generando ALE para {feature}: {e}")