# Intro
En este notebook ya dejo presentado el modelo final a usar que luego ira a parar a .py _train.py_ y _predict.py_

# Imports

In [18]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import uniform, randint
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from xgboost import XGBClassifier, plot_importance
from joblib import dump
import shap
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    confusion_matrix,
    average_precision_score,
    classification_report,
    RocCurveDisplay,
    PrecisionRecallDisplay
)

# Funciones

In [19]:
def get_train_test(X,y):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42
    )

    return X_train, X_test, y_train, y_test

def get_pos_weight(y_train):
    # Calculate class imbalance ratio
    neg, pos = np.bincount(y_train)
    scale_pos_weight = neg / pos
    print(f"Class ratio: {neg}:{pos}, scale_pos_weight = {scale_pos_weight:.2f}")
    
    return scale_pos_weight

In [25]:
def generate_train_test_report(name,model,params,X_train,X_test,y_train,y_test,cv):
    print(f"\n>>> Training {name} ...")

    search = RandomizedSearchCV(
        estimator=model,
        param_distributions=params,
        scoring='average_precision',
        n_iter=25,              
        cv=cv,
        n_jobs=-1,
        random_state=42,
        verbose=3
    )

    search.fit(X_train, y_train)


    # Evaluate best model
    best_model = search.best_estimator_
    best_parameter = search.best_params_
    y_pred = best_model.predict(X_test)
    y_prob = best_model.predict_proba(X_test)[:, 1]

    metrics = {
        'model': name,
        'best_params': search.best_params_,
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred, zero_division=0),
        'recall': recall_score(y_test, y_pred),
        'f1': f1_score(y_test, y_pred),
        'roc_auc': roc_auc_score(y_test, y_prob),
        'pr_auc': average_precision_score(y_test,y_prob)
    }

    print(f"Classification report for {name}:\n", classification_report(y_test, y_pred))
    
    return best_model,best_parameter,metrics

# Load

In [26]:
df_train = pd.read_csv("../data/raw/cs-training.csv")
X = df_train.drop(['SeriousDlqin2yrs','Unnamed: 0','age','NumberOfDependents','DebtRatio','MonthlyIncome','NumberOfOpenCreditLinesAndLoans','NumberRealEstateLoansOrLines'],axis = 1)
y = df_train['SeriousDlqin2yrs']

In [27]:
X_train, X_test, y_train, y_test = get_train_test(X,y)

In [28]:
y_train

57836     0
132895    0
27981     0
37852     0
103813    0
         ..
18048     0
3895      0
109980    0
74354     0
80530     0
Name: SeriousDlqin2yrs, Length: 120000, dtype: int64

# Train model

## Model initialization

In [29]:
model = XGBClassifier(
                eval_metric='logloss',
                random_state=42,
                scale_pos_weight=get_pos_weight(y_train)
                                        )

Class ratio: 111979:8021, scale_pos_weight = 13.96


## Hyperparameters

In [30]:
param_distributions = {
                'n_estimators': randint(100, 600),
                'max_depth': randint(3, 10),
                'learning_rate': uniform(0.01, 0.3),
                'subsample': uniform(0.6, 0.4),
                'colsample_bytree': uniform(0.6, 0.4),
                'min_child_weight': randint(1, 10),
                'gamma': uniform(0, 5),
                'reg_lambda': uniform(0, 5),
                'reg_alpha': uniform(0, 5),
                'base_score': uniform(0.1,0.8)
            }

## Train setup

In [31]:
name = 'final_xgboost_model'
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
best_model,best_params,metrics = generate_train_test_report(name,model,param_distributions,X_train, X_test, y_train, y_test,cv)    


>>> Training final_xgboost_model ...
Fitting 5 folds for each of 25 candidates, totalling 125 fits
Classification report for final_xgboost_model:
               precision    recall  f1-score   support

           0       0.98      0.76      0.86     27995
           1       0.19      0.79      0.31      2005

    accuracy                           0.76     30000
   macro avg       0.59      0.78      0.58     30000
weighted avg       0.93      0.76      0.82     30000



In [32]:
metrics

{'model': 'final_xgboost_model',
 'best_params': {'base_score': np.float64(0.7793787283953424),
  'colsample_bytree': np.float64(0.8630451569201374),
  'gamma': np.float64(2.841543016677358),
  'learning_rate': np.float64(0.038102430348427745),
  'max_depth': 5,
  'min_child_weight': 6,
  'n_estimators': 474,
  'reg_alpha': np.float64(1.2523090930279208),
  'reg_lambda': np.float64(2.9493542378027193),
  'subsample': np.float64(0.9915571433100037)},
 'accuracy': 0.7638666666666667,
 'precision': 0.1921445023639229,
 'recall': 0.7905236907730673,
 'f1': 0.30914764969767894,
 'roc_auc': 0.8559003901213211,
 'pr_auc': 0.38764139104925116}

In [33]:
explainer = shap.TreeExplainer(best_model,data=X_train)

In [42]:
import joblib
import pandas as pd
import numpy as np
import shap
from scipy.stats import uniform, randint

from xgboost import XGBClassifier
from sklearn.model_selection import (
    train_test_split,
    StratifiedKFold,
    RandomizedSearchCV
)
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    average_precision_score,
    classification_report,
)


class ModelTrainer:

    def __init__(
        self,
        data_path: str,
        drop_columns: str,
        target: str,
        model_params: dict,
        model_path: str,
        explainer_path: str,
        model_name: str
    ):
        self.data_path = data_path
        self.drop_columns = drop_columns
        self.target = target
        self.model_params = model_params
        self.model_path = model_path
        self.explainer_path = explainer_path
        self.model_name = model_name

        # internal variables
        self.df = None
        self.X = None
        self.y = None
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        self.best_model = None
        self.best_params = None
        self.metrics = None


    # ----------------------------------------------------------
    # LOAD + PREPARE DATA
    # ----------------------------------------------------------
    def load_data(self):
        self.df = pd.read_csv(self.data_path)
        self.X = self.df.drop(columns=self.drop_columns)
        self.y = self.df[self.target]
        print(f"Loaded dataset: {self.df.shape} rows")


    def prepare_data(self):
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            self.X,
            self.y,
            test_size=0.2,
            stratify=self.y,
            random_state=42
        )
        print("Data prepared:")
        print(f" - Train shape: {self.X_train.shape}")
        print(f" - Test shape: {self.X_test.shape}")
        print(f" - Y Train shape: {self.y_train.shape}")
        print(f" - Y Test shape: {self.y_test.shape}")


    # ----------------------------------------------------------
    # UTILS
    # ----------------------------------------------------------
    def get_pos_weight(self):
        y_arr = np.asarray(self.y_train).astype(int).reshape(-1)
        neg, pos = np.bincount(y_arr)
        w = neg / pos
        print(f"Class ratio {neg}:{pos}, scale_pos_weight = {w:.2f}")
        return w


    # ----------------------------------------------------------
    # TRAIN
    # ----------------------------------------------------------
    def train(self):
        print(f"\n>>> Training {self.model_name} ...")

        model = XGBClassifier(
            eval_metric='logloss',
            random_state=42,
            scale_pos_weight=self.get_pos_weight()
        )

        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

        search = RandomizedSearchCV(
            estimator=model,
            param_distributions=self.model_params,
            scoring='average_precision',
            n_iter=25,
            cv=cv,
            n_jobs=-1,
            random_state=42,
            verbose=3,
        )

        search.fit(self.X_train, self.y_train)

        self.best_model = search.best_estimator_
        self.best_params = search.best_params_
        print("Best parameters:", self.best_params)


    # ----------------------------------------------------------
    # EVALUATE
    # ----------------------------------------------------------
    def evaluate(self):
        y_pred = self.best_model.predict(self.X_test)
        y_prob = self.best_model.predict_proba(self.X_test)[:, 1]

        self.metrics = {
            "model": self.model_name,
            "best_params": self.best_params,
            "accuracy": accuracy_score(self.y_test, y_pred),
            "precision": precision_score(self.y_test, y_pred, zero_division=0),
            "recall": recall_score(self.y_test, y_pred),
            "f1": f1_score(self.y_test, y_pred),
            "roc_auc": roc_auc_score(self.y_test, y_prob),
            "pr_auc": average_precision_score(self.y_test, y_prob),
        }

        print("\nClassification Report:\n")
        print(classification_report(self.y_test, y_pred))

        print("\nMetrics:")
        for k, v in self.metrics.items():
            print(f"{k}: {v}")

        return self.metrics


    # ----------------------------------------------------------
    # SAVE ARTIFACTS
    # ----------------------------------------------------------
    def save(self):
        # SHAP Explainer
        explainer = shap.TreeExplainer(self.best_model)

        # Save full training artifact
        artifact = {
            "model": self.best_model,
            "features": list(self.X_train.columns),
            "params": self.best_params,
            "metrics": self.metrics
        }

        joblib.dump(artifact, self.model_path)
        joblib.dump(explainer, self.explainer_path)

        print(f"\nModel saved to {self.model_path}")
        print(f"Explainer saved to {self.explainer_path}")


    # ----------------------------------------------------------
    # FULL PIPELINE
    # ----------------------------------------------------------
    def run(self):
        self.load_data()
        self.prepare_data()
        self.train()
        self.evaluate()
        #self.save()


# ----------------------------------------------------------
# RUN SCRIPT
# ----------------------------------------------------------
if __name__ == "__main__":
    param_distributions = {
                'n_estimators': randint(100, 600),
                'max_depth': randint(3, 10),
                'learning_rate': uniform(0.01, 0.3),
                'subsample': uniform(0.6, 0.4),
                'colsample_bytree': uniform(0.6, 0.4),
                'min_child_weight': randint(1, 10),
                'gamma': uniform(0, 5),
                'reg_lambda': uniform(0, 5),
                'reg_alpha': uniform(0, 5),
                'base_score': uniform(0.1,0.8)
            }

    trainer = ModelTrainer(
        data_path="../data/raw/cs-training.csv",
        drop_columns=['SeriousDlqin2yrs','Unnamed: 0'],
        target=['SeriousDlqin2yrs'],
        model_params=param_distributions,
        model_path="../models/xgb_model.joblib",
        explainer_path="../models/xgb_shap_explainer.joblib",
        model_name="XGBoost_Model"
    )



In [43]:
trainer.run()

Loaded dataset: (150000, 12) rows
Data prepared:
 - Train shape: (120000, 10)
 - Test shape: (30000, 10)
 - Y Train shape: (120000, 1)
 - Y Test shape: (30000, 1)

>>> Training XGBoost_Model ...
Class ratio 111979:8021, scale_pos_weight = 13.96
Fitting 5 folds for each of 25 candidates, totalling 125 fits
Best parameters: {'base_score': np.float64(0.7682419964713904), 'colsample_bytree': np.float64(0.7283120259886944), 'gamma': np.float64(0.9325925519992712), 'learning_rate': np.float64(0.022232542466429174), 'max_depth': 6, 'min_child_weight': 4, 'n_estimators': 502, 'reg_alpha': np.float64(3.555747662190089), 'reg_lambda': np.float64(4.047505230698577), 'subsample': np.float64(0.7394663949166917)}

Classification Report:

              precision    recall  f1-score   support

           0       0.98      0.81      0.89     27995
           1       0.23      0.76      0.35      2005

    accuracy                           0.81     30000
   macro avg       0.60      0.79      0.62     

In [44]:
trainer.save()


Model saved to ../models/xgb_model.joblib
Explainer saved to ../models/xgb_shap_explainer.joblib
