In [40]:
# 02_optimizacion_modelos_sms_spam.ipynb

import pandas as pd
import numpy as np

# Scikit-learn y métricas
from sklearn.model_selection       import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline              import Pipeline
from sklearn.preprocessing         import FunctionTransformer, StandardScaler
from sklearn.metrics               import classification_report, confusion_matrix, roc_curve, auc, f1_score

# Clasificadores
from sklearn.linear_model          import LogisticRegression
from sklearn.svm                   import SVC
from sklearn.ensemble              import RandomForestClassifier, VotingClassifier
from sklearn.neighbors             import KNeighborsClassifier
from sklearn.naive_bayes           import MultinomialNB
from sklearn.neural_network        import MLPClassifier
from sklearn.ensemble              import StackingClassifier

# Boosting libraries
import xgboost as xgb
import lightgbm as lgb

import matplotlib.pyplot as plt

# Cargar datos
ruta = "../datos_raw/SMSSpamCollection"
df = pd.read_csv(ruta, sep="\t", header=None, names=["label", "text"], encoding="utf-8")
df["label_bin"] = df["label"].map({"ham": 0, "spam": 1})

# División inicial en train/test (mantener proporción)
X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    df["text"], df["label_bin"],
    test_size=0.2,
    random_state=42,
    stratify=df["label_bin"]
)

print("Datos cargados. Tamaño train:", X_train_raw.shape, "tamaño test:", X_test_raw.shape)

Datos cargados. Tamaño train: (4457,) tamaño test: (1115,)


In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

def get_vectorizers():
    """
    Devuelve un diccionario con distintos vectorizadores para texto.
    Clave: nombre breve; valor: instancia del vectorizador.
    """
    return {
        # TF-IDF unigrama (mínimo freq=2)
        "tfidf_uni": TfidfVectorizer(
            lowercase=True,
            strip_accents="unicode",
            stop_words="english",
            ngram_range=(1, 1),
            min_df=2
        ),
        # TF-IDF uni+bi+tri-grama
        "tfidf_1_3": TfidfVectorizer(
            lowercase=True,
            strip_accents="unicode",
            stop_words="english",
            ngram_range=(1, 3),
            min_df=2
        ),
        # CountVectorizer uni+bi-grama
        "count_1_2": CountVectorizer(
            lowercase=True,
            strip_accents="unicode",
            stop_words="english",
            ngram_range=(1, 2),
            min_df=2
        ),
        # TF-IDF de caracteres (n-gramas de longitud 2 a 5)
        "tfidf_char_2_5": TfidfVectorizer(
            lowercase=True,
            analyzer="char_wb",
            ngram_range=(2, 5),
            min_df=5
        )
    }

# Prueba rápida: mostrar shapes de cada vectorización sobre train
vectorizadores = get_vectorizers()
for nombre, vec in vectorizadores.items():
    X_vec = vec.fit_transform(X_train_raw)
    print(f"{nombre}: {X_vec.shape}")

tfidf_uni: (4457, 3376)
tfidf_1_3: (4457, 10231)
count_1_2: (4457, 7370)
tfidf_char_2_5: (4457, 17067)


In [52]:
import re

def extra_features(text_series):
    """
    Recibe una Series de strings y retorna un DataFrame con columnas:
      - longitud_texto
      - num_palabras
      - num_puntuacion
      - prop_numerico
    """
    longitud = text_series.str.len()
    num_palabras = text_series.str.split().str.len()
    num_puntuacion = text_series.apply(lambda s: len(re.findall(r"[^\w\s]", s)))  # caracteres que no son letra, digito o espacio
    prop_numerico = text_series.apply(
        lambda s: sum(c.isdigit() for c in s) / max(len(s), 1)
    )
    return pd.DataFrame({
        "longitud_texto": longitud,
        "num_palabras": num_palabras,
        "num_puntuacion": num_puntuacion,
        "prop_numerico": prop_numerico
    })

# Ejemplo rápido
df_feats = extra_features(pd.Series(["Hola, ¿cómo estás? 123", "Esta es otra línea."]))
print(df_feats)

In [53]:
import pandas as pd
import re

def extra_features(text_series: pd.Series) -> pd.DataFrame:
    """
    Recibe una Series de strings y retorna un DataFrame con columnas:
      - longitud_texto: longitud total de cada texto
      - num_palabras: número de palabras (separadas por espacios)
      - num_puntuacion: cantidad de caracteres de puntuación (cualquier cosa que no sea letra, dígito o espacio)
      - prop_numerico: proporción de caracteres numéricos sobre la longitud total
    """
    # Longitud de cada texto (número de caracteres)
    longitud = text_series.str.len()
    # Número de palabras en cada texto
    num_palabras = text_series.str.split().str.len()
    # Cantidad de caracteres que no son palabra ni espacio (puntuación, símbolos, tildes, etc.)
    num_puntuacion = text_series.apply(lambda s: len(re.findall(r"[^\w\s]", s)))
    # Proporción de caracteres numéricos en el texto (evitando división por cero)
    prop_numerico = text_series.apply(
        lambda s: sum(c.isdigit() for c in s) / max(len(s), 1)
    )

    return pd.DataFrame({
        "longitud_texto": longitud,
        "num_palabras": num_palabras,
        "num_puntuacion": num_puntuacion,
        "prop_numerico": prop_numerico
    })

def extract_numeric_features(X):
    """
    Función diseñada para usar en FunctionTransformer de sklearn.
    Recibe un arreglo-like X (una columna de texto), lo convierte a Series de pandas
    y devuelve las mismas columnas numéricas que produce extra_features.
    """
    # X vendrá como un array de strings; para usar extra_features convertimos a Series
    return extra_features(pd.Series(X))

# ------------------------------------------
# Ejemplo rápido de uso:
if __name__ == "__main__":
    ejemplos = pd.Series([
        "Hola, ¿cómo estás? 123",
        "Esta es otra línea.",
        "¡Prueba 4 U! #spam",
        ""
    ])

    df_feats = extra_features(ejemplos)
    print("Características extraídas:\n")
    print(df_feats)

Características extraídas:

   longitud_texto  num_palabras  num_puntuacion  prop_numerico
0              22             4               3       0.136364
1              19             4               1       0.000000
2              18             4               3       0.055556
3               0             0               0       0.000000


In [27]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# Preprocesador que aplica vectorización al campo “text” y extrae features numéricos
def build_pipeline(vectorizer):
    # 1) Transformer para aplicar vectorizador sobre X["text"]
    text_transformer = Pipeline([
        ("vect", vectorizer)
    ])

    # 2) Transformer para aplicar extra_features y luego scaling
    numeric_transformer = Pipeline([
        ("feat", FunctionTransformer(lambda X: extra_features(pd.Series(X)), validate=False)),
        ("scale", StandardScaler())
    ])

    # ColumnTransformer: 
    preprocessor = ColumnTransformer(
        transformers=[
            ("text", text_transformer, "text"),       # sobre la columna “text”
            ("num", numeric_transformer, "text")      # de nuevo “text” (la función extra_features lee la serie)
        ],
        remainder="drop"
    )

    # Pipeline final: preprocesador + clasificador
    pipe = Pipeline([
        ("pre", preprocessor),
        ("clf", LogisticRegression(max_iter=1000, random_state=42))
    ])
    return pipe

# Prueba rápida con uno de los vectorizadores
pipe_ejemplo = build_pipeline(vectorizadores["tfidf_1_3"])
pipe_ejemplo.fit(X_train_raw.to_frame(name="text"), y_train)
print("Entrenado pipeline de ejemplo, score en train:", pipe_ejemplo.score(X_train_raw.to_frame(name="text"), y_train))

Entrenado pipeline de ejemplo, score en train: 0.9807045097599282


In [28]:
clasificadores = {
    "LogisticRegression": LogisticRegression(max_iter=1000, random_state=42),
    "SVM_Linear": SVC(kernel="linear", probability=True, random_state=42),
    "SVM_RBF": SVC(kernel="rbf", probability=True, random_state=42),
    "RandomForest": RandomForestClassifier(n_estimators=100, random_state=42),
    "KNeighbors": KNeighborsClassifier(n_neighbors=5),
    "MultinomialNB": MultinomialNB(),
    "MLP": MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, random_state=42),
    "XGBoost": xgb.XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42),
    "LightGBM": lgb.LGBMClassifier(random_state=42)
}

In [29]:
vectorizador_base = vectorizadores["tfidf_1_3"]

In [31]:
# —————————————————————————————————————————————————————————————
# Probar distintos clasificadores básicos (con corrección para MultinomialNB)
# —————————————————————————————————————————————————————————————

from sklearn.model_selection import cross_val_score

resultados_cv = {}

for nombre_clf, clf in clasificadores.items():
    if nombre_clf == "MultinomialNB":
        # Pipeline reducido: únicamente vectorizador → MultinomialNB
        pipe_nb = Pipeline([
            ("vect", vectorizador_base),   # vectorizador solo para texto
            ("clf", clf)                   # MultinomialNB
        ])
        # Para este pipeline, X debe ser una Serie de texto (no DataFrame con "text")
        X_para_cv = X_train_raw
        print(f"Evaluando {nombre_clf} con pipeline simplificado (solo texto)...")
        # cross_val_score espera X en forma de array o Series; MultinomialNB no acepta DataFrame
        scores = cross_val_score(
            pipe_nb,
            X_para_cv,
            y_train,
            cv=5,
            scoring="f1",     # f1 para la etiqueta positiva ("spam")
            n_jobs=-1
        )
        resultados_cv[nombre_clf] = {
            "mean_f1": np.mean(scores),
            "std_f1":  np.std(scores)
        }
        print(f"{nombre_clf}: F1 promedio = {np.mean(scores):.4f} ± {np.std(scores):.4f}\n")

    else:
        # Pipeline completo: vectorizador + features numéricas → clasificador
        pipe = build_pipeline(vectorizador_base)
        pipe.set_params(clf=clf)
        # Aquí X_train_raw.to_frame(name="text") es un DataFrame con columna "text"
        X_para_cv = X_train_raw.to_frame(name="text")
        print(f"Evaluando {nombre_clf} con pipeline completo (texto + features numéricas)...")
        # cross_val_score
        scores = cross_val_score(
            pipe,
            X_para_cv,
            y_train,
            cv=5,
            scoring="f1",
            n_jobs=-1
        )
        resultados_cv[nombre_clf] = {
            "mean_f1": np.mean(scores),
            "std_f1":  np.std(scores)
        }
        print(f"{nombre_clf}: F1 promedio = {np.mean(scores):.4f} ± {np.std(scores):.4f}\n")

# Convertir resultados a DataFrame y ordenar
df_resultados_cv = pd.DataFrame(resultados_cv).T
df_resultados_cv = df_resultados_cv.sort_values(by="mean_f1", ascending=False)
df_resultados_cv

Evaluando LogisticRegression con pipeline completo (texto + features numéricas)...
LogisticRegression: F1 promedio = 0.9049 ± 0.0209

Evaluando SVM_Linear con pipeline completo (texto + features numéricas)...
SVM_Linear: F1 promedio = 0.9555 ± 0.0091

Evaluando SVM_RBF con pipeline completo (texto + features numéricas)...
SVM_RBF: F1 promedio = 0.9322 ± 0.0158

Evaluando RandomForest con pipeline completo (texto + features numéricas)...
RandomForest: F1 promedio = 0.9371 ± 0.0094

Evaluando KNeighbors con pipeline completo (texto + features numéricas)...
KNeighbors: F1 promedio = 0.9168 ± 0.0102

Evaluando MultinomialNB con pipeline simplificado (solo texto)...
MultinomialNB: F1 promedio = 0.8463 ± 0.0208

Evaluando MLP con pipeline completo (texto + features numéricas)...
MLP: F1 promedio = 0.9651 ± 0.0063

Evaluando XGBoost con pipeline completo (texto + features numéricas)...
XGBoost: F1 promedio = 0.9339 ± 0.0174

Evaluando LightGBM con pipeline completo (texto + features numéricas

Unnamed: 0,mean_f1,std_f1
MLP,0.965099,0.00629
SVM_Linear,0.955509,0.00907
RandomForest,0.937089,0.009404
XGBoost,0.933884,0.01737
SVM_RBF,0.932218,0.015762
LightGBM,0.92867,0.013416
KNeighbors,0.916759,0.010203
LogisticRegression,0.904882,0.020857
MultinomialNB,0.84631,0.020751


In [32]:
# 1) Definir pipeline base con RandomForestClassifier
pipe_rf = build_pipeline(vectorizadores["tfidf_1_3"])
pipe_rf.set_params(clf=RandomForestClassifier(random_state=42))

# 2) Definir espacio de búsqueda
param_grid_rf = {
    "pre__text__vect__ngram_range": [(1,1), (1,2)],                 # probar unigramas vs. bi-gramas
    "pre__text__vect__min_df": [1, 2, 5],                            # frecuencia mínima de término
    "clf__n_estimators": [100, 200],
    "clf__max_depth": [None, 10, 20],
    "clf__min_samples_split": [2, 5]
}

# 3) Crear GridSearchCV
grid_rf = GridSearchCV(
    pipe_rf,
    param_grid=param_grid_rf,
    cv=3,
    scoring="f1",
    n_jobs=-1,
    verbose=1
)

# 4) Ejecutar búsqueda (tardará un poco)
grid_rf.fit(X_train_raw.to_frame(name="text"), y_train)

print("Mejores parámetros para Random Forest:")
print(grid_rf.best_params_)
print("Mejor F1 (cv):", grid_rf.best_score_)

Fitting 3 folds for each of 72 candidates, totalling 216 fits
Mejores parámetros para Random Forest:
{'clf__max_depth': None, 'clf__min_samples_split': 2, 'clf__n_estimators': 200, 'pre__text__vect__min_df': 5, 'pre__text__vect__ngram_range': (1, 2)}
Mejor F1 (cv): 0.946731187847467


In [34]:
from scipy.stats import randint

rand_dist_rf = {
    "pre__text__vect__ngram_range": [(1,1), (1,2)],
    "pre__text__vect__min_df": [1, 2, 5],
    "clf__n_estimators": randint(50, 300),
    "clf__max_depth": [None, 10, 20, 30],
    "clf__min_samples_split": [2, 5, 10]
}

rand_search_rf = RandomizedSearchCV(
    pipe_rf,
    param_distributions=rand_dist_rf,
    n_iter=20,
    cv=3,
    scoring="f1",
    n_jobs=-1,
    random_state=42,
    verbose=1
)

rand_search_rf.fit(X_train_raw.to_frame(name="text"), y_train)
print("Mejores parámetros (Randomized RF):", rand_search_rf.best_params_)
print("Mejor F1 (cv):", rand_search_rf.best_score_)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
Mejores parámetros (Randomized RF): {'clf__max_depth': None, 'clf__min_samples_split': 10, 'clf__n_estimators': 84, 'pre__text__vect__min_df': 2, 'pre__text__vect__ngram_range': (1, 1)}
Mejor F1 (cv): 0.9421997307849429


In [36]:
from sklearn.ensemble import VotingClassifier

# 1) Definir los clasificadores ya ajustados
clf_lr = LogisticRegression(C=1.0, max_iter=1000, random_state=42)
clf_xgb = xgb.XGBClassifier(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.1,
    eval_metric="logloss",
    random_state=42
)

# 2) Construir el pipeline con TF-IDF (1–3) + features numéricas
pipe_voting = build_pipeline(vectorizadores["tfidf_1_3"])
pipe_voting.set_params(
    clf=VotingClassifier(
        estimators=[("lr", clf_lr), ("xgb", clf_xgb)],
        voting="soft",
        weights=[1, 2]
    )
)

# 3) Validación cruzada (5‐fold) midiendo F1
scores_voting = cross_val_score(
    pipe_voting,
    X_train_raw.to_frame(name="text"),
    y_train,
    cv=5,
    scoring="f1",
    n_jobs=-1
)

print("VotingClassifier (LR + XGB) F1 promedio: ",
      f"{np.mean(scores_voting):.4f} ± {np.std(scores_voting):.4f}")

VotingClassifier (LR + XGB) F1 promedio:  0.9437 ± 0.0150


In [37]:
# 1) Entrenar el pipeline completo sobre TODO el train set
pipe_voting.fit(X_train_raw.to_frame(name="text"), y_train)

# 2) Predecir en X_test
y_test_pred_voting = pipe_voting.predict(X_test_raw.to_frame(name="text"))

# 3) Métricas en test
from sklearn.metrics import classification_report, confusion_matrix

print("Reporte en test para VotingClassifier:\n")
print(classification_report(y_test, y_test_pred_voting, target_names=["ham", "spam"]))

cm_voting = confusion_matrix(y_test, y_test_pred_voting)
print("Matriz de confusión (test):\n", cm_voting)

Reporte en test para VotingClassifier:

              precision    recall  f1-score   support

         ham       0.99      1.00      0.99       966
        spam       0.98      0.91      0.94       149

    accuracy                           0.98      1115
   macro avg       0.98      0.95      0.97      1115
weighted avg       0.98      0.98      0.98      1115

Matriz de confusión (test):
 [[963   3]
 [ 14 135]]


In [39]:
from sklearn.linear_model import LogisticRegression as LR_base

# Definir base learners y meta-learner
estimators_stack = [
    ("rf", RandomForestClassifier(n_estimators=100, random_state=42)),
    ("svc", SVC(kernel="linear", probability=True, random_state=42)),
    ("xgb", xgb.XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42))
]
meta_learner = LR_base()

stacking_clf = StackingClassifier(
    estimators=estimators_stack,
    final_estimator=meta_learner,
    cv=5,
    n_jobs=-1,
    passthrough=True  # incluir predicciones originales como características
)

pipe_stacking = build_pipeline(vectorizadores["tfidf_1_3"])
pipe_stacking.set_params(clf=stacking_clf)

scores_stack = cross_val_score(
    pipe_stacking,
    X_train_raw.to_frame(name="text"),
    y_train,
    cv=5,
    scoring="f1"
)
print("StackingClassifier F1 promedio:", np.mean(scores_stack))

StackingClassifier F1 promedio: 0.9598418961417335


In [42]:
# df_resultados_cv (logística, SVM, RF, …)
# puedes agregar Voting y Stacking manualmente:

df_resultados_cv.loc["Voting_LR_XGB", "mean_f1"] = np.mean(scores_voting)
df_resultados_cv.loc["Voting_LR_XGB", "std_f1"]  = np.std(scores_voting)

df_resultados_cv.loc["Stacking", "mean_f1"] = np.mean(scores_stack)
df_resultados_cv.loc["Stacking", "std_f1"]  = np.std(scores_stack)

# Ordenar de nuevo
df_resultados_cv = df_resultados_cv.sort_values(by="mean_f1", ascending=False)
df_resultados_cv

Unnamed: 0,mean_f1,std_f1
MLP,0.965099,0.00629
Stacking,0.959842,0.005913
SVM_Linear,0.955509,0.00907
Voting_LR_XGB,0.943697,0.015011
RandomForest,0.937089,0.009404
XGBoost,0.933884,0.01737
SVM_RBF,0.932218,0.015762
LightGBM,0.92867,0.013416
KNeighbors,0.916759,0.010203
LogisticRegression,0.904882,0.020857


In [47]:
# Recuperar best_params_ de grid_xgb
mejores_params_xgb = grid_xgb.best_params_

# Crear pipeline definitivo
pipeline_final = build_pipeline(vectorizadores["tfidf_1_3"])
pipeline_final.set_params(
    pre__text__vect__ngram_range=mejores_params_xgb["pre__text__vect__ngram_range"],
    pre__text__vect__min_df=mejores_params_xgb["pre__text__vect__min_df"],
    clf=xgb.XGBClassifier(
        n_estimators=mejores_params_xgb["clf__n_estimators"],
        max_depth=mejores_params_xgb["clf__max_depth"],
        learning_rate=mejores_params_xgb["clf__learning_rate"],
        use_label_encoder=False,
        eval_metric="logloss",
        random_state=42
    )
)

# Entrenar sobre todo el conjunto de entrenamiento
pipeline_final.fit(X_train_raw.to_frame(name="text"), y_train)

# Evaluar en test (X_test_raw)
y_test_pred = pipeline_final.predict(X_test_raw.to_frame(name="text"))
print("Reporte en conjunto test:\n")
print(classification_report(y_test, y_test_pred, target_names=["ham", "spam"]))

# Matriz de confusión en test
cm_test = confusion_matrix(y_test, y_test_pred)
print("Matriz de confusión (test):\n", cm_test)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Reporte en conjunto test:

              precision    recall  f1-score   support

         ham       0.99      0.99      0.99       966
        spam       0.96      0.91      0.93       149

    accuracy                           0.98      1115
   macro avg       0.97      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115

Matriz de confusión (test):
 [[960   6]
 [ 13 136]]


In [48]:
# Supongamos que pipe_xgb es tu pipeline final entrenado
y_proba = pipeline_final.predict_proba(X_test_raw.to_frame(name="text"))[:, 1]  # probabilidad de spam
umbral = 0.45
y_test_pred_adjusted = (y_proba >= umbral).astype(int)

from sklearn.metrics import classification_report, confusion_matrix
print("Reporte con umbral 0.45:\n")
print(classification_report(y_test, y_test_pred_adjusted, target_names=["ham", "spam"]))
print("Matriz de confusión:\n", confusion_matrix(y_test, y_test_pred_adjusted))

Reporte con umbral 0.45:

              precision    recall  f1-score   support

         ham       0.99      0.99      0.99       966
        spam       0.95      0.91      0.93       149

    accuracy                           0.98      1115
   macro avg       0.97      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115

Matriz de confusión:
 [[959   7]
 [ 13 136]]


In [49]:
import numpy as np

# Índices de falsos negativos (spam→ham)
mask_fn = (y_test == 1) & (y_test_pred == 0)
sms_fn = X_test_raw[mask_fn]
print("Ejemplos de spam clasificados como ham:\n")
for sms in sms_fn.sample(5, random_state=42):
    print("-", sms)

Ejemplos de spam clasificados como ham:

- RCT' THNQ Adrian for U text. Rgds Vatian
- Do you realize that in about 40 years, we'll have thousands of old ladies running around with tattoos?
- FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv
- ROMCAPspam Everyone around should be responding well to your presence since you are so warm and outgoing. You are bringing in a real breath of sunshine.
- Email AlertFrom: Jeri StewartSize: 2KBSubject: Low-cost prescripiton drvgsTo listen to email call 123


In [50]:
y_test_prob = pipeline_final.predict_proba(X_test_raw.to_frame(name="text"))[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_test_prob)
roc_auc_test = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, label=f"ROC (AUC = {roc_auc_test:.3f})")
plt.plot([0,1],[0,1],"k--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Curva ROC - Modelo final (XGBoost)")
plt.legend(loc="lower right")
plt.grid(True)
plt.savefig("../resultados/roc_final_xgb.png", dpi=150)
plt.close()

In [55]:
import pandas as pd
import re

def extra_features(text_series: pd.Series) -> pd.DataFrame:
    """
    Toma una Series de strings y devuelve un DataFrame con columnas:
      - longitud_texto
      - num_palabras
      - num_puntuacion
      - prop_numerico
    """
    longitud = text_series.str.len()
    num_palabras = text_series.str.split().str.len()
    num_puntuacion = text_series.apply(lambda s: len(re.findall(r"[^\w\s]", s)))
    prop_numerico = text_series.apply(
        lambda s: sum(c.isdigit() for c in s) / max(len(s), 1)
    )
    return pd.DataFrame({
        "longitud_texto": longitud,
        "num_palabras": num_palabras,
        "num_puntuacion": num_puntuacion,
        "prop_numerico": prop_numerico
    })

def extract_numeric_features(X):
    """
    Función que recibe un arreglo-like X (una columna de texto) y devuelve las features numéricas.
    Se usa en FunctionTransformer, con X como array de strings.
    """
    # X vendrá como un array de strings; para usar extra_features convertimos a Series
    return extra_features(pd.Series(X))

In [56]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

def build_pipeline(vectorizer):
    # Transformer para el texto
    text_transformer = Pipeline([
        ("vect", vectorizer)
    ])

    # Transformer para las features numéricas usando la función definida arriba
    numeric_transformer = Pipeline([
        ("feat", FunctionTransformer(extract_numeric_features, validate=False)),
        ("scale", StandardScaler())
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ("text", text_transformer, "text"),   # aplica vectorizer a la columna “text”
            ("num", numeric_transformer, "text")  # aplica extract_numeric_features a la misma columna
        ],
        remainder="drop"
    )

    pipe = Pipeline([
        ("pre", preprocessor),
        ("clf", None)  # luego se reemplaza con el clasificador que quieras
    ])
    return pipe

In [57]:
from sklearn.linear_model import LogisticRegression
import xgboost as xgb

# Supongamos que vectorizadores["tfidf_1_3"] ya existe
vectorizador = vectorizadores["tfidf_1_3"]

# Configura el clasificador XGBoost sin use_label_encoder
clf_xgb = xgb.XGBClassifier(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.1,
    eval_metric="logloss",
    random_state=42
)

# Crea el pipeline final y ajusta
pipeline_final = build_pipeline(vectorizador)
pipeline_final.set_params(clf=clf_xgb)

# Entrena sobre el conjunto de entrenamiento completo
pipeline_final.fit(X_train_raw.to_frame(name="text"), y_train)

0,1,2
,steps,"[('pre', ...), ('clf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('text', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,'unicode'
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,'english'
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,func,<function ext...0025F0F25CD60>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [58]:
import joblib

joblib.dump(pipeline_final, "../modelos/pipeline_final_sms_spam_xgb.joblib")

['../modelos/pipeline_final_sms_spam_xgb.joblib']

In [59]:
report_final = classification_report(y_test, y_test_pred, target_names=["ham", "spam"])
with open("../resultados/reporte_final_xgb.txt", "w", encoding="utf-8") as f:
    f.write(report_final)

In [60]:
df_resultados_cv.to_csv("../resultados/comparacion_algoritmos_cv.csv")