# Preprocesamiento

In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.combine import SMOTETomek
from sklearn.metrics import f1_score, precision_score, recall_score
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score, roc_auc_score, classification_report, confusion_matrix

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# Leer el archivo CSV
df_identity = pd.read_csv("/content/drive/MyDrive/train_identity.csv")
df_transaction = pd.read_csv("/content/drive/MyDrive/train_transaction.csv")

# Unir los DataFrames en uno solo
df_merged = pd.merge(df_identity, df_transaction, on="TransactionID", how="left")

In [5]:
# Creación del user Id para poder hacer seguimiento por cliente
df_merged["user_id"] = (
    df_merged["card1"].astype(str) + "_" +
    df_merged["card2"].astype(str) + "_" +
    df_merged["card3"].astype(str) + "_" +
    df_merged["card5"].astype(str) + "_" +
    df_merged["card4"].astype(str) + "_" +
    df_merged["card6"].astype(str) + "_" +
    df_merged["addr1"].astype(str) + "_" +
    df_merged["dist1"].astype(str) + "_" +
    df_merged["P_emaildomain"].astype(str) + "_" +
    df_merged["R_emaildomain"].astype(str) + "_" +
    df_merged["id_02"].astype(str) + "_" +
    df_merged["id_05"].astype(str) + "_" +
    df_merged["id_06"].astype(str) + "_" +
    df_merged["id_15"].astype(str) + "_" +
    df_merged["id_30"].astype(str) + "_" +
    df_merged["id_31"].astype(str) + "_" +
    df_merged["DeviceInfo"].astype(str)
)

In [10]:
df_merged = df_merged.drop_duplicates(subset="user_id", keep='first')

df_merged = df_merged.set_index('user_id')

In [11]:
# Verificar si hay índices duplicados
duplicated_user_ids = df_merged.index.duplicated(keep=False)
# Mostrar cuántos hay
print(f"Cantidad de user_id duplicados: {duplicated_user_ids.sum()}")

Cantidad de user_id duplicados: 0


In [12]:
# Borramos las columnas que tengan mas de un 40% de nulos
threshold = 0.40                            # Umbral del 40%
df = df_merged.copy()                      # Crear copia para no tocar el original
null_ratio = df.isnull().mean()            # Proporción de nulos por columna
cols_to_drop = null_ratio[null_ratio > threshold].index  # Columnas con > 40% de nulos
df.drop(columns=cols_to_drop, inplace=True)  # Eliminar columnas de la copia
print(f"Columnas eliminadas por nulos: {len(cols_to_drop)}")

df.dropna(inplace=True)
df.shape

Columnas eliminadas por nulos: 187


(75975, 247)

# Encoding / Escalar

In [13]:
# Lista de columnas categóricas que quieres codificar
categorical_columns = [
    'DeviceType', 'DeviceInfo', 'ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5', 'card6',
    'addr1', 'addr2', 'P_emaildomain', 'R_emaildomain',
    'id_12', 'id_13', 'id_14', 'id_15', 'id_16', 'id_17', 'id_18', 'id_19', 'id_20',
    'id_21', 'id_22', 'id_23', 'id_24', 'id_25', 'id_26', 'id_27', 'id_28', 'id_29', 'id_30',
    'id_31', 'id_32', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38',
    'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9'
]

# Para cada columna categórica que exista en el DataFrame, aplicamos label encoding
le = LabelEncoder()
for col in categorical_columns:
    if col in df.columns:
        df[col] = le.fit_transform(df[col].astype(str))

In [14]:
# Columnas numéricas = todas las que no sean categóricas ni el target
target_col = ["isFraud"]
numeric_columns = [col for col in df.columns if col not in categorical_columns + target_col]

# Aplicar StandardScaler a las columnas numéricas
scaler = StandardScaler()
df[numeric_columns] = scaler.fit_transform(df[numeric_columns])

# Modelado

In [15]:
# Separar X e y
X = df.drop(columns=["isFraud","TransactionID"])
y = df["isFraud"]

# Split con 80% para entrenamiento y 20% para validación
X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,            # 20% de los datos para validación
    random_state=42,          # Para reproducibilidad
    stratify=y                # Estratificación para que las clases estén balanceadas en ambos sets
)

# Mostrar el tamaño de cada subconjunto
print("Tamaño del set de entrenamiento:", X_train.shape)
print("Tamaño del set de validación:", X_val.shape)
print("Proporción de fraudes en entrenamiento:", y_train.mean())
print("Proporción de fraudes en validación:", y_val.mean())

Tamaño del set de entrenamiento: (60780, 245)
Tamaño del set de validación: (15195, 245)
Proporción de fraudes en entrenamiento: 0.08084896347482724
Proporción de fraudes en validación: 0.08081605791378743


In [16]:
# Instanciamos el balanceador
smt = SMOTETomek(random_state=42)

# Aplicamos solo sobre el set de entrenamiento
X_train_sm, y_train_sm = smt.fit_resample(X_train, y_train)

# Verificamos las nuevas proporciones
print("Tamaño del set de entrenamiento balanceado:", X_train_sm.shape)
print("Proporción de fraudes después del balanceo:", y_train_sm.mean())

Tamaño del set de entrenamiento balanceado: (111456, 245)
Proporción de fraudes después del balanceo: 0.5


In [17]:
def evaluate_thresholds(y_true, y_probs, metric='f1'):
    """
    Encuentra el mejor threshold según la métrica (default: F1-score)
    """
    thresholds = np.linspace(0.1, 0.9, 100)
    scores = []

    for thresh in thresholds:
        y_pred = (y_probs >= thresh).astype(int)
        if metric == 'f1':
            score = f1_score(y_true, y_pred)
        elif metric == 'precision':
            score = precision_score(y_true, y_pred)
        elif metric == 'recall':
            score = recall_score(y_true, y_pred)
        else:
            raise ValueError("Metric must be 'f1', 'precision' or 'recall'")
        scores.append(score)

    best_idx = np.argmax(scores)
    return thresholds[best_idx]


In [18]:
# ⚙️ Hiperparámetros finales
best_params = {
    "n_estimators": 443,
    "max_depth": 7,
    "min_child_weight": 5,
    "gamma": 0.07695212100160928,
    "subsample": 0.865237466656049,
    "colsample_bytree": 0.7377072170135907,
    "reg_alpha": 0.8922143189961961,
    "reg_lambda": 0.9920686938647577,
    "learning_rate": 0.15753081640792435,
    "eval_metric": "logloss",
    "random_state": 42,
    "n_jobs": -1,
}

# 🏗️ Entrenar modelo final
final_model = XGBClassifier(**best_params)
final_model.fit(X_train_sm, y_train_sm)

# 🔮 Predicción de probabilidades
y_proba_final = final_model.predict_proba(X_val)[:, 1]

# 🔧 Aplicar threshold óptimo
optimal_threshold = 0.3020
y_pred_final = (y_proba_final >= optimal_threshold).astype(int)

# 📊 Evaluación final
print("\n--- 📊 Evaluación FINAL del modelo optimizado ---")
print(f"Threshold    : {optimal_threshold:.4f}")
print(f"Accuracy     : {accuracy_score(y_val, y_pred_final):.4f}")
print(f"F1-score     : {f1_score(y_val, y_pred_final):.4f}")
print(f"Recall       : {recall_score(y_val, y_pred_final):.4f}")
print(f"Precision    : {precision_score(y_val, y_pred_final):.4f}")
print(f"ROC AUC      : {roc_auc_score(y_val, y_proba_final):.4f}")

# 📄 Reportes
print("\n" + classification_report(y_val, y_pred_final, digits=4))
print(confusion_matrix(y_val, y_pred_final))


--- 📊 Evaluación FINAL del modelo optimizado ---
Threshold    : 0.3020
Accuracy     : 0.9782
F1-score     : 0.8621
Recall       : 0.8428
Precision    : 0.8824
ROC AUC      : 0.9786

              precision    recall  f1-score   support

           0     0.9862    0.9901    0.9882     13967
           1     0.8824    0.8428    0.8621      1228

    accuracy                         0.9782     15195
   macro avg     0.9343    0.9165    0.9252     15195
weighted avg     0.9778    0.9782    0.9780     15195

[[13829   138]
 [  193  1035]]


In [None]:
# # 🔍 Predicciones en el set de entrenamiento (Chequeo de Overfitting / Underfitting)
# y_proba_train = final_model.predict_proba(X_train_sm)[:, 1]
# threshold_train = evaluate_thresholds(y_train_sm, y_proba_train)
# y_pred_train = (y_proba_train >= threshold_train).astype(int)

# # 📊 Evaluación en entrenamiento
# print("\n--- Evaluación en entrenamiento ---")
# print(f"Accuracy     : {accuracy_score(y_train_sm, y_pred_train):.4f}")
# print(f"F1-score     : {f1_score(y_train_sm, y_pred_train):.4f}")
# print(f"Recall       : {recall_score(y_train_sm, y_pred_train):.4f}")
# print(f"Precision    : {precision_score(y_train_sm, y_pred_train):.4f}")
# print(f"ROC AUC      : {roc_auc_score(y_train_sm, y_proba_train):.4f}")


# Resultados

In [19]:
# --- 1. Predicción de probabilidades en validación ---
y_val_proba = final_model.predict_proba(X_val)[:, 1]

# --- 2. Crear df_scores con user_id, fraud_score y monto ---
df_scores = pd.DataFrame({
    "user_id": X_val.index,  # índice ya es user_id
    "fraud_score": y_val_proba,
    "TransactionAmt": X_val["TransactionAmt"].values
})

df_scores["TransactionAmt"] = X_val.index.map(df_merged["TransactionAmt"])

# --- 3. Clasificar en grupos de riesgo ---
def assign_risk_group(score):
    if score < 0.3:
        return "Bajo riesgo"
    elif score < 0.6:
        return "Riesgo medio"
    elif score < 0.9:
        return "Riesgo alto"
    else:
        return "Fraude"

df_scores["risk_group"] = df_scores["fraud_score"].apply(assign_risk_group)

# --- 4. Asignar tipo de servicio según grupo de riesgo ---
def assign_service_package(risk_group):
    if risk_group == "Bajo riesgo":
        return "Paquete de Servicios completo"
    elif risk_group == "Riesgo medio":
        return "Paquete de Servicios medio"
    elif risk_group == "Riesgo alto":
        return "Paquete de Servicios simple"
    else:  # Fraude
        return "Sin Paquete de Servicios"

df_scores["service_assignment"] = df_scores["risk_group"].apply(assign_service_package)

# --- 5. Reordenar columnas ---
df_scores = df_scores[[
    "user_id", "fraud_score", "risk_group", "TransactionAmt", "service_assignment"
]]
df_scores

Unnamed: 0,user_id,fraud_score,risk_group,TransactionAmt,service_assignment
0,13832_375.0_185.0_224.0_mastercard_debit_nan_n...,0.009481,Bajo riesgo,16.790,Paquete de Servicios completo
1,11408_555.0_185.0_226.0_visa_credit_nan_nan_ho...,0.000769,Bajo riesgo,41.043,Paquete de Servicios completo
2,2261_111.0_150.0_226.0_visa_debit_143.0_nan_gm...,0.000178,Bajo riesgo,25.000,Paquete de Servicios completo
3,13307_428.0_150.0_166.0_visa_debit_184.0_nan_g...,0.000023,Bajo riesgo,150.000,Paquete de Servicios completo
4,4621_555.0_162.0_226.0_visa_debit_nan_nan_anon...,0.001809,Bajo riesgo,1.041,Paquete de Servicios completo
...,...,...,...,...,...
15190,15063_514.0_150.0_226.0_visa_credit_204.0_nan_...,0.000468,Bajo riesgo,25.000,Paquete de Servicios completo
15191,13844_583.0_150.0_226.0_visa_credit_299.0_nan_...,0.000551,Bajo riesgo,150.000,Paquete de Servicios completo
15192,4766_555.0_150.0_226.0_visa_debit_428.0_nan_gm...,0.020563,Bajo riesgo,100.000,Paquete de Servicios completo
15193,12426_555.0_150.0_226.0_visa_debit_308.0_nan_g...,0.000062,Bajo riesgo,50.000,Paquete de Servicios completo


In [21]:
# 1️⃣ Guardar df_scores en CSV
df_scores.to_csv("df_scores.csv", index=False)

In [28]:
import os
os.getcwd()

'/content'



DeltaGenerator()

SyntaxError: invalid syntax (ipython-input-3737097518.py, line 1)

# Prueba de Modelos

In [None]:
# BUSQUEDA DE MEJOR MODELO

# from sklearn.ensemble import RandomForestClassifier
# from xgboost import XGBClassifier
# from catboost import CatBoostClassifier
# from lightgbm import LGBMClassifier

# from sklearn.metrics import f1_score, recall_score, precision_score, roc_auc_score
# import pandas as pd
# import time

# # Lista de modelos a comparar
# models = {
#     "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1),
#     "XGBoost": XGBClassifier(n_estimators=100, use_label_encoder=False, eval_metric='logloss', random_state=42, n_jobs=-1),
#     "CatBoost": CatBoostClassifier(verbose=0, random_state=42),
#     "LightGBM": LGBMClassifier(n_estimators=100, random_state=42, n_jobs=-1)
# }

# results = []

# # Entrenamiento y evaluación
# for name, model in models.items():
#     start = time.time()
#     model.fit(X_train_sm, y_train_sm)
#     y_pred = model.predict(X_val)
#     y_proba = model.predict_proba(X_val)[:, 1]  # Para ROC AUC

#     f1 = f1_score(y_val, y_pred)
#     recall = recall_score(y_val, y_pred)
#     precision = precision_score(y_val, y_pred)
#     roc_auc = roc_auc_score(y_val, y_proba)
#     duration = time.time() - start

#     results.append({
#         "Model": name,
#         "F1-score": f1,
#         "Recall": recall,
#         "Precision": precision,
#         "ROC AUC": roc_auc,
#         "Train Time (s)": round(duration, 2)
#     })

# # Mostrar resultados ordenados por F1-score
# results_df = pd.DataFrame(results).sort_values(by="F1-score", ascending=False)
# print(results_df)

In [None]:
#BUSQUEDA DE MEJORES HIPERPARAMETROS RandomizedSearchCV (XGBoost, CatBoost)

# from sklearn.model_selection import RandomizedSearchCV
# from xgboost import XGBClassifier
# from catboost import CatBoostClassifier

# import pandas as pd
# import time

# # Definir modelos base
# models = {
#     "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42, n_jobs=-1),
#     "CatBoost": CatBoostClassifier(verbose=0, random_state=42)
# }

# # Espacios de búsqueda (parámetros más importantes)
# params = {
#     "XGBoost": {
#         'n_estimators': [100, 200, 300],
#         'max_depth': [3, 6, 10],
#         'learning_rate': [0.01, 0.1, 0.2],
#         'subsample': [0.6, 0.8, 1],
#         'colsample_bytree': [0.6, 0.8, 1],
#         'gamma': [0, 0.1, 0.2]
#     },
#     "CatBoost": {
#         'iterations': [100, 200, 300],
#         'depth': [4, 6, 10],
#         'learning_rate': [0.01, 0.1, 0.2],
#         'l2_leaf_reg': [1, 3, 5]
#     }
# }

# from sklearn.metrics import f1_score

# results = []

# for name in models:
#     print(f"\n--- Fine tuning {name} ---")
#     model = models[name]
#     param_dist = params[name]

#     # RandomizedSearchCV con 3 folds, 20 iteraciones, scoring f1
#     search = RandomizedSearchCV(
#         estimator=model,
#         param_distributions=param_dist,
#         n_iter=20,
#         scoring='f1',
#         cv=3,
#         verbose=1,
#         random_state=42,
#         n_jobs=-1
#     )

#     start = time.time()
#     search.fit(X_train_sm, y_train_sm)
#     duration = time.time() - start

#     best_model = search.best_estimator_
#     print(f"Mejores params: {search.best_params_}")
#     print(f"Tiempo de búsqueda: {duration:.1f} seg")

#     # Evaluar en set de validación
#     y_pred = best_model.predict(X_val)
#     y_proba = best_model.predict_proba(X_val)[:, 1]

#     f1 = f1_score(y_val, y_pred)
#     recall = recall_score(y_val, y_pred)
#     precision = precision_score(y_val, y_pred)
#     roc_auc = roc_auc_score(y_val, y_proba)

#     results.append({
#         "Model": name,
#         "F1-score": f1,
#         "Recall": recall,
#         "Precision": precision,
#         "ROC AUC": roc_auc,
#         "Train Time (s)": round(duration, 1)
#     })

# # Mostrar resultados ordenados por F1-score
# results_df = pd.DataFrame(results).sort_values(by="F1-score", ascending=False)
# print("\n--- Resultados Fine Tuning ---")
# print(results_df)

In [None]:
# !pip install optuna
# import optuna
# from xgboost import XGBClassifier
# from sklearn.metrics import f1_score
# from sklearn.model_selection import train_test_split

# # ✅ Callback personalizado para imprimir resultados bonitos
# def print_callback(study, trial):
#     print(f"\nTrial {trial.number} terminado")
#     print(f"🔎 F1 Score : {trial.value:.5f}")
#     print(f"📌 Params   : {trial.params}")

# # 🎯 Función objetivo para Optuna
# def objective(trial):
#     # Hiperparámetros a optimizar
#     params = {
#         "n_estimators": trial.suggest_int("n_estimators", 100, 500),
#         "max_depth": trial.suggest_int("max_depth", 3, 10),
#         "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
#         "gamma": trial.suggest_float("gamma", 0.0, 1.0),
#         "subsample": trial.suggest_float("subsample", 0.6, 0.9),
#         "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 0.9),
#         "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 1.0),
#         "reg_lambda": trial.suggest_float("reg_lambda", 0.0, 1.0),
#         "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2),
#         "random_state": 42,
#         "n_jobs": -1,
#         "eval_metric": "logloss"
#     }

#     # Entrenar modelo
#     model = XGBClassifier(**params)
#     model.fit(X_train_sm, y_train_sm)

#     # Predecir y ajustar threshold
#     y_proba = model.predict_proba(X_val)[:, 1]
#     threshold = evaluate_thresholds(y_val, y_proba, metric='f1')
#     y_pred = (y_proba >= threshold).astype(int)

#     return f1_score(y_val, y_pred)

# # 🧠 Crear estudio y optimizar
# study = optuna.create_study(direction='maximize')
# study.optimize(objective, n_trials=50, callbacks=[print_callback])

# # ✅ Mostrar mejores hiperparámetros
# print("\n🧠 Mejores hiperparámetros encontrados:")
# for k, v in study.best_params.items():
#     print(f"{k:<20}: {v}")
# print(f"\n🔝 Mejor F1 Score: {study.best_value:.5f}")

# # ✅ Entrenamiento final con mejores hiperparámetros
# best_params = {
#     **study.best_params,
#     "eval_metric": "logloss",
#     "n_jobs": -1,
#     "random_state": 42
# }
# final_model = XGBClassifier(**best_params)
# final_model.fit(X_train_sm, y_train_sm)

# # ✅ Evaluación final
# y_proba_best = final_model.predict_proba(X_val)[:, 1]
# best_threshold = evaluate_thresholds(y_val, y_proba_best)
# y_pred_best = (y_proba_best >= best_threshold).astype(int)

# from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score, precision_score, recall_score

# print("\n--- 📊 Evaluación FINAL del modelo optimizado ---")
# print(f"Threshold    : {best_threshold:.4f}")
# print(f"Accuracy     : {accuracy_score(y_val, y_pred_best):.4f}")
# print(f"F1-score     : {f1_score(y_val, y_pred_best):.4f}")
# print(f"Recall       : {recall_score(y_val, y_pred_best):.4f}")
# print(f"Precision    : {precision_score(y_val, y_pred_best):.4f}")
# print(f"ROC AUC      : {roc_auc_score(y_val, y_proba_best):.4f}")
# print("\n" + classification_report(y_val, y_pred_best, digits=4))
# print(confusion_matrix(y_val, y_pred_best))

# # ✅ Mostrar los mejores trials
# print("\n🔢 Top 5 mejores trials:")
# trials_df = study.trials_dataframe()
# display(trials_df.sort_values("value", ascending=False).head(5))


In [None]:
# #PRIMER FINE TUNNING SIN THRESHOLD

# from xgboost import XGBClassifier
# from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report, confusion_matrix

# # ✅ Entrenamos el modelo con los mejores hiperparámetros
# xgb_best = XGBClassifier(
#     n_estimators=400,
#     max_depth=10,
#     learning_rate=0.2,
#     subsample=0.8,
#     colsample_bytree=1,
#     gamma=0,
#     use_label_encoder=False,
#     eval_metric='logloss',
#     random_state=42,
#     n_jobs=-1
# )

# xgb_best.fit(X_train_sm, y_train_sm)

# # ✅ Predicciones
# y_pred = xgb_best.predict(X_val)
# y_proba = xgb_best.predict_proba(X_val)[:, 1]

# # ✅ Evaluación
# f1 = f1_score(y_val, y_pred)
# recall = recall_score(y_val, y_pred)
# precision = precision_score(y_val, y_pred)
# roc_auc = roc_auc_score(y_val, y_proba)
# accuracy = accuracy_score(y_val, y_pred)

# # ✅ Mostrar resultados
# print("\n--- Evaluación modelo final XGBoost ---")
# print(f"Accuracy     : {accuracy:.4f}")
# print(f"F1-score     : {f1:.4f}")
# print(f"Recall       : {recall:.4f}")
# print(f"Precision    : {precision:.4f}")
# print(f"ROC AUC      : {roc_auc:.4f}")

# print(classification_report(y_val, y_pred, digits=4))
# print(confusion_matrix(y_val, y_pred))