In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import xgboost as xgb

# Paso 1: Cargar los datos de entrenamiento y prueba
train_data = pd.read_csv("data/data_format2/data_format2/train_format2.csv")
test_data = pd.read_csv("data/data_format2/data_format2/test_format2.csv")


In [3]:

# Paso 2: Preprocesamiento de datos
# Rellenar valores nulos en 'age_range' y 'gender'
train_data.fillna({'age_range': 0, 'gender': 2}, inplace=True)
test_data.fillna({'age_range': 0, 'gender': 2}, inplace=True)


In [5]:
# Paso 3: Extraer características útiles del campo 'activity_log'
def process_activity_log(activity_log):
    # Verificar si el valor no es nulo
    if pd.isna(activity_log):
        return pd.Series({"click": 0, "add_to_cart": 0, "purchase": 0, "add_to_favorite": 0})
    
    actions = activity_log.split('#')
    counts = {"click": 0, "add_to_cart": 0, "purchase": 0, "add_to_favorite": 0}
    
    for action in actions:
        _, _, _, _, action_type = action.split(':')
        if action_type == '0':
            counts["click"] += 1
        elif action_type == '1':
            counts["add_to_cart"] += 1
        elif action_type == '2':
            counts["purchase"] += 1
        elif action_type == '3':
            counts["add_to_favorite"] += 1
    
    return pd.Series(counts)

# Aplicar la función a cada registro de activity_log
train_data = train_data.join(train_data['activity_log'].apply(process_activity_log))
test_data = test_data.join(test_data['activity_log'].apply(process_activity_log))

MemoryError: 

In [None]:
# Paso 4: Eliminar columnas no necesarias para el modelo
train_data.drop(['user_id', 'merchant_id', 'activity_log'], axis=1, inplace=True)
test_data.drop(['user_id', 'merchant_id', 'activity_log'], axis=1, inplace=True)

# Paso 5: Preparar los datos para entrenamiento
X = train_data.drop('label', axis=1)
y = train_data['label']

# Dividir en datos de entrenamiento y validación
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Paso 6: Configurar y entrenar el modelo XGBoost
xgb_params = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'learning_rate': 0.1,
    'max_depth': 6,
    'n_estimators': 100,
    'use_label_encoder': False
}

model = xgb.XGBClassifier(**xgb_params)

# Entrenar el modelo
model.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=10, verbose=True)

# Paso 7: Evaluar el modelo
y_val_pred = model.predict_proba(X_val)[:, 1]
auc_score = roc_auc_score(y_val, y_val_pred)
print(f"AUC en el conjunto de validación: {auc_score}")

# Paso 8: Generar predicciones para el conjunto de prueba
test_predictions = model.predict_proba(test_data)[:, 1]

# Paso 9: Guardar los resultados en prediction.csv
submission = test_data[['user_id', 'merchant_id']].copy()
submission['prob'] = test_predictions
submission.to_csv("prediction.csv", index=False)
print("Predicciones guardadas en prediction.csv")