In [12]:
%pip install pandas scikit-learn xgboost tensorflow joblib

Note: you may need to restart the kernel to use updated packages.


In [13]:
# train_models.py
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from joblib import dump
import tensorflow as tf

# ------------------------------------------------------------------
# 1. Carga de datos
# ------------------------------------------------------------------
DATA_DIR = Path('.')          # ajusta si lo necesitas
train_df = pd.read_csv(DATA_DIR / 'train.csv')
val_df   = pd.read_csv(DATA_DIR / 'val.csv')
test_df  = pd.read_csv(DATA_DIR / 'test.csv')

# Todas las columnas excepto 'result' son features
target_col  = 'result'
feature_cols = ["simhash", "astsimilarity"]

X_train, y_train = train_df[feature_cols], train_df[target_col]
X_val,   y_val   = val_df  [feature_cols], val_df  [target_col]
X_test,  y_test  = test_df [feature_cols], test_df [target_col]

# ------------------------------------------------------------------
# 2. Pre-procesamiento: escalamos numéricos
# ------------------------------------------------------------------
num_processor = Pipeline([('scaler', StandardScaler())])
preprocess    = ColumnTransformer([('num', num_processor, feature_cols)])

# ------------------------------------------------------------------
# 3. Modelo 0 — Baseline con TensorFlow
# ------------------------------------------------------------------
def build_tf_model(input_dim):
    model = tf.keras.Sequential([
        tf.keras.layers.InputLayer(input_shape=(input_dim,)),
        tf.keras.layers.Dense(32, activation='relu'),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(16, activation='relu'),
        tf.keras.layers.Dense(1,  activation='sigmoid')
    ])
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy', tf.keras.metrics.AUC(name='auc')])
    return model

X_tr_scaled = preprocess.fit_transform(X_train)
X_val_scaled = preprocess.transform(X_val)
X_te_scaled  = preprocess.transform(X_test)

tf_model = build_tf_model(X_tr_scaled.shape[1])
tf_model.fit(X_tr_scaled, y_train,
             validation_data=(X_val_scaled, y_val),
             epochs=50, batch_size=128, verbose=0,
             callbacks=[tf.keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True)])

print('\n🟢 TensorFlow — Test metrics')
tf_metrics = tf_model.evaluate(X_te_scaled, y_test, verbose=0)
for name, val in zip(tf_model.metrics_names, tf_metrics):
    print(f'{name}: {val:.4f}')
tf_model.save('tf_baseline.keras')

# ------------------------------------------------------------------
# 4. Modelo 1 — Regresión logística
# ------------------------------------------------------------------
logreg = Pipeline([
    ('prep', preprocess),
    ('clf', LogisticRegression(max_iter=1000, solver='lbfgs'))
])
logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_val)
print('\n🟢 LogReg — Val accuracy:', accuracy_score(y_val, y_pred))
dump(logreg, 'logreg.joblib')

# ------------------------------------------------------------------
# 5. Modelo 2 — Random Forest
# ------------------------------------------------------------------
rf = Pipeline([
    ('prep', preprocess),
    ('clf', RandomForestClassifier(
        n_estimators=400,
        max_depth=None,
        class_weight='balanced',
        n_jobs=-1,
        random_state=42))
])
rf.fit(X_train, y_train)
print('\n🟢 RandomForest — Val accuracy:',
      accuracy_score(y_val, rf.predict(X_val)))
dump(rf, 'rf.joblib')

# ------------------------------------------------------------------
# 6. Modelo 3 — XGBoost
# ------------------------------------------------------------------
preprocess.fit(X_train)

X_train_np = preprocess.transform(X_train)
X_val_np   = preprocess.transform(X_val)

xgb_clf = XGBClassifier(
        n_estimators=600,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.9,
        colsample_bytree=0.8,
        objective='binary:logistic',
        eval_metric='logloss',
        n_jobs=-1,
        random_state=42)

xgb_clf.fit(X_train_np, y_train,
            eval_set=[(X_val_np, y_val)],
            verbose=False)

print('\n🟢 XGBoost — Val accuracy:',
      accuracy_score(y_val, (xgb_clf.predict(X_val_np) > 0.5)))

xgb_clf.save_model('xgb.json')

# ------------------------------------------------------------------
# 7. Comparación final en TEST
# ------------------------------------------------------------------
def evaluate(name, model, X, y):
    y_hat   = model.predict(X)
    if hasattr(y_hat, 'dtype') and y_hat.dtype != int:
        y_hat = (y_hat > 0.5).astype(int)
    print(f'🔸 {name}: acc={accuracy_score(y, y_hat):.4f}',
          f'f1={f1_score(y, y_hat):.4f}',
          f'auc={roc_auc_score(y, model.predict_proba(X)[:,1]):.4f}')

print('\n📊  Métricas finales en TEST:')
evaluate('LogReg'      , logreg, X_test, y_test)
evaluate('RandomForest', rf    , X_test, y_test)
X_test_np = preprocess.transform(X_test)
evaluate("XGBoost",      xgb_clf, X_test_np,      y_test)

# TensorFlow (necesita X escalado ya hecho)
tf_pred   = (tf_model.predict(X_te_scaled) > 0.5).astype(int).ravel()
print(f'🔸 TensorFlow: acc={accuracy_score(y_test, tf_pred):.4f}',
      f'f1={f1_score(y_test, tf_pred):.4f}',
      f'auc={roc_auc_score(y_test, tf_model.predict(X_te_scaled)):.4f}')



🟢 TensorFlow — Test metrics
loss: 0.5155
accuracy: 0.7884
auc: 0.7925

🟢 LogReg — Val accuracy: 0.6275862068965518

🟢 RandomForest — Val accuracy: 0.6827586206896552

🟢 XGBoost — Val accuracy: 0.6655172413793103

📊  Métricas finales en TEST:
🔸 LogReg: acc=0.7611 f1=0.6903 auc=0.8055
🔸 RandomForest: acc=0.6758 f1=0.5957 auc=0.7030
🔸 XGBoost: acc=0.7406 f1=0.6545 auc=0.7820
🔸 TensorFlow: acc=0.7884 f1=0.7019 auc=0.7935
