In [None]:
# @title 1. Configuración
!pip install gensim --quiet

import pandas as pd
import numpy as np
import tensorflow as tf
import random
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers, models, callbacks, optimizers

# Para entrenar tus propios vectores
from gensim.models import Word2Vec

# Semillas
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)
tf.keras.utils.set_random_seed(SEED)

In [None]:
# @title 2. Carga y Preparación de Datos
URL_BASE = '/kaggle/input/nlp-getting-started/'
df = pd.read_csv(URL_BASE + "train.csv")


def clean_text(text):
    if pd.isna(text): return ""
    s = str(text).lower()
    s = s.replace("#", " ")
    s = re.sub(r'http\S+', '', s)
    s = re.sub(r'[^a-z0-9\s]', '', s)
    return s.strip()

# Limpiamos
df['text_clean'] = df['text'].apply(clean_text)
df['keyword_clean'] = df['keyword'].apply(clean_text)

# Concatenamos KEYWORD + TEXTO
df['final_text'] = df['keyword_clean'].fillna('') + " " + df['text_clean']
df['final_text'] = df['final_text'].str.strip()

X_train, X_temp, y_train, y_temp = train_test_split(
    df["final_text"].values, df["target"].values,
    test_size=0.2, random_state=SEED, stratify=df["target"]
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=SEED, stratify=y_temp
)


In [None]:
# @title 3. Tokenización y Entrenamiento Word2Vec
VOCAB_SIZE = 20000
OOV_TOKEN = "<UNK>"
MAX_LEN = 50

# A. Tokenizer de Keras
tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token=OOV_TOKEN)
tokenizer.fit_on_texts(X_train)

# Convertir a secuencias y padding
def texts_to_padded(x):
    seqs = tokenizer.texts_to_sequences(x)
    return pad_sequences(seqs, maxlen=MAX_LEN, padding="post", truncating="post")

Xtr = texts_to_padded(X_train)
Xva = texts_to_padded(X_val)
Xte = texts_to_padded(X_test)

word_index = tokenizer.word_index
vocab_len = min(VOCAB_SIZE, len(word_index) + 1)

train_tokens = [t.split() for t in X_train]

W2V_DIM = 100

w2v = Word2Vec(
    sentences=train_tokens,
    vector_size=W2V_DIM,
    window=5,
    min_count=1,
    workers=4,
    seed=SEED,
    sg=1
)

embedding_matrix = np.zeros((vocab_len, W2V_DIM))
hits = 0
misses = 0

for word, idx in word_index.items():
    if idx >= vocab_len: continue
    if word in w2v.wv:
        embedding_matrix[idx] = w2v.wv[word]
        hits += 1
    else:
        embedding_matrix[idx] = np.random.normal(scale=0.6, size=(W2V_DIM,))
        misses += 1


In [None]:
# @title 4. Definición y Entrenamiento
from tensorflow.keras import layers, models, callbacks, optimizers
import matplotlib.pyplot as plt

w2v = Word2Vec(
    sentences=train_tokens,
    vector_size=100,
    window=5,
    min_count=2,
    workers=4,
    seed=SEED,
    sg=1
)

embedding_matrix = np.zeros((vocab_len, 100))
for word, idx in word_index.items():
    if idx >= vocab_len: continue
    if word in w2v.wv:
        embedding_matrix[idx] = w2v.wv[word]
    else:
        embedding_matrix[idx] = np.random.normal(scale=0.6, size=(100,))

modelRNN = models.Sequential([
    layers.Embedding(
        input_dim=vocab_len,
        output_dim=100,
        weights=[embedding_matrix],
        input_length=MAX_LEN,
        trainable=True,
        mask_zero=True
    ),

    layers.SpatialDropout1D(0.3),

    layers.Bidirectional(layers.LSTM(32, return_sequences=False)),

    layers.Dropout(0.4),

    layers.Dense(32, activation="relu"),
    layers.Dropout(0.4),

    layers.Dense(1, activation="sigmoid"),
])

modelRNN.compile(optimizer=optimizers.Adam(0.0005),
                 loss="binary_crossentropy",
                 metrics=["accuracy"])

early_stopper = callbacks.EarlyStopping(
    monitor='val_loss',
    patience=4,
    restore_best_weights=True
)

hist = modelRNN.fit(Xtr, y_train,
                    epochs=30,
                    validation_data=(Xva, y_val),
                    callbacks=[early_stopper],
                    batch_size=32)

Epoch 1/30




[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 135ms/step - accuracy: 0.5938 - loss: 0.6598 - val_accuracy: 0.7201 - val_loss: 0.5441
Epoch 2/30
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 89ms/step - accuracy: 0.7248 - loss: 0.5526 - val_accuracy: 0.7582 - val_loss: 0.4897
Epoch 3/30
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 87ms/step - accuracy: 0.7787 - loss: 0.4842 - val_accuracy: 0.7753 - val_loss: 0.4853
Epoch 4/30
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 86ms/step - accuracy: 0.8191 - loss: 0.4240 - val_accuracy: 0.7832 - val_loss: 0.4750
Epoch 5/30
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 90ms/step - accuracy: 0.8475 - loss: 0.3718 - val_accuracy: 0.7963 - val_loss: 0.4491
Epoch 6/30
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 85ms/step - accuracy: 0.8657 - loss: 0.3319 - val_accuracy: 0.7792 - val_loss: 0.5004
Epoch 7/30
[1m191/191[0m

In [None]:
from sklearn.metrics import (
    make_scorer,
    f1_score,
    accuracy_score,
    precision_score,
    recall_score,
    confusion_matrix,
    classification_report
)

y_pred_prob_train = modelRNN.predict(Xtr)
umbrales = np.arange(0.1, 0.9, 0.01)
mejor_umbral = 0.5
best_f1 = 0

for u in umbrales:
    y_pred_t = (y_pred_prob_train >= t).astype(int)
    f1 = f1_score(y_train, y_pred_t)
    if f1 > best_f1:
        best_f1 = f1
        mejor_umbral = u


y_pred_val = (modelRNN.predict(Xva) >= mejor_umbral).astype(int)

f1 = f1_score(y_val, y_pred_val, average='weighted')
accuracy = accuracy_score(y_val, y_pred_val)
precision_1 = precision_score(y_val, y_pred_val, pos_label=1)
precision_0 = precision_score(y_val, y_pred_val, pos_label=0)
recall_1 = recall_score(y_val, y_pred_val, pos_label=1)
recall_0 = recall_score(y_val, y_pred_val, pos_label=0)

print(f'F1 {f1:.4f}')
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision_0: {precision_0:.4f}, Precision_1: {precision_1:.4f}')
print(f'Recall_0: {recall_0:.4f}, Recall_1: {recall_1:.4f}')

[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 16ms/step
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
F1 0.5088
Accuracy: 0.5611
Precision_0: 0.9098, Precision_1: 0.4945
Recall_0: 0.2558, Recall_1: 0.9664


In [None]:
from tensorflow.keras import layers, models, optimizers, callbacks
from sklearn.metrics import f1_score, classification_report, accuracy_score, precision_score, recall_score
import numpy as np

# --- Modelo GRU ---
modelGRU = models.Sequential([
    layers.Embedding(
        input_dim=vocab_len,
        output_dim=100,
        weights=[embedding_matrix],
        input_length=MAX_LEN,
        trainable=True,
        mask_zero=True
    ),

    layers.SpatialDropout1D(0.2),  # menos dropout que antes
    layers.Bidirectional(layers.GRU(64, return_sequences=False)),

    layers.Dropout(0.3),
    layers.Dense(64, activation="relu"),
    layers.Dropout(0.3),

    layers.Dense(1, activation="sigmoid")
])

modelGRU.compile(
    optimizer=optimizers.Adam(0.0005),
    loss="binary_crossentropy",
    metrics=["accuracy"]
)

early_stopper = callbacks.EarlyStopping(
    monitor='val_loss',
    patience=5,
    restore_best_weights=True
)

# --- Entrenamiento ---
hist = modelGRU.fit(
    Xtr, y_train,
    validation_data=(Xva, y_val),
    epochs=30,
    batch_size=32,
    callbacks=[early_stopper]
)

y_pred_prob_train = modelGRU.predict(Xtr)
thresholds = np.arange(0.1, 0.9, 0.01)
best_thresh = 0.5
best_f1 = 0

for t in thresholds:
    y_pred_t = (y_pred_prob_train >= t).astype(int)
    f1 = f1_score(y_train, y_pred_t)
    if f1 > best_f1:
        best_f1 = f1
        best_thresh = t

print("Mejor umbral (train):", best_thresh)

# --- Evaluación en VALIDATION ---
y_pred_val = (modelGRU.predict(Xva) >= best_thresh).astype(int)

f1 = f1_score(y_val, y_pred_val, average='weighted')
accuracy = accuracy_score(y_val, y_pred_val)
precision_1 = precision_score(y_val, y_pred_val, pos_label=1)
precision_0 = precision_score(y_val, y_pred_val, pos_label=0)
recall_1 = recall_score(y_val, y_pred_val, pos_label=1)
recall_0 = recall_score(y_val, y_pred_val, pos_label=0)

print(f'F1: {f1:.4f}')
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision_0: {precision_0:.4f}, Precision_1: {precision_1:.4f}')
print(f'Recall_0: {recall_0:.4f}, Recall_1: {recall_1:.4f}')
print("\nClassification Report:\n", classification_report(y_val, y_pred_val))




Epoch 1/30
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 149ms/step - accuracy: 0.6222 - loss: 0.6410 - val_accuracy: 0.7385 - val_loss: 0.5183
Epoch 2/30
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 101ms/step - accuracy: 0.7570 - loss: 0.5050 - val_accuracy: 0.7819 - val_loss: 0.4721
Epoch 3/30
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 104ms/step - accuracy: 0.8303 - loss: 0.3949 - val_accuracy: 0.7871 - val_loss: 0.5077
Epoch 4/30
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 100ms/step - accuracy: 0.8955 - loss: 0.2659 - val_accuracy: 0.7582 - val_loss: 0.6364
Epoch 5/30
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 110ms/step - accuracy: 0.9480 - loss: 0.1552 - val_accuracy: 0.7530 - val_loss: 0.9343
Epoch 6/30
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 110ms/step - accuracy: 0.9701 - loss: 0.0926 - val_accuracy: 0.7490 - val_loss: 1.0813
Epoch 7/30