<a href="https://colab.research.google.com/github/ITZ-NANO21-MC/fcc_sms_text_classification-Nano/blob/V3/fcc_sms_text_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

print(f"Versión de TensorFlow: {tf.__version__}")

# get data files
!wget https://cdn.freecodecamp.org/project-data/sms/train-data.tsv
!wget https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv

train_file_path = "train-data.tsv"
test_file_path = "valid-data.tsv"

# Función para cargar y preparar datos
def load_data(file_path):
    df = pd.read_csv(file_path, sep='\t', header=None, names=['label', 'message'])
    df['label'] = df['label'].map({'ham': 0, 'spam': 1})
    return df['message'].values, df['label'].values

# Cargar datos
train_texts, train_labels = load_data(train_file_path)
test_texts, test_labels = load_data(test_file_path)

# Paso 4: Preprocesamiento de texto
# Tokenización: Convertir palabras en números
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(train_texts)

# Convertir textos a secuencias numéricas
train_sequences = tokenizer.texts_to_sequences(train_texts)
test_sequences = tokenizer.texts_to_sequences(test_texts)

# Rellenar secuencias para que tengan la misma longitud
max_length = 100
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding='post', truncating='post')
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding='post', truncating='post')

# Paso 5: Construir el modelo de clasificación
def build_model():
    model = Sequential([
        # Capa de embedding: convierte índices en vectores densos
        Embedding(
            input_dim=10000,  # Tamaño del vocabulario
            output_dim=128,    # Dimensionalidad del embedding
            input_length=max_length  # Longitud de las secuencias
        ),

        # Capa LSTM bidireccional para capturar contexto
        Bidirectional(LSTM(64, return_sequences=True)),
        Dropout(0.5),  # Regularización para prevenir sobreajuste

        # Segunda capa LSTM
        Bidirectional(LSTM(32)),
        Dropout(0.5),

        # Capa densa final con activación sigmoide
        Dense(1, activation='sigmoid')
    ])

    model.compile(
        loss='binary_crossentropy',  # Para clasificación binaria
        optimizer=Adam(learning_rate=0.001),
        metrics=['accuracy']
    )
    return model

# Crear modelo
model = build_model()
model.summary()

# Paso 6: Entrenar el modelo
early_stop = EarlyStopping(
    monitor='val_loss',
    patience=3,
    restore_best_weights=True
)

history = model.fit(
    train_padded,
    train_labels,
    epochs=20,
    validation_split=0.2,
    callbacks=[early_stop],
    batch_size=64,
    verbose=1
)

# function to predict messages based on model
# (should return list containing prediction and label, ex. [0.008318834938108921, 'ham'])

# Paso 7: Crear función de predicción
def predict_message(pred_text):
    # Preprocesar el texto de entrada
    sequence = tokenizer.texts_to_sequences([pred_text])
    padded = pad_sequences(sequence, maxlen=max_length, padding='post', truncating='post')

    # Predecir probabilidad
    prediction_prob = model.predict(padded, verbose=0)[0][0]

    # Determinar etiqueta
    prediction_label = 'spam' if prediction_prob > 0.5 else 'ham'

    return [prediction_prob, prediction_label]

# Probar la función
pred_text = "how are you doing today?"
prediction = predict_message(pred_text)
print(prediction)

# Run this cell to test your function and model. Do not modify contents.
def test_predictions():
  test_messages = ["how are you doing today",
                   "sale today! to stop texts call 98912460324",
                   "i dont want to go. can we try it a different day? available sat",
                   "our new mobile video service is live. just install on your phone to start watching.",
                   "you have won £1000 cash! call to claim your prize.",
                   "i'll bring it tomorrow. don't forget the milk.",
                   "wow, is your arm alright. that happened to me one time too"
                  ]

  test_answers = ["ham", "spam", "ham", "spam", "spam", "ham", "ham"]
  passed = True

  for msg, ans in zip(test_messages, test_answers):
    prediction = predict_message(msg)
    if prediction[1] != ans:
      passed = False

  if passed:
    print("You passed the challenge. Great job!")
  else:
    print("You haven't passed yet. Keep trying.")

test_predictions()

Versión de TensorFlow: 2.18.0
--2025-07-02 13:40:32--  https://cdn.freecodecamp.org/project-data/sms/train-data.tsv
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 104.26.3.33, 104.26.2.33, 172.67.70.149, ...
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|104.26.3.33|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 358233 (350K) [text/tab-separated-values]
Saving to: ‘train-data.tsv’


2025-07-02 13:40:32 (15.8 MB/s) - ‘train-data.tsv’ saved [358233/358233]

--2025-07-02 13:40:32--  https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 104.26.3.33, 104.26.2.33, 172.67.70.149, ...
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|104.26.3.33|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 118774 (116K) [text/tab-separated-values]
Saving to: ‘valid-data.tsv’


2025-07-02 13:40:32 (32.7 MB/s) - ‘valid-data.tsv’ saved [118774/118774]





Epoch 1/20
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 45ms/step - accuracy: 0.8208 - loss: 0.4136 - val_accuracy: 0.9665 - val_loss: 0.1232
Epoch 2/20
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 23ms/step - accuracy: 0.9829 - loss: 0.0846 - val_accuracy: 0.9868 - val_loss: 0.0458
Epoch 3/20
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - accuracy: 0.9961 - loss: 0.0272 - val_accuracy: 0.9868 - val_loss: 0.0467
Epoch 4/20
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - accuracy: 0.9985 - loss: 0.0126 - val_accuracy: 0.9833 - val_loss: 0.0542
Epoch 5/20
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 22ms/step - accuracy: 0.9997 - loss: 0.0080 - val_accuracy: 0.9868 - val_loss: 0.0554
[np.float32(0.006295681), 'ham']
You passed the challenge. Great job!
