In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, SpatialDropout1D
from tensorflow.keras.optimizers import Adam
import string

nltk.download('stopwords')
nltk.download('punkt')

# Cargar datos
df = pd.read_csv('spam.csv', encoding='latin1')
df.rename(columns={'Category': 'target', 'Message': 'text'}, inplace=True)

styled_df = df.head()
# styled_df = styled_df.style.set_table_styles()
styled_df


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\moten\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\moten\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,target,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
encoder = LabelEncoder()
df['target'] = encoder.fit_transform(df['target'])
# styled_df = df.head().style
## Verificar valores faltantes
df.isnull().sum()
## Verificar valores duplicados
df.duplicated().sum()
## Eliminar valores duplicados
df = df.drop_duplicates(keep = 'first')

In [9]:
# Preprocesamiento de datos
encoder = LabelEncoder()
df['target'] = encoder.fit_transform(df['target'])

# Transformar texto
ps = nltk.stem.porter.PorterStemmer()
def transform_text(text):
    text = text.lower()
    text = nltk.word_tokenize(text)
    y = [i for i in text if i.isalnum()]
    y = [i for i in y if i not in stopwords.words('english') and i not in string.punctuation]
    y = [ps.stem(i) for i in y]
    return " ".join(y)

df['transformed_text'] = df['text'].apply(transform_text)

# Crear representaciones numéricas
tokenizer = Tokenizer(num_words=5000, lower=True)
tokenizer.fit_on_texts(df['transformed_text'].values)
X = tokenizer.texts_to_sequences(df['transformed_text'].values)
X = pad_sequences(X)
y = df['target'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Modelo 1: LSTM

In [10]:
# Modelo de aprendizaje profundo (LSTM)
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])

# Entrenamiento
history = model.fit(X_train, y_train, epochs=5, batch_size=64, validation_split=0.1, verbose=2)

Epoch 1/5




58/58 - 6s - 109ms/step - accuracy: 0.9071 - loss: 0.2674 - val_accuracy: 0.9734 - val_loss: 0.1362
Epoch 2/5
58/58 - 3s - 48ms/step - accuracy: 0.9774 - loss: 0.0817 - val_accuracy: 0.9855 - val_loss: 0.0572
Epoch 3/5
58/58 - 3s - 46ms/step - accuracy: 0.9892 - loss: 0.0401 - val_accuracy: 0.9806 - val_loss: 0.0604
Epoch 4/5
58/58 - 3s - 46ms/step - accuracy: 0.9941 - loss: 0.0232 - val_accuracy: 0.9831 - val_loss: 0.0690
Epoch 5/5
58/58 - 3s - 47ms/step - accuracy: 0.9962 - loss: 0.0163 - val_accuracy: 0.9806 - val_loss: 0.0599


Modelo 2: Random Forest

In [11]:
# Modelo tradicional (Random Forest)
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

Modelo 3: Decision Tree

In [12]:
# Modelo tradicional (Decision Tree)
dt = DecisionTreeClassifier(max_depth=5, random_state=42)
dt.fit(X_train, y_train)

Evaluación de Modelos

In [13]:
def evaluate_model(model, X_test, y_test, is_deep_learning=False):
    if is_deep_learning:
        y_pred = model.predict(X_test)
        y_pred = np.round(y_pred).astype(int).flatten()
    else:
        y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    return accuracy, precision, recall, f1

# Evaluación
rf_results = evaluate_model(rf, X_test, y_test)
dt_results = evaluate_model(dt, X_test, y_test)
lstm_results = evaluate_model(model, X_test, y_test, is_deep_learning=True)

# Comparar resultados
print("Random Forest Resultado:", rf_results)
print("Decision Tree Resultado:", dt_results)
print("LSTM Resultado:", lstm_results)

[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step
Random Forest Resultado: (0.9089147286821705, 0.8888888888888888, 0.35294117647058826, 0.5052631578947369)
Decision Tree Resultado: (0.8643410852713178, 0.4642857142857143, 0.19117647058823528, 0.2708333333333333)
LSTM Resultado: (0.9864341085271318, 0.9621212121212122, 0.9338235294117647, 0.9477611940298507)


In [14]:
# Mejora de los Modelos con Ajuste de Hiperparámetros
# Random Forest
rf = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)
rf.fit(X_train, y_train)
# Decision Tree
dt = DecisionTreeClassifier(max_depth=8, min_samples_split=5, random_state=42)
dt.fit(X_train, y_train)
# LSTM
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])

history = model.fit(X_train, y_train, epochs=10, batch_size=64, validation_split=0.1, verbose=2)


Epoch 1/10




58/58 - 7s - 119ms/step - accuracy: 0.9108 - loss: 0.2584 - val_accuracy: 0.9685 - val_loss: 0.1177
Epoch 2/10
58/58 - 3s - 59ms/step - accuracy: 0.9795 - loss: 0.0761 - val_accuracy: 0.9903 - val_loss: 0.0506
Epoch 3/10
58/58 - 3s - 58ms/step - accuracy: 0.9903 - loss: 0.0397 - val_accuracy: 0.9831 - val_loss: 0.0520
Epoch 4/10
58/58 - 3s - 58ms/step - accuracy: 0.9935 - loss: 0.0265 - val_accuracy: 0.9782 - val_loss: 0.0577
Epoch 5/10
58/58 - 3s - 58ms/step - accuracy: 0.9960 - loss: 0.0156 - val_accuracy: 0.9855 - val_loss: 0.0547
Epoch 6/10
58/58 - 3s - 59ms/step - accuracy: 0.9987 - loss: 0.0082 - val_accuracy: 0.9782 - val_loss: 0.0698
Epoch 7/10
58/58 - 3s - 59ms/step - accuracy: 0.9987 - loss: 0.0072 - val_accuracy: 0.9758 - val_loss: 0.0903
Epoch 8/10
58/58 - 3s - 59ms/step - accuracy: 0.9987 - loss: 0.0066 - val_accuracy: 0.9806 - val_loss: 0.0802
Epoch 9/10
58/58 - 3s - 57ms/step - accuracy: 0.9992 - loss: 0.0048 - val_accuracy: 0.9831 - val_loss: 0.0840
Epoch 10/10
58/58 - 

Evaluación de los Modelos Mejorados

In [15]:
rf_results = evaluate_model(rf, X_test, y_test)
dt_results = evaluate_model(dt, X_test, y_test)
lstm_results = evaluate_model(model, X_test, y_test, is_deep_learning=True)

print("Random Forest Resultados después del ajuste de hiperparámetros:", rf_results)
print("Decision Tree Resultados después del ajuste de hiperparámetros:", dt_results)
print("LSTM Resultados después del ajuste de hiperparámetros:", lstm_results)

[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step
Random Forest Resultados después del ajuste de hiperparámetros: (0.8972868217054264, 0.875, 0.25735294117647056, 0.3977272727272727)
Decision Tree Resultados después del ajuste de hiperparámetros: (0.875, 0.5339805825242718, 0.40441176470588236, 0.4602510460251046)
LSTM Resultados después del ajuste de hiperparámetros: (0.9815891472868217, 0.9090909090909091, 0.9558823529411765, 0.931899641577061)


Definir y Entrenar el Modelo GAN

In [16]:
# Configuración del GAN
latent_dim = 100

# Generador
generator = tf.keras.Sequential([
    tf.keras.layers.Dense(128, input_dim=latent_dim),
    tf.keras.layers.LeakyReLU(alpha=0.2),
    tf.keras.layers.Dense(256),
    tf.keras.layers.LeakyReLU(alpha=0.2),
    tf.keras.layers.Dense(np.prod(X.shape[1:])),
    tf.keras.layers.Reshape((X.shape[1],))
])

# Discriminador
discriminator = tf.keras.Sequential([
    tf.keras.layers.Dense(128, input_dim=X.shape[1]),
    tf.keras.layers.LeakyReLU(alpha=0.2),
    tf.keras.layers.Dense(256),
    tf.keras.layers.LeakyReLU(alpha=0.2),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# Compilación del discriminador
discriminator.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Combinación del GAN
gan = tf.keras.Sequential([generator, discriminator])
discriminator.trainable = False
gan.compile(optimizer='adam', loss='binary_crossentropy')

# Entrenamiento del GAN
epochs = 100
batch_size = 32

for epoch in range(epochs):
    # Generar ruido aleatorio
    noise = np.random.normal(0, 1, (batch_size, latent_dim))
    generated_data = generator.predict(noise)

    # Obtener un lote de datos reales
    idx = np.random.randint(0, X_train.shape[0], batch_size)
    real_data = X_train[idx]

    # Crear los conjuntos de etiquetas
    real_labels = np.ones((batch_size, 1))
    fake_labels = np.zeros((batch_size, 1))

    # Entrenar el discriminador
    d_loss_real = discriminator.train_on_batch(real_data, real_labels)
    d_loss_fake = discriminator.train_on_batch(generated_data, fake_labels)

    # Entrenar el generador
    noise = np.random.normal(0, 1, (batch_size, latent_dim))
    g_loss = gan.train_on_batch(noise, real_labels)

    if epoch % 1000 == 0:
        print(f"{epoch} [D loss: {0.5 * np.add(d_loss_real, d_loss_fake)}] [G loss: {g_loss}]")

# Generar nuevos datos
noise = np.random.normal(0, 1, (5000, latent_dim))
generated_data = generator.predict(noise)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75ms/step




0 [D loss: [47.29131    0.4765625]] [G loss: [array(31.74048, dtype=float32), array(31.74048, dtype=float32), array(0.609375, dtype=float32)]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17m


Aumentar el Conjunto de Datos

In [29]:
from sklearn.utils import shuffle


X_augmented = np.clip(X_augmented, 0, 4999)

# Realizar padding si es necesario
X_augmented = pad_sequences(X_augmented, maxlen=X_train.shape[1])

# Mezclar y dividir los datos aumentados
X_augmented, y_augmented = shuffle(X_augmented, y_augmented)
X_train_aug, X_val_aug, y_train_aug, y_val_aug = train_test_split(X_augmented, y_augmented, test_size=0.1, random_state=42)

# Verificar formas
print("Forma original X_train:", X_train.shape)
print("Forma Aumentada o  X_train:", X_augmented.shape)
assert X_train.shape[1] == X_augmented.shape[1], "Dimensiones incompatibles entre X_train y X_augmented"



Forma original X_train: (4125, 75)
Forma Aumentada o  X_train: (9125, 75)


Entrenar y Evaluar los Modelos con Datos Aumentados

In [30]:
# Random Forest
rf_aug = RandomForestClassifier(n_estimators=100, random_state=42)
rf_aug.fit(X_train_aug, y_train_aug)

# Decision Tree
dt_aug = DecisionTreeClassifier(max_depth=5, random_state=42)
dt_aug.fit(X_train_aug, y_train_aug)

Modelo LSTM

In [31]:
# Modelo LSTM
model_aug = Sequential()
model_aug.add(Embedding(input_dim=5000, output_dim=128, input_length=X_train.shape[1]))
model_aug.add(SpatialDropout1D(0.2))
model_aug.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model_aug.add(Dense(1, activation='sigmoid'))
model_aug.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])

# Entrenamiento
history_aug = model_aug.fit(X_train_aug, y_train_aug, epochs=5, batch_size=64, validation_data=(X_val_aug, y_val_aug), verbose=2)


Epoch 1/5




129/129 - 7s - 53ms/step - accuracy: 0.9375 - loss: 0.1520 - val_accuracy: 0.9869 - val_loss: 0.0572
Epoch 2/5
129/129 - 3s - 25ms/step - accuracy: 0.9889 - loss: 0.0378 - val_accuracy: 0.9869 - val_loss: 0.0334
Epoch 3/5
129/129 - 3s - 25ms/step - accuracy: 0.9938 - loss: 0.0263 - val_accuracy: 0.9912 - val_loss: 0.0315
Epoch 4/5
129/129 - 3s - 26ms/step - accuracy: 0.9973 - loss: 0.0116 - val_accuracy: 0.9923 - val_loss: 0.0257
Epoch 5/5
129/129 - 3s - 25ms/step - accuracy: 0.9978 - loss: 0.0084 - val_accuracy: 0.9945 - val_loss: 0.0262


Evaluación de los Modelos

In [32]:
rf_results_aug = evaluate_model(rf, X_test, y_test)
dt_results_aug = evaluate_model(dt, X_test, y_test)
lstm_results_aug = evaluate_model(model_aug, X_test, y_test, is_deep_learning=True)


# Comparar resultados
print("Random Forest Resultados con datos aumentados:", rf_results_aug)
print("Decision Tree Resultados con datos aumentados:", dt_results_aug)
print("LSTM Resultados con datos aumentados:", lstm_results_aug)


[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step
Random Forest Resultados con datos aumentados: (0.8972868217054264, 0.875, 0.25735294117647056, 0.3977272727272727)
Decision Tree Resultados con datos aumentados: (0.875, 0.5339805825242718, 0.40441176470588236, 0.4602510460251046)
LSTM Resultados con datos aumentados: (0.9883720930232558, 0.9696969696969697, 0.9411764705882353, 0.9552238805970149)
