### Se lee el conjunto de datos como dataframe

In [1]:
#Se lee el conjunto de datos como dataframe
import pandas as pd

ruta = 'bbc_data.csv'
# Leer un archivo CSV (si ya lo tienes subido)
df = pd.read_csv(ruta)

# Ver las primeras filas
df.head()

Unnamed: 0,data,labels
0,Musicians to tackle US red tape Musicians gro...,entertainment
1,"U2s desire to be number one U2, who have won ...",entertainment
2,Rocker Doherty in on-stage fight Rock singer ...,entertainment
3,Snicket tops US box office chart The film ada...,entertainment
4,"Oceans Twelve raids box office Oceans Twelve,...",entertainment


### Se realiza un preprocesamiento de las noticias

In [2]:
#Se realiza un preprocesamiento de las noticias

import nltk
import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    # Minúsculas
    text = text.lower()
    # Quitar signos de puntuación
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)
    # Tokenizar
    tokens = nltk.word_tokenize(text)
    # Quitar stopwords y lematizar
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return " ".join(tokens)

# Aplicar al dataframe
df['clean_text'] = df['data'].apply(preprocess)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Usuario/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Usuario/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Usuario/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Entrenamiento de dos Modelos LTSM. El primero solo tiene en cuenta el contexto por un lado, y el segundo tiene en cuenta el contexto por los dos lados, lo que mejora mucho el accuracy.  

In [11]:
# Entrenamiento de dos Modelos LTSM. El primero solo tiene en cuenta el contexto por un lado, y el segundo tiene en cuenta el contexto por los dos lados, lo que mejora mucho el accuracy.  

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Parámetros
MAX_WORDS = 5000  # tamaño del vocabulario
MAX_LEN = 200     # longitud máxima de cada secuencia

# Tokenizar texto
tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token='<OOV>')
tokenizer.fit_on_texts(df['clean_text'])

# Convertir textos a secuencias de enteros
sequences = tokenizer.texts_to_sequences(df['clean_text'])

# Rellenar secuencias a longitud fija
X = pad_sequences(sequences, maxlen=MAX_LEN, padding='post', truncating='post')


In [12]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

# Codificar etiquetas a enteros
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(df['labels'])

# Convertir a one-hot
y = to_categorical(y_encoded)


In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)


In [22]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

model = Sequential()
model.add(Embedding(input_dim=MAX_WORDS, output_dim=128, input_length=MAX_LEN))
model.add(LSTM(64, return_sequences=True))     # Primera capa LSTM (devuelve secuencia)
model.add(Dropout(0.3))
model.add(LSTM(32))                            # Segunda capa LSTM (finaliza secuencia)
model.add(Dropout(0.3))
model.add(Dense(y.shape[1], activation='softmax'))


model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()


Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 200, 128)          640000    
                                                                 
 lstm_8 (LSTM)               (None, 200, 64)           49408     
                                                                 
 dropout_8 (Dropout)         (None, 200, 64)           0         
                                                                 
 lstm_9 (LSTM)               (None, 32)                12416     
                                                                 
 dropout_9 (Dropout)         (None, 32)                0         
                                                                 
 dense_4 (Dense)             (None, 5)                 165       
                                                                 
Total params: 701989 (2.68 MB)
Trainable params: 70198

In [23]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

callbacks = [
    EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True),
    ModelCheckpoint('best_model_1.h5', save_best_only=True)  # ← CAMBIO a .h5
]


history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=30,
    batch_size=32,
    callbacks = callbacks
)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30


In [24]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional


model2 = Sequential()

# Capa de embeddings
model2.add(Embedding(input_dim=MAX_WORDS, output_dim=128, input_length=MAX_LEN))

# Primera capa Bidirectional LSTM (devuelve secuencias para la siguiente LSTM)
model2.add(Bidirectional(LSTM(64, return_sequences=True)))
model2.add(Dropout(0.3))

# Segunda capa Bidirectional LSTM (no necesita devolver secuencia)
model2.add(Bidirectional(LSTM(32)))
model2.add(Dropout(0.3))

# Capa de salida (tantas neuronas como clases)
model2.add(Dense(y.shape[1], activation='softmax'))

# Compilar el modelo
model2.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Resumen del modelo
model2.summary()


Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, 200, 128)          640000    
                                                                 
 bidirectional (Bidirection  (None, 200, 128)          98816     
 al)                                                             
                                                                 
 dropout_10 (Dropout)        (None, 200, 128)          0         
                                                                 
 bidirectional_1 (Bidirecti  (None, 64)                41216     
 onal)                                                           
                                                                 
 dropout_11 (Dropout)        (None, 64)                0         
                                                                 
 dense_5 (Dense)             (None, 5)                

In [25]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

callbacks2 = [
    EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True),
    ModelCheckpoint('best_model_2.h5', save_best_only=True)  # ← CAMBIO a .h5
]



history = model2.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=15,
    batch_size=32,
    callbacks = callbacks2
)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


### Visualización de una predicción del conjunto de datos

In [27]:
### Visualización de una predicción del conjunto de datos

import numpy as np
import pandas as pd



# Lista de nombres de las clases (ajusta según tu problema)
clases = ["entretenimiento", "deportes", "negocios", "tecnología", "política"]

# Índice de la noticia a predecir
i = 10  # Cambia este número para ver otras noticias

# Extraer datos
noticia_elegida = X_test[i]
real = y_test[i]
texto_original = df["data"].iloc[i] if "data" in df.columns else "Texto no disponible"

# Añadir dimensión para el modelo
noticia_input = np.expand_dims(noticia_elegida, axis=0)

# Hacer predicción
pred = model2.predict(noticia_input)

# Interpretar predicción
if pred.shape[1] == 1:
    pred_clase = 1 if pred[0][0] > 0.5 else 0
    real_clase_idx = int(real)
else:
    pred_clase = np.argmax(pred)
    real_clase_idx = np.argmax(real)

# Mostrar resultados
print("📰 Texto original:\n")
print(texto_original.strip())

print(f"\n✅ Clase real: {clases[real_clase_idx]}")
print(f"🤖 Clase predicha: {clases[pred_clase]}")


📰 Texto original:

Actress Roberts takes spider role  Actress Julia Roberts will play the part of a spider in a new film version of childrens classic Charlottes Web.  She will voice Charlotte, who teams up with a girl to save their friend Wilbur the pig, in the story by EB White. The film - a mix of live action and animation - will be Roberts first project since the birth of her twins, Hazel and Phinnaeus, two months ago. Oprah Winfrey will voice a goose, John Cleese will voice a sheep and Steve Buscemi a rat in the 2006 film.  Ten-year-old Dakota Fanning will play Fern, the girl at the centre of the story, in the film to be directed by 13 Going on 30 film-maker Gary Winick. Filming is due to begin in Melbourne, Australia, later this month. Charlottes Web has sold 45 million copies since it was published in 1952. An animated version was made in 1973 but this will be the first live action film. The actor who will voice Wilbur the pig has yet to be revealed. "

✅ Clase real: entretenimie