In [1]:
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
import os

# Cargar el dataset de noticias
dataset, info = tfds.load('ag_news_subset', with_info=True, as_supervised=True)
train_dataset, test_dataset = dataset['train'], dataset['test']

# Convertir a texto plano
train_texts = []
for text, label in tfds.as_numpy(train_dataset):
    train_texts.append(text.decode('utf-8'))

# Concatenar todos los textos en uno solo
text = ' '.join(train_texts)
print(f'Texto total: {len(text)} caracteres')

  from .autonotebook import tqdm as notebook_tqdm


[1mDownloading and preparing dataset Unknown size (download: Unknown size, generated: Unknown size, total: Unknown size) to C:\Users\garci\tensorflow_datasets\ag_news_subset\1.0.0...[0m


Dl Completed...: 0 url [00:00, ? url/s]
Dl Completed...:   0%|          | 0/1 [00:00<?, ? url/s]
Dl Completed...:   0%|          | 0/1 [00:03<?, ? url/s]
Dl Completed...:   0%|          | 0/1 [00:06<?, ? url/s]
Dl Completed...:   0%|          | 0/1 [00:06<?, ? url/s]
Dl Completed...:   0%|          | 0/1 [00:07<?, ? url/s]
Dl Completed...:   0%|          | 0/1 [00:08<?, ? url/s]
Dl Completed...:   0%|          | 0/1 [00:09<?, ? url/s]
Dl Completed...:   0%|          | 0/1 [00:10<?, ? url/s]
Dl Completed...:   0%|          | 0/1 [00:11<?, ? url/s]
Dl Completed...:   0%|          | 0/1 [00:12<?, ? url/s]
Dl Completed...:   0%|          | 0/1 [00:13<?, ? url/s]
Dl Completed...:   0%|          | 0/1 [00:13<?, ? url/s]
Dl Completed...:   0%|          | 0/1 [00:14<?, ? url/s]
Dl Completed...: 100%|██████████| 1/1 [00:14<00:00, 14.92s/ url]
Dl Completed...: 100%|██████████| 1/1 [00:15<00:00, 14.92s/ url]
Dl Completed...: 100%|██████████| 1/1 [00:15<00:00, 14.92s/ url]
Dl Completed...: 100%|██

[1mDataset ag_news_subset downloaded and prepared to C:\Users\garci\tensorflow_datasets\ag_news_subset\1.0.0. Subsequent calls will reuse this data.[0m
Texto total: 23328241 caracteres


In [2]:
# Crear un mapeo de caracteres a índices
vocab = sorted(set(text))
char2idx = {u: i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

# Convertir los caracteres a índices
text_as_int = np.array([char2idx[c] for c in text])

# Crear secuencias de entrada y salida
seq_length = 100
examples_per_epoch = len(text) // seq_length

char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)
sequences = char_dataset.batch(seq_length + 1, drop_remainder=True)

def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

# Crear lotes de entrenamiento
BATCH_SIZE = 64
BUFFER_SIZE = 10000
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

In [7]:
# Construir el modelo
vocab_size = len(vocab)
embedding_dim = 256
rnn_units = 1024

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim),
    tf.keras.layers.GRU(rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
    tf.keras.layers.Dense(vocab_size)
])

# Función de pérdida
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

model.compile(optimizer='adam', loss=loss)

# Entrenar el modelo
EPOCHS = 10
history = model.fit(dataset, epochs=EPOCHS)

Epoch 1/10
[1m 122/3608[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m1:46:56[0m 2s/step - loss: 3.3813

KeyboardInterrupt: 

In [None]:
import re

# Tokenizar el texto en palabras
words = re.findall(r'\b\w+\b', text.lower())
word2idx = {u: i for i, u in enumerate(set(words))}
idx2word = np.array(list(set(words)))

# Convertir las palabras a índices
text_as_int = np.array([word2idx[w] for w in words])

# Crear secuencias de entrada y salida
seq_length = 20
examples_per_epoch = len(words) // seq_length

word_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)
sequences = word_dataset.batch(seq_length + 1, drop_remainder=True)

def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

# Crear lotes de entrenamiento
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

In [None]:
# Construir el modelo
vocab_size = len(word2idx)
embedding_dim = 256
rnn_units = 1024

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, batch_input_shape=[BATCH_SIZE, None]),
    tf.keras.layers.GRU(rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
    tf.keras.layers.Dense(vocab_size)
])

# Función de pérdida
model.compile(optimizer='adam', loss=loss)

# Entrenar el modelo
history = model.fit(dataset, epochs=EPOCHS)


In [None]:
def generate_text(model, start_string, char2idx, idx2char, num_generate=1000):
    input_eval = [char2idx[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)
    text_generated = []

    temperature = 1.0

    model.reset_states()
    for i in range(num_generate):
        predictions = model(input_eval)
        predictions = tf.squeeze(predictions, 0)
        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()
        input_eval = tf.expand_dims([predicted_id], 0)
        text_generated.append(idx2char[predicted_id])

    return start_string + ''.join(text_generated)

start_string = "The"
print(generate_text(model, start_string, char2idx, idx2char))


In [None]:
# Generar texto con ambos modelos
generated_text_char = generate_text(model_char, start_string, char2idx, idx2char)
generated_text_word = generate_text(model_word, start_string, word2idx, idx2word)

print("Texto generado a nivel de caracteres:")
print(generated_text_char)
print("\nTexto generado a nivel de palabras:")
print(generated_text_word)
