In [None]:
"""import requests

url = "https://raw.githubusercontent.com/silentrob/superscript-friends/master/transcripts/friends.txt"
response = requests.get(url)

with open("friends_script.txt", "w", encoding="utf-8") as f:
    f.write(response.text)

print("Archivo descargado como 'friends_script.txt'")


Archivo descargado como 'friends_script.txt'


In [None]:
# Leer el corpus
with open("friends_script.txt", "r", encoding="utf-8") as f:
    text = f.read()

print(f"Longitud total del corpus: {len(text)} caracteres")
print(f"Primeros 1000 caracteres:\n{text[:500]}")

Longitud total del corpus: 3247629 caracteres
Primeros 1000 caracteres:
Written by: Marta Kauffman & David Crane
Monica: There's nothing to tell! He's just some guy I work with!
Joey: C'mon, you're going out with the guy! There's gotta be something wrong with him!
Chandler: All right Joey, be nice.  So does he have a hump? A hump and a hairpiece?
Phoebe: Wait, does he eat chalk?
Phoebe: Just, 'cause, I don't want her to go through what I went through with Carl- oh!
Monica: Okay, everybody relax. This is not even a date. It's just two people going out to dinner and- 


In [30]:
import re
import unicodedata

def clean_raw_text(txt):
    txt = re.sub(r" +", " ", txt)
    txt = re.sub(r"\[.*?\]", "", txt)
    txt = re.sub(r"\(.*?\)", "", txt)
    txt = "\n".join([line for line in txt.splitlines() if line.strip() != ""])
    txt = re.sub(r"^(\w+):", lambda m: m.group(1).lower() + ":", txt, flags=re.M)
    return txt

def normalize_text(txt):
    txt = txt.lower()
    txt = unicodedata.normalize("NFD", txt)
    txt = txt.encode("ascii", "ignore").decode("utf-8")
    txt = re.sub(r"[^a-z0-9.,!?'\\n:;()\" \\-]+", '', txt)
    return txt

In [31]:
text = clean_raw_text(text)
text = normalize_text(text)

In [32]:
# Crear vocabulario y mappings
chars = sorted(set(text))
vocab_size = len(chars)
print(f"Vocabulario: {vocab_size} caracteres")

# Mapeo de caracteres a índices y viceversa
char_to_idx = {ch: i for i, ch in enumerate(chars)}
idx_to_char = {i: ch for i, ch in enumerate(chars)}

# Vectorizar todo el texto
encoded_text = [char_to_idx[c] for c in text]

Vocabulario: 48 caracteres


In [33]:
# Longitud de cada secuencia
seq_length = 100

# Crear pares de secuencia (entrada y salida esperada)
X = []
y = []

for i in range(0, len(text) - seq_length):
    input_seq = text[i:i+seq_length]
    output_char = text[i+seq_length]
    X.append([char_to_idx[c] for c in input_seq])
    y.append(char_to_idx[output_char])             

print(f"Número de secuencias: {len(X)}")

Número de secuencias: 3151889


In [34]:
import numpy as np

# Parámetros
max_context_size = 100  # longitud de secuencia
p_val = 0.1  # proporción para validación

# Calcular cantidad de secuencias de validación
num_val = int(np.ceil(len(encoded_text) * p_val / max_context_size))

# Separar texto tokenizado en entrenamiento y validación
train_text = encoded_text[:-num_val * max_context_size]
val_text   = encoded_text[-num_val * max_context_size:]

# Crear secuencias
tokenized_sentences_train = [train_text[i:i + max_context_size] for i in range(len(train_text) - max_context_size + 1)]
tokenized_sentences_val   = [val_text[i:i + max_context_size] for i in range(num_val)]

# Dataset de entrenamiento: X e y (shift 1)
X_train = np.array(tokenized_sentences_train[:-1])
y_train = np.array(tokenized_sentences_train[1:])

# Dataset de validación
X_val = np.array(tokenized_sentences_val[:-1])
y_val = np.array(tokenized_sentences_val[1:])

print(f"Train: {X_train.shape}, {y_train.shape}")
print(f"Val: {X_val.shape}, {y_val.shape}")


Train: (2836689, 100), (2836689, 100)
Val: (3151, 100), (3151, 100)


SimpleRNN

In [35]:
from keras.models import Sequential
from keras.layers import TimeDistributed, CategoryEncoding, SimpleRNN, Dense

model = Sequential()
model.add(TimeDistributed(
    CategoryEncoding(num_tokens=vocab_size, output_mode="one_hot"),
    input_shape=(None, 1))
)
model.add(SimpleRNN(200, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))
model.add(Dense(vocab_size, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer='rmsprop')
model.summary()


  super().__init__(**kwargs)


In [36]:
from keras.callbacks import Callback
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

class PplCallback(Callback):
    def __init__(self, val_data, history_ppl, patience=5):
        self.val_data = val_data
        self.target = []
        self.padded = []
        self.info = []
        self.min_score = np.inf
        self.patience = patience
        self.patience_counter = 0
        count = 0

        for seq in self.val_data:
            len_seq = len(seq)
            subseq = [seq[:i] for i in range(1, len_seq)]
            self.target.extend([seq[i] for i in range(1, len_seq)])
            if len(subseq) > 0:
                self.padded.append(pad_sequences(subseq, maxlen=max_context_size, padding='pre'))
                self.info.append((count, count + len_seq))
                count += len_seq

        self.padded = np.vstack(self.padded)

    def on_epoch_end(self, epoch, logs=None):
        scores = []
        predictions = self.model.predict(self.padded, verbose=0)
        for start, end in self.info:
            probs = [predictions[i, -1, self.target[i]] for i in range(start, end)]
            scores.append(np.exp(-np.sum(np.log(probs)) / (end - start)))

        current_score = np.mean(scores)
        history_ppl.append(current_score)
        print(f'\n Perplejidad media: {current_score:.4f}')

        if current_score < self.min_score:
            self.min_score = current_score
            self.model.save("my_model.keras")
            print("Nuevo modelo guardado.")
            self.patience_counter = 0
        else:
            self.patience_counter += 1
            if self.patience_counter >= self.patience:
                print("Early stopping activado.")
                self.model.stop_training = True


In [37]:
history_ppl = []

hist = model.fit(
    X_train[..., np.newaxis],  # expandir dimensión
    y_train[..., np.newaxis],
    epochs=20,
    batch_size=256,
    callbacks=[PplCallback(tokenized_sentences_val, history_ppl)]
)

Epoch 1/20
[1m11081/11081[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 125ms/step - loss: 1.8761

: 

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 5))
sns.lineplot(x=range(1, len(history_ppl)+1), y=history_ppl)
plt.xlabel("Épocas")
plt.ylabel("Perplejidad")
plt.title("Evolución de perplejidad")
plt.show()
