# GloVe model on The Odyssey

In [None]:
# Download and clean The Odyssey text
import requests
from bs4 import BeautifulSoup

url = "https://www.gutenberg.org/cache/epub/1727/pg1727-images.html"
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# Extracting the main text content
text = soup.get_text()
start_marker = "Tell me, O Muse"
end_marker = "End of the Project Gutenberg EBook"
odyssey_text = text[text.find(start_marker):text.find(end_marker)].strip()

print(odyssey_text[:1000]) 


Tell me, O Muse, of that ingenious hero who travelled far and wide after he had
sacked the famous town of Troy. Many cities did he visit, and many were the
nations with whose manners and customs he was acquainted; moreover he suffered
much by sea while trying to save his own life and bring his men safely home;
but do what he might he could not save his men, for they perished through their
own sheer folly in eating the cattle of the Sun-god Hyperion; so the god
prevented them from ever reaching home. Tell me, too, about all these things,
oh daughter of Jove, from whatsoever source you may know them.


So now all who escaped death in battle or by shipwreck had got safely home
except Ulysses, and he, though he was longing to return to his wife and
country, was detained by the goddess Calypso, who had got him into a large cave
and wanted to marry him. But as years went by, there came a time when the gods
settled that he should go back to Ithaca; even then, however, when he was


In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer()
tokenizer.fit_on_texts([odyssey_text])
total_words = len(tokenizer.word_index) + 1

# Creating the input sequences
input_sequences = []
for line in odyssey_text.split('.'):
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

# Pad sequences
max_seq_len = max([len(x) for x in input_sequences])
input_sequences = pad_sequences(input_sequences, maxlen=max_seq_len, padding='pre')

# Features and labels
import numpy as np
X, y = input_sequences[:,:-1], input_sequences[:,-1]
y = np.eye(total_words)[y]


In [4]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

model_baseline = Sequential([
    Embedding(total_words, 100, input_length=max_seq_len-1),
    LSTM(150),
    Dense(total_words, activation='softmax')
])

model_baseline.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model_baseline.summary()


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 218, 100)          979800    
                                                                 
 lstm_1 (LSTM)               (None, 150)               150600    
                                                                 
 dense_1 (Dense)             (None, 9798)              1479498   
                                                                 
Total params: 2609898 (9.96 MB)
Trainable params: 2609898 (9.96 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [5]:
import zipfile

with zipfile.ZipFile('glove.6B.zip', 'r') as zip_file:
    zip_file.extractall('data')


In [6]:
import numpy as np



embedding_index = {}

with open("data/glove.6B.100d.txt", encoding='utf8') as f:

  for line in f:

    values = line.split()

    word = values[0]

    vec = np.asarray(values[1:], dtype='float32')

    embedding_index[word] = vec

In [None]:
glove_path = "C:/Users/yourname/Downloads/glove.6B.100d.txt"


In [12]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

model_baseline = Sequential([
    Embedding(total_words, 100, input_length=max_seq_len-1),
    LSTM(150),
    Dense(total_words, activation='softmax')
])

model_baseline.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model_baseline.summary()


Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 218, 100)          979800    
                                                                 
 lstm_2 (LSTM)               (None, 150)               150600    
                                                                 
 dense_2 (Dense)             (None, 9798)              1479498   
                                                                 
Total params: 2609898 (9.96 MB)
Trainable params: 2609898 (9.96 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
glove_path = r"C:\Users\nyolc\Downloads\glove.6B.100d.txt"

embeddings_index = {}
with open(glove_path, encoding="utf8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

print(f"Loaded {len(embeddings_index)} vectors of words of GloVe.")


Se cargaron 400000 vectores de palabras de GloVe.


In [None]:
embedding_dim = 100  
embedding_matrix = np.zeros((total_words, embedding_dim))

for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

model_glove = Sequential([
    Embedding(input_dim=total_words,
              output_dim=embedding_dim,
              input_length=max_seq_len-1,
              weights=[embedding_matrix],
              trainable=False), 
    LSTM(150),
    Dense(total_words, activation='softmax')
])

model_glove.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model_glove.summary()


Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 218, 100)          979800    
                                                                 
 lstm_3 (LSTM)               (None, 150)               150600    
                                                                 
 dense_3 (Dense)             (None, 9798)              1479498   
                                                                 
Total params: 2609898 (9.96 MB)
Trainable params: 1630098 (6.22 MB)
Non-trainable params: 979800 (3.74 MB)
_________________________________________________________________


The GloVe model:

In [None]:
model_glove.fit(X, y, epochs=10, verbose=1)


Epoch 1/10
 596/4048 [===>..........................] - ETA: 34:32 - loss: 6.9829 - accuracy: 0.0460

In [None]:
def generate_text(model, seed_text, next_words=20):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_seq_len-1, padding='pre')
        predicted = model.predict(token_list, verbose=0)
        predicted_word = tokenizer.index_word[np.argmax(predicted)]
        seed_text += " " + predicted_word
    return seed_text


Testing

In [None]:
print(generate_text(model_glove, "Tell me, O Muse"))
