In [7]:
import requests
import numpy as np
import spacy
from keras.models import Sequential
from keras.layers import LSTM, Dense, Embedding
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
import gensim

In [76]:
# Download and load text data
url = "https://ota.bodleian.ox.ac.uk/repository/xmlui/bitstream/handle/20.500.12024/5730/5730.txt"
r = requests.get(url)
raw_text = r.text

# Tokenize text using gensim
tokenized_text = gensim.utils.simple_preprocess(raw_text)

In [77]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk

# Download NLTK data
nltk.download('punkt')

# Tokenize the raw text
tokens = word_tokenize(raw_text.lower())

# Prepare the training data for Word2Vec
sentences = [tokens]  # Word2Vec expects a list of sentences, where each sentence is a list of words

# Train a Word2Vec model
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=2, workers=4)

# Save the model for inspection
word2vec_model_path = "./Week_1/Day_4/word2vec_model.model"
word2vec_model.save(word2vec_model_path)

[nltk_data] Downloading package punkt to /Users/ashish/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [78]:
import numpy as np
from keras.models import Sequential
from keras.layers import LSTM, Dense
from keras.optimizers import RMSprop

# Function to prepare data for training
def prepare_training_data(tokens, word_vectors, sequence_length=10):
    X = []
    y = []
    for i in range(len(tokens) - sequence_length):
        input_sequence = tokens[i:i + sequence_length]
        target_word = tokens[i + sequence_length]
        
        # Check if all words in the input sequence and target word exist in Word2Vec
        if all(word in word_vectors for word in input_sequence) and target_word in word_vectors:
            X.append([word_vectors[word] for word in input_sequence])
            y.append(word_vectors[target_word])
    
    return np.array(X), np.array(y)

# Prepare training data
sequence_length = 10
tokens = [word.lower() for word in tokens if word.isalpha()]  # Filter non-alphabetic tokens
X, y = prepare_training_data(tokens, word2vec_model.wv, sequence_length)

# Check shapes of X and y
print(f"Shape of X: {X.shape}")  # (num_samples, sequence_length, embedding_dim)
print(f"Shape of y: {y.shape}")  # (num_samples, embedding_dim)

# Build the model
print('Building model...')
model = Sequential()

# First LSTM layer
model.add(LSTM(128, input_shape=(sequence_length, word2vec_model.vector_size), return_sequences=True))

# Second LSTM layer
model.add(LSTM(16, return_sequences=False))

# Output layer
model.add(Dense(word2vec_model.vector_size, activation='linear'))

# Compile the model
optimizer = RMSprop(learning_rate=0.01)
model.compile(loss='mean_squared_error', optimizer=optimizer)

# Train the model
print('Training model...')
model.fit(X, y, epochs=100, batch_size=64, validation_split=0.2)

Shape of X: (43419, 10, 100)
Shape of y: (43419, 100)
Building model...
Training model...


  super().__init__(**kwargs)


Epoch 1/100
[1m543/543[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 12ms/step - loss: 0.0396 - val_loss: 0.0378
Epoch 2/100
[1m543/543[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 11ms/step - loss: 0.0369 - val_loss: 0.0376
Epoch 3/100
[1m543/543[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 10ms/step - loss: 0.0368 - val_loss: 0.0366
Epoch 4/100
[1m543/543[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 10ms/step - loss: 0.0363 - val_loss: 0.0372
Epoch 5/100
[1m543/543[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 12ms/step - loss: 0.0363 - val_loss: 0.0376
Epoch 6/100
[1m543/543[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 11ms/step - loss: 0.0364 - val_loss: 0.0364
Epoch 7/100
[1m543/543[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 10ms/step - loss: 0.0365 - val_loss: 0.0364
Epoch 8/100
[1m543/543[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 11ms/step - loss: 0.0364 - val_loss: 0.0367
Epoch 9/100
[1m543/543

<keras.src.callbacks.history.History at 0x3eea62fe0>

In [79]:
import numpy as np

# Function to prepare input embeddings for the model
def prepare_input_sequence(input_text, word_vectors, vector_size, sequence_length=10):
    tokens = input_text.lower().split()
    
    # Ensure the input has exactly `sequence_length` tokens
    if len(tokens) < sequence_length:
        tokens = ['<pad>'] * (sequence_length - len(tokens)) + tokens
    else:
        tokens = tokens[-sequence_length:]
    
    # Convert words to embeddings
    input_sequence = []
    for word in tokens:
        if word in word_vectors:
            input_sequence.append(word_vectors[word])
        else:
            input_sequence.append(np.zeros(vector_size))  # Unknown words as zero vectors
    
    return np.array(input_sequence).reshape(1, sequence_length, vector_size)

# Function to find the closest word to the predicted embedding
def find_closest_word(embedding, word_vectors):
    closest_word = None
    min_distance = float('inf')
    for word, vector in word_vectors.items():
        distance = np.linalg.norm(vector - embedding)
        if distance < min_distance:
            closest_word = word
            min_distance = distance
    return closest_word

# Function to predict the next 100 words
def predict_next_words(model, initial_text, word_vectors, vector_size, sequence_length=10, num_words=100):
    generated_text = initial_text.split()
    
    for _ in range(num_words):
        # Prepare the input sequence
        input_sequence = prepare_input_sequence(" ".join(generated_text[-sequence_length:]), word_vectors, vector_size, sequence_length)
        
        # Predict the next word embedding
        predicted_embedding = model.predict(input_sequence, verbose=0).flatten()
        
        # Find the closest word
        next_word = find_closest_word(predicted_embedding, word_vectors)
        
        # Append the next word to the generated text
        generated_text.append(next_word)
    
    return " ".join(generated_text)

# Prepare the Word2Vec embeddings dictionary
word_vectors = {word: word2vec_model.wv[word] for word in word2vec_model.wv.index_to_key}
vector_size = word2vec_model.vector_size  # Retrieve the vector size from the Word2Vec model

# Ask the user for input text
initial_text = "There was a great treasure hunt going on this deserted island. There were many hunters roaming"
print("\nGenerating text...\n")

# Generate the next 100 words
generated_text = predict_next_words(model, initial_text, word_vectors, vector_size, sequence_length=10, num_words=100)

# Print the generated text
print(f"Input Text: {initial_text}")
print(f"Generated Text: {generated_text}")



Generating text...

Input Text: There was a great treasure hunt going on this deserted island. There were many hunters roaming
Generated Text: There was a great treasure hunt going on this deserted island. There were many hunters roaming but this dance be so is ! hand us be who be an very be is is be us be hand is be hand be hand is be hand be hand is be hand be hand is be hand be hand is be hand be hand is be hand be hand is be hand be hand is be hand be hand is be hand be hand is be hand be hand is be hand be hand is be hand be hand is be hand be hand is be hand be hand is be hand be hand is be hand be
