# Overview
This script builds a text prediction model using LSTM (Long Short-Term Memory) and Word2Vec embeddings. The model is trained on a dataset from the PersonaHub collection, and it predicts the next word in a sequence based on a given input sentence.

Installation
Before running the code, install the required packages using:

# **Installation**
Before running the code, install the required packages using

In [None]:
!pip install datasets
!pip install nltk datasets


Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/547.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m204.8/547.8 kB[0m [31m6.0 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m542.7/547.8 kB[0m [31m8.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB

# Imports
The script imports the following libraries:

gensim: For loading pre-trained Word2Vec models.
tensorflow.keras: For building and training the LSTM model.
nltk: For text preprocessing and tokenization.
datasets: For loading the dataset.
numpy: For numerical operations.
re: For regular expression operations.

In [None]:
import gensim.downloader as api
import gensim
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense,SimpleRNN
from nltk.tokenize import word_tokenize
import re
from datasets import load_dataset
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import gensim.downloader as api
import numpy as np
import tensorflow as tf

import nltk
from nltk.corpus import stopwords


# Dataset Loading


In [None]:
from datasets import load_dataset

ds = load_dataset("proj-persona/PersonaHub", "instruction")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/3.91k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/26.4M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/50000 [00:00<?, ? examples/s]

# Text Preprocessing
 The preprocess_text function processes the text as follows:

Converts text to lowercase.

1. Converts text to lowercase.

2.  Removes special characters, URLs, and mentions.

3. Tokenizes the text into words..





In [None]:
# Load dataset (assuming it is loaded as 'dataset')
# Example: dataset = load_dataset('your_dataset_path')

nltk.download('punkt')
nltk.download('stopwords')
# Initialize stop words
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()  # Convert text to lowercase
    text = re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", text)
    words = word_tokenize(text)
    return ' '.join(words)

# Preprocessing and Tokenization
all_text = ''

for row in ds['train']:
    text = row['synthesized text']
    processed_text = preprocess_text(text)
    all_text += ' ' + processed_text

tokenizer = Tokenizer()
tokenizer.fit_on_texts([all_text])
sequences = tokenizer.texts_to_sequences([all_text])[0]
vocab_size = len(tokenizer.word_index) + 1

# Preparing Data
input_sequences = []
output_sequences = []
sequence_length = 5  # Number of words to consider as input for prediction

for i in range(len(sequences) - sequence_length):
    input_sequences.append(sequences[i:i + sequence_length])
    output_sequences.append(sequences[i + sequence_length])

input_sequences = np.array(input_sequences)
output_sequences = np.array(output_sequences)

print(f'Input Sequences Shape: {input_sequences.shape}')
print(f'Output Sequences Shape: {output_sequences.shape}')
print(f'Vocabulary Size: {vocab_size}')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Input Sequences Shape: (2068589, 5)
Output Sequences Shape: (2068589,)
Vocabulary Size: 68888


In [None]:
output_sequences.shape

(2068589,)

In [None]:
input_sequences.shape

(2068589, 5)

# Isntaling word2vec model

In [None]:
# Word2vec Model
word2vec_model = api.load("word2vec-google-news-300")

#word2vec_model =api.load("glove-twitter-25")




# Embedding Matrix
Creates an embedding matrix using Word2Vec embeddings:


*   Each word's vector is stored in the matrix if it exists in the pre-trained model

In [None]:
# Creating Embedding Matrix
embedding_dim = word2vec_model.vector_size
embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, index in tokenizer.word_index.items():
    if word in word2vec_model:
        embedding_matrix[index] = word2vec_model[word]

# LSTM Model

* Embedding Layer: Uses pre-trained Word2Vec embeddings.

*   LSTM Layer: Processes sequences of words.

*  Dense Layer: Outputs predictions for the next word.






In [None]:
# LSTM Model
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], input_length=sequence_length, trainable=False))
model.add(LSTM(64))
model.add(Dense(vocab_size, activation="softmax"))
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam")

# Model Training


In [None]:
# Training
train_size = int(0.8 * len(input_sequences))

X_train = input_sequences[:train_size]
y_train = output_sequences[:train_size]
X_valid = input_sequences[train_size:]
y_valid = output_sequences[train_size:]

model.fit(X_train, y_train, validation_data=(X_valid, y_valid), batch_size=126, epochs=5,verbose=1)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7ec1c018b670>

In [None]:
model.fit(X_train, y_train, validation_data=(X_valid, y_valid), batch_size=126, epochs=5,verbose=1)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7ec1645a0ac0>

In [None]:
def predict_next_words(sentence, top_k=3):
    tokens = tokenizer.texts_to_sequences([sentence])[0]
    tokens = pad_sequences([tokens], maxlen=sequence_length)
    prediction = model.predict(tokens)[0]
    top_indices = prediction.argsort()[-top_k:][::-1]
    next_words = [tokenizer.index_word[idx] for idx in top_indices]
    return next_words

# Prediction Function


In [None]:
def run_sentence_autocomplete():
    sentence = " "
    while True:
        word = input("Enter Next word (-1 to terminate): ")
        if word == "-1":
            break
        sentence += word + " "
        correct = "yes"
        while correct.lower() == "yes":
            next_words = predict_next_words(sentence)
            print("Top predicted words:", ", ".join(next_words))
            next_word = input("Choose the next word from the options above: ")
            if next_word in next_words:
                sentence += next_word + " "
            else:
                print("The word you chose is not in the top predictions. Please choose a word from the options.")
                correct = input("Is this the correct word? (yes/no): ")

    print("Your final sentence is:", sentence.strip())
    print()

In [None]:
model.save('predict_next_word_model.keras')

In [None]:
run_sentence_autocomplete()