In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [7]:
import warnings
warnings.filterwarnings("ignore")

# Build Recurrent Neural Network (RNN)

In [8]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential 
from tensorflow.keras.layers import Dense, SimpleRNN, Embedding

# Import the Tokenizer class from the text module in TensorFlow Keras preprocessing
# This class is used to vectorize a text corpus, by turning each text into either a sequence of integers
# (each integer being the index of a token in a dictionary) or into a vector where the coefficient(s)
# for each token could be binary, based on word count, based on tf-idf...
from tensorflow.keras.preprocessing.text import Tokenizer

# Import the pad_sequences function from the sequence module in TensorFlow Keras preprocessing
# This function is used to ensure that all sequences in a list have the same length,
# by padding shorter sequences with a specified value (by default 0) or truncating longer sequences
# to a specified length.
from tensorflow.keras.preprocessing.sequence import pad_sequences

## Sample Data for Training 

In [9]:
 
# Sample data
sentences = [
    "Machine learning algorithms are powerful tools."
    "I enjoy exploring new algorithms."
    "Learning about AI is captivating."
    "Deep learning models can be complex."
    "I love understanding how neural networks work."
    "The field of data science is evolving rapidly."
    "I find artificial intelligence intriguing."
    "Studying computer vision is exciting."
    "Natural language processing is a fascinating domain."
    "I enjoy coding in Python for data science projects."
    "Building models is a creative process."
    "I love experimenting with different machine learning techniques."
    "The potential of AI to transform industries is amazing."
    "I enjoy staying updated with the latest tech trends."
    "Learning about reinforcement learning is interesting."
    "I find predictive modeling to be very useful."
    "I love solving problems with data analysis."
    "Data preprocessing is a crucial step in machine learning."
    "I enjoy reading research papers on deep learning."
    "I find optimization techniques fascinating."
    "Understanding algorithms helps in developing better solutions."
    "I love the challenge of debugging code."
    "Machine learning applications are diverse and impactful."
    "I enjoy collaborating with others on tech projects."
    "Learning new programming languages is fun."
    "I love working with large datasets."
    "I find feature engineering to be an art."
    "Model evaluation is an essential part of machine learning."
    "I enjoy attending tech conferences."
    "Learning about big data technologies is exciting."
    "I love experimenting with neural network architectures."
    "I find the theory behind machine learning algorithms interesting."
    "I enjoy visualizing data insights."
    "Machine learning models can make accurate predictions."
    "I love the creativity involved in data storytelling."
    "I find unsupervised learning techniques intriguing."
    "I enjoy automating tasks with AI."
    "Learning about AI ethics is important."
    "I love the problem-solving aspect of machine learning."
    "I find cloud computing technologies fascinating."
    "I enjoy using machine learning for real-world applications."
    "I love experimenting with different data preprocessing techniques."
    "I find transfer learning to be a powerful approach."
    "I enjoy working on machine learning projects."
    "Learning about data privacy is crucial."
    "I love the innovation happening in the AI field."
    "I find data visualization tools useful."
    "I enjoy testing and validating machine learning models."
    "I love discovering new machine learning applications."
    "I find ensemble methods to be effective."
    "I enjoy learning from data."
    "Machine learning can provide valuable insights."
    "I love the interdisciplinary nature of AI."
    "I find recommendation systems interesting."
    "I enjoy participating in hackathons."
    "Learning about neural networks is fascinating."
    "I love the potential of AI to solve complex problems."
    "I find sentiment analysis intriguing."
    "I enjoy implementing machine learning algorithms."
    "I love the excitement of discovering patterns in data."
    "I find time series analysis challenging."
    "I enjoy exploring different types of data."
    "Machine learning is transforming various industries."
    "I love working on predictive analytics."
    "I find anomaly detection to be useful."
    "I enjoy studying the mathematics behind machine learning."
    "I love the hands-on experience of building models."
    "I find clustering techniques interesting."
    "I enjoy exploring open-source machine learning libraries."
    "Machine learning can automate complex tasks."
    "I love the flexibility of machine learning models."
    "I find computer vision applications fascinating."
    "I enjoy solving real-world problems with AI."
    "I love the continuous learning aspect of AI."
    "I find reinforcement learning to be challenging."
    "I enjoy experimenting with hyperparameter tuning."
    "Machine learning can improve decision-making processes."
    "I love the creativity involved in feature selection."
    "I find generative models to be fascinating."
    "I enjoy reading about the latest AI advancements."
    "Machine learning can enhance user experiences."
    "I love the diversity of machine learning applications."
    "I find natural language generation intriguing."
    "I enjoy working with text data."
    "Machine learning can optimize business processes."
    "I love the innovation in AI research."
    "I find the concept of machine learning interpretability interesting."
    "I enjoy creating machine learning workflows."
    "Machine learning can uncover hidden patterns."
    "I love the impact of AI on society."
    "I find deep reinforcement learning fascinating."
    "I enjoy developing custom machine learning solutions."
    "Machine learning can improve customer experiences."
    "I love the potential of AI in healthcare."
    "I find the scalability of machine learning models intriguing."
    "I enjoy applying machine learning to finance."
    "Machine learning can enhance security measures."
    "I love the possibilities of AI in creative industries."
    "I find the ethical implications of AI important."
    "I enjoy sharing knowledge about machine learning."
    "This Free Advance AI Course that is helping alot of students to learn the concepts of AI and provides the detailed guideline on how to learn AI. This course enables the students to make their own projects. Updates them with the state of the art technologies and provide all the necessary knowlegde so that they should not be dependend on anyone to be able to learn anything."
]

## Tokenization and Preprocessing of the Data 
Text Preprocessing Overview

**Tokenizer**: Converts text into sequences of integers, assigning a unique index to each word.

**fit_on_texts**: Updates vocabulary based on input text.

**texts_to_sequences**: Converts texts to integer sequences.
Sequences and Padding:

**N-gram Sequences**: Created from text sequences for language modeling.

**Padding**: Ensures all sequences are the same length using pad_sequences, with zeros added at the beginning if necessary.

**Total Words**: Represents the number of unique words plus one for padding, useful for setting model output size.



In [10]:
# --------------------------- TEXT TOKENIZATION ---------------------------

# Step 1: Initialize the tokenizer
# The Tokenizer helps convert words into numerical values (word → index)
tokenizer = Tokenizer()

# Step 2: Fit the tokenizer on your list of sentences
# This builds a vocabulary (word index) based on the frequency of words
tokenizer.fit_on_texts(sentences)

# Step 3: Get the total number of unique words
# We add 1 to include a padding token (used later for sequence alignment)
total_words = len(tokenizer.word_index) + 1
print("Total Unique Words (including padding):", total_words)
print("Word Index (word to number mapping):", tokenizer.word_index)

# ------------------------ GENERATE TRAINING SEQUENCES ------------------------

# Step 4: Create a list to hold our input sequences
input_sequences = []

# Step 5: For each sentence, create sequences of word indices
for line in sentences:
    # Convert each sentence into a list of integers based on the tokenizer
    token_list = tokenizer.texts_to_sequences([line])[0]
    
    # Step 6: Create n-gram sequences
    # For a sentence like [2, 4, 6], we create:
    # [2, 4], [2, 4, 6]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]  # slice the list up to the ith token
        input_sequences.append(n_gram_sequence)  # add to our list of sequences

# ------------------------ PADDING SEQUENCES ------------------------

# Step 7: Find the length of the longest sequence
# This helps in standardizing the length of all sequences
max_sequence_len = max([len(x) for x in input_sequences])

# Step 8: Pad all sequences so they have the same length
# 'pre' padding adds zeros to the beginning of shorter sequences
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

# Step 9: Print the padded sequences to understand the final input
print("Padded Input Sequences:\n", input_sequences)


Total Unique Words (including padding): 254
Word Index (word to number mapping): {'i': 1, 'learning': 2, 'machine': 3, 'the': 4, 'enjoy': 5, 'love': 6, 'find': 7, 'of': 8, 'ai': 9, 'is': 10, 'to': 11, 'data': 12, 'with': 13, 'in': 14, 'can': 15, 'be': 16, 'about': 17, 'models': 18, 'on': 19, 'fascinating': 20, 'algorithms': 21, 'intriguing': 22, 'techniques': 23, 'interesting': 24, 'applications': 25, 'a': 26, 'projects': 27, 'experimenting': 28, 'and': 29, 'working': 30, 'exploring': 31, 'new': 32, 'deep': 33, 'complex': 34, 'neural': 35, 'different': 36, 'potential': 37, 'industries': 38, 'tech': 39, 'reinforcement': 40, 'useful': 41, 'solving': 42, 'problems': 43, 'analysis': 44, 'technologies': 45, 'learn': 46, 'are': 47, 'powerful': 48, 'tools': 49, 'understanding': 50, 'how': 51, 'networks': 52, 'field': 53, 'science': 54, 'studying': 55, 'computer': 56, 'vision': 57, 'exciting': 58, 'natural': 59, 'language': 60, 'for': 61, 'building': 62, 'creative': 63, 'latest': 64, 'predicti

## Data Preparation 

In [11]:
# ------------------------- SPLITTING INPUTS AND LABELS -------------------------

# Step 1: Inputs (X) — all tokens in the sequence *except* the last one
# For example, if a padded sequence is [0, 2, 4, 6]
# X will be [0, 2, 4] → the words used as input
X = input_sequences[:, :-1]  # All columns except the last one
print("Input Data (X):", X)

# Step 2: Labels (y) — the *last* token in the sequence
# For the same example [0, 2, 4, 6]
# y will be 6 → the word we want the model to predict
y = input_sequences[:, -1]  # Only the last column
print("Labels (y):", y)

# ------------------------- ONE-HOT ENCODING LABELS -------------------------

# Step 3: Convert the label values (word indices) into one-hot vectors
# This step is needed because neural networks don't work directly with labels like 6 or 10
# One-hot encoding turns label 6 into [0, 0, 0, 0, 0, 0, 1, 0, ...] — a vector with a 1 at the 6th index
# total_words is used to ensure the one-hot vector has the correct length (same as vocabulary size)
y = tf.keras.utils.to_categorical(y, num_classes=total_words)
# print("One-hot Encoded Labels (y):", y)


Input Data (X): [[  0   0   0 ...   0   0   3]
 [  0   0   0 ...   0   3   2]
 [  0   0   0 ...   3   2  21]
 ...
 [  0   0   3 ...  11  16 252]
 [  0   3   2 ...  16 252  11]
 [  3   2  21 ... 252  11  46]]
Labels (y): [  2  21  47  48  49   1   5  31  32  21   2  17   9  10  98  33   2  18
  15  16  34   1   6  50  51  35  52  99   4  53   8  12  54  10 100 101
   1   7 102 103  22  55  56  57  10  58  59  60 104  10  26  20 105   1
   5 106  14 107  61  12  54  27  62  18  10  26  63 108   1   6  28  13
  36   3   2  23   4  37   8   9  11 109  38  10 110   1   5 111 112  13
   4  64  39 113   2  17  40   2  10  24   1   7  65 114  11  16 115  41
   1   6  42  43  13  12  44  12  66  10  26  67 116  14   3   2   1   5
  68  69 117  19  33   2   1   7 118  23  20  50  21 119  14  70 120  71
   1   6   4 121   8 122 123   3   2  25  47 124  29 125   1   5 126  13
 127  19  39  27   2  32 128 129  10 130   1   6  30  13 131 132   1   7
  72 133  11  16  73  74 134 135  10  73 136 137  

## **Defining the Model Architecture:**

**Sequential Model:** A linear stack of layers.

**Embedding Layer:** Converts word indices to dense vectors, helping the model understand word relationships.

**SimpleRNN Layer:** Processes the sequence data and captures temporal dependencies.

Dense Layer with Softmax: Outputs a probability distribution over the total words, predicting the next word in the sequence.

In [13]:
# ------------------------- DEFINING THE RNN MODEL -------------------------

# Import necessary modules
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense

# Step 1: Define a Sequential model
# Sequential means layers are added one after the other in a linear stack
model = Sequential([

    # Step 2: Embedding Layer
    # Converts each word index into a fixed-size dense vector (word embedding)
    # total_words: size of the vocabulary (number of unique words + 1 for padding)
    # 10: size of the embedding vector for each word
    # input_length: length of each input sequence (we use max_sequence_len - 1 
    # because the last word in each sequence is used as the label, not input)
    Embedding(input_dim=total_words, output_dim=10, input_length=max_sequence_len - 1),

    # Step 3: Simple RNN Layer
    # Processes the sequence of embeddings, one timestep at a time
    # 30 units means the RNN will output a 30-dimensional vector after reading the input sequence
    SimpleRNN(units=100),

    # Step 4: Output Layer
    # A dense (fully connected) layer with softmax activation
    # total_words: number of output neurons, equal to vocabulary size
    # softmax: ensures output is a probability distribution across all possible next words
    Dense(units=total_words, activation='softmax')
])


In [14]:
# ------------------------- COMPILE THE MODEL -------------------------

# Compile the model before training
# This step configures the model’s learning process

model.compile(
    loss='categorical_crossentropy',  # Loss function for multi-class classification
    optimizer='adam',                 # Adam optimizer adjusts the weights efficiently during training
    metrics=['accuracy']              # Track accuracy as a metric while training
)


In [15]:
# ------------------------- TRAIN THE MODEL -------------------------

# Train the model on the input data (X) and labels (y)
# This is where the model learns patterns in the word sequences

model.fit(
    X,          # Input data: sequences of words (excluding the last word)
    y,          # Labels: the next word to predict (in one-hot format)
    epochs=100,  # Number of times the model sees the entire training dataset
    verbose=1   # Display progress bar and training info while training
)


Epoch 1/100


I0000 00:00:1750707108.575339      98 service.cc:148] XLA service 0x7bbf20006260 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1750707108.575876      98 service.cc:156]   StreamExecutor device (0): Tesla P100-PCIE-16GB, Compute Capability 6.0
I0000 00:00:1750707108.919813      98 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m 3/24[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m1s[0m 58ms/step - accuracy: 0.0191 - loss: 5.5418

I0000 00:00:1750707110.009240      98 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 94ms/step - accuracy: 0.0100 - loss: 5.5473
Epoch 2/100
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 57ms/step - accuracy: 0.0145 - loss: 5.5211
Epoch 3/100
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 58ms/step - accuracy: 0.0456 - loss: 5.3114
Epoch 4/100
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 57ms/step - accuracy: 0.0699 - loss: 4.8456
Epoch 5/100
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 57ms/step - accuracy: 0.0940 - loss: 4.6337
Epoch 6/100
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 57ms/step - accuracy: 0.0995 - loss: 4.6551
Epoch 7/100
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 57ms/step - accuracy: 0.0925 - loss: 4.6539
Epoch 8/100
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 57ms/step - accuracy: 0.0984 - loss: 4.5767
Epoch 9/100
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x7bbffeff5390>

## **Next Word Prediction:**

**Seed Text Conversion:** The input text (seed_text) is converted to a sequence of integers using the tokenizer.

**Padding:** The integer sequence is padded to ensure it matches the model's expected input length.

**Prediction:** The model predicts the probability of each word in the vocabulary as the next word.

**Selection and Update:** The word with the highest probability is selected, converted back to a word, and appended to the seed text.

**Iterative Prediction:** This process can be repeated for a specified number of words (next_words).

In [16]:
# ------------------------- PREDICT NEXT WORD(S) FUNCTION -------------------------

# Define a function to predict the next word(s) given some seed text
# seed_text: starting words provided by the user
# next_words: number of words to predict and append

def predict_next_word(seed_text, next_words=1):
    for _ in range(next_words):
        # Step 1: Convert the seed text (string) into a list of integers (tokens)
        # This is necessary because our model was trained on sequences of numbers, not raw text
        token_list = tokenizer.texts_to_sequences([seed_text])[0]

        # Step 2: Pad the token list to ensure it matches the model's input size
        # This pads the sequence from the front with zeros if it's shorter than required length
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')

        # Step 3: Predict the probabilities of the next word using the trained model
        # The output is a probability distribution over all words in the vocabulary
        predicted = model.predict(token_list, verbose=0)

        # Step 4: Find the index of the word with the highest probability
        # This is the model's best guess for the next word
        predicted_word_index = np.argmax(predicted, axis=-1)[0]

        # Step 5: Convert the predicted index back to the actual word using the tokenizer's index
        predicted_word = tokenizer.index_word[predicted_word_index]

        # Step 6: Add the predicted word to the original seed text
        # This will help predict the next word in the next loop (if next_words > 1)
        seed_text += " " + predicted_word

    # Step 7: Return the final generated text
    return seed_text


In [24]:
# Test the prediction function with a sample input
print(predict_next_word("machine learning i enjoy about machine learning"))

machine learning i enjoy about machine learning to


# Building a Long Short-Term Memory (LSTM)

In [25]:
# Import necessary libraries
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

## Sample data


In [26]:
# Sample data
sentences = [
    "Machine learning algorithms are powerful tools."
    "I enjoy exploring new algorithms."
    "Learning about AI is captivating."
    "Deep learning models can be complex."
    "I love understanding how neural networks work."
    "The field of data science is evolving rapidly."
    "I find artificial intelligence intriguing."
    "Studying computer vision is exciting."
    "Natural language processing is a fascinating domain."
    "I enjoy coding in Python for data science projects."
    "Building models is a creative process."
    "I love experimenting with different machine learning techniques."
    "The potential of AI to transform industries is amazing."
    "I enjoy staying updated with the latest tech trends."
    "Learning about reinforcement learning is interesting."
    "I find predictive modeling to be very useful."
    "I love solving problems with data analysis."
    "Data preprocessing is a crucial step in machine learning."
    "I enjoy reading research papers on deep learning."
    "I find optimization techniques fascinating."
    "Understanding algorithms helps in developing better solutions."
    "I love the challenge of debugging code."
    "Machine learning applications are diverse and impactful."
    "I enjoy collaborating with others on tech projects."
    "Learning new programming languages is fun."
    "I love working with large datasets."
    "I find feature engineering to be an art."
    "Model evaluation is an essential part of machine learning."
    "I enjoy attending tech conferences."
    "Learning about big data technologies is exciting."
    "I love experimenting with neural network architectures."
    "I find the theory behind machine learning algorithms interesting."
    "I enjoy visualizing data insights."
    "Machine learning models can make accurate predictions."
    "I love the creativity involved in data storytelling."
    "I find unsupervised learning techniques intriguing."
    "I enjoy automating tasks with AI."
    "Learning about AI ethics is important."
    "I love the problem-solving aspect of machine learning."
    "I find cloud computing technologies fascinating."
    "I enjoy using machine learning for real-world applications."
    "I love experimenting with different data preprocessing techniques."
    "I find transfer learning to be a powerful approach."
    "I enjoy working on machine learning projects."
    "Learning about data privacy is crucial."
    "I love the innovation happening in the AI field."
    "I find data visualization tools useful."
    "I enjoy testing and validating machine learning models."
    "I love discovering new machine learning applications."
    "I find ensemble methods to be effective."
    "I enjoy learning from data."
    "Machine learning can provide valuable insights."
    "I love the interdisciplinary nature of AI."
    "I find recommendation systems interesting."
    "I enjoy participating in hackathons."
    "Learning about neural networks is fascinating."
    "I love the potential of AI to solve complex problems."
    "I find sentiment analysis intriguing."
    "I enjoy implementing machine learning algorithms."
    "I love the excitement of discovering patterns in data."
    "I find time series analysis challenging."
    "I enjoy exploring different types of data."
    "Machine learning is transforming various industries."
    "I love working on predictive analytics."
    "I find anomaly detection to be useful."
    "I enjoy studying the mathematics behind machine learning."
    "I love the hands-on experience of building models."
    "I find clustering techniques interesting."
    "I enjoy exploring open-source machine learning libraries."
    "Machine learning can automate complex tasks."
    "I love the flexibility of machine learning models."
    "I find computer vision applications fascinating."
    "I enjoy solving real-world problems with AI."
    "I love the continuous learning aspect of AI."
    "I find reinforcement learning to be challenging."
    "I enjoy experimenting with hyperparameter tuning."
    "Machine learning can improve decision-making processes."
    "I love the creativity involved in feature selection."
    "I find generative models to be fascinating."
    "I enjoy reading about the latest AI advancements."
    "Machine learning can enhance user experiences."
    "I love the diversity of machine learning applications."
    "I find natural language generation intriguing."
    "I enjoy working with text data."
    "Machine learning can optimize business processes."
    "I love the innovation in AI research."
    "I find the concept of machine learning interpretability interesting."
    "I enjoy creating machine learning workflows."
    "Machine learning can uncover hidden patterns."
    "I love the impact of AI on society."
    "I find deep reinforcement learning fascinating."
    "I enjoy developing custom machine learning solutions."
    "Machine learning can improve customer experiences."
    "I love the potential of AI in healthcare."
    "I find the scalability of machine learning models intriguing."
    "I enjoy applying machine learning to finance."
    "Machine learning can enhance security measures."
    "I love the possibilities of AI in creative industries."
    "I find the ethical implications of AI important."
    "I enjoy sharing knowledge about machine learning."
]

## Tokenization and preprocessing


In [27]:
# Tokenization and preprocessing

# Initialize the Tokenizer to convert text into sequences of integers
tokenizer = Tokenizer()

# Fit the tokenizer on the provided sentences to build the vocabulary
# Each unique word is assigned a unique integer index
tokenizer.fit_on_texts(sentences)

# Total number of unique words in the vocabulary plus one for padding
total_words = len(tokenizer.word_index) + 1

# Initialize a list to hold sequences of tokenized words
input_sequences = []

# Process each sentence
for line in sentences:
    # Convert the sentence to a sequence of integers based on the tokenizer
    token_list = tokenizer.texts_to_sequences([line])[0]

    # Generate n-gram sequences from the integer sequence
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]  # Create a sequence including up to the i-th word
        input_sequences.append(n_gram_sequence)  # Add the sequence to the list

# Find the maximum length of sequences
max_sequence_len = max([len(x) for x in input_sequences])

# Pad sequences to ensure they are all the same length
# Padding is added at the beginning of the sequences (pre-padding)
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))


## Data Prepration

In [28]:
# Split data into inputs and labels

# Extract input sequences: all elements except the last one
X = input_sequences[:, :-1]

# Extract labels: the last element of each sequence
y = input_sequences[:, -1]

# Convert labels to one-hot encoded format
# This creates a binary matrix representation of the labels with one-hot encoding
# num_classes is the total number of unique words in the vocabulary
y = tf.keras.utils.to_categorical(y, num_classes=total_words)


## Define the LSTM model


In [29]:
# Define the LSTM model

# Sequential model allows stacking layers in a linear fashion
model = Sequential([
    # Embedding layer: Converts word indices to dense vectors
    # Input dimension: total number of words, Output dimension: size of embedding vectors
    # Input length: length of input sequences (excluding the last word)
    Embedding(total_words, 10, input_length=max_sequence_len-1),

    # LSTM layer with 30 units
    # LSTM (Long Short-Term Memory) is a type of RNN that can capture long-term dependencies
    LSTM(100),

    # Dense output layer with a softmax activation function
    # Output dimension: total number of words (for multi-class classification)
    # Softmax activation converts the output to probabilities for each word
    Dense(total_words, activation='softmax')
])


## Compile the LSTM model


In [30]:

# Compile the model

# Compile the model specifies the loss function, optimizer, and evaluation metrics
model.compile(
    # Loss function: Categorical cross-entropy, used for multi-class classification problems
    loss='categorical_crossentropy',

    # Optimizer: Adam, an efficient optimization algorithm that adjusts the learning rate during training
    optimizer='adam',

    # Metrics: Accuracy, to evaluate the model's performance during training and testing
    metrics=['accuracy']
)



## # Train the LSTM Model


In [31]:
# Train the model

# Fit the model on the training data
# X: Input sequences
# y: One-hot encoded labels
# epochs: Number of times the model will go through the entire dataset
# verbose: Controls the verbosity of the output during training (1 for progress bar)
model.fit(X, y, epochs=100, verbose=1)


Epoch 1/100
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 36ms/step - accuracy: 0.0549 - loss: 5.3513
Epoch 2/100
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 35ms/step - accuracy: 0.1000 - loss: 4.7341
Epoch 3/100
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 35ms/step - accuracy: 0.1078 - loss: 4.5859
Epoch 4/100
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 35ms/step - accuracy: 0.1080 - loss: 4.5433
Epoch 5/100
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 35ms/step - accuracy: 0.1054 - loss: 4.5477
Epoch 6/100
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 35ms/step - accuracy: 0.1015 - loss: 4.5229
Epoch 7/100
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 35ms/step - accuracy: 0.0911 - loss: 4.5543
Epoch 8/100
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 35ms/step - accuracy: 0.1144 - loss: 4.4147
Epoch 9/100
[1m22/22[0m [32m━━━━━━━━━

<keras.src.callbacks.history.History at 0x7bbff44da810>

## Prediction of Next Word

In [32]:
# Function to predict the next word(s) given a seed text
def predict_next_word(seed_text, next_words=2):
    # Repeat the prediction process for the specified number of next words
    for _ in range(next_words):
        # Convert the seed text into a sequence of integers using the tokenizer
        token_list = tokenizer.texts_to_sequences([seed_text])[0]

        # Pad the sequence to ensure it matches the input length expected by the model
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')

        # Predict the probabilities of the next word in the sequence
        predicted = model.predict(token_list, verbose=0)

        # Get the index of the word with the highest probability
        predicted_word_index = np.argmax(predicted, axis=-1)[0]

        # Retrieve the word corresponding to the predicted index
        predicted_word = tokenizer.index_word[predicted_word_index]

        # Append the predicted word to the seed text
        seed_text += " " + predicted_word

    # Return the updated seed text with the predicted word(s)
    return seed_text


In [40]:
# Test prediction
print(predict_next_word("Machine Learning"))

Machine Learning algorithms are
