# Predicting the next word from DataFrame using LSTM in PyTorch
In this exercise we will build and train aLSTM Network to predict the next word based on sample data.

# 1.Data Preparation

## 1.Import necessary Libraries

In [3]:
# Import necessary libraries
import pandas as pd
import numpy as np
import re
import torch
import torch.nn as nn
import torch.optim as optim
from gensim.models import Word2Vec

# 2.Create a DataFrame with sample text

In [4]:
# Step 1: Create an example dataset
data = {
    "text": [
        "The movie was fantastic and very engaging",
        "I hated the acting and the storyline",
        "It was boring and lacked depth",
        "Amazing performance by the actors and great direction",
        "Not worth watching at all",
        "One of the best movies I have ever seen"
    ]
}

# Create a DataFrame from the dataset
df = pd.DataFrame(data)


# 3.Preprocess the data

# Step 1: Tokenize the text

In [5]:
# Step 1: Define a function to tokenize the text
def tokenize(text):
    return re.findall(r'\b\w+\b', text.lower())
# Apply the tokenization function to the text column
df['tokens'] = df['text'].apply(tokenize)


# step-2: word embeddings

In [6]:
# Step 2: Train a Word2Vec model on the tokenized sentences
word2vec_model = Word2Vec(sentences=df['tokens'], vector_size=50, window=3, min_count=1)

## Step 3: Prepare the sequences

In [7]:
# Step 3: Prepare dataset for word prediction
def prepare_sequences(tokens, model, context_size=3):
    X, y = [], []
    for i in range(len(tokens) - context_size):
        # Context words
        context = tokens[i:i + context_size]
        # Target word
        target = tokens[i + context_size]
        X.append([model.wv[word] for word in context])
        y.append(model.wv.key_to_index[target])  # Index of the target word in vocabulary
    return np.array(X), np.array(y)

In [8]:
# Create sequences for all rows in the dataset
context_size = 3
X, y = [], []
for tokens in df['tokens']:
    X_seq, y_seq = prepare_sequences(tokens, word2vec_model, context_size)
    X.extend(X_seq)
    y.extend(y_seq)

X, y = np.array(X), np.array(y)

# Step 4: Convert to pytorch tensors

In [9]:
# Step 4: Convert data to PyTorch tensors
X_tensor = torch.tensor(X, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.long)

# 2.LSTM Model Building

# Step 1 Define the LSTM model

In [10]:
# Step 1: Define the LSTM model for word prediction
class WordPredictionLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, vocab_size):
        super(WordPredictionLSTM, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        _, (hidden, _) = self.lstm(x)  # Get the hidden state from LSTM
        out = self.fc(hidden[-1])  # Pass hidden state through a fully connected layer
        return out

# step-2:Intialize the model parameters

In [11]:

# Step 2: Initialize the model
input_size = 50  # Size of the word vector
hidden_size = 64  # Number of hidden units in LSTM
vocab_size = len(word2vec_model.wv)  # Vocabulary size


# step-2: Initialize model, loss function and optimizer

In [12]:
# Step 1: Define loss function and optimizer
model = WordPredictionLSTM(input_size, hidden_size, vocab_size)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# 3. Training the model

# step-1: Train the model over multiple epochs

In [13]:
# Step 2: Training loop
num_epochs = 200  # Number of epochs for training
for epoch in range(num_epochs):
    model.train()  # Set the model to training mode
    optimizer.zero_grad()  # Reset gradients

    # Forward pass
    outputs = model(X_tensor)  # Get model predictions
    loss = criterion(outputs, y_tensor)  # Calculate loss

    # Backward pass and optimization
    loss.backward()  # Backpropagation
    optimizer.step()  # Update model parameters

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}")

[NVSHARE][WARN]: Couldn't open file /var/run/secrets/kubernetes.io/serviceaccount/namespace to read Pod namespace
[NVSHARE][INFO]: Successfully initialized nvshare GPU
[NVSHARE][INFO]: Client ID = 2d65622af6a7abc4


Epoch 1/200, Loss: 3.5035
Epoch 2/200, Loss: 3.4621
Epoch 3/200, Loss: 3.4228
Epoch 4/200, Loss: 3.3779
Epoch 5/200, Loss: 3.3218
Epoch 6/200, Loss: 3.2512
Epoch 7/200, Loss: 3.1680
Epoch 8/200, Loss: 3.0806
Epoch 9/200, Loss: 3.0007
Epoch 10/200, Loss: 2.9395
Epoch 11/200, Loss: 2.9047
Epoch 12/200, Loss: 2.8955
Epoch 13/200, Loss: 2.9036
Epoch 14/200, Loss: 2.9070
Epoch 15/200, Loss: 2.8941
Epoch 16/200, Loss: 2.8847
Epoch 17/200, Loss: 2.8866
Epoch 18/200, Loss: 2.8929
Epoch 19/200, Loss: 2.8910
Epoch 20/200, Loss: 2.8799
Epoch 21/200, Loss: 2.8694
Epoch 22/200, Loss: 2.8674
Epoch 23/200, Loss: 2.8707
Epoch 24/200, Loss: 2.8695
Epoch 25/200, Loss: 2.8613
Epoch 26/200, Loss: 2.8512
Epoch 27/200, Loss: 2.8432
Epoch 28/200, Loss: 2.8372
Epoch 29/200, Loss: 2.8310
Epoch 30/200, Loss: 2.8219
Epoch 31/200, Loss: 2.8084
Epoch 32/200, Loss: 2.7911
Epoch 33/200, Loss: 2.7715
Epoch 34/200, Loss: 2.7498
Epoch 35/200, Loss: 2.7243
Epoch 36/200, Loss: 2.6925
Epoch 37/200, Loss: 2.6519
Epoch 38/2

# 4.Evaluate the model

In [14]:
# Step 1: Prediction function for next word
def predict_next_word(context, model, word2vec_model, context_size=3):
    # Tokenize the input context
    tokens = tokenize(context)
    if len(tokens) < context_size:
        raise ValueError(f"Context must have at least {context_size} words")
    tokens = tokens[-context_size:]  # Use only the last `context_size` words

    # Convert tokens to vectors
    vectors = [word2vec_model.wv[word] for word in tokens]

    # Convert to PyTorch tensor and add batch dimension
    input_tensor = torch.tensor([vectors], dtype=torch.float32)  # Shape: (1, context_size, input_size)

    # Set the model to evaluation mode
    model.eval()

    # Make prediction
    with torch.no_grad():
        output = model(input_tensor)  # Get model predictions
        predicted_index = torch.argmax(output, dim=1).item()  # Get the index of the maximum value (class)

    # Convert index back to word
    predicted_word = word2vec_model.wv.index_to_key[predicted_index]
    return predicted_word

# Interactive testing

In [15]:
# Interactive Testing Function
def interactive_predict(model, word2vec_model, context_size=3):
    print("\nInteractive Word Prediction")
    print("Enter a context sentence to predict the next word.")
    print("Type 'exit' to quit.\n")

    while True:
        context = input("Enter context: ")
        if context.lower() == 'exit':
            print("Exiting interactive testing. Goodbye!")
            break

        try:
            next_word = predict_next_word(context, model, word2vec_model, context_size)
            print(f"Predicted next word: \"{next_word}\"")
        except ValueError as e:
            print(f"Error: {e}. Ensure the context has at least {context_size} words.")


In [None]:
# Run interactive testing
interactive_predict(model, word2vec_model)


Interactive Word Prediction
Enter a context sentence to predict the next word.
Type 'exit' to quit.



Enter context:  the movie was


  input_tensor = torch.tensor([vectors], dtype=torch.float32)  # Shape: (1, context_size, input_size)


Predicted next word: "fantastic"


Enter context:  i hated the


Predicted next word: "acting"


Enter context:  one of the


Predicted next word: "best"
