In [None]:
import nltk
nltk.download('punkt')

In [2]:
import torch
import torch.nn as nn
import torchvision
from torch.utils.data import Dataset, DataLoader
from nltk.tokenize import word_tokenize
from tqdm import tqdm
import numpy as np
import re
from torch.nn.functional import one_hot
import torch.optim as optim
import os


os.environ['TORCH_USE_CUDA_DSA'] = '1'

#Defining the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
# This Function tokenises the data
def tokeniser(path):
    # Open the file in read-write mode ('+r') with UTF-8 encoding
    with open(path, '+r', encoding='utf-8') as file:
        # Read the entire content of the file
        data = file.read()

        # Convert data to lowercase and replace 'old_text' with 'new_text'
        data = data.lower()
        new_data = data.replace('old_text', 'new_text')

        # Move the file pointer to the beginning, write new_data, and truncate the rest
        file.seek(0)
        file.write(new_data)
        file.truncate()

        # Split the modified data into tokens
        tokens = re.findall(r"\b\w+(?:'\w+)?\b", new_data)
    # Return the list of tokens
    return np.array(tokens)

In [4]:
path = r'/content/drive/MyDrive/Kaggle/conversion.txt'
tokenized_data = tokeniser(path)

In [14]:
# Create a vocabulary from the tokenized data
vocab = set(tokenized_data)
word_to_idx = {word: idx for idx, word in enumerate(vocab)}
idx_to_word = {idx: word for word, idx in word_to_idx.items()}
len(vocab)

8206

In [None]:
# Convert words to indices
indexed_data = [word_to_idx[word] for word in tokenized_data]
indexed_data

In [8]:
class NextWordDataset(Dataset):
    def __init__(self, data, sequence_length):
        self.data = data
        self.sequence_length = sequence_length

    def __len__(self):
        return len(self.data) - self.sequence_length

    def __getitem__(self, index):
        input_seq = self.data[index:index + self.sequence_length]
        target = self.data[index + self.sequence_length]
        return torch.tensor(input_seq), torch.tensor(target)

In [9]:
# Define the LSTM-based model
class NextWordPredictor(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(NextWordPredictor, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        output = self.fc(lstm_out[:, -1, :])
        return output

In [10]:
# Hyperparameters
vocab_size = len(vocab)
embedding_dim = 100
hidden_dim = 150
sequence_length = 3  # Length of input sequences

In [11]:
# Create dataset and DataLoader
dataset = NextWordDataset(indexed_data, sequence_length)
batch_size = 1  # Using batch size 1 for simplicity
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [12]:
# Move model and data to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = NextWordPredictor(vocab_size, embedding_dim, hidden_dim).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
# Training loop
epochs = 10
for epoch in range(epochs):
    running_loss = 0.0
    for data in tqdm(dataloader, desc=f"Epoch {epoch + 1}/{epochs}", unit="batch"):
        inputs, target = data
        inputs, target = inputs.to(device), target.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, target)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    epoch_loss = running_loss / len(dataloader)
    print(f"Epoch [{epoch + 1}/{epochs}] Loss: {epoch_loss:.4f}")

print("Training finished.")


In [24]:
!cd //content/drive/MyDrive/Kaggle

In [25]:
# Saving the model state dictionary
torch.save(model, '/content/drive/MyDrive/Kaggle/model.pt')

In [26]:
# Create an instance of the model
model = NextWordPredictor(vocab_size, embedding_dim, hidden_dim)

# Load the saved model state dictionary
model.load_state_dict(torch.load('next_word_model.pth'))

<All keys matched successfully>

In [None]:
# Sample sequence of words
input_sequence = ["good"]

# Convert words to indices using the word_to_idx dictionary
indexed_sequence = [word_to_idx[word] for word in input_sequence]

# Convert the indexed sequence to a PyTorch tensor
input_tensor = torch.tensor(indexed_sequence).unsqueeze(0)  # Add batch dimension

# Use the model for prediction
model.eval()  # Set the model to evaluation mode
with torch.no_grad():  # Disable gradient calculation
    output = model(input_tensor)

# Apply softmax to get probabilities
softmax = nn.Softmax(dim=1)
probabilities = softmax(output)

# Get the top n words based on probabilities
top_n = 5  # Number of top words to retrieve
top_probabilities, top_indices = torch.topk(probabilities, top_n)

# Convert indices back to words using idx_to_word dictionary
top_words = [idx_to_word[idx.item()] for idx in top_indices.squeeze()]

# Display the top words and their probabilities
for word, prob in zip(top_words, top_probabilities.squeeze()):
    print(f"Word: {word}")
