First, import the libraries

In [None]:
!pip install torch

import torch
import torch.nn as nn
import torch.optim as optim

Import and optimize the dataset

In [None]:
# Read the dataset from the file
with open('data.txt', 'r') as file:
    poems = file.readlines()

# Tokenize the poems
tokens = []
for poem in poems:
    # Preprocess the text by removing special characters and symbols
    processed_poem = poem.lower().strip().replace(".", "")
    tokens += processed_poem.split()

# Create a vocabulary
vocab = list(set(tokens))
vocab_size = len(vocab)

# Create word-to-index and index-to-word mappings
word_to_idx = {word: i for i, word in enumerate(vocab)}
idx_to_word = {i: word for i, word in enumerate(vocab)}

Preprocess the data

In [None]:
# Set the window size
window_size = 5

# Generate training examples
input_seqs = []
target_seqs = []
for i in range(len(tokens) - window_size):
    input_seq = tokens[i:i+window_size]
    target_seq = tokens[i+window_size]
    input_seqs.append(input_seq)
    target_seqs.append(target_seq)

# Handle the last sequence that is shorter than the window size
if len(tokens) >= window_size:
    input_seq = tokens[-window_size:]
    target_seq = tokens[-1]
    input_seqs.append(input_seq)
    target_seqs.append(target_seq)

# Convert sequences to tensors
input_tensors = []
target_tensors = []
for input_seq, target_seq in zip(input_seqs, target_seqs):
    input_tensors.append(torch.tensor([word_to_idx[word] for word in input_seq], dtype=torch.long))
    target_tensors.append(torch.tensor(word_to_idx[target_seq], dtype=torch.long))

Define the model

In [None]:
class AutocompleteModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(AutocompleteModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        embedded = self.embedding(x)
        output, _ = self.lstm(embedded)
        output = self.fc(output[:, -1, :])
        return output

Training (takes the most of  the time)

In [None]:
# Set the hyperparameters
embedding_dim = 128
hidden_dim = 256
epochs = 100
batch_size = 64
learning_rate = 0.001

# Create the model
model = AutocompleteModel(vocab_size, embedding_dim, hidden_dim)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Split the data into training and validation sets
train_ratio = 0.9
train_size = int(train_ratio * len(input_tensors))
train_inputs, val_inputs = input_tensors[:train_size], input_tensors[train_size:]
train_targets, val_targets = target_tensors[:train_size], target_tensors[train_size:]

# Train the model
best_val_loss = float('inf')
best_model_state_dict = None

for epoch in range(epochs):
    model.train()
    total_loss = 0

    for i in range(0, len(train_inputs), batch_size):
        batch_inputs = train_inputs[i:i+batch_size]
        batch_targets = train_targets[i:i+batch_size]

        optimizer.zero_grad()
        outputs = model(torch.stack(batch_inputs))
        loss = criterion(outputs, torch.stack(batch_targets))
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_inputs)

    # Evaluate on the validation set
    model.eval()
    val_loss = 0

    with torch.no_grad():
        for i in range(0, len(val_inputs), batch_size):
            batch_inputs = val_inputs[i:i+batch_size]
            batch_targets = val_targets[i:i+batch_size]

            outputs = model(torch.stack(batch_inputs))
            loss = criterion(outputs, torch.stack(batch_targets))
            val_loss += loss.item()

        avg_val_loss = val_loss / len(val_inputs)

    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        best_model_state_dict = model.state_dict()

    if (epoch + 1) % 1 == 0:
        print(f"Epoch: {epoch+1}/{epochs}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

# Load the best model state
model.load_state_dict(best_model_state_dict)

Run the model

In [None]:
#@title Settings
maxLengthForm = 1 #@param {type:"integer"}
seed = 124 #@param {type:"slider", min:1, max:2500, step:1}
suggestFor = "dreams come" #@param {type:"string"}
# Set the model in evaluation mode
model.eval()

# Generate autocompletions
input_sequence = suggestFor.lower()
max_length = maxLengthForm
beam_width = seed


def score_beam_candidates(beam_candidates):
    scores = []
    for candidate in beam_candidates:
        candidate_tensor = torch.tensor([word_to_idx[word] for word in candidate], dtype=torch.long).unsqueeze(0)
        with torch.no_grad():
            output = model(candidate_tensor)
            score = torch.log_softmax(output, dim=1).sum()
        scores.append(score.item())
    return torch.tensor(scores)


 
with torch.no_grad():
    # Tokenize the input sequence
    input_tokens = input_sequence.lower().split()
    
    # Filter out words that are not in the vocabulary
    input_tokens = [token for token in input_tokens if token in vocab]
    
    # Check if the input sequence is empty after filtering
    if len(input_tokens) == 0:
        print("No valid words in the input sequence. Please try again with valid words.")
        exit()
    
    input_tensor = torch.tensor([word_to_idx[word] for word in input_tokens], dtype=torch.long).unsqueeze(0)

    # Generate autocompletions using beam search
    output_sequence = input_tokens[:]
    for _ in range(max_length):
        output = model(input_tensor)
        _, topk_indices = torch.topk(output, beam_width, dim=1)

        beam_candidates = []
        for idx in topk_indices[0]:
            predicted_word = idx_to_word[idx.item()]
            beam_candidates.append(output_sequence + [predicted_word])

        scores = score_beam_candidates(beam_candidates)
        topk_scores, topk_indices = torch.topk(scores, beam_width)

        output_sequence = beam_candidates[topk_indices[0].item()]
        input_tensor = torch.tensor([word_to_idx[word] for word in output_sequence], dtype=torch.long).unsqueeze(0)

    autocompletion = ' '.join(output_sequence)
    print(f"Suggestion: {autocompletion}")