# Next-word Generator for the Sherlock Holmes dataset

In this assignment, we will build a word-level language model using PyTorch, focusing on improved handling of punctuation and case sensitivity. We will work with the "Sherlock Holmes" dataset and visualize word embeddings using Plotly.


## Imports and Initial Configuration

In [1]:
import torch
import torch.nn.functional as F
from torch import nn
import pandas as pd
import plotly.express as px  # For interactive plotting
import plotly.graph_objects as go
import re
import os
from sklearn.manifold import TSNE

# Set up Plotly for better visualization
import plotly.io as pio
pio.renderers.default = 'notebook_connected'

# Display PyTorch version and set device
print(f"PyTorch Version: {torch.__version__}")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Define dataset directory
dataset_dir = os.path.join(os.getcwd(), 'datasets')


PyTorch Version: 1.11.0+cu113
Using device: cuda


## Text Cleaning Function

We define a function to clean the text by handling punctuation more effectively and ensuring case insensitivity.

In [4]:
def clean_text(filename: str):
    """
    Reads and cleans text from a file.
    Handles punctuation by separating them as distinct tokens
    and converts all text to lowercase.
    """
    filepath = os.path.join(dataset_dir, filename)
    with open(filepath, encoding='utf-8') as file:
        text = file.read()
    
    # Separate punctuation by adding spaces around them
    text = re.sub(r'([.!?])', r' \1 ', text)
    # Remove any unwanted characters except specified punctuation
    text = re.sub(r'[^a-zA-Z0-9\s.!?]', ' ', text)
    # Convert to lowercase
    text = text.lower()
    # Remove extra spaces
    text = " ".join(text.split())
    return text


## Unique Words Extraction Function

Extracts unique words, including punctuation, and creates mappings between words and their indices.

In [5]:
def unique_words(text: str):
    """
    Extracts unique words and punctuation from the text.
    Creates mappings from string to index and vice versa.
    """
    words = pd.Series(text.split())
    # Filter words by length and ensure they are alphabetic or punctuation
    words = words[((words.str.len() > 0) & (words.str.len() < 20))]
    # Allow punctuation marks as valid tokens
    words = words[words.str.match(r'^[a-zA-Z0-9.!?]+$')]
    words = words.drop_duplicates(ignore_index=True)
    words = words.sort_values(key=lambda x: x.str.len()).reset_index(drop=True)
    vocab = words.sort_values().to_list()
    
    # Create string to index mapping, starting from 1
    stoi = {s: i + 1 for i, s in enumerate(vocab)}
    stoi['.'] = stoi.get('.', len(stoi) + 1)  # Ensure '.' is included
    stoi['!'] = stoi.get('!', len(stoi) + 1)  # Ensure '!' is included
    stoi['?'] = stoi.get('?', len(stoi) + 1)  # Ensure '?' is included
    
    # Create index to string mapping
    itos = {i: s for s, i in stoi.items()}
    return vocab, stoi, itos


## Data Preparation
We prepare the dataset by creating input-output pairs based on a context window.

In [6]:
def prepare_data(text: str, block_size: int, stoi):
    """
    Prepares input-output pairs for training.
    Each input consists of `block_size` words, and the target is the next word.
    """
    words = text.split()
    X, Y = [], []
    for i in range(block_size, len(words)):
        context = words[i-block_size:i]
        target = words[i]
        # Convert context and target to indices, default to '.' if not found
        context_ix = [stoi.get(word, stoi['.']) for word in context]
        target_ix = stoi.get(target, stoi['.'])
        X.append(context_ix)
        Y.append(target_ix)
    # Convert lists to tensors
    X = torch.tensor(X, dtype=torch.long).to(device)
    Y = torch.tensor(Y, dtype=torch.long).to(device)
    return X, Y

## Data Cleaning and Preparation
We clean the text, build the vocabulary, and prepare the data for training.

In [7]:
# Clean the text from the dataset
text = clean_text('sherlock.txt')

# Extract unique words and create mappings
vocab, stoi, itos = unique_words(text)

# Prepare input-output pairs with a context window of 5
block_size = 5
X, Y = prepare_data(text, block_size, stoi)

# Display the shapes of the tensors
print(f"Input shape: {X.shape}, dtype: {X.dtype}")
print(f"Target shape: {Y.shape}, dtype: {Y.dtype}")


TypeError: drop_duplicates() got an unexpected keyword argument 'ignore_index'

## Embedding Initialization and Visualization

We initialize the embedding layer and visualize the embeddings using t-SNE with Plotly.

In [6]:
# %%
# Define hyperparameters
embedding_dim = 128  # Increased embedding size
hidden_dim = 512      # Increased hidden layer size
batch_size = 4096
epochs = 50
learning_rate = 0.001

# Initialize the embedding layer
embedding = nn.Embedding(len(stoi) + 1, embedding_dim).to(device)  # +1 for padding if needed
print(f"Embedding Weights Shape: {embedding.weight.shape}")

# Convert embeddings to NumPy for visualization
embeddings = embedding.weight.detach().cpu().numpy()

# Perform t-SNE to reduce dimensions to 2D
tsne = TSNE(n_components=2, random_state=42)
embeddings_2d = tsne.fit_transform(embeddings)

# Create a DataFrame for Plotly
words = list(stoi.keys())
df = pd.DataFrame({
    'word': words,
    'x': embeddings_2d[:len(words), 0],
    'y': embeddings_2d[:len(words), 1]
})

# Plot using Plotly Express
fig = px.scatter(
    df, 
    x='x', 
    y='y', 
    text='word',
    title='t-SNE Visualization of Word Embeddings',
    hover_data=['word'],
    width=800,
    height=800
)

# Customize the plot
fig.update_traces(textposition='top center', marker=dict(size=5))
fig.update_layout(
    title=dict(x=0.5),
    xaxis_title="t-SNE Dimension 1",
    yaxis_title="t-SNE Dimension 2",
    template='plotly_white'
)

fig.show()


Embedding Weights Shape: torch.Size([8154, 128])


KeyboardInterrupt: 

## MLP for next word prediction
We define the neural network model with an increased hidden layer size.

In [None]:
class NextWord(nn.Module):
    """
    A simple feedforward neural network for next-word prediction.
    """
    def __init__(self, block_size, vocab_size, embedding_dim, hidden_dim):
        super(NextWord, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)  # Embedding layer
        self.lin1 = nn.Linear(embedding_dim * block_size, hidden_dim)  # First hidden layer
        self.relu = nn.ReLU()  # Activation function
        self.lin2 = nn.Linear(hidden_dim, vocab_size)  # Output layer

    def forward(self, x):
        embeds = self.embedding(x)  # Shape: [batch_size, block_size, embedding_dim]
        embeds = embeds.view(x.shape[0], -1)  # Flatten: [batch_size, block_size * embedding_dim]
        out = self.lin1(embeds)  # [batch_size, hidden_dim]
        out = self.relu(out)     # [batch_size, hidden_dim]
        out = self.lin2(out)     # [batch_size, vocab_size]
        return out

# Initialize the model with increased hidden dimension
model = NextWord(block_size, len(stoi) + 1, embedding_dim, hidden_dim).to(device)
print(model)

def generate_sequence(model, itos, stoi, context_words, block_size, max_len=20):
    """
    Generates a sequence of words based on the provided context.
    """
    model.eval()  # Set model to evaluation mode
    context = [stoi.get(word, stoi['.']) for word in context_words]
    if len(context) < block_size:
        context = [stoi['.']] * (block_size - len(context)) + context
    sequence = context_words.copy()
    
    with torch.no_grad():
        for _ in range(max_len):
            x = torch.tensor(context[-block_size:]).unsqueeze(0).to(device)  # Shape: [1, block_size]
            y_pred = model(x)  # [1, vocab_size]
            probs = F.softmax(y_pred, dim=1)
            ix = torch.multinomial(probs, num_samples=1).item()
            word = itos.get(ix, '.')
            if word == '.':
                break
            sequence.append(word)
            context.append(ix)
    
    return ' '.join(sequence)

# Define loss function and optimizer
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)


In [None]:
for param_name, param in model.named_parameters():
    print(f"{param_name}: {param.shape}")


## Training Loop

In [None]:
# Move data to device
X = X.to(device)
Y = Y.to(device)

# Training loop
for epoch in range(1, epochs + 1):
    model.train()  # Set model to training mode
    optimizer.zero_grad()
    outputs = model(X)
    loss = loss_fn(outputs, Y)
    loss.backward()
    optimizer.step()
    
    if epoch % 5 == 0 or epoch == 1:
        print(f"Epoch {epoch}/{epochs}, Loss: {loss.item():.4f}")


## Generating Text Sequences
We generate sample text sequences using the trained model.

In [None]:
# %%
# Example context
context_words = "and every precaution has to be taken"

# Generate a sequence of 20 words
generated_sequence = generate_sequence(model, itos, stoi, context_words.split(), block_size, max_len=20)

print(f"Context: {context_words}")
print(f"Generated sequence: {generated_sequence}")


## t-SNE Visualization of Trained Embeddings
Finally, we visualize the trained word embeddings using t-SNE with Plotly.

In [None]:
# %%
# Extract trained embeddings
trained_embeddings = model.embedding.weight.detach().cpu().numpy()

# Perform t-SNE
tsne = TSNE(n_components=2, random_state=42)
trained_embeddings_2d = tsne.fit_transform(trained_embeddings)

# Create DataFrame for Plotly
trained_df = pd.DataFrame({
    'word': list(stoi.keys()),
    'x': trained_embeddings_2d[:len(stoi), 0],
    'y': trained_embeddings_2d[:len(stoi), 1]
})

# Plot using Plotly Express
fig = px.scatter(
    trained_df, 
    x='x', 
    y='y', 
    text='word',
    title='t-SNE Visualization of Trained Word Embeddings',
    hover_data=['word'],
    width=800,
    height=800
)

# Customize the plot
fig.update_traces(textposition='top center', marker=dict(size=5))
fig.update_layout(
    title=dict(x=0.5),
    xaxis_title="t-SNE Dimension 1",
    yaxis_title="t-SNE Dimension 2",
    template='plotly_white'
)

fig.show()
