<a href="https://colab.research.google.com/github/MatteoAldovardi92/Datascience-and-Machine-Learning-Sandbox/blob/main/CBOW.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import tensorflow as tf
import numpy as np
import torch

import numpy as np
from collections import defaultdict
import re # For more robust tokenization
import matplotlib.pyplot as plt # For visualization

import numpy as np
from collections import defaultdict
import re

# --- Preprocessing Functions (from previous response) ---
def preprocess_text(text):
    """
    Cleans and tokenizes raw text.
    Converts to lowercase, removes most special characters, and splits into words.
    """
    text = text.lower()
    # Remove characters that are not letters, numbers, spaces, or selected punctuation (', -, .)
    text = re.sub(r'[^a-z0-9\s\'-.]', '', text)
    tokens = text.split()
    return tokens

def build_vocabulary(tokens, min_freq=5):
    """
    Builds a word-to-ID and ID-to-word mapping, filtering by minimum frequency.
    Adds special tokens for padding and unknown words.
    """
    word_counts = defaultdict(int)
    for word in tokens:
        word_counts[word] += 1

    # Filter out words that appear less than min_freq times
    filtered_vocab_items = [item for item in word_counts.items() if item[1] >= min_freq]
    # Sort by frequency for consistent ID assignment
    sorted_vocab = sorted(filtered_vocab_items, key=lambda x: x[1], reverse=True)

    word_to_id = {'<PAD>': 0, '<UNK>': 1} # Initialize with special tokens
    id_to_word = {0: '<PAD>', 1: '<UNK>'}

    # Assign IDs to words based on sorted frequency
    for word, _ in sorted_vocab:
        if word not in word_to_id: # Ensure special tokens aren't overwritten
            word_to_id[word] = len(word_to_id)
            id_to_word[len(id_to_word)] = word

    vocab_size = len(word_to_id)
    return word_to_id, id_to_word, vocab_size



In [3]:
## Dowload tinyshakespeare.txt

import requests

url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
file_path = 'tinyshakespeare.txt'

try:
    response = requests.get(url)
    response.raise_for_status() # Raise an exception for bad status codes (4xx or 5xx)
    with open(file_path, 'w', encoding='utf-8') as f:
        f.write(response.text)
    print(f"Downloaded '{url}' to '{file_path}'")
except requests.exceptions.RequestException as e:
    print(f"Error downloading the file: {e}")
except IOError as e:
    print(f"Error writing the file '{file_path}': {e}")

Downloaded 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt' to 'tinyshakespeare.txt'


In [4]:


file_path = 'tinyshakespeare.txt'

try:
    with open(file_path, 'r', encoding='utf-8') as f:
        raw_text = f.read()
except FileNotFoundError:
    print(f"Error: '{file_path}' not found. Please download it from https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt")
    exit()

print("--- Step 1: Initial Text Loading ---")
print(f"First 500 characters of raw text:\n{raw_text[:500]}...\n")


--- Step 1: Initial Text Loading ---
First 500 characters of raw text:
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor...



In [5]:

# Perform tokenization
tokens = preprocess_text(raw_text)

print("--- Step 2: Preprocessing and Tokenization ---")
print(f"Total tokens after preprocessing: {len(tokens)}")
print(f"First 20 tokens:\n{tokens[:20]}\n")
print(f"Last 20 tokens:\n{tokens[-20:]}\n")



--- Step 2: Preprocessing and Tokenization ---
Total tokens after preprocessing: 202649
First 20 tokens:
['first', 'citizen', 'before', 'we', 'proceed', 'any', 'further,', 'hear', 'me', 'speak.', 'all', 'speak,', 'speak.', 'first', 'citizen', 'you', 'are', 'all', 'resolved', 'rather']

Last 20 tokens:
['moving,', 'and', 'yet', 'so', 'fast', 'asleep.', 'antonio', 'noble', 'sebastian,', 'thou', "let'st", 'thy', 'fortune', 'sleep--die,', 'rather', "wink'st", 'whiles', 'thou', 'art', 'waking.']



In [6]:
# Build vocabulary
min_word_frequency = 10 # Words appearing less than 10 times will be <UNK>
word_to_id, id_to_word, vocab_size = build_vocabulary(tokens, min_freq=min_word_frequency)

print("--- Step 3: Vocabulary Construction ---")
print(f"Vocabulary Size (with min_freq={min_word_frequency}): {vocab_size}")

print("\nTop 10 most frequent words (by ID):")
for i in range(2, 12): # Start from 2 to skip <PAD> and <UNK>
    if i < vocab_size:
        print(f"ID: {i}, Word: '{id_to_word[i]}'")
    else:
        break

print("\nBottom 10 words (least frequent words that met min_freq, by ID):")
# Get the last 10 entries from the sorted vocabulary (before special tokens)
num_to_show = min(10, vocab_size - 2) # Don't show more than available
for i in range(vocab_size - num_to_show, vocab_size):
      print(f"ID: {i}, Word: '{id_to_word[i]}'")


# Convert entire corpus to numerical IDs
indexed_corpus = [word_to_id.get(word, word_to_id['<UNK>']) for word in tokens]

print("\n--- Step 4: Corpus Indexing (Conversion to Numbers) ---")
print(f"Length of indexed corpus: {len(indexed_corpus)}")
print(f"First 20 indexed tokens:\n{indexed_corpus[:20]}\n")
print(f"Last 20 indexed tokens:\n{indexed_corpus[-20:]}\n")

# Verify a few translations
print("--- Step 5: Verification ---")
sample_text = "the king loves his queen, and the queen loves her king."
sample_tokens = preprocess_text(sample_text)
sample_indexed = [word_to_id.get(word, word_to_id['<UNK>']) for word in sample_tokens]

print(f"Sample text: '{sample_text}'")
print(f"Sample tokens: {sample_tokens}")
print(f"Sample indexed: {sample_indexed}")

# Decode back for verification
decoded_sample = [id_to_word.get(idx, '<UNK>') for idx in sample_indexed]
print(f"Decoded sample: {decoded_sample}")

# Check for an unknown word
unknown_word = "xyzzy" # Highly unlikely to be in Shakespeare
unknown_id = word_to_id.get(unknown_word, word_to_id['<UNK>'])
print(f"\nID for unknown word '{unknown_word}': {unknown_id} (which should be {word_to_id['<UNK>']})")
print(f"Word for ID {word_to_id['<UNK>']}: '{id_to_word[word_to_id['<UNK>']]}'")

--- Step 3: Vocabulary Construction ---
Vocabulary Size (with min_freq=10): 2001

Top 10 most frequent words (by ID):
ID: 2, Word: 'the'
ID: 3, Word: 'and'
ID: 4, Word: 'to'
ID: 5, Word: 'i'
ID: 6, Word: 'of'
ID: 7, Word: 'my'
ID: 8, Word: 'a'
ID: 9, Word: 'you'
ID: 10, Word: 'that'
ID: 11, Word: 'in'

Bottom 10 words (least frequent words that met min_freq, by ID):
ID: 1991, Word: 'houses'
ID: 1992, Word: 'clear'
ID: 1993, Word: 'bona'
ID: 1994, Word: 'instruct'
ID: 1995, Word: 'curst'
ID: 1996, Word: 'angelo.'
ID: 1997, Word: 'claudio,'
ID: 1998, Word: 'provost,'
ID: 1999, Word: 'lucentio.'
ID: 2000, Word: 'alonso'

--- Step 4: Corpus Indexing (Conversion to Numbers) ---
Length of indexed corpus: 202649
First 20 indexed tokens:
[86, 250, 143, 33, 1291, 136, 1, 130, 25, 591, 35, 571, 591, 86, 250, 9, 39, 35, 1468, 352]

Last 20 indexed tokens:
[1, 3, 82, 28, 881, 1, 590, 142, 1, 26, 1, 27, 450, 1, 352, 1, 1139, 26, 132, 1]

--- Step 5: Verification ---
Sample text: 'the king loves his

In [7]:
indexed_corpus = torch.tensor(indexed_corpus)
window_size = 2

In [7]:
# --- Step to generate context-target pairs ---

# Initialize an empty list to store context-target pairs
context_target_pairs = []

# Define the context window size (2 words before and 2 words after)
context_length = 2

# Iterate through the indexed corpus to create context-target pairs
# We need to ensure we have enough words for the context window around the target word.
# The target word is at index 'i'. The context is from i-context_length to i+context_length, excluding i.
# We need i to be at least context_length to have words before it.
# We need i to be at most len(indexed_corpus) - context_length - 1 to have words after it.
for i in range(context_length, len(indexed_corpus) - context_length):
    target_id = indexed_corpus[i]

    # Get the indices for the words before the target
    before_context_ids = indexed_corpus[i - context_length:i]

    # Get the indices for the words after the target
    after_context_ids = indexed_corpus[i + 1:i + context_length + 1]

    # Combine the context word IDs
    # We need to handle padding here if a full window isn't available at the beginning or end.
    # For now, we'll assume a full window is available based on the loop range.
    # The padding strategy will be addressed in a later step.
    context_ids = torch.cat((before_context_ids, after_context_ids))

    # Append the context-target pair to the list
    context_target_pairs.append((context_ids, target_id))

print(f"Generated {len(context_target_pairs)} context-target pairs.")


# Task
Generate context-target pairs from "input.txt" where the context is the two words before and two words after the target word, excluding the target word. Implement padding for contexts at the beginning and end of the text.

## Modify context window generation

### Subtask:
Update the code to generate context-target pairs where the context includes two words before and two words after the target word, excluding the target word itself.


**Reasoning**:
The previous attempt to generate context-target pairs failed due to an incorrect loop structure and an attempt to drop elements from a tensor in place. This code will iterate through the `indexed_corpus` with the correct bounds to create context-target pairs as specified in the instructions, ensuring the context includes two words before and two words after the target word, excluding the target word itself.



In [10]:
# Assuming you have generated context_target_pairs in a previous step
# If not, you'll need to run the code to generate them first.

print("\n--- Step 6: Displaying Sample Context-Target Pairs ---")
# Print a few sample pairs
num_samples_to_display = 5

for i in range(min(num_samples_to_display, len(context_target_pairs))):
    context_ids = context_target_pairs[i][0]
    target_id = context_target_pairs[i][1]

    context_words = [id_to_word.get(idx.item() if isinstance(idx, torch.Tensor) else idx, '<UNK>') for idx in context_ids]
    target_word = id_to_word.get(target_id.item() if isinstance(target_id, torch.Tensor) else target_id, '<UNK>')

    print(f"Pair {i+1}:")
    print(f"  Context IDs: {context_ids}")
    print(f"  Context Words: {context_words}")
    print(f"  Target ID: {target_id}")
    print(f"  Target Word: {target_word}\n")


--- Step 6: Displaying Sample Context-Target Pairs ---
Pair 1:
  Context IDs: tensor([  86,  250,   33, 1291])
  Context Words: ['first', 'citizen', 'we', 'proceed']
  Target ID: 143
  Target Word: before

Pair 2:
  Context IDs: tensor([ 250,  143, 1291,  136])
  Context Words: ['citizen', 'before', 'proceed', 'any']
  Target ID: 33
  Target Word: we

Pair 3:
  Context IDs: tensor([143,  33, 136,   1])
  Context Words: ['before', 'we', 'any', '<UNK>']
  Target ID: 1291
  Target Word: proceed

Pair 4:
  Context IDs: tensor([  33, 1291,    1,  130])
  Context Words: ['we', 'proceed', '<UNK>', 'hear']
  Target ID: 136
  Target Word: any

Pair 5:
  Context IDs: tensor([1291,  136,  130,   25])
  Context Words: ['proceed', 'any', 'hear', 'me']
  Target ID: 1
  Target Word: <UNK>



In [12]:
train_ratio = 0.8
train_data = context_target_pairs[:int(train_ratio * len(context_target_pairs))]
test_data = context_target_pairs[int(train_ratio * len(context_target_pairs)):]

In [27]:
import torch
import torch.nn as nn
import torch.optim as optim

# --- Hints for Building Your Model ---

# 1. Define the model architecture:
#    - You'll typically use an Embedding layer to convert word IDs into dense vectors.
#    - A few Linear (dense) layers with activation functions (like ReLU) can process the context embeddings.
#    - The final layer should be a Linear layer with an output size equal to your vocabulary size.
#    - Why vocab_size and not batch_size for the final layer's output?
#      # The model's goal is to predict the probability of each possible word in your vocabulary
#      # being the target word, given the context.
#      # Therefore, the output layer needs to produce a score (or logit) for every word
#      # in your vocabulary, representing how likely that word is to be the target.
#      # The size of this output is directly tied to the total number of unique words
#      # your model knows (the vocabulary size), not the number of examples
#      # being processed in a single step (the batch size).
#    - A Softmax layer (or combine with the loss function) will convert the final layer's outputs into probabilities over the vocabulary.

# Example (replace with your actual model definition):
class LanguageModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, context_window_size):
        super(LanguageModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        # linear1 takes the flattened context embeddings.
        # The input dimension here is (context_window_size * embedding_dim) because
        # for each example in the batch, we have 'context_window_size' words,
        # and each word is represented by an 'embedding_dim' vector.
        # The batch size is handled implicitly by PyTorch's linear layer;
        # it operates on each example in the batch independently.
        self.linear1 = nn.Linear(embedding_dim * context_window_size, hidden_dim)
        self.layer_norm = nn.LayerNorm(hidden_dim)
        self.relu = nn.ReLU()
        self.linear2 = nn.Linear(hidden_dim, vocab_size)

    def forward(self, context_ids):
        # context_ids shape: (batch_size, context_window_size)
        embedded = self.embedding(context_ids) # shape: (batch_size, context_window_size, embedding_dim)
        # Flatten the embedded context for the linear layers
        # The .view() operation preserves the batch dimension implicitly.
        # It reshapes each item in the batch from (context_window_size, embedding_dim)
        # to a single vector of size (context_window_size * embedding_dim).
        flattened_context = embedded.view(embedded.size(0), -1) # shape: (batch_size, context_window_size * embedding_dim)
        hidden = self.layer_norm(self.linear1(flattened_context)) # shape: (batch_size, hidden_dim)
        hidden = self.relu(hidden) # shape: (batch_size, hidden_dim)
        output = self.linear2(hidden) # shape: (batch_size, vocab_size)
        # Note: Softmax is often included in the loss function (e.g., nn.CrossEntropyLoss)
        return output

# 2. Instantiate the model:
embedding_dim = 1000 # Choose an appropriate dimension
hidden_dim = 300   # Choose an appropriate dimension
# # Remember to define context_window_size based on your context (e.g., 4 for 2 before and 2 after)
context_window_size = 4 # Example value, adjust based on your definition
model = LanguageModel(vocab_size, embedding_dim, hidden_dim, context_window_size)

# --- Set device for CUDA ---
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')
model.to(device)


# 3. Define the loss function and optimizer:
#    - For multi-class classification like predicting the next word, Cross-Entropy Loss is suitable.
#    - Adam or SGD are common optimizers.
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
                  weight_decay=0.01, amsgrad=False)
 # Choose a learning rate

# 4. Prepare data for training (create DataLoaders for batching):
#    - You'll need to convert your train_data and test_data lists of tuples into PyTorch Tensors.
#    - Use `torch.utils.data.TensorDataset` and `torch.utils.data.DataLoader` to handle batching and shuffling (for training data).
train_contexts = torch.stack([pair[0] for pair in train_data])
train_targets = torch.tensor([pair[1] for pair in train_data])
train_dataset = torch.utils.data.TensorDataset(train_contexts, train_targets)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True) # Choose a batch size

test_contexts = torch.stack([pair[0] for pair in test_data])
test_targets = torch.tensor([pair[1] for pair in test_data])
test_dataset = torch.utils.data.TensorDataset(test_contexts, test_targets)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=64, shuffle=False) # No need to shuffle test data

# 5. Training loop:
#    - Iterate over epochs.
#    - In each epoch, iterate over batches from the train_loader.
#    - For each batch:
#        - context_batch, target_batch = batch # Get context and target tensors for the batch
#        - Zero the gradients: optimizer.zero_grad()
#        - Forward pass: outputs = model(context_batch)
#        - Calculate loss: loss = criterion(outputs, target_batch)
#        - Backward pass: loss.backward()
#        - Update weights: optimizer.step()
#        - Print loss periodically to monitor training progress.

number_of_epochs = 100 # Choose an appropriate number of epochs

for epoch in range(number_of_epochs):
    # Set the model to training mode
    model.train()
    total_train_loss = 0

    for batch_idx, batch in enumerate(train_loader):
        # Move batch to the chosen device
        context_batch, target_batch = batch
        context_batch, target_batch = context_batch.to(device), target_batch.to(device)

        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(context_batch)

        # Calculate loss
        # Use the criterion instance you defined earlier
        loss = criterion(outputs, target_batch)

        # Backward pass
        loss.backward()

        # Update weights
        optimizer.step()

        total_train_loss += loss.item()

        # Print loss periodically
        if (batch_idx + 1) % 100 == 0:
            print(f'Epoch [{epoch+1}/{number_of_epochs}], Step [{batch_idx+1}/{len(train_loader)}], Loss: {loss.item():.4f}')

    average_train_loss = total_train_loss / len(train_loader)
    print(f'Epoch [{epoch+1}/{number_of_epochs}] Average Training Loss: {average_train_loss:.4f}')


# 6. Evaluation loop (after training):
#    - Use the test_loader.
#    - In each batch:
#        - context_batch, target_batch = batch # Get context and target tensors for the batch
#        - Forward pass: outputs = model(context_batch)
#        - Calculate loss: test_loss = criterion(outputs, target_batch)
#        - Calculate accuracy or other relevant metrics.
#    - Report the average test loss and metrics.

# Remember to adjust the 'context_window_size' in the model definition based on your actual context size (2 words before + 2 words after = 4).
# The padding strategy will affect how you handle the input to the embedding layer, potentially requiring masks or special padding tokens.

Using device: cuda
Epoch [1/100], Step [100/2534], Loss: 5.7524
Epoch [1/100], Step [200/2534], Loss: 5.9729
Epoch [1/100], Step [300/2534], Loss: 4.3181
Epoch [1/100], Step [400/2534], Loss: 4.7428
Epoch [1/100], Step [500/2534], Loss: 5.6846
Epoch [1/100], Step [600/2534], Loss: 4.0967
Epoch [1/100], Step [700/2534], Loss: 5.3944
Epoch [1/100], Step [800/2534], Loss: 5.3077
Epoch [1/100], Step [900/2534], Loss: 4.4231
Epoch [1/100], Step [1000/2534], Loss: 4.8612
Epoch [1/100], Step [1100/2534], Loss: 5.0123
Epoch [1/100], Step [1200/2534], Loss: 4.1624
Epoch [1/100], Step [1300/2534], Loss: 4.9190
Epoch [1/100], Step [1400/2534], Loss: 5.0924
Epoch [1/100], Step [1500/2534], Loss: 4.2574
Epoch [1/100], Step [1600/2534], Loss: 5.2324
Epoch [1/100], Step [1700/2534], Loss: 3.7018
Epoch [1/100], Step [1800/2534], Loss: 3.8588
Epoch [1/100], Step [1900/2534], Loss: 4.4659
Epoch [1/100], Step [2000/2534], Loss: 4.6094
Epoch [1/100], Step [2100/2534], Loss: 4.3800
Epoch [1/100], Step [220

In [28]:
# --- Step to get a prediction from the model ---

# Set the model to evaluation mode
model.eval()

# Disable gradient calculation for evaluation
with torch.no_grad():
    # Get one batch from the test loader
    # We can iterate through the test_loader or use `next(iter(test_loader))` to get a single batch
    context_batch, target_batch = next(iter(test_loader))

    # Move the batch to the same device as the model
    context_batch, target_batch = context_batch.to(device), target_batch.to(device)

    # Get the model's output for the context batch
    outputs = model(context_batch)

    # The outputs are logits (raw scores). To get probabilities, you'd typically use softmax.
    # However, for finding the predicted class (word ID), we can just find the index
    # with the maximum logit value, as softmax preserves the order of the logits.
    # torch.argmax returns the index of the maximum value along a dimension.
    # Here, dim=1 means we find the max index for each item in the batch (each example's output).
    predicted_ids = torch.argmax(outputs, dim=1)

    # Move the tensors back to CPU and convert to numpy arrays or lists for easier iteration and use with the dictionary
    context_batch_cpu = context_batch.cpu().numpy()
    target_batch_cpu = target_batch.cpu().numpy()
    predicted_ids_cpu = predicted_ids.cpu().numpy()

    # Display predictions for a few samples in the batch
    num_samples_to_display = 5
    print("\n--- Sample Predictions ---")
    for i in range(min(num_samples_to_display, context_batch_cpu.shape[0])):
        context_ids = context_batch_cpu[i]
        real_target_id = target_batch_cpu[i]
        predicted_target_id = predicted_ids_cpu[i]

        # Translate IDs back to words
        context_words = [id_to_word.get(idx, '<UNK>') for idx in context_ids]
        real_target_word = id_to_word.get(real_target_id, '<UNK>')
        predicted_target_word = id_to_word.get(predicted_target_id, '<UNK>')

        print(f"Context: {context_words}")
        print(f"Real Target: {real_target_word}")
        print(f"Predicted Target: {predicted_target_word}")
        print("-" * 20)


--- Sample Predictions ---
Context: ['i', 'do', 'your', 'words.']
Real Target: <UNK>
Predicted Target: beseech
--------------------
Context: ['do', '<UNK>', 'words.', 'be']
Real Target: your
Predicted Target: <UNK>
--------------------
Context: ['<UNK>', 'your', 'be', 'that']
Real Target: words.
Predicted Target: <UNK>
--------------------
Context: ['your', 'words.', 'that', 'you']
Real Target: be
Predicted Target: for
--------------------
Context: ['words.', 'be', 'you', 'are,']
Real Target: that
Predicted Target: <UNK>
--------------------


In [29]:
from sklearn.decomposition import PCA
import torch
import matplotlib.pyplot as plt

embedding_layer = model.embedding
embedding_matrix = embedding_layer.weight.data.cpu().numpy()

pca = PCA(n_components=3)
embedding_3d = pca.fit_transform(embedding_matrix)

# Assuming you have word_to_id and id_to_word dictionaries from previous steps
# Iterate through the vocabulary and print the word and its 3D embedding
print("--- Word Embeddings (3D PCA reduced) ---")
for word, idx in word_to_id.items(): # Use .items() to iterate through key-value pairs
    if idx < len(embedding_3d): # Ensure the index is within the bounds of the reduced embeddings
        word_embedding = embedding_3d[idx]
        print(f"{word}: {word_embedding}")

# Optional: Add code here later to visualize the 3D embeddings using matplotlib or a more interactive library like Plotly
# For example, using matplotlib for a basic 3D scatter plot:
# fig = plt.figure(figsize=(10, 10))
# ax = fig.add_subplot(111, projection='3d')
# ax.scatter(embedding_3d[:, 0], embedding_3d[:, 1], embedding_3d[:, 2])
#
# # Add labels for a few words (optional, can make the plot cluttered)
# for word, idx in word_to_id.items():
#     if idx < len(embedding_3d):
#         ax.text(embedding_3d[idx, 0], embedding_3d[idx, 1], embedding_3d[idx, 2], word)
#
# plt.title("Word Embeddings (3D PCA)")
# plt.show()

--- Word Embeddings (3D PCA reduced) ---
<PAD>: [-0.27103522 -0.03780319  0.23448652]
<UNK>: [-0.20933999 -0.02985658  0.10057826]
the: [-0.28122187 -0.20792985  0.49075952]
and: [-0.46544057 -0.33584952  0.3411419 ]
to: [-0.5463037  -0.20920393  0.3822745 ]
i: [-0.4206137  -0.18306234  0.2040831 ]
of: [-0.44358888 -0.10516313  0.45383886]
my: [-0.2661458  -0.24126875  0.44891065]
a: [-0.05130346 -0.33895633  0.5146702 ]
you: [-0.41408998 -0.3372705   0.13028586]
that: [-0.39657837 -0.20565575  0.3862762 ]
in: [-0.4156024   0.07394608  0.5056672 ]
is: [-0.55725086 -0.09385255  0.25307846]
for: [-0.5408285  -0.29112718  0.47276872]
not: [-0.5379752   0.06596622  0.33327785]
with: [-0.48994714  0.01385933  0.35845754]
your: [-0.08366466 -0.42621967  0.50161755]
be: [-0.73531836  0.28941494 -0.10072008]
his: [-0.11455855 -0.07782812  0.45676172]
it: [-0.399845   -0.18766952  0.2633434 ]
he: [-0.1878209  -0.14499275  0.41226974]
this: [-0.24750787 -0.30679932  0.41946015]
have: [-0.5845023

# The plot everyone wants to play with

In [30]:
import plotly.express as px

# Assuming you have embedding_3d and id_to_word from previous steps
# You can create a DataFrame for easier plotting with Plotly
import pandas as pd

# Create a list of words corresponding to the indices in embedding_3d
words = [id_to_word.get(i, '<UNK>') for i in range(len(embedding_3d))]

# Create a DataFrame with the 3D coordinates and the words
embeddings_df = pd.DataFrame({
    'x': embedding_3d[:, 0],
    'y': embedding_3d[:, 1],
    'z': embedding_3d[:, 2],
    'word': words
})

# Create the interactive 3D scatter plot
fig = px.scatter_3d(embeddings_df, x='x', y='y', z='z', text='word',
                    title='Word Embeddings (3D PCA)')

# Adjust the text position to avoid overlap (optional)
fig.update_traces(textposition='top center')

# Show the plot
fig.show()