In [2]:
import numpy as np
import matplotlib.pyplot as plt
import os
import string
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# Path to the book file
FILE_PATH = "../harry_potter/HP1.txt"
CONTEXT_WINDOW_SIZE = 2

# ============ STEP 1: Read only the first paragraph ============
with open(FILE_PATH, 'r', encoding='utf-8') as f:
    full_text = f.read()

# Split into paragraphs (assuming paragraphs are separated by two newlines)
paragraphs = full_text.strip().split('\n\n')
first_paragraph = paragraphs[0]

print("=== First paragraph ===")
print(first_paragraph[:500])  # preview
print()

=== First paragraph ===
Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much. They were the last people you’d expect to be involved in anything strange or mysterious, because they just didn’t hold with such nonsense. Mr. Dursley was the director of a firm called Grunnings, which made drills. He was a big, beefy man with hardly any neck, although he did have a very large mustache. Mrs. Dursley was thin and blonde and had nearly twice the usual amount 



## Step 2

In [3]:
# ============ STEP 2: Clean and tokenize ============
clean_text = first_paragraph.lower()
punctuation_list = string.punctuation + '“”’'
translator = str.maketrans('', '', punctuation_list)
clean_text = clean_text.translate(translator)

tokens = clean_text.split()
print("Tokens:", tokens[:50])

Tokens: ['mr', 'and', 'mrs', 'dursley', 'of', 'number', 'four', 'privet', 'drive', 'were', 'proud', 'to', 'say', 'that', 'they', 'were', 'perfectly', 'normal', 'thank', 'you', 'very', 'much', 'they', 'were', 'the', 'last', 'people', 'youd', 'expect', 'to', 'be', 'involved', 'in', 'anything', 'strange', 'or', 'mysterious', 'because', 'they', 'just', 'didnt', 'hold', 'with', 'such', 'nonsense', 'mr', 'dursley', 'was', 'the', 'director']


In [4]:
stopwords = ['the', 'is', 'will', 'be', 'a', 'only', 'can', 'their', 'now', 'and', 'at', 'it']

filtered_data = []
for word in tokens:
    if word not in stopwords:
        filtered_data.append(word)
print(filtered_data[:50])

['mr', 'mrs', 'dursley', 'of', 'number', 'four', 'privet', 'drive', 'were', 'proud', 'to', 'say', 'that', 'they', 'were', 'perfectly', 'normal', 'thank', 'you', 'very', 'much', 'they', 'were', 'last', 'people', 'youd', 'expect', 'to', 'involved', 'in', 'anything', 'strange', 'or', 'mysterious', 'because', 'they', 'just', 'didnt', 'hold', 'with', 'such', 'nonsense', 'mr', 'dursley', 'was', 'director', 'of', 'firm', 'called', 'grunnings']


## STEP 3

In [5]:
# ============ STEP 3: Build vocabulary with Unique words ============
vocabulary = []
seen_words = set()

for word in tokens:
    if word not in seen_words:
        vocabulary.append(word)
        seen_words.add(word)

vocab_size = len(vocabulary)
print("Vocabulary:", vocabulary[:50])
print(f"Vocab size: {vocab_size}")

word_to_idx = {word: i for i, word in enumerate(vocabulary)}
idx_to_word = {i: word for i, word in enumerate(vocabulary)}

# Preview
if 'harry' in word_to_idx:
    print("Index of 'harry':", word_to_idx['harry'])

Vocabulary: ['mr', 'and', 'mrs', 'dursley', 'of', 'number', 'four', 'privet', 'drive', 'were', 'proud', 'to', 'say', 'that', 'they', 'perfectly', 'normal', 'thank', 'you', 'very', 'much', 'the', 'last', 'people', 'youd', 'expect', 'be', 'involved', 'in', 'anything', 'strange', 'or', 'mysterious', 'because', 'just', 'didnt', 'hold', 'with', 'such', 'nonsense', 'was', 'director', 'a', 'firm', 'called', 'grunnings', 'which', 'made', 'drills', 'he']
Vocab size: 6022
Index of 'harry': 397


###  STEP  4

In [6]:
# Assuming you already have vocab and word_to_index from earlier in your code
vocab_size = len(tokens)

# Print one-hot
for i, word in enumerate(vocabulary[:10]):
    one_hot_vector = np.zeros(vocab_size, dtype=int)
    one_hot_vector[i] = 1
    print(f"{word}: {one_hot_vector}")

mr: [1 0 0 ... 0 0 0]
and: [0 1 0 ... 0 0 0]
mrs: [0 0 1 ... 0 0 0]
dursley: [0 0 0 ... 0 0 0]
of: [0 0 0 ... 0 0 0]
number: [0 0 0 ... 0 0 0]
four: [0 0 0 ... 0 0 0]
privet: [0 0 0 ... 0 0 0]
drive: [0 0 0 ... 0 0 0]
were: [0 0 0 ... 0 0 0]


### Step 5

In [7]:
training_pairs = []

for i, focal_word in enumerate(tokens):
    # Define context window indices
    start = max(0, i - CONTEXT_WINDOW_SIZE)
    end = min(len(tokens), i + CONTEXT_WINDOW_SIZE + 1)
    
    for j in range(start, end):
        if i == j:
            continue  # skip the focal word itself
        
        context_word = tokens[j]
        training_pairs.append((word_to_idx[focal_word], word_to_idx[context_word]))

# Print pairs with words for clarity
print("Training pairs (focal_word -> context_word):")
print("\nFirst 5 pairs only:")
for focal_idx, context_idx in training_pairs[:10]:
    print(f"{tokens[focal_idx]} ({focal_idx}) -> {tokens[context_idx]} ({context_idx})")

Training pairs (focal_word -> context_word):

First 5 pairs only:
mr (0) -> and (1)
mr (0) -> mrs (2)
and (1) -> mr (0)
and (1) -> mrs (2)
and (1) -> dursley (3)
mrs (2) -> mr (0)
mrs (2) -> and (1)
mrs (2) -> dursley (3)
mrs (2) -> of (4)
dursley (3) -> and (1)


### STEP 6

In [8]:
# Hyperparameters
embedding_dim = 10
vocab_size = len(vocabulary)
epochs = 100
learning_rate = 0.1
init_std = 0.05
batch_size = 256  # small batches to avoid memory issues

# Convert training pairs to tensors
X_train = torch.tensor([center for center, _ in training_pairs], dtype=torch.long)
y_train = torch.tensor([context for _, context in training_pairs], dtype=torch.long)

# Create DataLoader for mini-batch training
dataset = TensorDataset(X_train, y_train)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Skip-gram model
class SkipGramModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, init_std):
        super(SkipGramModel, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.output = nn.Linear(embedding_dim, vocab_size)
        self.log_softmax = nn.LogSoftmax(dim=1)
        
        # Small Gaussian initialization
        nn.init.normal_(self.embeddings.weight, mean=0.0, std=init_std)
        nn.init.normal_(self.output.weight, mean=0.0, std=init_std)
        nn.init.zeros_(self.output.bias)

    def forward(self, x):
        x = self.embeddings(x)   # (batch, embed_dim)
        x = self.output(x)       # (batch, vocab_size)
        x = self.log_softmax(x)
        return x

# Initialize model, loss, optimizer
model = SkipGramModel(vocab_size, embedding_dim, init_std)
criterion = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=learning_rate)

# Training loop with mini-batches
for epoch in range(epochs):
    total_loss = 0
    for batch_x, batch_y in dataloader:
        optimizer.zero_grad()
        output = model(batch_x)
        loss = criterion(output, batch_y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    print(f"Epoch [{epoch+1}/{epochs}], Loss: {total_loss/len(dataloader):.4f}")

# Extract learned embeddings
word_embeddings = model.embeddings.weight.data
print("\nWord embeddings shape:", word_embeddings.shape)


Epoch [1/100], Loss: 8.3031
Epoch [2/100], Loss: 7.7161
Epoch [3/100], Loss: 7.4201
Epoch [4/100], Loss: 7.2602
Epoch [5/100], Loss: 7.1575
Epoch [6/100], Loss: 7.0826
Epoch [7/100], Loss: 7.0230
Epoch [8/100], Loss: 6.9739
Epoch [9/100], Loss: 6.9323
Epoch [10/100], Loss: 6.8967
Epoch [11/100], Loss: 6.8657
Epoch [12/100], Loss: 6.8383
Epoch [13/100], Loss: 6.8140
Epoch [14/100], Loss: 6.7919
Epoch [15/100], Loss: 6.7717
Epoch [16/100], Loss: 6.7532
Epoch [17/100], Loss: 6.7358
Epoch [18/100], Loss: 6.7198
Epoch [19/100], Loss: 6.7046
Epoch [20/100], Loss: 6.6904
Epoch [21/100], Loss: 6.6771
Epoch [22/100], Loss: 6.6644
Epoch [23/100], Loss: 6.6527
Epoch [24/100], Loss: 6.6414
Epoch [25/100], Loss: 6.6308
Epoch [26/100], Loss: 6.6208
Epoch [27/100], Loss: 6.6113
Epoch [28/100], Loss: 6.6022
Epoch [29/100], Loss: 6.5937
Epoch [30/100], Loss: 6.5855
Epoch [31/100], Loss: 6.5779
Epoch [32/100], Loss: 6.5705
Epoch [33/100], Loss: 6.5635
Epoch [34/100], Loss: 6.5568
Epoch [35/100], Loss: 6

# STEP 7

In [9]:

def word_to_one_hot(word, word_to_idx):
    vocab_size = len(word_to_idx)
    one_hot = torch.zeros(vocab_size)
    idx = word_to_idx.get(word)
    if idx is None:
        raise ValueError(f"Word '{word}' not found in vocabulary.")
    one_hot[idx] = 1
    return one_hot

def get_embedding_from_word(model, word, word_to_idx):
    word_index = word_to_idx[word]
    embedding = model.embeddings(torch.tensor([word_index]))
    return embedding.squeeze()

while True:
    user_input = input("Enter a word (or 'quit' to stop): ").strip()
    if user_input.lower() == 'quit':
        break
    
    try:
        one_hot_vector = word_to_one_hot(user_input, word_to_idx)
        print(f"One-hot vector for '{user_input}':\n{one_hot_vector}\n")
        
        embedding_vector = get_embedding_from_word(model, user_input, word_to_idx)
        print(f"Embedding vector for '{user_input}':\n{embedding_vector}\n")
    except ValueError as e:
        print(e)


One-hot vector for 'harry':
tensor([0., 0., 0.,  ..., 0., 0., 0.])

Embedding vector for 'harry':
tensor([ 0.9439, -0.8520,  0.5920,  1.1341, -0.3767, -0.5537,  0.2148, -0.1261,
        -0.1760, -1.7613], grad_fn=<SqueezeBackward0>)

One-hot vector for 'and':
tensor([0., 1., 0.,  ..., 0., 0., 0.])

Embedding vector for 'and':
tensor([-0.2036, -1.2905,  0.8329,  0.3956, -0.6224, -0.4276,  0.5420,  0.3732,
         0.6058, -0.4890], grad_fn=<SqueezeBackward0>)



In [10]:
# Disable truncation for printing tensors
torch.set_printoptions(threshold=10_000)  # set very high threshold to print full tensor

user_input = input("Enter a word (or 'quit' to stop): ").strip()

try:
    one_hot_vector = word_to_one_hot(user_input, word_to_idx)
    print(f"One-hot vector for '{user_input}':\n{one_hot_vector}\n")
        
    embedding_vector = get_embedding_from_word(model, user_input, word_to_idx)
    print(f"Embedding vector for '{user_input}':\n{embedding_vector}\n")
except ValueError as e:
    print(e)


Word 'quit' not found in vocabulary.


# Implementing a mapping between corpora

In [11]:
# Ensure your embedding tensors are on the CPU and converted to NumPy
embeddings_A = word_embeddings_1.cpu().numpy()
embeddings_B = word_embeddings_2.cpu().numpy()
word_to_idx_A = word_to_idx_1
word_to_idx_B = word_to_idx_2

# 1. Find the shared vocabulary
vocab_A = set(word_to_idx_A.keys())
vocab_B = set(word_to_idx_B.keys())
shared_vocab = sorted(list(vocab_A.intersection(vocab_B)))

print(f"Found {len(shared_vocab)} shared words between the two vocabularies.")

# 2. Create alignment matrices for the shared words
# These will hold the embeddings for the words that exist in BOTH texts.
X_A = np.zeros((len(shared_vocab), embeddings_A.shape[1]))
X_B = np.zeros((len(shared_vocab), embeddings_B.shape[1]))

for i, word in enumerate(shared_vocab):
    idx_A = word_to_idx_A[word]
    idx_B = word_to_idx_B[word]
    
    X_A[i] = embeddings_A[idx_A]
    X_B[i] = embeddings_B[idx_B]

# 3. Solve for the transformation matrix T using the pseudo-inverse
# This finds the optimal linear map T such that X_A @ T is as close as possible to X_B
T = np.linalg.pinv(X_A) @ X_B

print(f"Learned transformation matrix T with shape: {T.shape}")

NameError: name 'word_embeddings_1' is not defined

In [None]:
# ## Phase 3: Evaluate the Mapping

# 1. Transform the embedding space A using the learned matrix T
mapped_X_A = X_A @ T

# 2. Calculate the cosine similarity for each word pair
# We will compare each mapped vector from A with its corresponding true vector in B
dot_products = np.sum(mapped_X_A * X_B, axis=1)
norms_A = np.linalg.norm(mapped_X_A, axis=1)
norms_B = np.linalg.norm(X_B, axis=1)

# To avoid division by zero for any potential zero-vectors
valid_indices = (norms_A > 0) & (norms_B > 0)
similarities = dot_products[valid_indices] / (norms_A[valid_indices] * norms_B[valid_indices])

# 3. Display the results
average_similarity = np.mean(similarities)
print(f"--- Evaluation Results ---")
print(f"Average Cosine Similarity between mapped and target vectors: {average_similarity:.4f}")

# 4. Show some examples for qualitative analysis
print("\n--- Example Word-level Similarities ---")
for i in range(10): # Print the similarity for the first 10 shared words
    word = shared_vocab[i]
    similarity = similarities[i]
    print(f"Word: '{word}', Cosine Similarity: {similarity:.4f}")

### Can we map from one embedding space to another where the embeddings are trained on different data?

Yes, you can map one embedding space to another even if they are trained on different data — this is a well-studied problem in NLP and related fields.

Why do this?
Different embedding models (e.g., trained on different corpora, languages, or time periods) live in different vector spaces.

Mapping between spaces enables alignment, transfer learning, or cross-lingual embeddings.

Common approaches
Learn a linear transformation (mapping matrix) W:

from target (for a shared vocabulary or dictionary), find matrix 𝑊 that minimizes:
Usually solved by Procrustes analysis or orthogonal Procrustes if you constrain 
𝑊 to be orthogonal.

Requires a bilingual dictionary or anchor points (shared words) to align.

Non-linear mappings:

Use neural networks (MLPs) to learn more complex mappings.

Often less common for basic embedding alignment but helpful for very different domains.

Iterative refinement:

Start with a seed dictionary, learn 𝑊.

Use 𝑊 to find more pairs, retrain 
𝑊, repeat.

Practical example:
Mapping GloVe embeddings trained on Wikipedia to fastText embeddings trained on Common Crawl.

Cross-lingual embeddings to translate embeddings between languages.

 it's common and practical — with enough shared words or anchor points, you can effectively align and map embedding spaces trained on different data.