# Exercise 4: Word Embeddings

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import os
import glob
from collections import Counter

In [None]:
def sorter(item):
    """ Function tha gets only the first number of the name of the file and organizes the files base on that"""
    
    return int(os.path.basename(item).split('_')[0])

def read_raw_text(path_data):
    """ 
    Function for reading the raw data in the .txt files. 
    
    Parameters
    ----------
    path_data: str
        path of the folder that contains the data that is going to be used. (should be test or train)
        
    Returns
    ---------
    data,scores: array_like
        Data arrays, X is an array of shape [#documents of the dataset, #words in the vocabulary], y is an array of shape [#documents,] 
    """
    
    data = []
    scores = []
    
    sentiments = ['pos', 'neg']
    for sentiment in sentiments:
        path_vocab_pos = os.path.join(".", "aclImdb", path_data, sentiment, "*.txt")
        
        for filename in sorted(glob.glob(path_vocab_pos), key=sorter):
            
            with open(filename, encoding='utf8') as f:
                
                lines = f.read()
                
                data.append(lines)
                scores.append(int(os.path.basename(filename).split('_')[1].strip('.txt')))
    return data, scores

In [None]:
# import the data
corpus, _ = read_raw_text('train')

In [None]:
import re
import string
from collections import Counter

def pre_process(
    reviews,
    tokenize_punct=False,
    lowercase=False,
    remove_punct=False,
    remove_high_freq_terms=False,
    high_freq_threshold=0.5,
    replace_numbers=False
):
    # todo copy the code from the previous exercise

tokenized_corpus = pre_process(corpus, tokenize_punct=True, lowercase=True, remove_punct=True)

In [None]:
# reduce the corpus if you are facing performance issues
tokenized_corpus = tokenized_corpus[:10]

## Task 1: CBOW

In [None]:
# Parameters (change these as wanted)
CONTEXT_SIZE = 2  # Window size on each side
EMBEDDING_DIM = 10
PAD_TOKEN = '<PAD>'

# Vocabulary
vocab = list(set(word for sentence in tokenized_corpus for word in sentence))
word_to_idx = {word: i for i, word in enumerate(vocab)}
idx_to_word = {i: word for word, i in word_to_idx.items()}
vocab_size = len(vocab)

print('Vocab Size', vocab_size)
print('Context Size', CONTEXT_SIZE)
print('Embedding Dimension', EMBEDDING_DIM)

In [None]:
idx_to_word

In [None]:
# Add PAD_TOKEN to vocab

# Pad sentences

# Use padded sentences to create training data (i.e., context-target pairs, e.g, ('is', ['bromwell', 'high', 'a', 'cartoon']))


In [None]:
# Define CBOW model
class CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(CBOW, self).__init__()
        # todo
        
    def forward(self, context_idxs):
        # todo

In [None]:
# Training
model = CBOW(vocab_size, EMBEDDING_DIM)
# todo

In [None]:
def evaluate_cbow(model, context_words):
    model.eval()
    with torch.no_grad():
        context_idxs = torch.tensor([word_to_idx[w] for w in context_words], dtype=torch.long)
        output = model(context_idxs)
        probs = torch.softmax(output, dim=1)
        top_prob, top_idx = torch.topk(probs, 5)  # top 5 predictions

        print(f"Context: {context_words}")
        print("Top predictions for center word:")
        for prob, idx in zip(top_prob[0], top_idx[0]):
            print(f"  {idx_to_word[idx.item()]}: {prob.item():.4f}")

# Example: I didn't know this -> [i], [didn], [t], [know], [this]
context_example = ['i', 'didn', 'know', 'this']
evaluate_cbow(model, context_example)

## Task 2: Skip-Gram

In [None]:
# Prepare training data for SkipGram, i.e. (center_word, context_words), e.g., ('is', ['bromwell', 'high', 'a', 'cartoon'])
# Hint: You might be able to reuse the data from CBOW

In [None]:
class SkipGram(nn.Module):
    def __init__(self, vocab_size, embedding_dim, context_size):
        super().__init__()
        # todo

    def forward(self, center_word_idx):
        # todo

In [None]:
model = SkipGram(vocab_size, EMBEDDING_DIM, CONTEXT_SIZE)
# todo

In [None]:
def evaluate_skipgram(model, center_word):
    model.eval()
    with torch.no_grad():
        input_idx = torch.tensor([word_to_idx[center_word]], dtype=torch.long)  # (1,)
        output = model(input_idx)  # (1, context_size*2, vocab_size)
        
        # For each context position, get top predictions
        context_preds = output.squeeze(0)  # (context_size*2, vocab_size)
        
        print(f"Center word: '{center_word}'")
        print("Top predicted context words per context position:")
        
        for pos, preds in enumerate(context_preds):
            probs = torch.softmax(preds, dim=0)  # softmax over vocab dimension
            top_prob, top_idx = torch.topk(probs, 5)
            print(f" Context position {pos+1}:")
            for prob, idx in zip(top_prob, top_idx):
                print(f"   {idx_to_word[idx.item()]}: {prob.item():.4f}")
            print()

# Example usage
center_word_example = 'can'
evaluate_skipgram(model, center_word_example)

## Task 3: Cosine Similarity
Make sure that you have installed the package gensim.

In [None]:
#conda install -c conda-forge gensim -y

In [None]:
import gensim
from gensim.models import KeyedVectors
import gensim.downloader
from gensim.models import Word2Vec
from gensim.test.utils import datapath, get_tmpfile
import numpy as np
from numpy.linalg import norm
from numpy import dot

### Task 3 (a): Cosine Similarity

In [None]:
def cosine_similarity(x, y):
    pass # todo

### Task 3 (b)

#### Model 1

In [None]:
model1 = KeyedVectors.load_word2vec_format(datapath('word2vec_pre_kv_c'), binary=False) 

In [None]:
king_vector_m1 = model1.get_vector('king')
queen_vector_m1 = model1.get_vector('queen')
man_vector_m1 = model1.get_vector('man')
woman_vector_m1 = model1.get_vector('woman')

In [None]:
model1.key_to_index

#### Model 2

In [None]:
model2 = KeyedVectors.load_word2vec_format(datapath('high_precision.kv.bin'), binary=True) 

In [None]:
king_vector_m2 = model2.get_vector('king')
queen_vector_m2 = model2.get_vector('queen')
man_vector_m2 = model2.get_vector('man')
woman_vector_m2 = model2.get_vector('woman')

In [None]:
model2.key_to_index

#### Model 3

In [None]:
model3 = KeyedVectors.load_word2vec_format(datapath('euclidean_vectors.bin'), binary=True) 

In [None]:
king_vector_m3 = model3.get_vector('king')
queen_vector_m3 = model3.get_vector('queen')
man_vector_m3 = model3.get_vector('man')
woman_vector_m3 = model3.get_vector('woman')

In [None]:
model3.key_to_index

#### Analogy Example 1

In [None]:
king_mins_man_plus_woman_m3 = (king_vector_m3 - man_vector_m3) + woman_vector_m3

# Make sure you have implemented cosine similarity. 
cosine_similarity(king_mins_man_plus_woman_m3, queen_vector_m3)

#### Model 4

In [None]:
word2vec_google = gensim.downloader.load('word2vec-google-news-300');

In [None]:
len(word2vec_google.get_vector('king'))

In [None]:
# you can also try the GLOVE model
glove_google = gensim.downloader.load('glove-wiki-gigaword-100');

In [None]:
len(glove_google.get_vector('king'))

In [None]:
model4 = word2vec_google

In [None]:
king_vector_m4 = model4.get_vector('king')
queen_vector_m4 = model4.get_vector('queen')
man_vector_m4 = model4.get_vector('man')
woman_vector_m4 = model4.get_vector('woman')

#### Analogy Example 2

In [None]:
king_mins_man_plus_woman_m4 = (king_vector_m4 - man_vector_m4) + woman_vector_m4

# Make sure you have implemented cosine similarity. 
cosine_similarity(king_mins_man_plus_woman_m4, queen_vector_m4)

In [None]:
# Find a method to search for similar words given a word
# Hint: you can use a method of the word2vec_google object

similar_words = model4.IDENTIFIED_METHOD('phone', topn=10)

for word, similarity in similar_words:
    print(f"{word}: {similarity:.4f}")

In [None]:
similar_words = model4.IDENTIFIED_METHOD('king', topn=10)

for word, similarity in similar_words:
    print(f"{word}: {similarity:.4f}")

In [None]:
# try to find at least five analogies using the method you found above


## Theoretical Question #8

In [None]:
word2vec_google.IDENTIFIED_METHOD(king_mins_man_plus_woman_m4) # First answer will be King