# CSC_583
# Hithesh Shanmugam
# HW5

## Part II: Word Embedding
## Task 1: Pytorch Tutorial

In [1]:
# Import the necessary packages
import os
import string
import nltk
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from torch.utils.data import Dataset, DataLoader

In [2]:
# Define CBOW model
class CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_size):
        super(CBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_size)
        self.linear1 = nn.Linear(embedding_size * 4, 128)
        self.linear2 = nn.Linear(128, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs).view((1, -1))
        out = nn.functional.relu(self.linear1(embeds))
        out = self.linear2(out)
        log_probs = nn.functional.log_softmax(out, dim=1)
        return log_probs

# Define custom dataset for CBOW
class CBOWDataset(Dataset):
    def __init__(self, corpus, window_size, word_to_index):
        self.corpus = corpus
        self.window_size = window_size
        self.word_to_index = word_to_index
        self.data = self.prepare_data()

    def prepare_data(self):
        data = []
        for i in range(self.window_size, len(self.corpus) - self.window_size):
            context = []
            for j in range(-self.window_size, self.window_size + 1):
                if j != 0:
                    context.append(self.word_to_index[self.corpus[i + j]])
            target = self.word_to_index[self.corpus[i]]
            data.append((context, target))
        return data

    def __getitem__(self, index):
        context, target = self.data[index]
        context = torch.LongTensor(context)
        target = torch.tensor(target)  # Convert target to a scalar tensor
        return context, target

    def __len__(self):
        return len(self.data)


# Text corpus from the exercise
corpus = "We are about to study the idea of a computational process.Computational processes are abstract beings that inhabit computers.As they evolve, processes manipulate other abstract things called data.The evolution of a process is directed by a pattern of rules called a program. People create programs to direct processes. In effect,we conjure the spirits of the computer with our spells."

# Preprocessing and vocabulary creation
word_to_index = {}
index_to_word = {}
corpus_words = corpus.split()

for word in corpus_words:
    if word not in word_to_index:
        index = len(word_to_index)
        word_to_index[word] = index
        index_to_word[index] = word

# Define hyperparameters
vocab_size = len(word_to_index)
embedding_size = 10
window_size = 2
learning_rate = 0.001
num_epochs = 100

# Training CBOW model
model = CBOW(vocab_size, embedding_size)
criterion = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=learning_rate)

dataset = CBOWDataset(corpus_words, window_size, word_to_index)
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

for epoch in range(num_epochs):
    total_loss = 0
    for context, target in dataloader:
        context = context.squeeze()
        target = target

        optimizer.zero_grad()
        log_probs = model(context)
        loss = criterion(log_probs, target)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {total_loss}")

# Get vocabulary size
vocabulary_size = len(word_to_index)
print(f"Vocabulary Size: {vocabulary_size}")

# Function to get embedding vector for a word
def get_embedding_vector(word):
    word_index = word_to_index[word]
    embedding_vector = model.embeddings(torch.LongTensor([word_index]))
    return embedding_vector

# Get embedding vector for 'processes'
target_word = 'processes'
embedding_vector = get_embedding_vector(target_word)
print(f"Embedding Vector for '{target_word}': {embedding_vector}")

# Function to compute cosine similarity between two vectors
cosine_similarity = nn.CosineSimilarity(dim=0)

# Find the top three closest words to 'processes'
top_similar_words = []
top_similar_scores = []
for word in word_to_index:
    if word != target_word:
        word_vector = get_embedding_vector(word)
        similarity = cosine_similarity(embedding_vector, word_vector)
        similarity = similarity.detach().numpy()  # Convert to numpy array
        similarity_scalars = [s.item() for s in similarity]  # Convert each element to scalar
        top_similar_words.append(word)
        top_similar_scores.append(similarity_scalars)

# Sort the top similar words based on scores
top_similar_words_scores = zip(top_similar_words, top_similar_scores)
top_similar_words_scores = sorted(top_similar_words_scores, key=lambda x: x[1], reverse=True)

# Display the top three similar words and their scores
print("Top Three Similar Words:")
for word, scores in top_similar_words_scores[:3]:
    print(f"Word: {word}, Similarity Scores: {scores}")

Epoch 1/100, Loss: 207.47762203216553
Epoch 2/100, Loss: 206.18940901756287
Epoch 3/100, Loss: 204.90996527671814
Epoch 4/100, Loss: 203.6384196281433
Epoch 5/100, Loss: 202.3758668899536
Epoch 6/100, Loss: 201.1199986934662
Epoch 7/100, Loss: 199.87089896202087
Epoch 8/100, Loss: 198.6289210319519
Epoch 9/100, Loss: 197.39469480514526
Epoch 10/100, Loss: 196.16647934913635
Epoch 11/100, Loss: 194.94354844093323
Epoch 12/100, Loss: 193.72517561912537
Epoch 13/100, Loss: 192.51147079467773
Epoch 14/100, Loss: 191.30107808113098
Epoch 15/100, Loss: 190.09475803375244
Epoch 16/100, Loss: 188.89232516288757
Epoch 17/100, Loss: 187.69287109375
Epoch 18/100, Loss: 186.49614453315735
Epoch 19/100, Loss: 185.30294060707092
Epoch 20/100, Loss: 184.1133029460907
Epoch 21/100, Loss: 182.92629742622375
Epoch 22/100, Loss: 181.7418074607849
Epoch 23/100, Loss: 180.56015133857727
Epoch 24/100, Loss: 179.37934112548828
Epoch 25/100, Loss: 178.2002182006836
Epoch 26/100, Loss: 177.02402353286743
Epoch

## Task 2: Larger Experiment

In [3]:
# Download stopwords if not already downloaded
nltk.download('stopwords')

# Define the preprocessing function
def preprocess_text(text):
    # Tokenize the text by whitespaces
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(text.lower())

    # Remove punctuations
    punctuations = string.punctuation
    tokens = [token for token in tokens if token not in punctuations]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    return tokens

# Set the path to the dataset folder
dataset_folder = "C:/Users/sures/OneDrive - DePaul University/Desktop/pos/"

# Set the list of files to process
files = ['cv199_9629.txt', 'cv261_10954.txt', 'cv315_11629.txt', 'cv368_10466.txt', 'cv401_12605.txt',
         'cv453_10379.txt', 'cv519_14661.txt', 'cv729_10154.txt', 'cv782_19526.txt', 'cv900_10331.txt']

# Preprocess the text in each file and extract the first 200 words
preprocessed_data = []
for file in files:
    file_path = os.path.join(dataset_folder, file)
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
        tokens = preprocess_text(text)
        preprocessed_data.append(tokens[:200])

# Create a vocabulary by taking the most frequent 149 words
word_freq = {}
for tokens in preprocessed_data:
    for word in tokens:
        if word in word_freq:
            word_freq[word] += 1
        else:
            word_freq[word] = 1
sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
vocab_words = [word for word, _ in sorted_words[:149]]
vocab_words.append('<unk>')

# Create word-to-index and index-to-word lookup tables
word_to_index = {word: index for index, word in enumerate(vocab_words)}
index_to_word = {index: word for index, word in enumerate(vocab_words)}

# Define the CBOW dataset
class CBOWDataset(Dataset):
    def __init__(self, data, word_to_index):
        self.data = data
        self.word_to_index = word_to_index

    def __getitem__(self, index):
        context = self.data[index]
        target = context.pop(2)  # Remove the target word from the context
        context_indices = [self.word_to_index.get(word, self.word_to_index['<unk>']) for word in context]
        target_index = self.word_to_index.get(target, self.word_to_index['<unk>'])
        return torch.tensor(context_indices), torch.tensor(target_index)

    def __len__(self):
        return len(self.data)

# Define the CBOW model
class CBOWModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(CBOWModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.fc = nn.Linear(embedding_dim, vocab_size)

    def forward(self, inputs):
        embeds = self.embedding(inputs)
        embeds_sum = torch.sum(embeds, dim=1)
        logits = self.fc(embeds_sum)
        log_probs = nn.functional.log_softmax(logits, dim=1)
        return log_probs

# Set the hyperparameters
embedding_dim = 20
learning_rate = 0.001
context_size = 2
batch_size = 64
num_epochs = 100

# Create the CBOW dataset
dataset = CBOWDataset(preprocessed_data, word_to_index)

# Create the data loader
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Create the CBOW model
model = CBOWModel(len(vocab_words), embedding_dim)

# Define the loss function and optimizer
criterion = nn.NLLLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    total_loss = 0
    for context, target in data_loader:
        optimizer.zero_grad()
        log_probs = model(context)
        loss = criterion(log_probs, target)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {total_loss}")

# Obtain the word embeddings
embeddings = model.embedding.weight.data

# Function to get embedding vector for a word
def get_embedding_vector(word):
    word_index = word_to_index.get(word, word_to_index['<unk>'])
    embedding_vector = embeddings[word_index]
    return embedding_vector

# Get embedding vector for 'titanic'
target_word = 'titanic'
embedding_vector = get_embedding_vector(target_word)
print(f"\nEmbedding Vector for '{target_word}': {embedding_vector}")

# Function to compute cosine similarity between two vectors
def cosine_similarity(vector1, vector2):
    dot_product = torch.dot(vector1, vector2)
    norm1 = torch.norm(vector1)
    norm2 = torch.norm(vector2)
    similarity = dot_product / (norm1 * norm2)
    return similarity

# Find the top 3 most similar words for each target word
target_words = ['titanic', 'acting', 'great', 'poor']
for target_word in target_words:
    embedding_vector = get_embedding_vector(target_word)
    top_similar_words = []
    top_similar_scores = []
    for word, index in word_to_index.items():
        if word != target_word:
            word_vector = embeddings[index]
            similarity = cosine_similarity(embedding_vector, word_vector)
            top_similar_words.append(word)
            top_similar_scores.append(similarity.item())
    top_similar_words_scores = zip(top_similar_words, top_similar_scores)
    top_similar_words_scores = sorted(top_similar_words_scores, key=lambda x: x[1], reverse=True)
    print(f"\nTop 3 Similar Words for '{target_word}':")
    for i, (word, score) in enumerate(top_similar_words_scores[:3]):
        print(f"{i+1}. {word} (Similarity Score: {score:.4f})")
    print()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sures\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Epoch 1/100, Loss: 176.86270141601562
Epoch 2/100, Loss: 155.4148712158203
Epoch 3/100, Loss: 145.27764892578125
Epoch 4/100, Loss: 150.28395080566406
Epoch 5/100, Loss: 126.74754333496094
Epoch 6/100, Loss: 154.43911743164062
Epoch 7/100, Loss: 138.0699920654297
Epoch 8/100, Loss: 130.90719604492188
Epoch 9/100, Loss: 125.19926452636719
Epoch 10/100, Loss: 117.06805419921875
Epoch 11/100, Loss: 109.87776184082031
Epoch 12/100, Loss: 114.70987701416016
Epoch 13/100, Loss: 117.5086669921875
Epoch 14/100, Loss: 105.41197204589844
Epoch 15/100, Loss: 110.51570892333984
Epoch 16/100, Loss: 90.68229675292969
Epoch 17/100, Loss: 86.53245544433594
Epoch 18/100, Loss: 79.31787872314453
Epoch 19/100, Loss: 94.59461212158203
Epoch 20/100, Loss: 78.99835205078125
Epoch 21/100, Loss: 87.28910827636719
Epoch 22/100, Loss: 59.76496124267578
Epoch 23/100, Loss: 75.75611877441406
Epoch 24/100, Loss: 90.05707550048828
Epoch 25/100, Loss: 58.604766845703125
Epoch 26/100, Loss: 76.67857360839844
Epoch 27