ID: V01053626

Name: Newsha Bahardoost

In [None]:
import re
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset
import torch.nn.functional as F

# Task 1: Tokenization
with open('/content/p1-sentiments.txt', 'r') as f:
    sentences = f.readlines()

processed_sentences = []
all_tokens = []

for line in sentences:
    line_clean = re.sub(r"'s\b", "", line.strip())  # Remove 's suffix
    line_clean = re.sub(r"'", "", line_clean)       # Remove apostrophes
    tokens = line_clean.split()
    processed_sentences.append(tokens)
    all_tokens.extend(tokens)

vocab = list(set(all_tokens))
vocab_size = len(vocab)
print(vocab_size)

# Print tokenization of 7th and 256th sentences (0-based index)
print(processed_sentences[6])
print(processed_sentences[255])
print(processed_sentences[0])

3645
['Secretary', 'manages', 'a', 'neat', 'trick,', 'bundling', 'the', 'flowers', 'of', 'perversity,', 'comedy', 'and', 'romance', 'into', 'a', 'strangely', 'tempting', 'bouquet', 'of', 'a', 'movie.']
['The', 'story', 'is', 'familiar', 'from', 'its', 'many', 'predecessors;', 'like', 'them,', 'it', 'eventually', 'culminates', 'in', 'the', 'not-exactly', '-stunning', 'insight', 'that', 'crime', 'doesnt', 'pay.']
['(An)', 'absorbing', 'documentary.']


In [None]:
# Task 2: Word Embeddings
# Create word-to-index mappings
word2idx = {word: idx for idx, word in enumerate(vocab)}
idx2word = {idx: word for idx, word in enumerate(vocab)}

# Generate training data (target, context pairs)
training_data = []
for sent in processed_sentences:
    for i in range(len(sent)):
        target = sent[i]
        start = max(0, i - 2)
        end = min(len(sent), i + 3)
        context = sent[start:i] + sent[i+1:end]
        for ctx in context:
            training_data.append((target, ctx))

# Convert to indices
X = [word2idx[target] for target, _ in training_data]
Y = [word2idx[ctx] for _, ctx in training_data]

X_tensor = torch.LongTensor(X)
Y_tensor = torch.LongTensor(Y)
dataset = TensorDataset(X_tensor, Y_tensor)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

# Define the model
class Word2Vec(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.linear = nn.Linear(embed_dim, vocab_size, bias=False)

    def forward(self, x):
        x = self.embedding(x)
        x = self.linear(x)
        return x
        model = Word2Vec(vocab_size, 2)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

# Training loop
for epoch in range(50):
    total_loss = 0
    for batch_X, batch_Y in dataloader:
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_Y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss / len(dataloader)}")

# Function to find closest words
def get_closest_words(word, embeddings, word2idx, idx2word, top_k=5):
    if word not in word2idx:
        return []
    idx = word2idx[word]
    embed = embeddings[idx]
    cos_sims = F.cosine_similarity(embed.unsqueeze(0), embeddings, dim=1)
    sorted_indices = torch.argsort(cos_sims, descending=True)
    closest = [i.item() for i in sorted_indices if i != idx][:top_k]
    return [idx2word[i] for i in closest]

embeddings = model.embedding.weight.data
words = ['actor', 'actress', 'good', 'bad', 'awesome']

# Handle potential typo in 'actress' if needed (assuming it's a typo for 'actress')
target_words = []
for word in words:
    if word in word2idx:
        target_words.append(word)
    elif word == 'actress' and 'actress' not in word2idx:
        # Check for possible typo in the problem statement
        pass

# Print closest words for each target word
for word in ['actor', 'actress', 'good', 'bad', 'awesome']:
    closest = get_closest_words(word, embeddings, word2idx, idx2word)
    print(f"{word} => {closest}")

Epoch 1, Loss: 6.712689130805259
Epoch 2, Loss: 6.6981749322063235
Epoch 3, Loss: 6.690814162409583
Epoch 4, Loss: 6.684509566587995
Epoch 5, Loss: 6.680273312930913
Epoch 6, Loss: 6.6742801629295645
Epoch 7, Loss: 6.669197397638661
Epoch 8, Loss: 6.664893476537956
Epoch 9, Loss: 6.6595533115919245
Epoch 10, Loss: 6.6560081834940945
Epoch 11, Loss: 6.651330020076545
Epoch 12, Loss: 6.647077454152957
Epoch 13, Loss: 6.643319750016974
Epoch 14, Loss: 6.6395783932634105
Epoch 15, Loss: 6.63472150647363
Epoch 16, Loss: 6.633213423943335
Epoch 17, Loss: 6.628773003585579
Epoch 18, Loss: 6.626757219780323
Epoch 19, Loss: 6.62289590151735
Epoch 20, Loss: 6.6198351734368375
Epoch 21, Loss: 6.617476792298546
Epoch 22, Loss: 6.615345038184824
Epoch 23, Loss: 6.611674904823303
Epoch 24, Loss: 6.610792574956435
Epoch 25, Loss: 6.607005685798882
Epoch 26, Loss: 6.604462513627932
Epoch 27, Loss: 6.603265482325886
Epoch 28, Loss: 6.60023615341778
Epoch 29, Loss: 6.598652739857519
Epoch 30, Loss: 6.59