In [2]:
import torch
from torch import nn, optim
import torch.nn.functional as F
from torch.utils.data import TensorDataset
from collections import Counter
from utils import train, set_device, compute_accuracy
import nltk
import os

  from .autonotebook import tqdm as notebook_tqdm


# 2.1 Word embedding

Part 1:

In [3]:
# Read txt file and tokenize
def read_tokenize_txt(path):
    with open(path, 'r', encoding="utf8") as f:
        tokens = nltk.tokenize.word_tokenize(f.read())
    return tokens

# Read all txt files in a directory and tokenize
def read_tokenize_dir(path):
    tokens = []
    for file in os.listdir(path):
        tokens += read_tokenize_txt(path + file) # Should use os.path.join
    return tokens


train_data = read_tokenize_dir('../data_train/')
test_data = read_tokenize_dir('../data_test/')
val_data = read_tokenize_dir('../data_val/')

In [4]:
# Part 2

def get_freq_vocab(data, min_freq=100):
    freq = Counter(data)
    vocab = {w:f for (w,f) in freq.items() if freq[w] >= min_freq}
    return freq, vocab

print(f"Number of tokens in training data: {len(train_data):,}")
freq, vocab = get_freq_vocab(train_data, min_freq=100)
print(f"Number of distinct tokens in training data: {len(freq):,}")
print(f"Size of vocabulary: {len(vocab):,}")
print("Comments:\nA little more than 3% of the tokens are in the vocabulary with the threshold of 100 occurences. This seems resonable.")

Number of tokens in training data: 2,757,691
Number of distinct tokens in training data: 60,424
Size of vocabulary: 2,177
Comments:
A little more than 3% of the tokens are in the vocabulary with the threshold of 100 occurences. This seems resonable.


In [5]:
# Part 3
class My_MLP(nn.Module):
    def __init__(self, vocab_size, emb_dim=16, context_size=3):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim*context_size)
        # self.embedding.load_state_dict(self.embedding.state_dict())
        self.fc1 = nn.Linear(emb_dim*context_size, 128)
        self.fc2 = nn.Linear(128, 3)

    def forward(self, x):
        out = self.embedding(x)
        out = out.view(out.size(0), -1)
        out = F.relu(self.fc1(out))
        out = self.fc2(out)
        return out


Very much inspired by exercise 6. For this to work we need to convert the tokenized data into a format that the model can understand by creating a context/target dataset. The context is the sequence of words that surround the target word. The target word is the word we are trying to predict given context. We can then create a dataset that pairs the context window with the target word. The context window will be the input to the model, and the target word will be the target for the model to predict.

In [6]:
def create_dataset(text, vocab, context_size=3):
    """Create a PyTorch dataset of context/target pairs from text"""
    # Remove words that are not in the vocabulary
    text = [w for w in text if w in vocab.keys()]

    # Map each word to its index in the vocabulary
    word_to_ix = {word: i for i, word in enumerate(vocab.keys())}
    
    # Transform the text as a list of integers.
    data = [word_to_ix[word] for word in text]

    contexts = []
    targets = []
    for i in range(context_size, len(text) - context_size):
        target = data[i]
        context = data[i - context_size:i] + data[i + 1:i + context_size + 1]
        contexts.append(context)
        targets.append(target)
            
    # Convert context/target lists to PyTorch tensor
    context_tensor = torch.tensor(contexts)
    target_tensor = torch.tensor(targets)

    # Create a PyTorch dataset out of these context / target pairs
    return TensorDataset(context_tensor, target_tensor)

### Training and evaluation

In [7]:
# Part 4

torch.manual_seed(265)
# train_freq and train_vocab are already initialized in Part 2
data_train = create_dataset(train_data, vocab)

val_freq, val_vocab = get_freq_vocab(val_data)
data_val = create_dataset(val_data, val_vocab)

test_freq, test_vocab = get_freq_vocab(test_data)
data_test = create_dataset(test_data, test_vocab)

# embedding = nn.Embedding(len(vocab), 16)
models = [My_MLP(len(vocab)), My_MLP(len(vocab)), My_MLP(len(vocab)), My_MLP(len(vocab))]

for i, model in enumerate(models):
    print(model_name := f"\n------ Model{i+1} ------")
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    loss_fn = nn.CrossEntropyLoss()

    # I got this far without using DataLoader, and now I am filled with nothing but regret and shame.
    train(30, optimizer, model, loss_fn, data_train, "cpu")
    # The train function doesnt even use batch_size, but still that is what it crashes on.
    torch.save(model.to(device="cpu"), f"models/{model_name}.pt")
    

    # Compute accuracy on training and validation data sets
    print(f"Training accuracy: {compute_accuracy(model, data_train, 'cpu'):.2f}%")
    print(f"Validation accuracy: {compute_accuracy(model, data_val, 'cpu'):.2f}%")


best_model = None # We dont know, because the code above is borked



------ Model1 ------


ValueError: Expected input batch_size (6) to match target batch_size (0).

In [None]:
# Part 5

# Compute that cosine similarity matrix, if I can get the model to work
embedding = nn.Embedding(len(vocab), 16)
embedding.load_state_dict(torch.load("models/my_404_embedding.pt"))
embedding_weights = embedding.weight.data
cos_sim_matrix = torch.nn.functional.cosine_similarity(embedding_weights, embedding_weights, dim=1)

# Reporting similar words for some random words
word_to_ix = {word: i for i, word in enumerate(vocab.keys())}
ix_to_word = {i: word for i, word in enumerate(vocab.keys())}
words = ["man", "be", "have", "how", "castle"]
word_indices = [word_to_ix[word] for word in words]

for word, word_index in zip(words, word_indices):
    similarity_row = cos_sim_matrix[word_index]
    # Sort the row, without the word itself
    sorted_row = similarity_row.argsort(descending=True)[1:][:10]
    top_words = [ix_to_word[index.item()] for index in sorted_row]
    print(f"\nMost similar to '{word}': {', '.join(top_words)}")

## 2.2 Conjugating *be* and *have*

First we define the architectures, MLP, MLP with attention and RNN, using LSTM.

In [9]:

class MLP(nn.Module):
    def __init__(self, embedding_matrix, hidden_dim=16):
        super().__init__()
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix)
        self.fc1 = nn.Linear(embedding_matrix.shape[1], hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, 2)

    def forward(self, x):
        x = self.embedding(x)
        x = x.mean(dim=1)  # average the embeddings across the context
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

class MLPA(nn.Module): # A as in attention
    def __init__(self, embedding_matrix, hidden_dim=16):
        super().__init__()
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix)
        self.attention = nn.Linear(embedding_matrix.shape[1], 1)
        self.fc1 = nn.Linear(embedding_matrix.shape[1], hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, 2)

    def forward(self, x):
        x = self.embedding(x)
        alpha = F.softmax(self.attention(x), dim=1)
        x = alpha * x
        x = x.sum(dim=1)  # weighted sum of the embeddings based on attention
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x


class RNN(nn.Module):
    def __init__(self, embedding_matrix, hidden_dim=16):
        super().__init__()
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix)
        self.rnn = nn.LSTM(embedding_matrix.shape[1], hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, 2)

    def forward(self, x):
        x = self.embedding(x)
        _, (h_n, _) = self.rnn(x)
        x = h_n.squeeze()
        x = self.fc(x)
        return x


In [None]:
# Training and evaluation

models = [MLP(embedding_weights), MLPA(embedding_weights), RNN(embedding_weights)]

for i, model in enumerate(models):
    print(model_name := f"\n------ Model{i+1} ------")
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    loss_fn = nn.CrossEntropyLoss()

    train(30, optimizer, model, loss_fn, data_train, "cpu")
    torch.save(model.to(device="cpu"), f"models/{model_name}.pt")

    # TODO: Only use the datasets that have the targets:
    # "be, am, are, is, was, were, been, being, have, has, had, having"
    print(f"Training accuracy: {compute_accuracy(model, data_train, 'cpu'):.2f}%")
    print(f"Validation accuracy: {compute_accuracy(model, data_val, 'cpu'):.2f}%")


## 2.3 Text generation