In [5]:
import torch
from torch import nn, optim
import torch.nn.functional as F
from collections import Counter
import nltk
import os

  from .autonotebook import tqdm as notebook_tqdm


# 2.1 Word embedding

Part 1:

In [2]:
# Read txt file and tokenize
def read_tokenize_txt(path):
    with open(path, 'r', encoding="utf8") as f:
        tokens = nltk.tokenize.word_tokenize(f.read())
    return tokens

# Read all txt files in a directory and tokenize
def read_tokenize_dir(path):
    tokens = []
    for file in os.listdir(path):
        tokens += read_tokenize_txt(path + file)
    return tokens


train_data = read_tokenize_dir('../data_train/')
test_data = read_tokenize_dir('../data_test/')
val_data = read_tokenize_dir('../data_val/')

In [4]:
# Part 2

def get_freq_vocab(data, min_freq=100):
    freq = Counter(data)
    vocab = [w for w in freq.keys() if freq[w] >= min_freq]
    return freq, vocab

print(f"Number of tokens in training data: {len(train_data):,}")
freq, vocab = get_freq_vocab(train_data, min_freq=100)
print(f"Number of distinct tokens in training data: {len(freq):,}")
print(f"Size of vocabulary: {len(vocab):,}")
print("Comments:\nA little more than 3% of the tokens are in the vocabulary with the threshold of 100 occurences. This seems resonable.")

Number of tokens in training data: 2,757,691
Number of distinct tokens in training data: 60,424
Size of vocabulary: 2,177
Comments:
A little more than 3% of the tokens are in the vocabulary with the threshold of 100 occurences. This seems resonable.


In [None]:
# Part 3
class MyMLP(nn.Module):
    
    def __init__(self, embedding=None, context_size=3):
        super().__init__()
        
        (vocab_size, embedding_dim) = embedding.weight.shape
        # Instantiate an embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        # Load the pretrained weights
        self.embedding.load_state_dict(embedding.state_dict())
        # Freeze the layer
        for p in self.embedding.parameters():
            p.requires_grad = False
            
        # Regular MLP
        self.fc1 = nn.Linear(embedding_dim*context_size, 128)
        self.fc2 = nn.Linear(128, 3)

    def forward(self, x):
        # x is of shape (N, context_size) but contains integers which can
        # be seen as equivalent to (N, context_size, vocab_size) since one hot
        # encoding is used under the hood
        out = self.embedding(x)
        # out is now of shape (N, context_size, embedding_dim)
        
        out = F.relu(self.fc1(torch.flatten(out, 1)))
        # out is now of shape (N, context_size*embedding_dim)
        
        out = self.fc2(out)
        return out
