In [1]:
import os
import torch
import re
from torch import nn;
from torch.utils.data import DataLoader, TensorDataset
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator, Vocab

DATA_DIR = './data/'
MIN_WORD_FREQUENCY = 100

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Using device: cuda


# Tokenize datasets

In [2]:
TOKENIZER = get_tokenizer('basic_english')

def read_lines(
    dataset: str
) -> list[str]:
    """
    Reads all the lines form all the texts in the given `dataset`.

    Datasets are `train`, `val` and `test`.
    """

    # Scan for all input files
    inDirectoryName = os.path.join(DATA_DIR, 'input', dataset)
    inFileNames = [os.path.join(inDirectoryName, f) for f in os.listdir(inDirectoryName)]

    # Read all the lines from all the files
    lines = []
    for inFileName in inFileNames:
        with open(inFileName, 'r') as file:
            lines += file.readlines()

    print(f"Read {len(lines)} lines from {dataset}")
    return lines

def create_tokens(
    dataset: str
) -> list[str]:
    """
    Creates tokens for all the words in the given `dataset`.

    Datasets are `train`, `val` and `test`.
    """

    outFileName = os.path.join(DATA_DIR, f'words.{dataset}.pt')
    
    # If the file exists, don't create it again.
    if os.path.isfile(outFileName):
        print(f"Loaded tokenized words for {dataset} ({outFileName})")
        return torch.load(outFileName)

    tokens = []
    for line in read_lines(dataset):
        tokens += TOKENIZER(line)

    # Save tokens so we dont have to do this again
    torch.save(tokens, outFileName)
    
    return tokens

def create_vocabulary(
    dataset: str
) -> Vocab:
    """
    Creates a vocabulary for the given `dataset`.

    Datasets are `train`, `val` and `test`.
    """

    outFileName = os.path.join(DATA_DIR, f'vocabulary.pt')

    # If the file exists, don't create it again.
    if os.path.isfile(outFileName):
        print(f"Loaded vocabulary for {dataset} ({outFileName})")
        return torch.load(outFileName)

    def read_sanitize_tokenize():

        for line in read_lines(dataset):

            line = re.sub('\\w*[0-9]+\\w*', ' ', line) # Remove numbers
            line = re.sub('\\w*[A-Z]+\\w*', ' ', line) # Remove uppercase names
            line = re.sub('\\s+', ' ', line) # Remove double spaces

            yield TOKENIZER(line)

    vocabulary = build_vocab_from_iterator(read_sanitize_tokenize(), min_freq=MIN_WORD_FREQUENCY, specials=['<unk>'])

    vocabulary.set_default_index(vocabulary['<unk>'])

    # We removed all uppercase names, this includes 'I'
    vocabulary.append_token('i') 

    # Save vocabulary so we dont have to do this again
    torch.save(vocabulary, outFileName)

    return vocabulary
    


In [3]:
words_train = create_tokens('train')
words_val = create_tokens('val')
words_test = create_tokens('test')

vocabulary = create_vocabulary('train')
VOCABULARY_SIZE = len(vocabulary)

Loaded tokenized words for train (./data/words.train.pt)
Loaded tokenized words for val (./data/words.val.pt)
Loaded tokenized words for test (./data/words.test.pt)
Loaded vocabulary for train (./data/vocabulary.pt)


In [4]:

print("Words in 'train' dataset ........:", len(words_train))
print("Words in 'val' dataset ..........:", len(words_val))
print("Words in 'test' dataset .........:", len(words_test))
print("Distinct words in 'train' dataset:", len(set(words_train)))
print("Words in vocabulary .............:", VOCABULARY_SIZE)

Words in 'train' dataset ........: 2684706
Words in 'val' dataset ..........: 49526
Words in 'test' dataset .........: 124152
Distinct words in 'train' dataset: 52105
Words in vocabulary .............: 1880


In [5]:
EMBEDDINGS_DIM = 32
CONTEXT_SIZE = 5
EMBEDDINGS_BATCH_SIZE = 128
EMBEDDINGS_EPOCHS = 100

class CBOW(nn.Module):
    def __init__(self):
        super(CBOW, self).__init__()
        self.embeddings = nn.Embedding(VOCABULARY_SIZE, EMBEDDINGS_DIM, sparse=True)
        self.linear = nn.Linear(EMBEDDINGS_DIM*CONTEXT_SIZE, VOCABULARY_SIZE)

    def forward(self, x):
        x = self.embeddings(x)
        x = x.view(x.size(0), -1)
        x = self.linear(x)
        x = torch.log_softmax(x, dim=1)
        return x
    
def cbow_create_dataset(
    words: list[str]
):
    """
    Creates a dataset from the given words.
    """
    words_idx = [vocabulary[word] for word in words]

    contexts = []
    targets = []
    for i in range(len(words) - CONTEXT_SIZE):
        context = words_idx[i:i+CONTEXT_SIZE]
        target = words_idx[i+CONTEXT_SIZE]

        contexts.append(torch.tensor(context))
        targets.append(target)

    contexts = torch.stack(contexts).to(device)
    targets = torch.tensor(targets).to(device)

    return TensorDataset(contexts, targets)

def model_nameof(
    model: nn.Module, 
    criterion: object, 
    optimizer: torch.optim.Optimizer
) -> str:
    """
    Creates a good name for the model.
    """

    name = f'{model.__class__.__name__}_{criterion.__class__.__name__}_{optimizer.__class__.__name__}'
    options = optimizer.param_groups[0]

    if 'lr' in options:
        name += f'-lr{options["lr"]:.3f}'

    if 'momentum' in options and options['momentum'] != 0.0:
        name += f'-m{options["momentum"]:.3f}'

    if 'weight_decay' in options and options['weight_decay'] != 0.0:
        name += f'-wd{options["weight_decay"]:.3f}'

    return name

def model_save(model: nn.Module, folder: str | None = None):
    """
    Save the given model to a file.
    """

    folder = '' if folder is None else folder + '/'
    filename = DATA_DIR + f'{folder}{model.name}.pt'

    torch.save(model.state_dict(), filename)
    print(f'Saved {model.name} ({filename})')

def model_load(model: nn.Module, folder: str | None = None) -> bool:
    """
    Save the given model to a file.

    Returns `True` if the model was loaded, `False` otherwise.
    """

    folder = '' if folder is None else folder + '/'
    filename = DATA_DIR + f'{folder}{model.name}.pt'

    if not os.path.exists(filename):
        return False
    
    model.load_state_dict(torch.load(filename))
    print(f'Loaded {model.name} ({filename}')
    return True
    
def cbow_train(
    dataset: TensorDataset,
    model: CBOW,
    criterion: object,
    optimizer: torch.optim.Optimizer,
    force_retrain: bool = False
):
    """
    Trains the given model on the given dataset.

    Returns the trained model.
    """

    criterion.to(device)
    model.to(device)
    model.train()

    model.name = model_nameof(model, criterion, optimizer)

    if model_load(model, 'embeddings') and force_retrain is False:
        return
    
    data_loader = DataLoader(dataset, batch_size=EMBEDDINGS_BATCH_SIZE, shuffle=True)

    print(f'Training {model.name}...')

    losses = []

    for epoch in range(EMBEDDINGS_EPOCHS):

        total_loss = torch.tensor([0.0]).to(device)
        total_size = 0

        for contexts, targets in data_loader:
            optimizer.zero_grad()
            outputs = model(contexts)
            targets = torch.nn.functional.one_hot(targets, num_classes=VOCABULARY_SIZE).float().to(device)

            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

            total_loss += loss
            total_size += len(targets)

        total_loss = total_loss.item() / total_size
        losses.append(total_loss)
        print(f'Training | {model.name} | Epoch {epoch} | Loss {total_loss}')

    model_save(model, 'embeddings')

def cbow_eval(
    dataset: TensorDataset,
    model: CBOW
):
    """
    Evaluate the given model on the given dataset.

    Returns the accuracy of the model.
    """

    model.to(device)
    model.eval()

    data_loader = DataLoader(dataset, batch_size=EMBEDDINGS_BATCH_SIZE, shuffle=True)

    correct = 0
    total = 0

    for contexts, targets in data_loader:
        outputs = model(contexts)
        outputs = torch.argmax(outputs, dim=1)

        total += targets.size(0)
        correct += (outputs == targets).sum().item()

    print(f'Validation | {model.name} | Accuracy {correct/total:.4f}')

    return correct / total

def cbow_eval_and_pick_best(
    dataset: TensorDataset,
    models: list[nn.Module]
):
    """
    Evaluate multiple models and pick the best one.
    """

    best_model = None
    best_accuracy = 0.0

    for model in models:
        accuracy = cbow_eval(dataset, model)
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_model = model

    print(f'Best model: {best_model.name} | Accuracy {best_accuracy}')
    return best_model

def cbow_create_embeddings() -> torch.Tensor:
    """
    Create multiple embeddings models and pick the best one.  

    Returns the embeddings of the best model.
    """

    train_dataset = cbow_create_dataset(words_train)

    m1 = CBOW()
    cbow_train(
        train_dataset, m1,
        nn.CrossEntropyLoss(),
        torch.optim.SGD(m1.parameters(), lr=0.02)
    )

    m2 = CBOW()
    cbow_train(
        train_dataset, m2,
        nn.CrossEntropyLoss(),
        torch.optim.SGD(m2.parameters(), lr=0.01)
    )
    
    m3 = CBOW()
    cbow_train(
        train_dataset, m3,
        nn.CrossEntropyLoss(),
        torch.optim.SGD(m3.parameters(), lr=0.001)
    )

    val_dataset = cbow_create_dataset(words_val)
    best_model = cbow_eval_and_pick_best(
        val_dataset,
        [m1, m2, m3]
    )

    return best_model.embeddings.weight.detach().to(device)

In [6]:
embeddings = cbow_create_embeddings()

Training CBOW_CrossEntropyLoss_SGD-lr0.020...
Training | CBOW_CrossEntropyLoss_SGD-lr0.020 | Epoch 0 | Loss 0.03887515080822781
Training | CBOW_CrossEntropyLoss_SGD-lr0.020 | Epoch 1 | Loss 0.03571249065538397
Training | CBOW_CrossEntropyLoss_SGD-lr0.020 | Epoch 2 | Loss 0.034859221171184424
Training | CBOW_CrossEntropyLoss_SGD-lr0.020 | Epoch 3 | Loss 0.03433660710727191
Training | CBOW_CrossEntropyLoss_SGD-lr0.020 | Epoch 4 | Loss 0.03396170793321118
Training | CBOW_CrossEntropyLoss_SGD-lr0.020 | Epoch 5 | Loss 0.033673378571766464
Training | CBOW_CrossEntropyLoss_SGD-lr0.020 | Epoch 6 | Loss 0.03343976608102727
Training | CBOW_CrossEntropyLoss_SGD-lr0.020 | Epoch 7 | Loss 0.03324561911177446
Training | CBOW_CrossEntropyLoss_SGD-lr0.020 | Epoch 8 | Loss 0.0330800309559612
Training | CBOW_CrossEntropyLoss_SGD-lr0.020 | Epoch 9 | Loss 0.032937679209900844
Training | CBOW_CrossEntropyLoss_SGD-lr0.020 | Epoch 10 | Loss 0.032812252067362435
Training | CBOW_CrossEntropyLoss_SGD-lr0.020 | E

In [15]:
def word_vector_similarity_cosine(word_a:torch.Tensor, word_b:torch.Tensor):
    return torch.dot(word_a, word_b) / (word_a.norm() * word_b.norm())

def word_vector_similarity_euclidian(word_a:torch.Tensor, word_b:torch.Tensor):
    return (word_a - word_b).norm()

def word_similarity_cosine(word_a:str, word_b:str):
    word_a_idx = vocabulary[word_a]
    word_b_idx = vocabulary[word_b]

    word_a_embedding = embeddings[word_a_idx]
    word_b_embedding = embeddings[word_b_idx]

    return word_vector_similarity_cosine(word_a_embedding, word_b_embedding)

def word_find_top_closest(
    word: str,
    top: int
):
    similarities = []
    for other in vocabulary.lookup_tokens(range(len(vocabulary))):
        similarity = word_similarity_cosine(word, other).item()
        similarities.append((other, similarity))

    similarities.sort(key=lambda x: x[1], reverse=True)

    similarities = similarities[1:top+1]

    return similarities

def word_find_closest(
    word_vector:torch.Tensor,
):
    closest_word = None
    closest_distance = 1_000_000

    for other in vocabulary.lookup_tokens(range(len(vocabulary))):
        other_idx = vocabulary[other]
        other_embedding = embeddings[other_idx]

        distance = word_vector_similarity_euclidian(word_vector, other_embedding)

        if distance < closest_distance:
            closest_distance = distance
            closest_word = other
    
    return closest_word

In [31]:
def print_most_similar_words(words, top = 10):
    print(f"Top {top} most similar words")
    for word in words:
        if vocabulary[word] == vocabulary['<unk>']:
            print(word, ':'. "Not in vocabulary")
        else:
            print(word, ':', [x[0] for x in word_find_top_closest(word, top)])

print_most_similar_words([
    'king', 'queen', 'man', 'woman', 'he', 'she', 'doctor', 'nurse',
    'black', 'white', 'slave', 'master',
    'poor', 'rich', 
    'smart', 'dumb', 
    'strong', 'weak',
    'good', 'bad',
])

Top 10 similar words
king : ['earl', 'prince', 'father', 'spirit', 'building', 'aloud', 'rest', 'wrong', 'bishop', 'feel']
queen : ['fixed', 'can', 'sprang', 'desired', 'trust', 'wound', 'large', 'walls', 'spring', 'crossed']
man : ['wood', 'social', 'together', 'doctor', 'chamber', 'glancing', 'party', 'sun', 'harm', 'ye']
woman : ['fatigue', 'soul', 'paid', 'empty', 'size', 'heat', 'man', 'thick', 'singing', 'le']
he : ['who', 'she', 'growth', 'fully', 'face', 'wrong', 'count', 'they', 'example', 'i']
she : ['he', 'himself', 'everything', 'never', 'excellent', 'brothers', 'key', 'child', 'her', 'i']
doctor : ['smile', 'report', 'torn', 'glancing', 'man', 'really', 'cases', 'otherwise', 'uttered', 'dream']
'nurse' is not in vocabulary
black : ['poor', 'skald', 'words', 'white', 'turned', 'o', 'minutes', 'called', 'years', 'human']
white : ['single', 'poor', 'excellent', 'rain', 'nose', 'personal', 'whose', 'black', 'horrible', 'gaze']
'slave' is not in vocabulary
master : ['begged', '