In [23]:
import os
import torch
import re
from torch import nn;
from torch.utils.data import DataLoader, TensorDataset
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator, Vocab

DATA_DIR = './data/'
MIN_WORD_FREQUENCY = 100

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Using device: cuda


# Tokenize datasets

In [24]:
TOKENIZER = get_tokenizer('basic_english')

def read_lines(
    dataset: str
) -> list[str]:
    """
    Reads all the lines form all the texts in the given `dataset`.

    Datasets are `train`, `val` and `test`.
    """

    # Scan for all input files
    inDirectoryName = os.path.join(DATA_DIR, 'input', dataset)
    inFileNames = [os.path.join(inDirectoryName, f) for f in os.listdir(inDirectoryName)]

    # Read all the lines from all the files
    lines = []
    for inFileName in inFileNames:
        with open(inFileName, 'r') as file:
            lines += file.readlines()

    print(f"Read {len(lines)} lines from {dataset}")
    return lines

def create_tokens(
    dataset: str
) -> list[str]:
    """
    Creates tokens for all the words in the given `dataset`.

    Datasets are `train`, `val` and `test`.
    """

    outFileName = os.path.join(DATA_DIR, f'words.{dataset}.pt')
    
    # If the file exists, don't create it again.
    if os.path.isfile(outFileName):
        print(f"Loaded tokenized words for {dataset} ({outFileName})")
        return torch.load(outFileName)

    tokens = []
    for line in read_lines(dataset):
        tokens += TOKENIZER(line)

    # Save tokens so we dont have to do this again
    torch.save(tokens, outFileName)
    
    return tokens

def create_vocabulary(
    dataset: str
) -> Vocab:
    """
    Creates a vocabulary for the given `dataset`.

    Datasets are `train`, `val` and `test`.
    """

    outFileName = os.path.join(DATA_DIR, f'vocabulary.pt')

    # If the file exists, don't create it again.
    if os.path.isfile(outFileName):
        print(f"Loaded vocabulary for {dataset} ({outFileName})")
        return torch.load(outFileName)

    def read_sanitize_tokenize():

        for line in read_lines(dataset):

            line = re.sub('\\w*[0-9]+\\w*', ' ', line) # Remove numbers
            line = re.sub('\\w*[A-Z]+\\w*', ' ', line) # Remove uppercase names
            line = re.sub('\\s+', ' ', line) # Remove double spaces

            yield TOKENIZER(line)

    vocabulary = build_vocab_from_iterator(read_sanitize_tokenize(), min_freq=MIN_WORD_FREQUENCY, specials=['<unk>'])

    vocabulary.set_default_index(vocabulary['<unk>'])

    # We removed all uppercase names, this includes 'I'
    vocabulary.append_token('i') 

    # Save vocabulary so we dont have to do this again
    torch.save(vocabulary, outFileName)

    return vocabulary
    


In [25]:
words_train = create_tokens('train')
words_val = create_tokens('val')
words_test = create_tokens('test')

vocabulary = create_vocabulary('train')
VOCABULARY_SIZE = len(vocabulary)

Loaded tokenized words for train (./data/words.train.pt)
Loaded tokenized words for val (./data/words.val.pt)
Loaded tokenized words for test (./data/words.test.pt)
Loaded vocabulary for train (./data/vocabulary.pt)


In [26]:

print("Words in 'train' dataset ........:", len(words_train))
print("Words in 'val' dataset ..........:", len(words_val))
print("Words in 'test' dataset .........:", len(words_test))
print("Distinct words in 'train' dataset:", len(set(words_train)))
print("Words in vocabulary .............:", VOCABULARY_SIZE)

Words in 'train' dataset ........: 2684706
Words in 'val' dataset ..........: 49526
Words in 'test' dataset .........: 124152
Distinct words in 'train' dataset: 52105
Words in vocabulary .............: 1880


In [28]:
EMBEDDINGS_DIM = 32
CONTEXT_SIZE = 5
EMBEDDINGS_BATCH_SIZE = 128
EMBEDDINGS_EPOCHS = 100

class CBOW(nn.Module):
    def __init__(self):
        super(CBOW, self).__init__()
        self.embeddings = nn.EmbeddingBag(VOCABULARY_SIZE, EMBEDDINGS_DIM, sparse=True)
        self.linear = nn.Linear(EMBEDDINGS_DIM, VOCABULARY_SIZE)

    def forward(self, x):
        x = self.embeddings(x)
        x = self.linear(x)
        x = torch.log_softmax(x, dim=1)
        return x
    
def cbow_create_dataset(
    words: list[str]
):
    """
    Creates a dataset from the given words.
    """
    words_idx = [vocabulary[word] for word in words]

    contexts = []
    targets = []
    for i in range(len(words) - CONTEXT_SIZE):
        context = words_idx[i:i+CONTEXT_SIZE]
        target = words_idx[i+CONTEXT_SIZE]

        contexts.append(torch.tensor(context))
        targets.append(target)

    contexts = torch.stack(contexts).to(device)
    targets = torch.tensor(targets).to(device)

    return TensorDataset(contexts, targets)

def model_nameof(model: nn.Module, criterion: object, optimizer: torch.optim.Optimizer) -> str:
    """
    Creates a good name for the model.
    """

    name = f'{model.__class__.__name__}_{criterion.__class__.__name__}_{optimizer.__class__.__name__}'
    options = optimizer.param_groups[0]

    if 'lr' in options:
        name += f'-lr{options["lr"]:.3f}'

    if 'momentum' in options and options['momentum'] != 0.0:
        name += f'-m{options["momentum"]:.3f}'

    if 'weight_decay' in options and options['weight_decay'] != 0.0:
        name += f'-wd{options["weight_decay"]:.3f}'

    return name
    
def cbow_train(
    dataset: TensorDataset,
    model: CBOW,
    criterion: object,
    optimizer: torch.optim.Optimizer,
) -> CBOW:
    """
    Trains the given model on the given dataset.

    Returns the trained model.
    """

    criterion.to(device)
    model.to(device)
    model.train()

    model.name = model_nameof(model, criterion, optimizer)
    filename = DATA_DIR + f'embeddings/{model.name}.pt'

    if (os.path.exists(filename)):
        print(f'Model {model.name} loaded from file ({filename})')
        return torch.load(filename).to(device)
    
    data_loader = DataLoader(dataset, batch_size=EMBEDDINGS_BATCH_SIZE, shuffle=True)

    print(f'Training {model.name}...')

    losses = []

    for epoch in range(EMBEDDINGS_EPOCHS):

        total_loss = torch.tensor([0.0]).to(device)
        total_size = 0

        for contexts, targets in data_loader:
            optimizer.zero_grad()
            outputs = model(contexts)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

            total_loss += loss
            total_size += len(targets)

        total_loss = total_loss.item() / total_size
        losses.append(total_loss)
        print(f'Training | {model.name} | Epoch {epoch} | Loss {total_loss}')

    torch.save(model, filename)
    return model

def cbow_eval(
    dataset: TensorDataset,
    model: CBOW
):
    """
    Evaluate the given model on the given dataset.

    Returns the accuracy of the model.
    """

    model.to(device)
    model.eval()

    data_loader = DataLoader(dataset, batch_size=EMBEDDINGS_BATCH_SIZE, shuffle=True)

    correct = 0
    total = 0

    for contexts, targets in data_loader:
        outputs = model(contexts)
        _, predicted = torch.max(outputs, 1)
        total += targets.size(0)
        correct += (predicted == targets).sum().item()

    print(f'Validation | {model.name} | Accuracy {correct/total:.4f}')

    return correct / total

def cbow_eval_and_pick_best(
    dataset: TensorDataset,
    models: list[nn.Module]
):
    """
    Evaluate multiple models and pick the best one.
    """

    best_model = None
    best_accuracy = 0.0

    for model in models:
        accuracy = cbow_eval(dataset, model)
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_model = model

    print(f'Best model: {best_model.name} | Accuracy {best_accuracy}')
    return best_model

def cbow_create_embeddings() -> torch.Tensor:
    """
    Create multiple embeddings models and pick the best one.  

    Returns the embeddings of the best model.
    """

    train_dataset = cbow_create_dataset(words_train)

    m1 = CBOW()
    cbow_train(
        train_dataset, m1,
        nn.CrossEntropyLoss(),
        torch.optim.SGD(m1.parameters(), lr=0.02)
    )

    m2 = CBOW()
    cbow_train(
        train_dataset, m2,
        nn.CrossEntropyLoss(),
        torch.optim.SGD(m2.parameters(), lr=0.01)
    )
    
    m3 = CBOW()
    cbow_train(
        train_dataset, m3,
        nn.CrossEntropyLoss(),
        torch.optim.SGD(m3.parameters(), lr=0.001)
    )

    val_dataset = cbow_create_dataset(words_val)
    best_model = cbow_eval_and_pick_best(
        val_dataset,
        [m1, m2, m3]
    )

    return best_model.embeddings.weights.detach().to(device)

embeddings = cbow_create_embeddings()

Training CBOW_CrossEntropyLoss_SGD-lr0.020...
Training | CBOW_CrossEntropyLoss_SGD-lr0.020 | Epoch 0 | Loss 0.04200461941571892
Training | CBOW_CrossEntropyLoss_SGD-lr0.020 | Epoch 1 | Loss 0.03920111820645949
Training | CBOW_CrossEntropyLoss_SGD-lr0.020 | Epoch 2 | Loss 0.038677689340451696
Training | CBOW_CrossEntropyLoss_SGD-lr0.020 | Epoch 3 | Loss 0.03839519163400319
Training | CBOW_CrossEntropyLoss_SGD-lr0.020 | Epoch 4 | Loss 0.038198763218883595
Training | CBOW_CrossEntropyLoss_SGD-lr0.020 | Epoch 5 | Loss 0.03804670077785199
Training | CBOW_CrossEntropyLoss_SGD-lr0.020 | Epoch 6 | Loss 0.0379212183451714
Training | CBOW_CrossEntropyLoss_SGD-lr0.020 | Epoch 7 | Loss 0.037813812878976095
Training | CBOW_CrossEntropyLoss_SGD-lr0.020 | Epoch 8 | Loss 0.03771976434712841
Training | CBOW_CrossEntropyLoss_SGD-lr0.020 | Epoch 9 | Loss 0.037634661178283915
Training | CBOW_CrossEntropyLoss_SGD-lr0.020 | Epoch 10 | Loss 0.037557615820160234
Training | CBOW_CrossEntropyLoss_SGD-lr0.020 | 

AttributeError: 'EmbeddingBag' object has no attribute 'weights'

In [None]:
def word_vector_cosine_similarity(word_a:torch.Tensor, word_b:torch.Tensor):
    return torch.dot(word_a, word_b) / (word_a.norm() * word_b.norm())

def word_vector_distance_similarity(word_a:torch.Tensor, word_b:torch.Tensor):
    return (word_a - word_b).norm()

def word_similarity(word_a:str, word_b:str, vocabulary:Vocab, embeddings:torch.Tensor):
    word_a_idx = vocabulary[word_a]
    word_b_idx = vocabulary[word_b]

    word_a_embedding = embeddings[word_a_idx]
    word_b_embedding = embeddings[word_b_idx]

    return word_vector_cosine_similarity(word_a_embedding, word_b_embedding)

def word_find_similars(
    word: str,
    top: int,
    vocabulary: Vocab,
    embeddings: torch.Tensor
):
    similarities = []
    for other in vocabulary.lookup_tokens(range(len(vocabulary))):
        similarity = word_similarity(word, other, vocabulary, embeddings)
        similarities.append((other, similarity))

    similarities.sort(key=lambda x: x[1], reverse=True)

    similarities = similarities[1:top+1]

    return similarities

print(word_find_similars('king', 10, vocabulary, embeddings))

print(word_find_similars('queen', 10, vocabulary, embeddings))

def word_closest(word_vector:torch.Tensor, vocabulary:Vocab, embeddings:torch.Tensor):
    closest_word = None
    closest_distance = 1_000_000

    for other in vocabulary.lookup_tokens(range(len(vocabulary))):
        other_idx = vocabulary[other]
        other_embedding = embeddings[other_idx]

        distance = word_vector_distance_similarity(word_vector, other_embedding)

        if distance < closest_distance:
            closest_distance = distance
            closest_word = other
    
    return closest_word

king_vector = embeddings[vocabulary['king']]
he_vector = embeddings[vocabulary['he']]
she_vector = embeddings[vocabulary['she']]

print(word_closest(king_vector - he_vector + she_vector, vocabulary, embeddings))

[('absorbed', tensor(0.8978, device='cuda:0')), ('borne', tensor(0.8677, device='cuda:0')), ('duty', tensor(0.8438, device='cuda:0')), ('gone', tensor(0.8303, device='cuda:0')), ('anything', tensor(0.8259, device='cuda:0')), ('absolute', tensor(0.8175, device='cuda:0')), ('thought', tensor(0.8108, device='cuda:0')), ('people', tensor(0.7782, device='cuda:0')), ('gay', tensor(0.7702, device='cuda:0')), ('actions', tensor(0.7661, device='cuda:0'))]
[('presented', tensor(0.8658, device='cuda:0')), ('bulbs', tensor(0.8037, device='cuda:0')), ('fire', tensor(0.7927, device='cuda:0')), ('instant', tensor(0.7905, device='cuda:0')), ('merely', tensor(0.7626, device='cuda:0')), ('about', tensor(0.7617, device='cuda:0')), ('many', tensor(0.7599, device='cuda:0')), ('post', tensor(0.7515, device='cuda:0')), ('midst', tensor(0.7476, device='cuda:0')), ('keeping', tensor(0.7431, device='cuda:0'))]
pride
