In [120]:
import numpy as np
import pandas as pd
import torch
from torch import nn, optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
import torchtext
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

import math
import os
import re
from datetime import datetime
import string
import copy

seed = 265
torch.manual_seed(seed);

In [121]:
# Helper functions for training:

def set_device(device=None):
    """
    Helper function to set device
    """
    if device is None:
        device = (
            torch.device('cuda') if torch.cuda.is_available()
            else torch.device('cpu'))
        print(f"On device {device}.")
    return device

def train(n_epochs, optimizer, model, loss_fn, train_loader, val_loader=None, device=None):
    device = set_device(device)

    n_batch_train = len(train_loader)
    if val_loader:
        n_batch_val = len(val_loader)
    losses_train = []
    losses_val = []
    
    model.train()
    optimizer.zero_grad(set_to_none=True)

    for epoch in range(1, n_epochs + 1):
        loss_train = 0.0
        for train_contexts, train_targets in train_loader:

            train_contexts = train_contexts.to(device=device)
            train_targets = train_targets.to(device=device)

            train_outputs = model(train_contexts)

            loss = loss_fn(train_outputs, train_targets)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            loss_train += loss.item()

        avg_loss_train = loss_train / n_batch_train
        losses_train.append(avg_loss_train)

        if epoch % 5 == 0 or epoch == 1:
            print('{}  |  Epoch {}  |  Training loss {:.5f}'.format(
                datetime.now().time().strftime("%H:%M:%S"), epoch, avg_loss_train))
        
        if val_loader:
            model.eval()
            loss_val = 0.0
            n_batch_val = len(val_loader)
            with torch.no_grad():
                for val_contexts, val_targets in val_loader:
                    val_contexts = val_contexts.to(device=device)
                    val_targets = val_targets.to(device=device)

                    val_outputs = model(val_contexts)
                    val_loss = loss_fn(val_outputs, val_targets)
                    loss_val += val_loss.item()
            
            avg_loss_val = loss_val / n_batch_val
            losses_val.append(avg_loss_val) 
            
            if epoch % 5 == 0 or epoch == 1:
                print('{}  |  Epoch {}  |  Validation loss {:.5f}'.format(
                    datetime.now().strftime("%H:%M:%S"), epoch, avg_loss_val))
            
            model.train()
            
    return losses_train, losses_val


def compute_accuracy(model, loader, device=None):
    model.eval()
    device = set_device(device)

    correct = 0
    total = 0

    with torch.no_grad():
        for contexts, targets in loader:
            contexts = contexts.to(device=device)
            targets = targets.to(device=device)

            outputs = model(contexts)
            _, predicted = torch.max(outputs, dim=1)
            total += len(targets)
            correct += int((predicted == targets).sum())

    acc =  correct / total
    return acc

In [122]:
tokenizer = get_tokenizer("basic_english")
data_path = 'textdata/'
min_freq = 100
device = set_device()
CONTEXT_SIZE = 2 # two tokens on either side for cbow.

On device cpu.


In [123]:
# Helper functions for text pre-processing:
# Credit to the text data tutorial for this cell:
def read_files(path=data_path + "data_train/"):
    files = os.listdir(path)
    
    lines = []
    
    for file in files:
        with open(path + file) as f:
            lines += f.readlines()
    
    return lines
    
def tokenize(lines, tokenizer=tokenizer):
    list_text = []
    for line in lines:
        list_text += tokenizer(line)
    return list_text

def yield_tokens(lines, tokenizer=tokenizer):
    """
    Yield tokens, ignoring names and digits to build vocabulary
    """
    # Match any word containing digit
    no_digits = r'\w*[0-9]+\w*'
    # Match word containing a uppercase 
    no_names = r'\w*[A-Z]+\w*'
    # Match any sequence containing more than one space
    no_spaces = r'\s+'
    
    for line in lines:
        line = re.sub(no_digits, ' ', line)
        line = re.sub(no_names, ' ', line)
        line = re.sub(no_spaces, ' ', line)
        yield tokenizer(line)

def count_freqs(words, vocab):
    """
    Count occurrences of each word in vocabulary in the data
    
    Useful to get some insight on the data and to compute loss weights
    """
    freqs = torch.zeros(len(vocab), dtype=torch.int)
    for w in words:
        freqs[vocab[w]] += 1
    return freqs

def count_freqs_int(int_targets, vocab):
    """
    Count occurrences of each word index in the data
    
    This function is adjusted to work directly with word indices that are already integers.
    Useful to get some insight on the data and to compute loss weights.

    Parameters:
    - int_targets: a list of integer indices corresponding to words.
    - vocab_size: the size of the vocabulary.

    Returns:
    - freqs: a tensor of frequencies for each word index.
    """
    freqs = torch.zeros(len(vocab), dtype=torch.int)
    for index in int_targets:
        freqs[index] += 1
    return freqs

def create_vocabulary(lines, min_freq=min_freq):
    """
    Create a vocabulary (list of known tokens) from a list of strings
    """
    # vocab contains the vocabulary found in the data, associating an index to each word
    vocab = build_vocab_from_iterator(yield_tokens(lines), min_freq=min_freq, specials=["<unk>"])
    # Since we removed all words with an uppercase when building the vocabulary, we skipped the word "I"
    vocab.append_token("i")
    # Value of default index. This index will be returned when OOV (Out Of Vocabulary) token is queried.
    vocab.set_default_index(vocab["<unk>"])
    return vocab

In [124]:
# ----------------------- Tokenize texts -------------------------------
# Load tokenized versions of texts if you have already generated it
# Otherwise, create it and save it
token_path = 'tokens/'
os.makedirs(token_path, exist_ok=True)

if os.path.isfile(token_path + "words_train.pt"):
    words_train = torch.load(token_path + "words_train.pt")
    words_val = torch.load(token_path + "words_val.pt")
    words_test = torch.load(token_path + "words_test.pt")
else:
    # Get lists of strings, one for each line in each .txt files in 'datapath' 
    lines_books_train = read_files(data_path + 'data_train/')
    lines_books_val = read_files(data_path + 'data_val/')
    lines_books_test = read_files(data_path + 'data_test/')

    # List of words contained in the dataset
    words_train = tokenize(lines_books_train)
    words_val = tokenize(lines_books_val)
    words_test = tokenize(lines_books_test)
    
    torch.save(words_train , token_path + "words_train.pt")
    torch.save(words_val , token_path + "words_val.pt")
    torch.save(words_test , token_path + "words_test.pt")



# ----------------------- Create vocabulary ----------------------------
vocab_path = 'vocabs/'
os.makedirs(vocab_path, exist_ok=True)
vocab_name = "vocabulary.pt"
# Load vocabulary if you have already generated it
# Otherwise, create it and save it
if os.path.isfile(vocab_path + vocab_name):
    vocab = torch.load(vocab_path + vocab_name)
else:
    # Create vocabulary based on the words in the training dataset
    vocab = create_vocabulary(lines_books_train, min_freq=min_freq)
    torch.save(vocab, vocab_path + vocab_name)

In [125]:
# ------------------------ Vocab analysis ------------------------------
VOCAB_SIZE = len(vocab)
print("Total number of words in the training dataset:     ", len(words_train))
print("Total number of words in the validation dataset:   ", len(words_val))
print("Total number of words in the test dataset:         ", len(words_test))
print("Number of distinct words in the training dataset:  ", len(set(words_train)))
print("Number of distinct words kept (vocabulary size):   ", VOCAB_SIZE)

freqs = count_freqs(words_train, vocab)
print("Twenty first words in the vocabulary and their occurences:\n", [(f.item(), w) for (f, w)  in zip(freqs, vocab.lookup_tokens(range(20)))])
print("Twenty last words in the vocabulary and their occurences:\n", [(f.item(), w) for (f, w)  in zip(freqs[-20:], vocab.lookup_tokens(range((len(vocab) - 20), len(vocab))))])

Total number of words in the training dataset:      2684706
Total number of words in the validation dataset:    49526
Total number of words in the test dataset:          124152
Number of distinct words in the training dataset:   52105
Number of distinct words kept (vocabulary size):    1880
Twenty first words in the vocabulary and their occurences:
 [(433907, '<unk>'), (182537, ','), (151278, 'the'), (123727, '.'), (82289, 'and'), (65661, 'of'), (62763, 'to'), (49230, 'a'), (41477, 'in'), (31052, 'that'), (37167, 'he'), (29046, 'was'), (26508, 'his'), (26354, 'it'), (20862, 'with'), (20159, 'had'), (19965, 'is'), (15692, 'not'), (16593, 'as'), (15705, 'on')]
Twenty last words in the vocabulary and their occurences:
 [(102, 'astonished'), (101, 'authority'), (105, 'bourgeois'), (101, 'chain'), (102, 'crossing'), (101, 'divided'), (100, 'eaten'), (116, 'elder'), (101, 'ends'), (108, 'gradually'), (102, 'instinct'), (100, 'mounted'), (100, 'pistol'), (102, 'pot'), (103, 'pride'), (100, 's

## Context/target dataset creation, inspired by tutorial

In [126]:
def create_dataset(text, vocab, context_size=CONTEXT_SIZE):
    """
    Create a PyTorch dataset of context/target pairs from a text
    """

    # Define punctuation symbols
    string_punctuations = string.punctuation
    punctuations = [vocab[p] for p in string_punctuations
               if vocab[p] != 0]
    
    # Transform the text into a list of integers from vocabulary
    txt = [vocab[w] for w in text]
    
    n_text = len(txt)

    contexts = []
    targets = []
    for i in range(n_text - context_size*2):
        # Word directly after the start of context is the target.
        target = txt[i + context_size]
        # Skip targets that are unknown:
        if vocab.lookup_token(target) != '<unk>' and target not in punctuations:
            # Context before target
            context_before = txt[i:i + context_size]

            # Context after the target
            context_behind = txt[i + context_size + 1: i + context_size*2 + 1]

            targets.append(target)
            contexts.append(torch.tensor(context_before + context_behind))

    # Convert contexts and targets into PyTorch tensors and stack for dataset formation.
    contexts = torch.stack(contexts)
    targets = torch.tensor(targets)
    
    # Create a PyTorch dataset from these context/target pairs
    return TensorDataset(contexts, targets)

In [127]:
dataset_path = 'dataset/'
os.makedirs(dataset_path, exist_ok=True)
def load_dataset(words, vocab, fname):
    """
    Load dataset if already generated, otherwise, create it and save it
    """
    # If already generated
    if os.path.isfile(dataset_path + fname):
        dataset = torch.load(dataset_path + fname)
    else:
        # Create context / target dataset based on the list of strings
        dataset = create_dataset(words, vocab)
        torch.save(dataset, dataset_path + fname)
    return dataset

data_train = load_dataset(words_train, vocab, "data_train.pt")
data_val = load_dataset(words_val, vocab, "data_val.pt")
data_test = load_dataset(words_test, vocab, "data_test.pt")

In [128]:
print(f"The training data has {len(data_train)} entries.")
print(f"The validation data has {len(data_val)} entries.")
print(f"The test data has {len(data_test)} entries.")

The training data has 1916842 entries.
The validation data has 36218 entries.
The test data has 83643 entries.


## Simple CBOW model
Inspired by this tutorial: https://www.youtube.com/watch?v=Rqh4SRcZuDA

In [129]:
class SimpleCBOW(nn.Module):
    def __init__(self, emb_dim=16, vocab_size=VOCAB_SIZE):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.fc = nn.Linear(emb_dim, vocab_size)
        
    def forward(self, x):
        embeddings = self.embedding(x).mean(1).squeeze(1) # batch_size x emb_dim
        return self.fc(embeddings)

## Larger CBOW model

An attempt to see if a more complex model performs better, includes dropout, batch normalisation and an extra fully connected layer.

In [130]:
class LargerCBOW(nn.Module):
    def __init__(self, emb_dim=32, vocab_size=VOCAB_SIZE, dropout_rate=0.3):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.dropout = nn.Dropout(dropout_rate)
        
        self.fc1 = nn.Linear(emb_dim, 256)
        self.batchnorm256 = nn.BatchNorm1d(256)
        self.out = nn.Linear(256, vocab_size)
    
    def forward(self, x):
        embeddings = self.embedding(x).mean(1).squeeze(1)
        embeddings = F.relu(self.fc1(embeddings))
        embeddings = self.batchnorm256(embeddings)
        embeddings = self.dropout(embeddings)
        return self.out(embeddings)

### Calculating training label frequencies for class weights

In [131]:
training_freqs = count_freqs_int(data_train[:][1], vocab)

# Here we see that the two first tokens in the vocabulary,
# "<unk>" and "," have 0 counts in the target set
print(f'Updated frequencies for new dataset: {training_freqs[:10]}')

training_freqs = torch.tensor(training_freqs, dtype=torch.float)


total_samples = training_freqs.sum()

# Calculate class weights
class_weights = total_samples / training_freqs

# Remove inf's caused by division by zero
class_weights = torch.tensor([w if w != float('inf') else 0 for w in class_weights]).to(device)

# Create logarithmically scaled weights to reduce magnitude
log_weights = torch.log(class_weights)

# Again, remove -inf's caused by logarithm
log_weights = torch.tensor([w if w != float('-inf') else 0 for w in log_weights]).to(device)

print(f'Class weights sample: {class_weights[:10]}')
print(f'Largest class weight: {torch.max(class_weights):.2f}')

print(f'Logarithmically scaled weights sample: {log_weights[:10]}')
print(f'Largest logarithmic weight: {torch.max(log_weights):.2f}')


Updated frequencies for new dataset: tensor([     0,      0, 151278,      0,  82288,  65661,  62763,  49230,  41477,
         31052], dtype=torch.int32)
Class weights sample: tensor([ 0.0000,  0.0000, 12.6710,  0.0000, 23.2943, 29.1930, 30.5410, 38.9365,
        46.2146, 61.7301])
Largest class weight: 19168.42
Logarithmically scaled weights sample: tensor([0.0000, 0.0000, 2.5393, 0.0000, 3.1482, 3.3739, 3.4191, 3.6619, 3.8333,
        4.1228])
Largest logarithmic weight: 9.86


  training_freqs = torch.tensor(training_freqs, dtype=torch.float)


## Training all CBOWS

In [132]:
def run_cbows(first, second, train_loader, val_loader):
    train_losses = []
    val_losses = []
    
    for model in first:
        if not os.path.isfile(cbow_path + model.__class__.__name__ + "no_weights.pt"):
            print(f'Training model: {model.__class__.__name__ + "no_weights"}')
            optim = torch.optim.Adam(model.parameters(), lr=.001)
            loss_fn = nn.CrossEntropyLoss()
            losses_train, losses_val = train(20, optim, model, loss_fn, train_loader, val_loader, device=device)
            train_losses.append(losses_train)
            val_losses.append(losses_val)

            torch.save(model, cbow_path + model.__class__.__name__ + "no_weights.pt" )
        
        
    for model in second:
        if not os.path.isfile(cbow_path + model.__class__.__name__ + "with_weights.pt"):
            print(f'Training model: {model.__class__.__name__ + "with_weights"}')
            optim = torch.optim.Adam(model.parameters(), lr=.001)
            loss_fn = nn.CrossEntropyLoss(weight=log_weights)
            losses_train, losses_val = train(20, optim, model, loss_fn, train_loader, val_loader, device=device)
            train_losses.append(losses_train)
            val_losses.append(losses_val)


            torch.save(model, cbow_path + model.__class__.__name__ + "with_weights.pt")
            
    return train_losses, val_losses

In [133]:
torch.manual_seed(seed)

cbow_path = 'models/cbow/'
    
# Run four experiments, where cbow 1 and 2 will be without class weights,
# while cbow 3 and 4 will be with class weights.
cbow1 = SimpleCBOW(emb_dim=16, vocab_size=VOCAB_SIZE).to(device)
cbow2 = LargerCBOW(emb_dim=32, vocab_size=VOCAB_SIZE).to(device)
cbow3 = SimpleCBOW(emb_dim=16, vocab_size=VOCAB_SIZE).to(device)
cbow4 = LargerCBOW(emb_dim=32, vocab_size=VOCAB_SIZE).to(device)

    
no_weights_cbows = [cbow1, cbow2]
with_weights_cbows = [cbow3, cbow4]


train_loader = DataLoader(data_train, batch_size=128)
val_loader = DataLoader(data_val, batch_size=128)


train_losses, val_losses = run_cbows(no_weights_cbows, with_weights_cbows, train_loader, val_loader)

### Selecting the best CBOW

In [134]:
torch.manual_seed(seed)

def evaluate_and_select_cbow(cbows, cbow_names, model_path, val_loader):
    accs = []

    for cbow in cbows:
        acc = compute_accuracy(cbow, val_loader, device=device)
        accs.append(acc)
    
    model_idx = accs.index(max(accs))
    print(f'Max accuracy: {accs[model_idx]:.3f}, selected model: model: {model_idx + 1}: {cbow_names[model_idx]}', end='\n\n')
    print('All accuracies: \n' + '\n'.join([f'{cbow_name} achieved {round(acc, 3)}' for cbow_name, acc in zip(cbow_names, accs)]))
    return cbows[model_idx]

cbow1 = torch.load(cbow_path + 'SimpleCBOWno_weights.pt', map_location=device)
cbow2 = torch.load(cbow_path + 'LargerCBOWno_weights.pt', map_location=device)
cbow3 = torch.load(cbow_path + 'SimpleCBOWwith_weights.pt', map_location=device)
cbow4 = torch.load(cbow_path + 'LargerCBOWwith_weights.pt', map_location=device)


cbows = [cbow1, cbow2, cbow3, cbow4]
cbow_names = ['SimpleCBOW', 'LargerCBOW', 'SimpleCBOW_weights', 'LargerCBOW_weights']
val_loader = DataLoader(data_val, batch_size=128)
selected_cbow = evaluate_and_select_cbow(cbows, cbow_names, cbow_path, val_loader)

Max accuracy: 0.189, selected model: model: 2: LargerCBOW

All accuracies: 
SimpleCBOW achieved 0.166
LargerCBOW achieved 0.189
SimpleCBOW_weights achieved 0.151
LargerCBOW_weights achieved 0.176


In [135]:
# It seems like the losses aren't stabilising yet - train the two larger cbow models for more epochs
if not os.path.isfile(cbow_path + cbow2.__class__.__name__ + "no_weights50epochs.pt"):
    no_weights = cbow2
    optim = torch.optim.Adam(no_weights.parameters(), lr=0.005)
    loss_fn = nn.CrossEntropyLoss()
    losses = train(30, optim, no_weights, loss_fn, train_loader, val_loader)

    torch.save(no_weights, cbow_path + no_weights.__class__.__name__+ "no_weights50epochs.pt")
else:
    no_weights = torch.load(cbow_path + cbow2.__class__.__name__ + "no_weights50epochs.pt", map_location=device)



if not os.path.isfile(cbow_path + cbow4.__class__.__name__ + "with_weights50epochs.pt"):
    with_weights = cbow4
    optim = torch.optim.Adam(with_weights.parameters(), lr=0.005)
    loss_fn = nn.CrossEntropyLoss(log_weights)
    losses = train(30, optim, with_weights, loss_fn, train_loader, val_loader)

    torch.save(with_weights, cbow_path + with_weights.__class__.__name__+ "with_weights50epochs.pt")
else:
    with_weights = torch.load(cbow_path + cbow4.__class__.__name__ + "with_weights50epochs.pt", map_location=device)

In [136]:
cbows.append(no_weights)
cbows.append(with_weights)
cbow_names.append('LargerCBOW50epochs')
cbow_names.append('LargerCBOW50epochs_weights')

In [137]:
# Now see if performance improved after further training
selected_cbow = evaluate_and_select_cbow(cbows, cbow_names, cbow_path, val_loader)

Max accuracy: 0.189, selected model: model: 2: LargerCBOW

All accuracies: 
SimpleCBOW achieved 0.166
LargerCBOW achieved 0.189
SimpleCBOW_weights achieved 0.151
LargerCBOW_weights achieved 0.176
LargerCBOW50epochs achieved 0.186
LargerCBOW50epochs_weights achieved 0.171


### Getting final metrics of selected CBOW:

In [138]:
torch.manual_seed(seed)
train_loader = DataLoader(data_train, batch_size=128)
val_loader = DataLoader(data_val, batch_size=128)
test_loader = DataLoader(data_test, batch_size=128)

train_acc = compute_accuracy(selected_cbow, train_loader, device=device)
val_acc = compute_accuracy(selected_cbow, val_loader, device=device)
test_acc = compute_accuracy(selected_cbow, test_loader, device=device)

print(f'Final model accuracy on train data: {train_acc:.3f}')
print(f'Final model accuracy on val data: {val_acc:.3f}')
print(f'Final model accuracy on test data: {test_acc:.3f}')

Final model accuracy on train data: 0.197
Final model accuracy on val data: 0.189
Final model accuracy on test data: 0.210


In [139]:
# Create embeddings from selected model
embeddings_path = '/kaggle/working/embedding/'
os.makedirs(embeddings_path, exist_ok=True)
if not os.path.isfile(embeddings_path + "unnormalized_embeddings.pt"):
    word_embeddings = selected_cbow.embedding.weight.data
    torch.save(word_embeddings, embeddings_path + "unnormalized_embeddings.pt")
else:
    word_embeddings = torch.load(embeddings_path + "unnormalized_embeddings.pt", map_location=device)

if not os.path.isfile(embeddings_path + "normalized_embeddings.pt"):
    normalized_embeddings = F.normalize(word_embeddings, p=2, dim=1)
    torch.save(normalized_embeddings, embeddings_path + "normalized_embeddings.pt")
else:
    normalized_embeddings = torch.load(embeddings_path + "normalized_embeddings.pt", map_location=device)

In [140]:
word_ls =  ['me', 'white', 'man', 'have', 'be', 'child', 'yes', 'what',
           'cold', 'wet', 'ran', 'convinced', 'scare']

In [141]:
# Convert selected words to indices
word_indices = [vocab[word] for word in word_ls]
selected_embeddings = normalized_embeddings[word_indices]

# Compute the cosine similarity between the selected words and all words in the vocabulary
cosine_similarities = torch.matmul(selected_embeddings, normalized_embeddings.transpose(0, 1))

# For each word, find the 10 most similar words
top_similar_words = []
for i, word in enumerate(word_ls):
    similarities = cosine_similarities[i]
    # We skip the first one because it will be the word itself with a similarity of 1.
    top_indices = similarities.topk(11).indices[1:]
    similar_words = [vocab.lookup_token(index) for index in top_indices]
    top_similar_words.append(similar_words)

In [142]:
for i in range(len(word_ls)):
    print(f'The top ten similar words to {word_ls[i]} are: {",".join(top_similar_words[i])}')

The top ten similar words to me are: him,us,them,thee,herself,himself,myself,yourself,her,ourselves
The top ten similar words to white are: black,red,golden,gray,soft,big,green,heavy,yellow,beautiful
The top ten similar words to man are: woman,gentleman,lady,soldier,girl,child,lad,person,creature,fellow
The top ten similar words to have are: ve,having,has,had,hast,consider,choose,take,beat,find
The top ten similar words to be are: remain,stand,serve,being,find,exist,enter,get,prove,grow
The top ten similar words to child are: soldier,dog,woman,man,priest,gentleman,lady,lad,boy,porter
The top ten similar words to yes are: ,,sir,?,—,!,),suppose,pray,remember,.
The top ten similar words to what are: why,whether,how,whatever,where,whom,which,that,because,yours
The top ten similar words to cold are: calm,hot,weak,sad,quiet,warm,beautiful,strong,merry,gentle
The top ten similar words to wet are: beautiful,firm,broken,frightened,cold,severe,familiar,mingled,weak,dark
The top ten similar words

## Visualising embeddings

https://projector.tensorflow.org/

In [143]:
vocab_ls = vocab.lookup_tokens(range(VOCAB_SIZE))

embeddings_df = pd.DataFrame(normalized_embeddings.cpu())

# Convert the vocabulary to a DataFrame
vocab_df = pd.DataFrame(vocab_ls)

embeddings_path = 'embedding/'

# Save the embeddings as a TSV file
embeddings_df.to_csv(embeddings_path + 'embeddings.tsv', sep='\t', header=False, index=False)

# Save the vocabulary as a TSV file
vocab_df.to_csv(embeddings_path + '/csv_vocab.tsv', sep='\t', header=False, index=False)

In [144]:
print(vocab_df.head())
print()
print(embeddings_df.head())

       0
0  <unk>
1      ,
2    the
3      .
4    and

         0         1         2         3         4         5         6   \
0  0.073012 -0.206832 -0.252393 -0.019664  0.244478 -0.075594  0.146849   
1  0.141393 -0.005656  0.018705 -0.151012  0.106441  0.091766 -0.069533   
2 -0.375181 -0.211718 -0.171357  0.222586  0.092827 -0.138984 -0.255654   
3 -0.100052  0.096181  0.290298 -0.218915  0.021255  0.127657 -0.137882   
4 -0.137398  0.127038  0.074155 -0.435377 -0.215810 -0.112719  0.196020   

         7         8         9   ...        22        23        24        25  \
0 -0.275250  0.349527 -0.084009  ... -0.127031 -0.174418 -0.077118  0.280887   
1 -0.034630 -0.132091  0.175377  ... -0.320935  0.253002  0.021195 -0.119807   
2  0.152078  0.029543 -0.096057  ...  0.023932  0.279010  0.030458  0.045584   
3  0.084834 -0.583147  0.043893  ... -0.272811  0.326768 -0.145677  0.001816   
4 -0.071211 -0.500548 -0.032921  ...  0.032088  0.027607 -0.315452 -0.107383   

         26  

## Conjugating <em>be</em> and <em>have</em>

In [145]:
# Creating the dataset for be/have conjugation,
# adjusting the create dataset function from earlier

In [146]:
def create_conjugation_dataset(text, vocab, conjugation_words, context_size=CONTEXT_SIZE):
    """
    Create a PyTorch dataset of context/target pairs from a text
    """

    # Define punctuation symbols
    string_punctuations = string.punctuation
    punctuations = [vocab[p] for p in string_punctuations
               if vocab[p] != 0]
    
    # Transform the text into a list of integers from vocabulary
    txt = [vocab[w] for w in text]
    
    n_text = len(txt)

    contexts = []
    targets = []
    for i in range(n_text - context_size*2):
        # Word directly after the start of context is the target.
        target = txt[i + context_size]
        word = vocab.lookup_token(target)
        # Skip targets that are unknown:
        if word != '<unk>' and target not in punctuations:
            if word in conjugation_words:
                # Context before target
                context_before = txt[i:i + context_size]

                # Context after the target
                context_behind = txt[i + context_size + 1: i + context_size*2 + 1]

                # Add context
                contexts.append(torch.tensor(context_before + context_behind))
                
                # Add mapped target
                targets.append(conjugation_words.index(word))

    # Convert contexts and targets into PyTorch tensors and stack for dataset formation.
    contexts = torch.stack(contexts)
    targets = torch.tensor(targets)
    
    # Create a PyTorch dataset from these context/target pairs
    return TensorDataset(contexts, targets)

In [147]:
def load_conjugation(words, vocab, fname):
    """
    Load dataset if already generated, otherwise, create it and save it
    """
    
    dataset_path = 'dataset/'
    
    
    conjugations = ['be', 'am', 'are', 'is', 'was', 'were', 'been',
                    'being', 'have', 'has', 'had', 'having']
    
    # If already generated
    if os.path.isfile(dataset_path + fname):
        dataset = torch.load(dataset_path + fname)
    else:
        # Create context / target dataset based on the list of strings
        dataset = create_conjugation_dataset(words, vocab, conjugations)
        torch.save(dataset, dataset_path + fname)
        
    return dataset

c_train = load_conjugation(words_train, vocab, "c_train.pt")
c_val = load_conjugation(words_val, vocab, "c_val.pt")
c_test = load_conjugation(words_test, vocab, "c_test.pt")

In [148]:
print(f"The training data has {len(c_train)} entries.")
print(f"The validation data has {len(c_val)} entries.")
print(f"The test data has {len(c_test)} entries.")

The training data has 124031 entries.
The validation data has 2590 entries.
The test data has 4765 entries.


In [149]:
# Labels and distribution
torch.unique(c_train.tensors[1], return_counts=True)

(tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11]),
 tensor([13255,  2169,  8800, 19965, 29046,  8219,  5446,  1678, 10205,  3817,
         20159,  1272]))

## Models, positional encoding and attention

#### Positional encoding

In [150]:
# https://pytorch.org/tutorials/beginner/transformer_tutorial.html
class PositionalEncoding(nn.Module):
    def __init__(self, emb_dim, max_len=128):
        super().__init__()
    
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, emb_dim, 2) * -(math.log(10000.0) / emb_dim))
        
        pe = torch.zeros(max_len, 1, emb_dim)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0)]
        return x

In [151]:
# Inspired by: https://spotintelligence.com/2023/01/31/self-attention/
class SingleHeadAttention(nn.Module):
    def __init__(self, p, emb_dim):
        super().__init__()
        self.p = p
        self.Q_W = nn.Linear(emb_dim, p)
        self.K_W = nn.Linear(emb_dim, p)
        self.V_W = nn.Linear(emb_dim, p)
    
    def forward(self, x):
        Q = self.Q_W(x)
        K = self.K_W(x)
        V = self.V_W(x)
        
        a_scores = torch.bmm(Q, K.transpose(1, 2)) / torch.sqrt(torch.tensor(self.p))
        a_weights = F.softmax(a_scores, dim=2)
        return torch.bmm(a_weights, V)

In [152]:
class MultiHeadAttention(nn.Module):
    def __init__(self, emb_dim, n_head):
        super().__init__()
        self.p = emb_dim // n_head
        self.emb_dim = emb_dim

        self.heads = nn.ModuleList([SingleHeadAttention(self.p, emb_dim)
                                   for h in range(n_head)])
        self.W_O = nn.Linear(self.p*n_head, emb_dim)
        
    def forward(self, x):
        head_outputs = [head(x) for head in self.heads]

        # Concatenate the inputs
        output = torch.cat(head_outputs, dim=-1)
        
        return self.W_O(output)

In [153]:
class SimpleMLP(nn.Module):
    def __init__(self, embedding, max_len=4, output_size=12):
        super().__init__()
        
        emb_dim = embedding.weight.shape[1]
        
        # Freeze the embedding
        self.embedding = embedding
        self.embedding.weight.requires_grad=False
        
        self.fc1 = nn.Linear(emb_dim*max_len, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, output_size)
    
    def forward(self, x):
        x = self.embedding(x)
        x = torch.flatten(x, start_dim=1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return self.fc3(x)

In [154]:
class AttentionMLP(nn.Module):
    def __init__(self, embedding, output_size=12, attention_heads=6, 
                 max_len=4, dropout_rate=0.5):
        super().__init__()
        # max_len is implicit
        # takes input: [batch_size, max_len, emb_dim]
        emb_dim = embedding.weight.shape[1]
        self.embedding = embedding
        self.embedding.weight.requires_grad = False
        
        self.attention = MultiHeadAttention(emb_dim, attention_heads) 
        
        self.positional_encoding = PositionalEncoding(emb_dim)
        
        self.fc1 = nn.Linear(emb_dim, 128)
        self.batchnorm128 = nn.BatchNorm1d(128)
        self.dropout = nn.Dropout(dropout_rate)
        self.fc2 = nn.Linear(128, 64)
        self.batchnorm64 = nn.BatchNorm1d(64)
        self.fc3 = nn.Linear(64, output_size)
        
    
    def forward(self, x):
        x = self.embedding(x)
        x = self.positional_encoding(x)
        x = self.attention(x)
        x = torch.mean(x, dim=1) # pooling over sequence length removes max_len dimension
        x = F.relu(self.fc1(x)) 
        x = self.batchnorm128(x)
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.batchnorm64(x)
        x = self.dropout(x)
        
        return self.fc3(x)

In [155]:
class RNN(nn.Module):
    def __init__(self, embedding, hidden_size=16, num_layers = 2, output_size=12, 
                 dropout_rate=0, max_len=4):
        super().__init__()
        
        emb_dim = embedding.weight.shape[1]
        self.embedding = embedding
        self.embedding.weight.requires_grad = False
        
        self.dropout = nn.Dropout(dropout_rate)
        
        # Creating num_layers RNN layers with size hidden_size
        self.rnn = nn.RNN(emb_dim, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        x = self.embedding(x)
        st, lt = self.rnn(x)
        x = self.dropout(lt[-1])
        x = F.relu(x)
        return self.fc(x)

In [156]:
embedding = selected_cbow.embedding

In [157]:
torch.unique(c_train.tensors[1])

tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])

In [158]:
def run_conjugations(models, model_names, learning_rates, num_epochs):
    total_losses = []
    model_path = 'models/classification/'
    train_loader = DataLoader(c_train, batch_size=64)
    val_loader = DataLoader(c_val, batch_size=64)
    
    for i, model in enumerate(models):
        for lr in learning_rates:
            if not os.path.isfile(model_path + model_names[i] +
                                f"lr_{lr}_e{num_epochs}" + '.pt'):
                training = copy.deepcopy(model) # makes sure we create a new instance for each lr
                optim = torch.optim.Adam(training.parameters(), lr=lr)
                loss_fn = nn.CrossEntropyLoss()
                print(f'Currently training model: {model_names[i]} with learning rate = {lr}.')
                losses = train(num_epochs, optim, training, loss_fn, train_loader, val_loader)
                total_losses.append(losses)

                torch.save(training, model_path + model_names[i] +
                            f"lr_{lr}_e{num_epochs}" + '.pt')
                
    return total_losses

In [159]:
torch.manual_seed(seed)
simple_mlp = SimpleMLP(embedding).to(device)
attention_mlp = AttentionMLP(embedding).to(device)
attention_mlp_no_dropout = AttentionMLP(embedding, dropout_rate=0).to(device)
rnn = RNN(embedding).to(device)
bigger_rnn = RNN(embedding, hidden_size=24, num_layers=4, dropout_rate=0.2).to(device)

models = [simple_mlp, attention_mlp, attention_mlp_no_dropout, rnn, bigger_rnn]
names = ['SimpleMLP', 'AttentionMLP', 'AttentionMLPNoDropout', 'RNN', 'BiggerRNN']

losses = run_conjugations(models, names, [0.001, 0.005, 0.01], 30)

In [160]:
# Run all models with lr = 0.001 for 50 epochs to see if they improve,
# as losses didn't seem to stabilise yet for some of these models

In [161]:
def run_conjugations2(models, model_names):
    total_losses = []
    model_path = 'models/classification/'
    train_loader = DataLoader(c_train, batch_size=64)
    val_loader = DataLoader(c_val, batch_size=64)
    
    for i, model in enumerate(models):
        if not os.path.isfile(model_path + model_names[i] +
                             "lr_0.001_e50" + '.pt'):
            optim = torch.optim.Adam(model.parameters(), lr=0.001)
            loss_fn = nn.CrossEntropyLoss()
            print(f'Currently training model: {model_names[i]} with learning rate = 0.001.')
            losses = train(50, optim, model, loss_fn, train_loader, val_loader)
            total_losses.append(losses)

            torch.save(model, model_path + model_names[i] +
                      "lr_0.001_e50" + '.pt')
                
    return total_losses

In [162]:
simple_mlp = SimpleMLP(embedding).to(device)
attention_mlp = AttentionMLP(embedding).to(device)
attention_mlp_no_dropout = AttentionMLP(embedding, dropout_rate=0).to(device)
rnn = RNN(embedding).to(device)
bigger_rnn = RNN(embedding, hidden_size=24, num_layers=4, dropout_rate=0.2).to(device)

models = [simple_mlp, attention_mlp, attention_mlp_no_dropout, rnn, bigger_rnn]
names = ['SimpleMLP', 'AttentionMLP', 'AttentionMLPNoDropout', 'RNN', 'BiggerRNN']

losses = run_conjugations(models, names, [0.001], 50)

In [163]:
classification_path = 'models/classification/'
models = []
model_names = []
for file in os.listdir(classification_path):
    model = torch.load(classification_path + file, map_location=device)
    models.append(model)
    model_names.append(file[:-3])

In [164]:
print('All models to test:\n')
for name in model_names:
    print(name)

All models to test:

AttentionMLPlr_0.001_e30
AttentionMLPlr_0.001_e50
AttentionMLPlr_0.005_e30
AttentionMLPlr_0.01_e30
AttentionMLPNoDropoutlr_0.001_e30
AttentionMLPNoDropoutlr_0.001_e50
AttentionMLPNoDropoutlr_0.005_e30
AttentionMLPNoDropoutlr_0.01_e30
BiggerRNNlr_0.001_e30
BiggerRNNlr_0.001_e50
BiggerRNNlr_0.005_e30
BiggerRNNlr_0.01_e30
RNNlr_0.001_e30
RNNlr_0.001_e50
RNNlr_0.005_e30
RNNlr_0.01_e30
SimpleMLPlr_0.001_e30
SimpleMLPlr_0.001_e50
SimpleMLPlr_0.005_e30
SimpleMLPlr_0.01_e30


In [165]:
torch.manual_seed(seed)
def evaluate_and_select_conjugator(models, model_names, val_loader):
    accs = []

    for model in models:
        acc = compute_accuracy(model, val_loader, device=device)
        accs.append(acc)
    
    model_idx = accs.index(max(accs))
    print(f'Max accuracy: {accs[model_idx]:.3f}, selected model: model: {model_idx + 1}: {model_names[model_idx]}', end='\n')
    
    all_accs = "\n\n".join([f"{model_name}: {acc:.3f}" for model_name, acc in zip(model_names, accs)])
    print('All accuracies:')
    print(all_accs)
    return models[model_idx]


c_val_loader = DataLoader(c_val, batch_size=64)
selected_conjugator = evaluate_and_select_conjugator(models, model_names, c_val_loader)

Max accuracy: 0.446, selected model: model: 10: BiggerRNNlr_0.001_e50
All accuracies:
AttentionMLPlr_0.001_e30: 0.382

AttentionMLPlr_0.001_e50: 0.415

AttentionMLPlr_0.005_e30: 0.374

AttentionMLPlr_0.01_e30: 0.373

AttentionMLPNoDropoutlr_0.001_e30: 0.375

AttentionMLPNoDropoutlr_0.001_e50: 0.375

AttentionMLPNoDropoutlr_0.005_e30: 0.375

AttentionMLPNoDropoutlr_0.01_e30: 0.364

BiggerRNNlr_0.001_e30: 0.437

BiggerRNNlr_0.001_e50: 0.446

BiggerRNNlr_0.005_e30: 0.383

BiggerRNNlr_0.01_e30: 0.308

RNNlr_0.001_e30: 0.442

RNNlr_0.001_e50: 0.440

RNNlr_0.005_e30: 0.390

RNNlr_0.01_e30: 0.376

SimpleMLPlr_0.001_e30: 0.438

SimpleMLPlr_0.001_e50: 0.443

SimpleMLPlr_0.005_e30: 0.407

SimpleMLPlr_0.01_e30: 0.405


### Final results

In [166]:
selected_conjugator

RNN(
  (embedding): Embedding(1880, 32)
  (dropout): Dropout(p=0.2, inplace=False)
  (rnn): RNN(32, 24, num_layers=4, batch_first=True)
  (fc): Linear(in_features=24, out_features=12, bias=True)
)

In [167]:
torch.manual_seed(seed)
c_train_loader = DataLoader(c_train, batch_size=64)
c_val_loader = DataLoader(c_val, batch_size=64)
c_test_loader = DataLoader(c_test, batch_size=64)

train_acc = compute_accuracy(selected_conjugator, c_train_loader, device=device)
val_acc = compute_accuracy(selected_conjugator, c_val_loader, device=device)
test_acc = compute_accuracy(selected_conjugator, c_test_loader, device=device)

print(f'Final model accuracy on train data: {train_acc:.3f}')
print(f'Final model accuracy on val data: {val_acc:.3f}')
print(f'Final model accuracy on test data: {test_acc:.3f}')

Final model accuracy on train data: 0.482
Final model accuracy on val data: 0.446
Final model accuracy on test data: 0.407


In [168]:
def show_samples(contexts, preds, n):
    selected_contexts = contexts[:n]
    selected_preds = preds[:n]
    mapping = ['be', 'am', 'are', 'is', 'was', 'were', 'been',
                    'being', 'have', 'has', 'had', 'having']
    
    for idx in range(len(selected_contexts)):
        context_before = selected_contexts[idx][:2].tolist()
        context_after = selected_contexts[idx][2:].tolist()
        prediction = mapping[selected_preds[idx]]
        
        print(f'Showing sample: {idx + 1}')
        print(f'{" ".join(vocab.lookup_tokens(context_before))} *{prediction}* {" ".join(vocab.lookup_tokens(context_after))}')
        print()

In [169]:
all_val_logits = selected_conjugator(c_val[:][0].to(device))

In [170]:
predictions = F.softmax(all_val_logits, dim=1)
predicted_conjugations = torch.argmax(predictions, dim=1)

In [171]:
# Sample incorrect predictions
all_incorrect_preds = predicted_conjugations[predicted_conjugations != c_val[:][1].to(device)]
all_incorrect_contexts = c_val[:][0].to(device)[predicted_conjugations != c_val[:][1].to(device)]

In [172]:
# Sample correct predictions
all_correct_preds = predicted_conjugations[predicted_conjugations == c_val[:][1].to(device)]
all_correct_contexts = c_val[:][0].to(device)[predicted_conjugations == c_val[:][1].to(device)]


In [173]:
print('Ten incorrect samples:')
show_samples(all_incorrect_contexts, all_incorrect_preds, 10)

Ten incorrect samples:
Showing sample: 1
. ( *is* a <unk>

Showing sample: 2
army . *are* <unk> my

Showing sample: 3
, i *am* <unk> attached

Showing sample: 4
the regiment *is* <unk> in

Showing sample: 5
<unk> war *is* broken out

Showing sample: 6
my <unk> *is* advanced through

Showing sample: 7
officers who *are* in the

Showing sample: 8
me it *is* nothing but

Showing sample: 9
. i *am* removed from

Showing sample: 10
there i *been* struck on


In [174]:
print('Ten correct samples:')
show_samples(all_correct_contexts, all_correct_preds, 10)

Ten correct samples:
Showing sample: 1
, and *was* already deep

Showing sample: 2
i should *have* fallen into

Showing sample: 3
it not *been* for the

Showing sample: 4
as to *be* able to

Showing sample: 5
day should *be* lost in

Showing sample: 6
man to *be* . under

Showing sample: 7
the <unk> *are* <unk> <unk>

Showing sample: 8
, i *was* standing at

Showing sample: 9
who had *been* a <unk>

Showing sample: 10
of <unk> *is* a pleasant


# Text generation

### Dataset generation

(Here we only use the context before the target word)

In [175]:
g_context_size = 6

In [176]:
def create_generation_dataset(text, vocab, context_size=g_context_size):
    """
    Create a PyTorch dataset of context/target pairs from a text
    """

    # Define punctuation symbols
    string_punctuations = string.punctuation
    punctuations = [vocab[p] for p in string_punctuations
               if vocab[p] != 0]
    
    # Transform the text into a list of integers from vocabulary
    txt = [vocab[w] for w in text]
    
    n_text = len(txt)

    contexts = []
    targets = []
    for i in range(n_text - context_size):
        # Word directly after the start of context is the target.
        target = txt[i + context_size]
        # Skip targets that are unknown:
        if vocab.lookup_token(target) != '<unk>' and target not in punctuations:
            # Context before target
            context = txt[i:i + context_size]
            
            targets.append(target)
            contexts.append(torch.tensor(context))

    # Convert contexts and targets into PyTorch tensors and stack for dataset formation.
    contexts = torch.stack(contexts)
    targets = torch.tensor(targets)
    
    # Create a PyTorch dataset from these context/target pairs
    return TensorDataset(contexts, targets)

In [177]:
def load_generation(words, vocab, fname):
    """
    Load dataset if already generated, otherwise, create it and save it
    """
    
    dataset_path = 'dataset/'
    
    # If already generated
    if os.path.isfile(dataset_path + fname):
        dataset = torch.load(dataset_path + fname)
    else:
        # Create context / target dataset based on the list of strings
        dataset = create_generation_dataset(words, vocab)
        torch.save(dataset, dataset_path + fname)
        
    return dataset

g_train = load_generation(words_train, vocab, "g_train.pt")
g_val = load_generation(words_val, vocab, "g_val.pt")
g_test = load_generation(words_test, vocab, "g_test.pt")

In [178]:
print(g_train.tensors[0].shape)
print(g_train.tensors[1].shape)

torch.Size([1916840, 6])
torch.Size([1916840])


In [179]:
print(f"The training data has {len(g_train)} entries.")
print(f"The validation data has {len(g_val)} entries.")
print(f"The test data has {len(g_test)} entries.")

The training data has 1916840 entries.
The validation data has 36216 entries.
The test data has 83641 entries.


In [180]:
embedding = selected_cbow.embedding

## Word generation model architecture:

In [181]:
class GenerationRNN(nn.Module):
    def __init__(self, embedding, vocab_size=VOCAB_SIZE,
                hidden_layer_size=12, n_layers=2, dropout_rate=0.5):
        super().__init__()
        emb_dim = embedding.weight.shape[1]
        self.embedding = embedding
        self.embedding.weight.requires_grad = False
        self.lstm = nn.LSTM(emb_dim, hidden_layer_size,
                          n_layers, dropout=dropout_rate, 
                            batch_first=True)
        
        self.out = nn.Linear(hidden_layer_size, vocab_size)
        
    def forward(self, x):
        x = self.embedding(x)
        _, (hidden_state, cell_state) = self.lstm(x)
        x = F.relu(hidden_state[-1])
        return self.out(x)

### Class frequencies for g_train and class weights

In [182]:
g_freqs = count_freqs_int(g_train[:][1], vocab)

print(f'Updated frequencies for new dataset: {g_freqs[:10]}')

g_freqs = torch.tensor(g_freqs, dtype=torch.float)


total_samples = g_freqs.sum()

# Calculate class weights
g_weights = total_samples / g_freqs

g_weights = torch.tensor([w if w != float('inf') else 0 for w in g_weights]).to(device)

print(f'Class weights sample: {g_weights[:10]}')

Updated frequencies for new dataset: tensor([     0,      0, 151278,      0,  82288,  65661,  62763,  49230,  41477,
         31052], dtype=torch.int32)
Class weights sample: tensor([ 0.0000,  0.0000, 12.6710,  0.0000, 23.2943, 29.1930, 30.5409, 38.9364,
        46.2145, 61.7300])


  g_freqs = torch.tensor(g_freqs, dtype=torch.float)


In [183]:
torch.sum(g_weights)

log_g_weights = torch.log1p(g_weights)
print(f'Log weights sample: {log_g_weights[:10]}')

Log weights sample: tensor([0.0000, 0.0000, 2.6153, 0.0000, 3.1902, 3.4076, 3.4513, 3.6873, 3.8547,
        4.1388])


In [184]:
def run_generators(models, model_names, lr, num_epochs):
    total_losses = []
    generator_path = 'models/generators/'
    train_loader = DataLoader(g_train, batch_size=128)
    val_loader = DataLoader(g_val, batch_size=128)
    
    for i in range(len(models)):
        if not os.path.isfile(generator_path + model_names[i] + f'lr_{lr}_e{num_epochs}.pt'):
            model = models[i]
            optim = torch.optim.Adam(model.parameters(), lr=lr)
            
            if model_names[i].endswith('W'):
                loss_fn = nn.CrossEntropyLoss(log_g_weights)
            else:
                loss_fn = nn.CrossEntropyLoss()
            
            print(f'Currently training model: {model_names[i]} with learning_rate = {lr}')
            losses = train(num_epochs, optim, model, loss_fn, train_loader, val_loader)
            total_losses.append(losses)
            torch.save(model, generator_path + model_names[i] + f'lr_{lr}_e{num_epochs}.pt')
    
    return total_losses

In [185]:
torch.manual_seed(seed)
# First trying lr = 0.005 and 40 epochs
generator_path = 'models/generators/'
os.makedirs(generator_path, exist_ok=True)

# Train different variations of the model -
# essentially hyperparameter tuning
basic_gen = GenerationRNN(embedding, dropout_rate=0)
dropout_gen = GenerationRNN(embedding, dropout_rate=0.5)
medium_gen = GenerationRNN(embedding, hidden_layer_size=16, n_layers=3, dropout_rate=0.3)
deeper_gen = GenerationRNN(embedding, hidden_layer_size=24, n_layers=4)

# Test all of these kinds of generators with class weights
basic_gen_weighted = GenerationRNN(embedding, dropout_rate=0)
dropout_gen_weighted = GenerationRNN(embedding, dropout_rate=0.5)
medium_gen_weighted = GenerationRNN(embedding, hidden_layer_size=16, n_layers=3, dropout_rate=0.3)
deeper_gen_weighted = GenerationRNN(embedding, n_layers=4)

generation_models = [basic_gen, dropout_gen, medium_gen, deeper_gen, basic_gen_weighted,
                     dropout_gen_weighted, medium_gen_weighted, deeper_gen_weighted]
generation_models = [model.to(device) for model in generation_models]
generation_names = ['Basic', 'Dropout', 'Medium', 'Deeper',
                    'BasicW', 'DropoutW', 'MediumW', 'DeeperW']


total_losses = run_generators(generation_models, generation_names, 0.005, 40)

In [186]:
torch.manual_seed(seed)
# Trying learning rate 0.001 for 50 epochs, running the same model architectures

# Train different variations of the model -
# essentially hyperparameter tuning
basic_gen = GenerationRNN(embedding, dropout_rate=0)
dropout_gen = GenerationRNN(embedding, dropout_rate=0.5)
medium_gen = GenerationRNN(embedding, hidden_layer_size=16, n_layers=3, dropout_rate=0.3)
deeper_gen = GenerationRNN(embedding, hidden_layer_size=24, n_layers=4)

# Test all of these kinds of generators with class weights
basic_gen_weighted = GenerationRNN(embedding, dropout_rate=0)
dropout_gen_weighted = GenerationRNN(embedding, dropout_rate=0.5)
medium_gen_weighted = GenerationRNN(embedding, hidden_layer_size=16, n_layers=3, dropout_rate=0.2)
deeper_gen_weighted = GenerationRNN(embedding, n_layers=4)

generation_models = [basic_gen, dropout_gen, medium_gen, deeper_gen, basic_gen_weighted,
                     dropout_gen_weighted, medium_gen_weighted, deeper_gen_weighted]
generation_models = [model.to(device) for model in generation_models]
generation_names = ['Basic', 'Dropout', 'Medium', 'Deeper',
                    'BasicW', 'DropoutW', 'MediumW', 'DeeperW']


total_losses = run_generators(generation_models, generation_names, 0.001, 50)

In [187]:
generator_path = 'models/generators/'
generation_models = []
generation_names = []
for file in os.listdir(generator_path):
    model = torch.load(generator_path + file, map_location=device)
    generation_models.append(model)
    generation_names.append(file[:-3])

In [188]:
torch.manual_seed(seed)
def evaluate_and_select_generator(models, model_names, model_path, val_loader):
    accs = []

    for model in models:
        acc = compute_accuracy(model, val_loader, device=device)
        accs.append(acc)
    
    model_idx = accs.index(max(accs))
    print(f'Max accuracy: {accs[model_idx]:.3f}, selected model: model: {model_idx + 1}: {model_names[model_idx]}', end='\n')
    
    all_accs = "\n\n".join([f"{model_name}: {acc:.3f}" for model_name, acc in zip(model_names, accs)])
    print('All accuracies:')
    print(all_accs)
    return models[model_idx]



g_val_loader = DataLoader(g_val, batch_size=128)
selected_generator = evaluate_and_select_generator(generation_models, generation_names, generator_path, g_val_loader)
selected_generator = selected_generator.to(device)

Max accuracy: 0.161, selected model: model: 1: Basiclr_0.001_e50
All accuracies:
Basiclr_0.001_e50: 0.161

Basiclr_0.005_e40: 0.153

BasicWlr_0.001_e50: 0.152

BasicWlr_0.005_e40: 0.138

Deeperlr_0.001_e50: 0.149

Deeperlr_0.005_e40: 0.127

DeeperWlr_0.001_e50: 0.116

DeeperWlr_0.005_e40: 0.086

Dropoutlr_0.001_e50: 0.149

Dropoutlr_0.005_e40: 0.140

DropoutWlr_0.001_e50: 0.139

DropoutWlr_0.005_e40: 0.128

Mediumlr_0.001_e50: 0.156

Mediumlr_0.005_e40: 0.149

MediumWlr_0.001_e50: 0.147

MediumWlr_0.005_e40: 0.133


## Beam Search

In [189]:
def beam_search(text_input, generator, num_predictions=4, beam_width=5):
    words_idx = torch.tensor(vocab.lookup_indices(text_input)).to(device)
    
    # This list of list will store all current sequences,
    # and track their probabilities
    possible_sequences = [[words_idx.tolist(), 1.0]]
    
    for p in range(num_predictions):
        all_candidates = []
        
        for seq, score in possible_sequences:
            seq_tensor = torch.tensor([seq]).to(device)
            
            prediction = generator(seq_tensor)
            probabilities = F.softmax(prediction, dim=-1)
        
            # Explore all possible next words
            for w in range(VOCAB_SIZE): 
                next_score = score * probabilities[0][w].item()
                candidate = [seq + [w], next_score]
                all_candidates.append(candidate)
        
        possible_sequences = sorted(all_candidates, key=lambda tup: tup[1], reverse=True)[:beam_width]

    
    return vocab.lookup_tokens(possible_sequences[0][0])
    

## Performance and beam search testing

In [190]:
torch.manual_seed(seed)
g_train_loader = DataLoader(g_train, batch_size=128)
g_val_loader = DataLoader(g_val, batch_size=128)
g_test_loader = DataLoader(g_test, batch_size=128)

train_acc = compute_accuracy(selected_generator, g_train_loader, device=device)
val_acc = compute_accuracy(selected_generator, g_val_loader, device=device)
test_acc = compute_accuracy(selected_generator, g_test_loader, device=device)

print(f'Final model accuracy on train data: {train_acc:.3f}')
print(f'Final model accuracy on val data: {val_acc:.3f}')
print(f'Final model accuracy on test data: {test_acc:.3f}')

Final model accuracy on train data: 0.166
Final model accuracy on val data: 0.161
Final model accuracy on test data: 0.191


In [191]:
text_input = ['can', 'you', 'feel', 'the']
" ".join(beam_search(text_input, selected_generator))

'can you feel the old man of the'

In [192]:
text_input = ['sometimes', 'i', 'try']
" ".join(beam_search(text_input, selected_generator))

'sometimes i try to be a man'

In [193]:
text_input = ['when', 'i', 'can']
" ".join(beam_search(text_input, selected_generator))

'when i can t come to be'

In [194]:
text_input = ['the', 'man', 'was']
" ".join(beam_search(text_input, selected_generator))

'the man was a man of the'

In [195]:
text_input = ['at', 'the', 'sea']
" ".join(beam_search(text_input, selected_generator, num_predictions=5))

'at the sea of the old man s'

In [196]:
text_input = ['the', 'season']
" ".join(beam_search(text_input, selected_generator, num_predictions=5))

'the season in the midst of the'

In [197]:
text_input = ['the', 'hand']
" ".join(beam_search(text_input, selected_generator, num_predictions=8, beam_width=4))

'the hand of the old man who had been a'

In [198]:
text_input = ['sometimes', 'when', 'i']
" ".join(beam_search(text_input, selected_generator, num_predictions=8, beam_width=3))

'sometimes when i m not to be a man of the'

In [199]:
text_input = ['cold']
" ".join(beam_search(text_input, selected_generator))

'cold old man and the'

In [200]:
text_input = ['it', 'was', 'a', 'dark']
" ".join(beam_search(text_input, selected_generator))

'it was a dark man of a man'

In [201]:
text_input = ['this', 'isn', 't' ,'going']
" ".join(beam_search(text_input, selected_generator))

'this isn t going to be a man'

In [202]:
text_input = ['the', 'woman', 'took', 'the', 'cat']
" ".join(beam_search(text_input, selected_generator))

'the woman took the cat in the depths of'

In [203]:
text_input = ['i', 'cannot', 'believe']
" ".join(beam_search(text_input, selected_generator, 15))

'i cannot believe it is a man of the old man and a man who had been a'

### Experimenting with all models

In [204]:
def test_all_models(models, names, text, num_predictions=7, beam_width=5):
    for idx in range(len(models)):
        print(f'Generating {num_predictions} with model {names[idx]} and beam width {beam_width}')
        words = beam_search(text, models[idx], num_predictions, beam_width)
        print(' '.join(words))
        print()

In [205]:
text = ['the', 'doctor', 'had']
test_all_models(generation_models, generation_names, text)

Generating 7 with model Basiclr_0.001_e50 and beam width 5
the doctor had been in the midst of the other

Generating 7 with model Basiclr_0.005_e40 and beam width 5
the doctor had been in the middle of us and

Generating 7 with model BasicWlr_0.001_e50 and beam width 5
the doctor had been in front of the rear of

Generating 7 with model BasicWlr_0.005_e40 and beam width 5
the doctor had been no more of us in the

Generating 7 with model Deeperlr_0.001_e50 and beam width 5
the doctor had been in the ground of the ground

Generating 7 with model Deeperlr_0.005_e40 and beam width 5
the doctor had the man of the trench of the

Generating 7 with model DeeperWlr_0.001_e50 and beam width 5
the doctor had is a man of the head of

Generating 7 with model DeeperWlr_0.005_e40 and beam width 5
the doctor had are t see that we are t

Generating 7 with model Dropoutlr_0.001_e50 and beam width 5
the doctor had been in the ground of the other

Generating 7 with model Dropoutlr_0.005_e40 and beam width