# Wavenet

This is an implementation of a character level NLP model, motivated by [van den Oord et al.](https://arxiv.org/pdf/1609.03499.pdf) paper, based on Andrej Karpathy's [Makemore](https://www.youtube.com/watch?v=t3YJ5hKiMQ0&list=PLAqhIrjkxbuWI23v9cThsA9GvCAUhRvKZ&index=6) lectures.

<br>
<br>
<figure align="center">
    <img src="./images/van_den_oord_wavenet.png" width="500">
    <figcaption>van den Oord et al.'s wavenet.</figcaption>
</figure>

<br>
<br>
<figure align="center">
    <img src="./images/van_den_oord_wavenet_building_block.png" width="500">
    <figcaption>Building block of the wavenet.</figcaption>
</figure>

In [None]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
import urllib
import random
from typing import List, Tuple

%matplotlib inline

# Set the seed for the random number generator for repeatability
random.seed(42)

In [None]:
# Url to the file containing over 30k names
url_source = "https://raw.githubusercontent.com/karpathy/makemore/master/names.txt"

text = str('')

# Read to a variable, line by line
for line in urllib.request.urlopen(url_source):
    text += line.decode('utf-8')

In [None]:
# Split the file to lines and show the first 10 lines
words = text.splitlines()
print(words[:10])
print(f"Total number of words: {len(words)}")

In [None]:
# build a vocabulary of characters and mappings to/from integers
# chars is a set of all the characters found in the text, converted into a list and ordered
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i, s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s, i in stoi.items()}
vocab_size = len(itos)

print(f"chars:{chars}\n")
print(f"stoi: {stoi}\n")
print(f"itos: {itos}\n")
print(f"vocabulary size: {vocab_size} letters")

In [None]:
# shuffle the words
random.shuffle(words)

In [None]:
# Build the dataset

# Context length -> how many characters do we take to predict the next one
block_size = 3

def build_dataset(words: List, block_size: int) -> Tuple[torch.tensor, torch.tensor]:
    """Builds a character dataset for NLP. Outputs are the context (n-number of previous characters) and 
    the target character.

    Parameters
    ----------
    words : List
        A list of words used in learning.
    block_size : int
        Number of characters in the learning dataset.

    Returns
    -------
    Tuple[torch.tensor, torch.tensor]
        [context, target]
    """
    X, Y = [], []

    # Iterate through the words
    for w in words:
        # A neat trick to create a list and initialize it
        context = [0] * block_size

        # Iterate through the characters of the word that have been added with '.' at the end
        for ch in w + '.':
            ix = stoi[ch]
            X.append(context)
            Y.append(ix)
            context = context[1:] + [ix]

    X = torch.tensor(X)
    Y = torch.tensor(Y)
    return X, Y

# Training, validation and test dataset lengths
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

X_training, Y_training = build_dataset(words[:n1], block_size)
X_validation, Y_validation = build_dataset(words[n1:n2], block_size)
X_test, Y_test = build_dataset(words[n2:], block_size)