# **Character Level Multi Layer Perceptron Language Model**

In [1]:
import torch
import numpy as np
import matplotlib.pyplot as plt
import torch.nn.functional as F

In [2]:
# load dataset
words = open('names.txt', 'r').read().splitlines()

In [3]:
# create lookup table for converting characters to indices
chars = sorted(list(set(''.join(words)))) # all unique characters in the dataset
stoi = {s:i+1 for i,s in enumerate(chars)} # string to index

# manually enumerate start and end token since they are not visible in the dataset
start_token = '<'
end_token = '>'
stoi[start_token] = 0
stoi[end_token] = len(stoi)

# total number of unique characters plus start and end token
chars_count = len(stoi)

# index to string
itos = {i:s for s,i in stoi.items()}

itos

{1: 'a',
 2: 'b',
 3: 'c',
 4: 'd',
 5: 'e',
 6: 'f',
 7: 'g',
 8: 'h',
 9: 'i',
 10: 'j',
 11: 'k',
 12: 'l',
 13: 'm',
 14: 'n',
 15: 'o',
 16: 'p',
 17: 'q',
 18: 'r',
 19: 's',
 20: 't',
 21: 'u',
 22: 'v',
 23: 'w',
 24: 'x',
 25: 'y',
 26: 'z',
 0: '<',
 27: '>'}

### Build dataset
Based on given list of words creates input tensor with sequence of characters  with length equal to 'context_length' and target tensor with next character in sequence.

In [4]:
def build_dataset(words, context_length=3):
    X, Y = [], []
    for w in words:
        context = [stoi[start_token]] * context_length # context window padded with start token
        for ch in w + end_token:
            ix = stoi[ch]
            X.append(context) # for given context ...
            Y.append(ix) # ... the next character is the target
            context = context[1:] + [ix] # crop and append - sliding window
    
    return torch.tensor(X), torch.tensor(Y)

In [5]:
# X (input) and Y (output/target/label) tensors
X, Y = build_dataset(words[:1])

### Create embedding lookup table

In [6]:
# Matrix of embeddings
embedding_dim = 2
C = torch.randn((chars_count, embedding_dim))

# Example of embedding single character with index 15
# since our matrix of embeddings is the same size as the number of characters
# we can simply use the index of the character to get its embedding
embedding = C[15]
print(f'Embedding using index C[15]={embedding}')

# Alternative approach would be to one-hot encode the character and then multiply it by the embedding matrix
# this will give same result because encoded vector will have only one non-zero value equal to 1
# and this will simply act as a mask for the embedding matrix
embedding = F.one_hot(torch.tensor([15]), num_classes=chars_count).float() @ C
print(f'Embedding using one-hot encoding and matrix multiplication={embedding}')


# For rest of of notebook I will use index based approach because it is more efficient
# also thanks to python semantics we can easly retrieve embedding for whole sequence of characters
embedding = C[torch.tensor([15, 20, 5])]
print(f'Embedding for sequence of three characters C[torch.tensor([15, 20, 5])]=\n{embedding}')


Embedding using index C[15]=tensor([-1.4355,  0.3942])
Embedding using one-hot encoding and matrix multiplication=tensor([[-1.4355,  0.3942]])
Embedding for sequence of three characters C[torch.tensor([15, 20, 5])]=
tensor([[-1.4355,  0.3942],
        [-1.2593,  0.7810],
        [ 0.7148,  0.1095]])
