In [12]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
from tqdm import tqdm 
import random
%matplotlib inline

In [13]:
#Define word dataset and put it into dictionary
word_set = open('names.txt', 'r').read().splitlines()
print(word_set[:10]) #print 10 words
print(len(word_set))

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia', 'harper', 'evelyn']
32033


In [14]:
#Build vocabulary dictionary (Integer/Character Mappings)
chars = sorted(list(set(''.join(word_set))))
S_to_I = {s: i+1 for i,s in enumerate(chars)} #give each char mapping of i + 1
S_to_I['.'] = 0 #map end of sequence token
I_to_S = {i: s for s,i in S_to_I.items()} #reverse S to I

In [20]:
#Build dataset with Context : How many characters needed to predict the next character?
block_size = 3

def build_dataset(words):
    X, Y = [],[]
    for name in tqdm(words, desc = "Building dataset"):
        context = [0] * block_size 
        #Loop through chars, add . to symbolize end token
        for ch in name + '.':
            index = S_to_I[ch]
            X.append(context) #input context
            Y.append(index) #output integer integer to next character
            context = context[1:] + [index] #append context
    X = torch.tensor(X, dtype=torch.long)
    Y = torch.tensor(Y, dtype=torch.long)
    print(f"Shapes X: {X.shape}, Y: {Y.shape}")
    return X,Y


#Split up data and build datasets for Training, Validation, and Testing
random.seed(42)
random.shuffle(word_set)
idx1 = int(0.8 * len(word_set))
idx2 = int(0.9 * len(word_set))

train_x, train_y = build_dataset(word_set[:idx1]) #First 80 percent for Training
dev_x, dev_y = build_dataset(word_set[idx1:idx2]) #
test_x, test_y = build_dataset(word_set[idx2:]) #10 percent to test



Building dataset: 100%|███████████████████████████████████████| 25626/25626 [00:00<00:00, 142589.10it/s]


Shapes X: torch.Size([182437, 3]), Y: torch.Size([182437])


Building dataset: 100%|█████████████████████████████████████████| 3203/3203 [00:00<00:00, 537417.22it/s]


Shapes X: torch.Size([22781, 3]), Y: torch.Size([22781])


Building dataset: 100%|█████████████████████████████████████████| 3204/3204 [00:00<00:00, 495138.35it/s]

Shapes X: torch.Size([22928, 3]), Y: torch.Size([22928])





In [18]:
train_x

tensor([[ 0,  0,  0],
        [ 0,  0,  5],
        [ 0,  5,  2],
        [ 5,  2, 18],
        [ 2, 18,  9],
        [18,  9, 13],
        [ 9, 13,  1]])

In [21]:
train_y

tensor([20,  1, 21,  ..., 14,  1,  0])