In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F 
from torch.utils.data import DataLoader, TensorDataset

In [2]:
names = open('names.txt').read().splitlines()

vocab = sorted(set(''.join(names) + '.'))
stoi = {v:k for k, v in enumerate(vocab)}
itos = {v:k for k, v in stoi.items()}

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using device:", device)


Using device: cpu


  return torch._C._cuda_getDeviceCount() > 0


In [4]:
# train neural bigram model
vocab_size = len(stoi)  # 27
batch_size = 512
epochs = 20

test_size = int(0.2 * len(names))

tr_names = names[:-test_size]
ts_names = names[-test_size:]

In [5]:
def build_dataset(names_list, stoi, context_size):
    X, Y = [], []
    for name in names_list:
        name = '.' * context_size + name + '.'
        for i in range(len(name) - context_size):
            context = [stoi[c] for c in name[i:i+context_size]]
            target = stoi[name[i+context_size]]
            X.append(context)
            Y.append(target)
    return torch.tensor(X), torch.tensor(Y)

In [6]:
# model
class NeuralNGramModel(nn.Module):
    def __init__(self, vocab_size, context_size, embed_dim=50, hidden_dim=200, dropout=0.2):
        super().__init__()
        self.context_size = context_size
        self.embed = nn.Embedding(vocab_size, embed_dim)
        self.fc1 = nn.Linear(embed_dim * context_size, hidden_dim)
        self.dropout = nn.Dropout(dropout)
        self.fc2 = nn.Linear(hidden_dim, vocab_size)
    
    def forward(self, x):
        x = self.embed(x)
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        logits = self.fc2(x)
        return logits


In [7]:
# training
def train_ngram(context_size=1, embed_dim=64, hidden_dim=128, lr=1e-3, epochs=15):
    print(f"\nTraining {context_size}-gram model...")
    Xtr, Ytr = build_dataset(tr_names, stoi, context_size)
    Xts, Yts = build_dataset(ts_names, stoi, context_size)

    model = NeuralNGramModel(len(stoi), context_size, embed_dim, hidden_dim).to(device)
    opt = torch.optim.Adam(model.parameters(), lr=lr)
    loss_fn = nn.CrossEntropyLoss()

    Xtr, Ytr, Xts, Yts = Xtr.to(device), Ytr.to(device), Xts.to(device), Yts.to(device)

    for epoch in range(epochs):
        model.train()
        logits = model(Xtr)
        loss = loss_fn(logits, Ytr)
        opt.zero_grad()
        loss.backward()
        opt.step()

        with torch.no_grad():
            model.eval()
            val_loss = loss_fn(model(Xts), Yts).item()
        if (epoch+1) % 5 == 0 or epoch == 0:
            print(f"Epoch {epoch+1:2d} | train {loss.item():.4f} | test {val_loss:.4f}")

    return model

In [8]:
# inference
@torch.no_grad()
def sample(model, context_size=1, num=5):
    for _ in range(num):
        context = [stoi['.']] * context_size
        out = []
        while True:
            x = torch.tensor([context[-context_size:]], device=device)
            logits = model(x)
            probs = F.softmax(logits, dim=1)
            ix = torch.multinomial(probs, 1).item()
            ch = itos[ix]
            if ch == '.': break
            out.append(ch)
            context.append(ix)
        print(''.join(out))

In [9]:
# device = 'cpu'

In [None]:
bigram_model = train_ngram(context_size=1, embed_dim=64, hidden_dim=128, lr=3e-3, epochs=40)
bigram_model.eval()



Training 1-gram model...
Epoch  1 | train 3.3183 | test 3.2032
Epoch  5 | train 2.9080 | test 2.9154
Epoch 10 | train 2.6550 | test 2.7879
Epoch 15 | train 2.5694 | test 2.7275
Epoch 20 | train 2.5267 | test 2.6743
Epoch 25 | train 2.4982 | test 2.6415
Epoch 30 | train 2.4842 | test 2.6294
Epoch 35 | train 2.4724 | test 2.6191
Epoch 40 | train 2.4680 | test 2.6123


NeuralNGramModel(
  (embed): Embedding(27, 64)
  (fc1): Linear(in_features=64, out_features=128, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=128, out_features=27, bias=True)
)

In [11]:
bigram_model = train_ngram(context_size=2, embed_dim=64, hidden_dim=128, lr=3e-3, epochs=40)
bigram_model.eval()


Training 2-gram model...
Epoch  1 | train 3.3182 | test 3.1791
Epoch  5 | train 2.8053 | test 2.8610
Epoch 10 | train 2.5979 | test 2.7613
Epoch 15 | train 2.5044 | test 2.6738
Epoch 20 | train 2.4428 | test 2.6076
Epoch 25 | train 2.3977 | test 2.5690
Epoch 30 | train 2.3649 | test 2.5386
Epoch 35 | train 2.3413 | test 2.5142
Epoch 40 | train 2.3231 | test 2.5020


NeuralNGramModel(
  (embed): Embedding(27, 64)
  (fc1): Linear(in_features=128, out_features=128, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=128, out_features=27, bias=True)
)

In [None]:
sample()