In [2]:
import torch
import torch.nn as nn
from collections import defaultdict
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
import tqdm
import math
from collections import defaultdict

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
FILL_IN = "FILL_IN"

In [4]:
# Dictionaries, {idx -> ch} and {ch -> idx}
itos = defaultdict(int)
stoi = defaultdict(int)
# Embedding dimension, per character
d_model = 10
# Hidden dimension for RNN and also MLP Language Models 
d_h = 200

# START = STOP token
stoi['.'] = 0
itos[0] = '.'

# Fill in stoi and itoi; loop over names.txt
with open("names.txt", "r") as file:
    data = file.read()

data = data.replace("\n", "")

i = 1
for char in data:
    if char not in stoi.keys():
        stoi[char] = i
        itos[i] = char
        i += 1

In [5]:
assert len(stoi) == len(itos)
vocab_size = len(stoi)
assert vocab_size == 27

In [6]:
stoi

defaultdict(int,
            {'.': 0,
             'e': 1,
             'm': 2,
             'a': 3,
             'o': 4,
             'l': 5,
             'i': 6,
             'v': 7,
             's': 8,
             'b': 9,
             'p': 10,
             'h': 11,
             'c': 12,
             'r': 13,
             't': 14,
             'y': 15,
             'n': 16,
             'g': 17,
             'z': 18,
             'f': 19,
             'd': 20,
             'u': 21,
             'k': 22,
             'w': 23,
             'q': 24,
             'x': 25,
             'j': 26})

## Attention RNN Language Model
- For each name, run an RNN character by character
- Use the recursion x = Tanh()(Wh @ h + Wx @ x + bh + bx) and y = Softmax()(Wy h + by)
- Do not use the RNN Cell from PyTorch, do this manually as hinted below
- Use attention as below to get the pprediction at each time step and define the next hidden state
- Token embedding size is d_model; d_h is the hidden size

In [61]:
class AttentionRNNLanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        # For each token, we have an embedding of size d_model
        self.e = nn.Embedding(vocab_size, d_model)
        
        # Wh and Wx act on h_t and x_t, respectively
        self.Wh = nn.Linear(d_h, d_h, bias=True)
        self.Wx = nn.Linear(d_model, d_h, bias=True)
        
        # We will concatenate h_t and k_t to predict the y_t vector
        self.Wy = nn.Linear(2*d_h, vocab_size, bias=True)
        
        # Attention is used to get scores e_{t, s} = h_s @ Wa @ h_t
        self.Wa = nn.Linear(d_h, d_h, bias=False)
        
    """
    Example schematic of this model:
       c1    c2    c3    c4    c5
    
       ^     ^     ^     ^     ^
       |     |     |     |     |

h0 ->  h1 -> h2 -> h3 -> h4 -> h5
    
       ^     ^     ^     ^     ^
       |     |     |     |     | 
    
       c0    c1    c2    c3    c4
    
    As an example, to predict c3 take h3 with h2, h1 
    a3i = h3 @ hi with i <= 3
    Apply softmax to a3i
    k3 = a33 @ h3 + a32 @ h2 + a31@h1
    Use k3 and h3 to predict c3
    k3 really is [k3, h3] first and then we get the logits for c3
    
    """

    # h_past here is the past history of hidden states
    # At time 1, it is just h0
    # At time 2, it is [h0, h1]
    def forward(self, x, h_past):
        # Run through to get the embedding for the token
        # The embedding per token is the feature vector x  we pass into the 
        x = self.e(x)
        
        # Get the last hidden state
        h = h_past[-1]
        
        # Use the last hidden state and the current token to get the new hidden state
        # Pass through Tanh the combination of x and the last hidden state
        h = torch.tanh(self.Wh(h) + self.Wx(x))
        
        # Append the current state to the old states
        # (h0, ..., h_{t-1}) becomes (h0, ..., h_{t-1}, h_t)
        h_past.append(h)
        
        # Get the scores, which are given by e_{t, s} = h_s @ Wa @ h_t, a scalar
        # Get all the real valued scores [e_{t, 0}, ..., e_{t, t}]
        e = torch.matmul(self.Wa(torch.stack(h_past, dim=0)), h.unsqueeze(-1)).squeeze(-1) 

        # From the scores, get the probabilities
        # a_{t, 0} + ... + a_{t, t} = 1
        a = F.softmax(e, dim=0)
                        
        # Define k_t as a_{t, 0} * h_0 + ... + a_{t, t} * h_t
        k = torch.sum(a.unsqueeze(-1) * torch.stack(h_past), dim=0)
        

        # Reset and define k_t as (k_t, h_t)
        k = torch.cat([k, h], dim=-1)
        
        # Get the logits which are Wy(k_t) 
        z = self.Wy(k)
        
        # Return the logits and the current history of hidden states
        return z, h_past

In [62]:
model = AttentionRNNLanguageModel()
# Try some learning rates of the type 10^{x} where x is between -1 and -10
# Things should be going down, as below; use Adam
x=-4
optimizer = torch.optim.Adam(model.parameters(), lr=10**x)

In [63]:
total_loss = 0
total_ct = 0
total_epochs = 5

for _ in range(total_epochs):
    for name in open('names.txt', 'r'):
        # Lowercase and get rid of new lines and spaces at the end
        name = name.lower().strip()
        # Add the START and END padding token
        name = "." + name + "."
        
        # Get x_data, which is names[:-1]
        # Get y_data, which is names[1:]
        x_data = name[:-1]
        y_data = [stoi[char] for char in name[1:]]
        
        # Set the list of all logits
        logits = []
        
        # Reset gradients
        optimizer.zero_grad()
        
        # Set the hidden state to random
        # (1, d_h)
        h_past = [torch.randn(1, d_h)]
        
        # Go through each token of a word and get the logits
        for i, x in enumerate(x_data):
            z, h_past = model.forward(torch.tensor([stoi[x]]), h_past)
            logits.append(z)
        
        # Concatenate all the logits
        logits = torch.cat(logits, dim=0)
                                
        # Compute the loss
        loss = F.cross_entropy(logits, torch.tensor(y_data))

        # Get the new gradient
        loss.backward()

        # Clip the gradients at max norm 0.1
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=0.1)

        # Do a gradient update
        optimizer.step()

        # Get the loss for the batch and get the number of batches
        total_loss += loss.item()
        total_ct += 1

        if total_ct and total_ct % 100 == 0:
            print(total_loss / total_ct)
            total_loss = 0
            total_ct = 0
        

3.199035234451294
2.974315526485443
2.710927710533142
2.614125065803528
2.516718189716339
2.5330309784412384
2.384697015285492
2.337466984987259
2.4017140316963195
2.432653168439865
2.401982527971268
2.366298077106476
2.3614695620536805
2.407786922454834
2.3507401657104494
2.296804198026657
2.3507695710659027
2.332318141460419
2.373187232017517
2.3024566340446473
2.390524340867996
2.335330935716629
2.325185399055481
2.368083999156952
2.333436135053635
2.3262116730213167
2.327047199010849
2.2969601547718046
2.3146406519412994
2.272947039604187
2.285314515829086
2.33095516204834
2.283539936542511
2.2546438348293303
2.3122267186641694
2.2554650604724884
2.2945896553993226
2.3199011301994323
2.272285820245743
2.2598521268367766
2.3104511976242064
2.3297461807727813
2.2837871778011323
2.315853319168091
2.277763673067093
2.2679293429851533
2.270770425796509
2.2454948127269745
2.2239956057071684
2.2754459488391876
2.298145649433136
2.304756406545639
2.235808312892914
2.318499357700348
2.33308

In [64]:
with torch.no_grad():
    # Get perplexity
    sumneglogp = 0
    T = 0
    for name in open('names.txt', 'r'):
        name = name.lower().strip()
        T += len(name)
        # Pad with START and STOP
        name = "." + name + "."
        # Get the name from index 0 to -1 exclusive end
        x_data = name[:-1]
        # Get the y from index 1 to end inclusive end
        y_data = [stoi[char] for char in name[1:]]
        
        # Define the logits per token prediction
        logits = []
        
        # Initialize the h vector to random
        h_past = [torch.randn(1, d_h)]
        
        # Loop over each chracter in the name and pass h and this into the RNN
        # Get the new logit
        for x in x_data:
            # x_data above was raw; get the int representation if you also do that
            x = torch.tensor(stoi[x])
            z, h_past = model.forward(x, h_past)
            # Append the logit
            logits.append(z)

        logits = torch.cat(logits, dim=0)

        # Compute the loss
        loss = F.cross_entropy(logits, torch.tensor(y_data))
                
        # Change to log base 2
        # log2(x) = ln(x) / ln(2)
        loss *= (1 / math.log(2))

        sumneglogp += loss
        
    # sumneglogp is -log(p('.' + name1)) -log(p('.' + name2)) -log(p('.' + name3)) ...  
    # Divide by the appropriate term to get the answer we want 
    print('Perplexity: ', torch.pow(2, sumneglogp.clone().detach() / T).item())

Perplexity:  1.4525952339172363


In [83]:
# Generate a random word using this distributon
# Intialize the word with 
name = '.'
# Set this to [h_0]
h_past = [torch.randn(1, d_h)]
while True:
    # Get the last character in the name
    c = name[-1]
    # Make the distribution from c to any other word other than START
    logits, h_past = model.forward(torch.tensor(stoi[c]), h_past)
    # Get the probabilities 
    p = F.softmax(logits, dim=1)
    # Sample a character from th distribution above
    c = torch.distributions.Categorical(p).sample()
    # Halt generation if the token gotten is '.' or append the new token to the name
    if c.item() == 0:
        break
    else:
        name += itos[c.item()]
print('Generated name: ' , name[1:])

Generated name:  jaizo


## Gated CNN
- Implement a language model across names similar to that here: https://arxiv.org/abs/1612.08083v3
- Batch size will be 1
- For a name like "abcdef" we will use data [.abcdef] -> [abcdef.]
- '.' is a start and end token, which denotes either the start or end of a word

In [17]:
class GatedCNNLanguageModel(nn.Module):
    def __init__(self, n_layers, kernel_size):
        super().__init__()
        
        # Define an embedding layer so each character gets an embedding of size d_model
        # There are vocab_size tokens
        self.e = nn.Embedding(vocab_size, d_model)
        
        # Define n_layers
        self.n_layers = n_layers
        
        # Define A as the first Conv1d layer which takes in d_model returns d_model and uses kernel_size kernels
        self.A = nn.Conv1d(d_model, d_model, kernel_size)
        
        # Define B as the first Conv1d layer which takes in d_model returns d_model and uses kernel_size kernels
        self.B = nn.Conv1d(d_model, d_model, kernel_size)
        
        # Define a list of A layers as above; n_layers
        self.A_list = nn.ModuleList([nn.Conv1d(d_model, d_model, kernel_size) for _ in range(self.n_layers)])
        
        # Define a list of B layers as above; n_layers
        self.B_list = nn.ModuleList([nn.Conv1d(d_model, d_model, kernel_size) for _ in range(self.n_layers)])
        
        # Define a fully connected layer which takes d_model embedding and gives the logits across vocab_size tokens
        self.fc = nn.Linear(d_model, vocab_size, bias=False)
        
        # Pad on the left with an amount such that the input is the same dimension as the output
        # Remember, our data looks like [a, b, c] -> [b, c, d]
        # We want to make sure tokens to the left of b (a) are used to predict b
        # We want to make sure tokens to the left of c (a, b) are used to predict c
        # We want to make sure tokens to the left of d (a, b, c) are used to predict d
        # Etc.
        # Pad with on the left
        self.kernel_size = kernel_size

    def forward(self, x):
        # Run through to get the embedding for the token
        # The embedding per token is the feature vector x  we pass into the
        # (B, L, D)
        x = self.e(x)
        
        # Change the above to be (B, D, L)
        x = x.transpose(1, 2)
        
        # Pad on the left with the right amount
        # Remember, what goes in, need to come out the same dimension and you can't "look ahead"
        # You want to just pad on the left
        # (B, D, L + kernel_size - 1)
        x = F.pad(x, (self.kernel_size - 1, 0))
        
        # Run x through A, B and get h as defined for the gated CNN
        # Define the res_input as the hidden layer we get; we'll move forward with h 
        h = self.A(x) * F.sigmoid(self.B(x))
        res_input = h
        
        for i, (conv, conv_gate) in enumerate(zip(self.A_list, self.B_list)):
            # Pad on the left
            h = F.pad(h, (self.kernel_size - 1, 0))
            # Apply conv and conv_gate to h and then combine to get the new h
            h = conv(h) * F.sigmoid(conv_gate(h))
            # Get the new h, and also add to it the old non transformed hidden layer; the residual connection
            # Reset the residual so you use it next time
            h = h + res_input
            res_input = h

        # Get the final h to be of dimension (B, L, D)
        h = h.transpose(1, 2)

        # Get the logits which is of dimension (B, L, vocab_size)
        logit =  self.fc(h)
        
        # Transpose the logits to be of dimension (B, vocab_size, L)
        logit = logit.transpose(1, 2)
        return logit

In [18]:
n_layers = 3
kernel_size = 3
model = GatedCNNLanguageModel(n_layers, kernel_size)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [19]:
total_loss = 0
total_ct = 0
total_epochs = 5

for _ in range(total_epochs):
    for name in open('names.txt', 'r'):
        name = name.lower().strip()
        # Pad the name with a special token for start and stop; use the same token
        name = '.' + name + '.'
        
        # Add the start and end padding token; reshape as needed
        # Batch size is 1 here
        x_data = torch.tensor([stoi[char] for char in name[:-1]]).unsqueeze(0)
        y_data = torch.tensor([stoi[char] for char in name[1:]]).unsqueeze(0)

        
        # Get the logits acrosss the x_data above
        logits = model.forward(x_data)
                                        
        # Compute the loss
        loss = F.cross_entropy(logits, y_data)

        # Get the new gradient
        loss.backward()

        # Clip the gradients at max norm 0.1
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=0.1)

        # Do a gradient update
        optimizer.step()

        # Get the loss for the batch and get the number of batches
        total_loss += loss.item()
        total_ct += 1

        if total_ct and total_ct % 100 == 0:
            print(total_loss / total_ct)
            total_loss = 0
            total_ct = 0

3.1296815967559812
2.729227132797241
2.5844305753707886
2.546993268728256
2.4741526579856874
2.5060327434539795
2.298123800754547
2.3075155353546144
2.3779546415805815
2.4501003754138946
2.366399209499359
2.3753054296970366
2.36068701505661
2.393677386045456
2.34833744764328
2.3049030923843383
2.363401359319687
2.340136194229126
2.3559988141059875
2.278149976730347
2.433100252151489
2.310502210855484
2.336207876205444
2.372140152454376
2.356697041988373
2.310680719614029
2.3134259378910063
2.274416744709015
2.280162868499756
2.234075565338135
2.2531217443943024
2.266591421365738
2.244017570018768
2.277844171524048
2.270915917158127
2.1992665123939514
2.2677265644073485
2.3171198189258577
2.260699521303177
2.2295311188697813
2.2429118764400484
2.3280926406383515
2.2516039431095125
2.2728367757797243
2.2105138540267943
2.257842890024185
2.253420451879501
2.208000460863113
2.196133860349655
2.228449218273163
2.248399304151535
2.3111205399036407
2.250219111442566
2.258981885910034
2.298151

In [25]:
with torch.no_grad():
    # Get perplexity
    sumneglogp = 0
    T = 0
    for name in open('names.txt', 'r'):
        name = name.lower().strip()
        T += len(name)
        name = '.' + name + '.'
        
        # Get the name from index 0 to -1 exclusive end
        x_data = torch.tensor([stoi[char] for char in name[:-1]]).unsqueeze(0)
        
        # Get the y from index 1 to end inclusive end
        y_data = torch.tensor([stoi[char] for char in name[1:]]).unsqueeze(0)
        
        # logits per token prediction
        logits = model.forward(x_data)
        
        # Compute the loss
        loss = F.cross_entropy(logits, y_data)
                
        # Change to log base 2
        # log2(x) = ln(x) / ln(2)
        loss *= (1 / math.log(2))

        # Update the sum across the negative logs
        sumneglogp += loss
        
    # sumneglogp is -log(p('.' + name1)) -log(p('.' + name2)) -log(p('.' + name3)) ...  
    
    # Divide by the appropriate term to get the answer we want 
    print('Perplexity: ', torch.pow(2, sumneglogp.clone().detach() / T).item())

Perplexity:  1.5733716487884521


In [60]:
# Generate a random word using this distributon
# Intialize the word with 
name = '.'
i_name = []
while True:
    # Get the last character and and add to i_name as an int representation
    c = stoi[name[-1]]
    i_name.append(c)
    
    # Make the distribution from c to any other word other than START
    logits = model.forward(torch.tensor(i_name).unsqueeze(0))

    # Grab the last logits, for the new term
    # Alternatively, just push the appropriate amount into the model
    logits = logits[:, :, -1]
    # Get the probabilities
    p = F.softmax(logits, dim=1)

    # Sample from the above probability distribution
    c = torch.distributions.Categorical(p).sample()
    
    # If we sample 0, stop; otherwise, continue
    if c.item() == 0:
        break
    else:
        name += itos[c.item()]
print('Generated name: ' , name[1:])

Generated name:  zuck
