# Deep N-Grams

In [2]:
import numpy as np
from numpy import random 
np.random.seed(32)

from time import perf_counter

import trax
from trax import layers as tl
import trax.fastmath.numpy as tnp
from trax.supervised import training

import pickle
from trax import fastmath
import random as rnd
import os

import itertools

2023-12-18 11:51:21.194879: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-18 11:51:21.194941: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-18 11:51:21.225496: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
dirname = 'data/'
lines = [] # storing all the lines in a variable. 
for filename in os.listdir(dirname):
    with open(os.path.join(dirname, filename)) as files:
        for line in files:
            # remove leading and trailing whitespace
            pure_line = line.strip()
            
            # if pure_line is not the empty string,
            if pure_line:
                # append it to the list
                lines.append(pure_line)

In [4]:
n_lines = len(lines)
n_lines

125097

In [5]:
eval_lines = lines[-1000:] # create holdout valid seet
lines = lines[:-1000]

In [6]:
def line_to_tensor(line, EOS_int=1):
    
    tensor = []
    for c in line:
        c_int = ord(c)
        tensor.append(c_int)
    
    tensor.append(EOS_int)
    return tensor

In [7]:
def data_generator(
        batch_size, max_length, data_lines, 
        line_to_tensor=line_to_tensor, shuffle=True
):
    """Generator function that yields batches of data

    Args:
        batch_size (int): number of examples per batch.
        max_length (int): maximum length of the output tensor.
        NOTE: max_length includes the end-of-sentence character that will be added to the tensor. The length of the tensor is always 1 + the length
                of the original line of characters.
        data_lines (list): list of the sentences to group into batches.
        line_to_tensor (function, optional): function that converts line to tensor. Defaults to line_to_tensor.
        shuffle (bool, optional): True if the generator should generate random batches of data. Defaults to True.

    Yields:
        tuple: two copies of the batch (jax.interpreters.xla.DeviceArray) and mask (jax.interpreters.xla.DeviceArray).
        NOTE: jax.interpreters.xla.DeviceArray is trax's version of numpy.ndarray
    """
    index = 0
    
    cur_batch = []
    
    num_lines = len(data_lines)
    
    lines_index = [*range(num_lines)]
    
    if shuffle:
        rnd.shuffle(lines_index)
    
    while True:
        
        if index >= num_lines:
            index = 0
            if shuffle:
                rnd.shuffle(lines_index)
            
        line = data_lines[lines_index[index]]
        
        if len(line) < max_length:
            cur_batch.append(line)
            
        index += 1
        
        # if the current batch is now equal to the desired batch size
        if len(cur_batch) == batch_size:
            
            batch = []
            mask = []
            
            # go through each line in cur_batch
            for li in cur_batch:
                tensor = line_to_tensor(li)
                # how much padding 
                pad = [0] * (max_length - len(tensor))
                
                tensor_pad = tensor + pad
                
                batch.append(tensor_pad)

                # A mask for  tensor_pad is 1 wherever tensor_pad is not 0 and 0 wherever tensor_pad is 0
                example_mask = [0 if t == 0 else 1 for t in tensor_pad]
                mask.append(example_mask)
               
            batch_np_arr = np.array(batch)
            mask_np_arr = np.array(mask)
            
            
            # yield two copies of the batch and mask.
            yield batch_np_arr, batch_np_arr, mask_np_arr
            
            cur_batch = []
            

### Repeating Batch Generator

As the above generator will keep generating batches forever, but `itertools.cycle` is usefull when the generator eventually stops 

We cycle over the dataset multiple times during training, for small datasets, we can use 'itertools.cycle'

## Hidden State Activation 

In [9]:
def sigmoid(x):
    return 1.0 / (1.0 + np.exp(-x))

$W_h = \left [ W_{hh} \ | \ W_{hx} \right ]$ - horizontal concatenation

2 ways to get this:

* Method 1 : np.concatenate()
* Method 2 : np.hstack()

In [10]:
w_hh = np.full((3, 2), 1)
w_hx = np.full((3, 3), 9)

w_h1 = np.concatenate((w_hh, w_hx), axis=1)
w_h2 = np.hstack((w_hh, w_hx))

print(w_h1, '\n\n', w_h2)

[[1 1 9 9 9]
 [1 1 9 9 9]
 [1 1 9 9 9]] 

 [[1 1 9 9 9]
 [1 1 9 9 9]
 [1 1 9 9 9]]


\begin{equation*}
[h^{<t-1>},x^{<t>}] = \left[ \frac{h^{<t-1>}}{x^{<t>}} \right]
\end{equation*}


In [11]:
h_t_prev = np.full((2, 1), 1)
x_t = np.full((3, 1), 9)  

ax_1 = np.concatenate((h_t_prev, x_t), axis=0)
ax_2 = np.vstack((h_t_prev, x_t))

print(ax_1, '\n\n', ax_2)

[[1]
 [1]
 [9]
 [9]
 [9]] 

 [[1]
 [1]
 [9]
 [9]
 [9]]


In [12]:
random.seed(10)                 
emb = 128                     
T = 256                       
h_dim = 16                    
h_0 = np.zeros((h_dim, 1))      

w1 = random.standard_normal((h_dim, emb+h_dim))
w2 = random.standard_normal((h_dim, emb+h_dim))
w3 = random.standard_normal((h_dim, emb+h_dim))
b1 = random.standard_normal((h_dim, 1))
b2 = random.standard_normal((h_dim, 1))
b3 = random.standard_normal((h_dim, 1))
X = random.standard_normal((T, emb, 1))
weights = [w1, w2, w3, b1, b2, b3]

In [13]:
def forward_V_RNN(inputs, weights): # forward vanilla RNN 
    x, h_t = inputs
    wh, _, _, bh, _, _ = weights

    h_t = np.dot(wh, np.concatenate([h_t, x])) + bh
    h_t = sigmoid(h_t)

    return h_t, h_t

GRU has relavance and update gate. 


In [14]:
def forward_GRU(inputs, weights): 
    x, h_t = inputs
    wu, wr, wc, bu, br, bc = weights

    u = np.dot(wu, np.concatenate([h_t, x])) + bu
    u = sigmoid(u)
    
    r = np.dot(wr, np.concatenate([h_t, x])) + br
    r = sigmoid(u)
    
    c = np.dot(wc, np.concatenate([r * h_t, x])) + bc
    c = np.tanh(c)
    
    h_t = u* c + (1 - u)* h_t
    return h_t, h_t

**Implementation of the `scan` function**

Used in the forward propagation of the RNNs.

`scan` goes through all the elements X in elems, calls the function fn with args ([x, h_t], weights), stores the computed hidden h_t and appends the result to a list y_s.

In [15]:
def scan(fn, elems, weights, h_0=None):    # Forward prop in RNN
    h_t = h_0
    ys = []

    for x in elems:
        y, h_t = fn([x, h_t], weights)
        ys.append(y)
    
    return ys, h_t

In [16]:
# vanilla RNNs
tic = perf_counter()
ys, h_T = scan(forward_V_RNN, X, weights, h_0)
toc = perf_counter()
RNN_time=(toc-tic)*1000
print (f"It took {RNN_time:.2f}ms to run the forward method for the vanilla RNN.")

It took 5.02ms to run the forward method for the vanilla RNN.


In [17]:
# GRUs
tic = perf_counter()
ys, h_T = scan(forward_GRU, X, weights, h_0)
toc = perf_counter()
GRU_time=(toc-tic)*1000
print (f"It took {GRU_time:.2f}ms to run the forward method for the GRU.")

It took 16.39ms to run the forward method for the GRU.


In [18]:
mode = 'train'
vocab_size = 256
model_dimension = 512
n_layers = 2

GRU = tl.Serial(
    tl.ShiftRight(mode=mode), # pass the mode parameter if it used for inference and training, 
    tl.Embedding(vocab_size=vocab_size, d_feature=model_dimension),
    [tl.GRU(n_units=model_dimension) for _ in range(n_layers)],
    tl.Dense(n_units=vocab_size),
    tl.LogSoftmax()
)

###  GRU Model

In [19]:
def GRULM(vocab_size=256, d_model=512, n_layers=2, mode='train'):
    
    model = tl.Serial(
      tl.ShiftRight(mode=mode),
      tl.Embedding(vocab_size=vocab_size, d_feature=d_model), 
      [tl.GRU(n_units=d_model) for _ in range(n_layers)], 
      tl.Dense(n_units=vocab_size), 
      tl.LogSoftmax() 
    )
    return model

model = GRULM()
print(model)

Serial[
  Serial[
    ShiftRight(1)
  ]
  Embedding_256_512
  GRU_512
  GRU_512
  Dense_256
  LogSoftmax
]


In [20]:
batch_size = 32
max_length = 64


def n_used_lines(lines, max_length):
 
    n_lines = 0
    for l in lines:
        if len(l) <= max_length:
            n_lines += 1
    return n_lines



num_used_lines = n_used_lines(lines, 32)
print('Number of used lines from the dataset:', num_used_lines)
print('Batch size (a power of 2):', int(batch_size))
steps_per_epoch = int(num_used_lines/batch_size)
print('Number of steps to cover one epoch:', steps_per_epoch)

Number of used lines from the dataset: 25773
Batch size (a power of 2): 32
Number of steps to cover one epoch: 805


In [21]:


def train_model(model, data_generator, batch_size=32, max_length=64, lines=lines, eval_lines=eval_lines, n_steps=1, output_dir='model/'): 
    
    bare_train_generator = data_generator(batch_size, max_length, data_lines=lines)
    infinite_train_generator = itertools.cycle(bare_train_generator)
    
    bare_eval_generator = data_generator(batch_size, max_length, data_lines=eval_lines)
    infinite_eval_generator = itertools.cycle(bare_eval_generator)
   
    train_task = training.TrainTask(
        labeled_data=infinite_train_generator, 
        loss_layer=tl.CrossEntropyLoss(),  
        optimizer=trax.optimizers.Adam(0.0005)    
    )

    eval_task = training.EvalTask(
        labeled_data=infinite_eval_generator,   
        metrics=[tl.CrossEntropyLoss(), tl.Accuracy()], 
        n_eval_batches=3   
    )
    
    training_loop = training.Loop(model,
                                  train_task,
                                  eval_task=eval_task,
                                  output_dir=output_dir)

    training_loop.run(n_steps=n_steps)
    
    return training_loop


## Calculating Perplexity

Language Model Evaluation
Measures how well a prob model predicts a sample

$$P(W) = \sqrt[N]{\prod_{i=1}^{N} \frac{1}{P(w_i| w_1,...,w_{n-1})}}$$

Taking log on both sides and solving, 

$$ logP(w)= -\frac{1}{N}{\big({\sum_{i=1}^{N}{logP(w_i| w_1,...,w_{n-1})}}\big)} $$

Log of products becomes the sum

In [22]:
def test_model(preds, target):
    """Function to test the model.

    Args:
        preds (jax.interpreters.xla.DeviceArray): Predictions of a list of batches of tensors corresponding to lines of text.
        target (jax.interpreters.xla.DeviceArray): Actual list of batches of tensors corresponding to lines of text.

    Returns:
        float: log_perplexity of the model.
    """
    
    log_p = np.sum(preds * tl.one_hot(target, preds.shape[-1]), axis= -1)

    non_pad = 1.0 - np.equal(target, 0)         
    log_p = log_p * non_pad                         
    
    log_ppx = np.sum(log_p, axis=-1) / np.sum(non_pad, axis=-1) 
    log_ppx = np.mean(log_ppx) 
    
    return -log_ppx

### Generating Language with Model

In [None]:
def gumbel_sample(log_probs, temperature=1.0):
    """Gumbel sampling from a categorical distribution."""
    u = numpy.random.uniform(low=1e-6, high=1.0 - 1e-6, size=log_probs.shape)
    g = -np.log(-np.log(u))
    return np.argmax(log_probs + g * temperature, axis=-1)

def predict(num_chars, prefix):
    inp = [ord(c) for c in prefix]
    result = [c for c in prefix]
    max_len = len(prefix) + num_chars
    for _ in range(num_chars):
        cur_inp = np.array(inp + [0] * (max_len - len(inp)))
        outp = model(cur_inp[None, :])  # Add batch dim.
        next_char = gumbel_sample(outp[0, len(inp)])
        inp += [int(next_char)]
       
        if inp[-1] == 1:
            break  # EOS
        result.append(chr(int(next_char)))
    
    return "".join(result)

print(predict(32, ""))

In [None]:
print(predict(32, ""))
print(predict(32, ""))
print(predict(32, ""))