# Test

### Imports

libraries and dataframes

In [None]:
import os
import torch 
import pandas as pd
import torch.nn as nn


dir_path = os.getcwd()[:-1]+'6'

train_df = pd.read_csv(dir_path+'/train_data.csv')
val_df = pd.read_csv(dir_path+'/train_data.csv')

Model class

In [None]:
class LSTMModel(nn.Module): # inherit nn.Module class
    def __init__(self, input_d, hidden_d, layer_d, output_d):
        """
        input_d : the number of expected features in the input
        hidden_d: the number of features in the hidden state.
        layer_d : the number of layers 
        output_d: the number of output nodes
        """
        super(LSTMModel, self).__init__()
        
        self.hidden_dim = hidden_d # the hidden d is the deadliest
        self.layer_dim = layer_d

        # LSTM model 
        self.lstm = nn.LSTM(input_d, hidden_d, layer_d, batch_first=True) 

        # fully connected
        self.fc = nn.Linear(hidden_d, output_d)

    def forward(self, x):
    
        h0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).requires_grad_()

        c0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).requires_grad_()

        # This step takes place input_d times. We detach as we truncate Backpropagation
        # through time (BPTT). If we don't detach, we'll backprop to the start.
        out, (hn, cn) = self.lstm(x, (h0.detach(), c0.detach()))

        out = self.fc(out[:, -1, :]) 
        return out

Initialize model

In [None]:
input_dim = 30
hidden_dim = 120
output_dim = 15
layer_dim = 1

model = LSTMModel(input_dim, hidden_dim, layer_dim, output_dim)

Learning parameters

In [None]:
#step 4: calculating cross entropy loss
error = nn.CrossEntropyLoss()

#step 5: optimizer 
learning_rate = 0.1
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

Train loop

# Towards Data Science

In [None]:
pip install bpemb

In [1]:
# Imports
import os
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import Preprocessor as p
import math
import functools as ft
import operator
from bpemb import BPEmb

#import torchtext
#import datasets

from tqdm import tqdm

# Use GPU if possible
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.manual_seed(0)

<torch._C.Generator at 0x10fe1b5d0>

In [2]:
# Load dataset
dir_path = os.getcwd()[:-1]+'6'
train_df = pd.read_csv(dir_path+'/train_data.csv')
val_df = pd.read_csv(dir_path+'/train_data.csv')

# Preprocess
train_df = p.DataFramePreprocessor(train_df, 
                                   ['question_text','document_plaintext'], 
                                   remove_stopwords=False, 
                                   remove_punctuation=False, 
                                   add_special_tokens=False,
                                   count=False)
val_df = p.DataFramePreprocessor(val_df, 
                                   ['question_text','document_plaintext'], 
                                   remove_stopwords=False, 
                                   remove_punctuation=False, # tokenize punctuation
                                   add_special_tokens=False,
                                   count=False)

# divide data set
train_arab = train_df.df[train_df.df['language'] == 'arabic']
train_indo = train_df.df[train_df.df['language'] == 'indonesian']
train_beng = train_df.df[train_df.df['language'] == 'bengali']

val_arab = val_df.df[val_df.df['language'] == 'arabic']
val_indo = val_df.df[val_df.df['language'] == 'indonesian']
val_beng = val_df.df[val_df.df['language'] == 'bengali']

Replacing punctuation with tokens...
Replacing punctuation with tokens...


In [22]:
# define dataloader 
def get_data(dataset: list, vocab, batch_size):
    """
    dataset: pandas series
    """
    encoded = vocab.encode_ids_with_eos(dataset.values)
    flat = ft.reduce(operator.iconcat, encoded, [])
    data = torch.tensor(flat)
    num_batches = data.shape[0] // batch_size
    data = data[:num_batches * batch_size] # remove modulus (leftovers)
    data = data.view(batch_size, num_batches) 
    return data


# Define LSTM model
class LSTM(nn.Module):
    def __init__(self, vocab_size: int, embedding_dim: int, hidden_dim: int, num_layers: int, 
                 dropout_rate: float, tie_weights: bool, vocab):
        """
        vocab_size: size of one-hot vector
        embedding_dim: dimension of the word representation.
        hidden_dim: network width
        num_layers: network depth
        dropout_rate: regularization method
        tie_weights: Weight tying is a method that dispenses with this redundancy and 
            simply uses a single set of embeddings at the input and softmax layers. That 
            is, we dispense with V and use E in both the start and end of the computation.
            In addition to providing improved model perplexity, this approach significantly 
            reduces the number of parameters required for the model.
        vocab: bpemb entity

        """
                
        super().__init__()
        self.num_layers = num_layers
        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim

        self.embedding = nn.Embedding.from_pretrained(torch.tensor(vocab.vectors)) # use bpemb embedding
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, 
                    dropout=dropout_rate, batch_first=True)
        self.dropout = nn.Dropout(dropout_rate)
        self.fc = nn.Linear(hidden_dim, vocab_size)
        
        if tie_weights:
            assert embedding_dim == hidden_dim, 'cannot tie, check dims'
            self.embedding.weight = self.fc.weight
        self.init_weights()

    def init_weights(self):
        init_range_emb = 0.1
        init_range_other = 1/math.sqrt(self.hidden_dim)
        self.embedding.weight.data.uniform_(-init_range_emb, init_range_emb)
        self.fc.weight.data.uniform_(-init_range_other, init_range_other)
        self.fc.bias.data.zero_()
        for i in range(self.num_layers):
            self.lstm.all_weights[i][0] = torch.FloatTensor(self.embedding_dim,
                    self.hidden_dim).uniform_(-init_range_other, init_range_other) 
            self.lstm.all_weights[i][1] = torch.FloatTensor(self.hidden_dim, 
                    self.hidden_dim).uniform_(-init_range_other, init_range_other) 

    def init_hidden(self, batch_size, device):
        hidden = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(device)
        cell = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(device)
        return hidden, cell
    
    def detach_hidden(self, hidden):
        hidden, cell = hidden
        hidden = hidden.detach()
        cell = cell.detach()
        return hidden, cell

    def forward(self, src, hidden):
        embedding = self.dropout(self.embedding(src))
        output, hidden = self.lstm(embedding, hidden)          
        output = self.dropout(output) 
        prediction = self.fc(output)
        return prediction, hidden
  
    
def get_batch(data, seq_len, num_batches, idx):
    """
    given the index of the first batch of tokens in the batch returns the 
    corresponding batch of sequences.
    
    data: in [batch size, num_batches] format
    seq_len: length of sequence
    idx: index

    returns: input and targets of the LSTM  
    """    
    src = data[:, idx:idx+seq_len]        # 
    target = data[:, idx+1:idx+seq_len+1] # next word        
    return src, target


# Train loop
def train(model, data, optimizer, criterion, batch_size, seq_len, clip, device):
    """
    model: LSTM entity
    data: result from get_data
     
    """
    epoch_loss = 0
    model.train() # training mode - dropout not disabled
    # drop all batches that are not a multiple of seq_len
    num_batches = data.shape[-1]
    data = data[:, :num_batches - (num_batches -1) % seq_len]
    num_batches = data.shape[-1]

    hidden = model.init_hidden(batch_size, device)
    
    # The last batch can't be a src
    for idx in tqdm(range(0, num_batches - 1, seq_len), desc='Training: ',leave=False):  
        optimizer.zero_grad()
        hidden = model.detach_hidden(hidden)

        src, target = get_batch(data, seq_len, num_batches, idx)
        src, target = src.to(device), target.to(device)
        batch_size = src.shape[0]
        prediction, hidden = model(src, hidden)               

        prediction = prediction.reshape(batch_size * seq_len, -1)   
        target = target.reshape(-1)
        loss = criterion(prediction, target)
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item() * seq_len
    return epoch_loss / num_batches


# Evaluation
def evaluate(model, data, criterion, batch_size, seq_len, device):

    epoch_loss = 0
    model.eval()
    num_batches = data.shape[-1]
    data = data[:, :num_batches - (num_batches -1) % seq_len]
    num_batches = data.shape[-1]

    hidden = model.init_hidden(batch_size, device)

    with torch.no_grad(): # we no longer need to backprop or keep track of gradients.
        for idx in range(0, num_batches - 1, seq_len):
            hidden = model.detach_hidden(hidden)
            src, target = get_batch(data, seq_len, num_batches, idx)
            src, target = src.to(device), target.to(device)
            batch_size= src.shape[0]

            prediction, hidden = model(src, hidden)
            prediction = prediction.reshape(batch_size * seq_len, -1)
            target = target.reshape(-1)

            loss = criterion(prediction, target)
            epoch_loss += loss.item() * seq_len
    return epoch_loss / num_batches

In [23]:
# hyperparameters
vocab_size = 10000
num_layers = 2
dropout_rate = 0.65  
lr = 1e-3
batch_size = 128
embedding_dim = 50 
hidden_dim = 1024 
tie_weights = True if embedding_dim == hidden_dim else False

n_epochs = 1
seq_len = 5
clip = 0.25
saved = False

bpemb_en = BPEmb(lang="en", dim=embedding_dim, vs=vocab_size) # English (for testing)
bpemb_bn = BPEmb(lang="bn", dim=embedding_dim, vs=vocab_size) # Bengali
bpemb_ar = BPEmb(lang="ar", dim=embedding_dim, vs=vocab_size) # Arabic
bpemb_id = BPEmb(lang="id", dim=embedding_dim, vs=vocab_size) # indonesian

In [24]:
lang = [(bpemb_ar, train_arab, val_arab, 'arab'), 
        (bpemb_bn, train_beng, val_beng, 'beng'), 
        (bpemb_id, train_indo, val_indo, 'indo')]

# Train and validate all 6 models
for vocab, train_df, val_df, ln in lang:
    for doc in ['question_text', 'document_plaintext']:
        
        print(f'Lanuage: {ln}')
        print(f'{doc}------------------')
        
        train_data = get_data(train_df[doc], vocab, batch_size)
        val_data = get_data(val_df[doc], vocab, batch_size)

        model = LSTM(vocab_size, embedding_dim, hidden_dim, num_layers, dropout_rate, tie_weights, vocab=vocab).to(device)
        optimizer = optim.Adam(model.parameters(), lr=lr)
        criterion = nn.CrossEntropyLoss()
        
        #num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
        #print(f'The model has {num_params:,} trainable parameters')

        # reduce learning rate as we go along 
        lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=0) 

        if saved:
            model.load_state_dict(torch.load('best-val-lstm_lm.pt',  map_location=device))
            test_loss = evaluate(model, val_data, criterion, batch_size, seq_len, device)
            print(f'Test Perplexity: {math.exp(test_loss):.3f}')
        else:
            best_valid_loss = float('inf')

            for epoch in range(n_epochs):
                
                train_loss = train(model, train_data, optimizer, criterion, 
                            batch_size, seq_len, clip, device)
                valid_loss = evaluate(model, val_data, criterion, batch_size, 
                            seq_len, device)
                
                lr_scheduler.step(valid_loss)

                if valid_loss < best_valid_loss:
                    best_valid_loss = valid_loss
                    name = 'best-val-'+ln+'-'+doc+'.pt'
                    torch.save(model.state_dict(), name)

                print(f'Epoch: {epoch+1}')
                print(f'\tTrain Perplexity: {math.exp(train_loss):.3f}')
                print(f'\tValid Perplexity: {math.exp(valid_loss):.3f}')

                

Lanuage: arab
question_text------------------
The model has 23,054,096 trainable parameters


                                                           

Epoch: 0
	Train Perplexity: 449.084
	Valid Perplexity: 193.028
Lanuage: arab
document_plaintext------------------
The model has 23,054,096 trainable parameters


                                                            

KeyboardInterrupt: 

In [None]:
def generate(prompt, max_seq_len, temperature, model, tokenizer, vocab, device, seed=None):
    if seed is not None:
        torch.manual_seed(seed)
    model.eval()
    tokens = tokenizer(prompt)
    indices = [vocab[t] for t in tokens]
    batch_size = 1
    hidden = model.init_hidden(batch_size, device)
    with torch.no_grad():
        for i in range(max_seq_len):
            src = torch.LongTensor([indices]).to(device)
            prediction, hidden = model(src, hidden)
            probs = torch.softmax(prediction[:, -1] / temperature, dim=-1)  
            prediction = torch.multinomial(probs, num_samples=1).item()    
            
            while prediction == vocab['<unk>']:
                prediction = torch.multinomial(probs, num_samples=1).item()

            if prediction == vocab['<eos>']:
                break

            indices.append(prediction)

    itos = vocab.get_itos()
    tokens = [itos[i] for i in indices]
    return tokens

In [None]:
# Tests
prompt = '' # Some example
max_seq_len = 30
seed = 0

temperatures = [0.5, 0.7, 0.75, 0.8, 1.0]
for temperature in temperatures:
    generation = generate(prompt, max_seq_len, temperature, model, tokenizer, 
                          vocab, device, seed)
    print(str(temperature)+'\n'+' '.join(generation)+'\n')