Imports

In [12]:
# Imports
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import math
import functools as ft
import operator
from bpemb import BPEmb

from tqdm import tqdm

# Use GPU if possible
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


In [13]:
def enforce_reproducibility(seed=42):
    # Sets seed manually for both CPU and CUDA
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # System based
    #random.seed(seed)
    #np.random.seed(seed)

enforce_reproducibility()

Data

In [132]:
# Load dataset
dir_path = os.getcwd()[:-1]+'6'
train_df = pd.read_csv(dir_path+'/train_data.csv')
val_df = pd.read_csv(dir_path+'/train_data.csv')

# divide data set
train_arab = train_df[train_df['language'] == 'arabic']
train_indo = train_df[train_df['language'] == 'indonesian']
train_beng = train_df[train_df['language'] == 'bengali']

val_arab = val_df[val_df['language'] == 'arabic']
val_indo = val_df[val_df['language'] == 'indonesian']
val_beng = val_df[val_df['language'] == 'bengali']

In [133]:
vocab_size = 10000
embedding_dim = 50

bpemb_bn = BPEmb(lang="bn", dim=embedding_dim, vs=vocab_size) # Bengali
bpemb_ar = BPEmb(lang="ar", dim=embedding_dim, vs=vocab_size) # Arabic
bpemb_id = BPEmb(lang="id", dim=embedding_dim, vs=vocab_size) # Indonesian

embeddings_ar = bpemb_ar.emb.vectors
embeddings_bn = bpemb_bn.emb.vectors
embeddings_id = bpemb_id.emb.vectors

Data loader

In [16]:
def get_batch(data, seq_len, num_batches, idx):
    """
    given the index of the first batch of tokens in the batch returns the 
    corresponding batch of sequences.
    
    data: in [batch size, num_batches] format
    seq_len: length of sequence
    idx: index

    returns: input and targets of the LSTM  
    """    
    src = data[:, idx:idx+seq_len]        # 
    target = data[:, idx+1:idx+seq_len+1] # next words       
    return src, target

# define dataloader 
def get_data(dataset: list, vocab, batch_size):
    """
    dataset: pandas series
    vocab: bpemb entity
    batch_size: size of batch
    """
    encoded = vocab.encode_ids_with_eos(dataset.values)
    flat = ft.reduce(operator.iconcat, encoded, [])
    data = torch.tensor(flat)
    num_batches = data.shape[0] // batch_size
    data = data[:num_batches * batch_size] # remove modulus (leftovers)
    data = data.view(batch_size, num_batches) 
    return data


Define model

In [108]:
# Define LSTM model
class LSTM(nn.Module):
    def __init__(self, hidden_dim: int, lstm_layers: int, 
                 dropout_rate: float, tie_weights: bool, vocab):
                
        super().__init__()
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab.vectors.shape[0]
        self.embedding_dim = vocab.vectors.shape[1]
        self.num_layers = lstm_layers

        self.model = nn.ModuleDict({
            'embeddings': nn.Embedding.from_pretrained(torch.tensor(vocab.vectors)),
            'lstm': nn.LSTM(
                self.embedding_dim,
                hidden_dim,
                lstm_layers,
                batch_first=True,
                dropout=dropout_rate,
                bidirectional=False),
            'fc': nn.Linear(hidden_dim, vocab_size)
        })
        self.dropout = nn.Dropout(p=dropout_rate)
        
        if tie_weights:
            assert embedding_dim == hidden_dim, 'cannot tie, check dims'
            self.embedding.weight = self.fc.weight

        self._init_weights()

    def _init_weights(self):
        all_params = list(self.model['lstm'].named_parameters()) + \
                     list(self.model['fc'].named_parameters())
        for n,p in all_params:
            if 'weight' in n:
                nn.init.xavier_normal_(p)
            elif 'bias' in n:
                nn.init.zeros_(p)

    def init_hidden(self, batch_size, device):
        hidden = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(device)
        cell = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(device)
        return hidden, cell
    
    def detach_hidden(self, hidden):
        hidden, cell = hidden
        hidden = hidden.detach()
        cell = cell.detach()
        return hidden, cell

    def forward(self, src, hidden):
        embeds = self.model['embeddings'](src)
        lstm_out, hidden = self.model['lstm'](embeds)      
        fc_in = self.dropout(lstm_out) 
        logits = self.model['fc'](fc_in)

        return logits, hidden

#### Define training

In [109]:
# Train loop
def train(model, data, optimizer, criterion, batch_size, seq_len, clip, device):

    epoch_loss = 0
    model.train() # training mode - dropout not disabled
    # drop all batches that are not a multiple of seq_len
    num_batches = data.shape[-1]
    data = data[:, :num_batches - (num_batches -1) % seq_len]
    num_batches = data.shape[-1]

    hidden = model.init_hidden(batch_size, device)
    
    # The last batch can't be a src
    for idx in tqdm(range(0, num_batches - 1, seq_len), desc='Training: ',leave=False):  
        optimizer.zero_grad()
        hidden = model.detach_hidden(hidden)

        src, target = get_batch(data, seq_len, num_batches, idx)
        src, target = src.to(device), target.to(device)
        batch_size = src.shape[0]
        prediction, hidden = model(src, hidden)               
        prediction = prediction.reshape(batch_size * seq_len, -1)   
        target = target.reshape(-1)
        loss = criterion(prediction, target)
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item() * seq_len
    return epoch_loss / num_batches



#### Define evaluation

In [None]:
def evaluate(model, data, criterion, batch_size, seq_len, device):
    
    model.eval()
    eval_loss = 0
    
    num_batches = data.shape[-1]
    data = data[:, :num_batches - (num_batches -1) % seq_len]
    num_batches = data.shape[-1]

    hidden = model.init_hidden(batch_size, device)

    with torch.no_grad(): # we no longer need to backprop or keep track of gradients.
        for idx in range(0, num_batches - 1, seq_len):
            hidden = model.detach_hidden(hidden)
            src, target = get_batch(data, seq_len, num_batches, idx)
            src, target = src.to(device), target.to(device)
            batch_size= src.shape[0]

            prediction, hidden = model(src, hidden)
            prediction = prediction.reshape(batch_size * seq_len, -1)
            target = target.reshape(-1)

            loss = criterion(prediction, target)
            eval_loss += loss.item() * seq_len
    return eval_loss / num_batches

#### Hyperparameters

In [148]:
# Model
num_layers = 2
dropout_rate = 0.1
hidden_dim = 128
tie_weights = True if embedding_dim == hidden_dim else False

# Training
lr = 1e-3
batch_size = 64
n_epochs = 10
seq_len = 10
clip = 0.25

saved = False

#### Train all models

In [146]:
ln = 'arab'
doc = 'question_text'
vocab = bpemb_ar

train_data = get_data(train_arab[doc], vocab, batch_size)
val_data = get_data(val_arab[doc], vocab, batch_size)

model = LSTM(hidden_dim, num_layers, dropout_rate, tie_weights, vocab).to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()

# reduce learning rate as we go along 
lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=0) 

best_valid_loss = float('inf')

for epoch in range(n_epochs):
    
    train_loss = train(model, train_data, optimizer, criterion, 
                batch_size, seq_len, clip, device)
    valid_loss = evaluate(model, val_data, criterion, batch_size, 
                seq_len, device)
    
    lr_scheduler.step(valid_loss)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        name = 'best-val-'+ln+'-'+doc+'.pt'
        torch.save(model.state_dict(), name)

    print(f'Epoch: {epoch+1}')
    print(f'\tTrain Perplexity: {math.exp(train_loss):.3f}')
    print(f'\tValid Perplexity: {math.exp(valid_loss):.3f}')

valid_loss = evaluate(model, train_data, criterion, batch_size, 
                seq_len, device)
print(f'\tFinal train Perplexity: {math.exp(valid_loss):.3f}')

    

                                                           

Epoch: 1
	Train Perplexity: 689.738
	Valid Perplexity: 345.348


                                                           

Epoch: 2
	Train Perplexity: 215.796
	Valid Perplexity: 157.724


                                                           

Epoch: 3
	Train Perplexity: 138.715
	Valid Perplexity: 109.905


                                                           

Epoch: 4
	Train Perplexity: 100.010
	Valid Perplexity: 81.075


                                                           

Epoch: 5
	Train Perplexity: 78.384
	Valid Perplexity: 65.756


                                                           

Epoch: 6
	Train Perplexity: 65.424
	Valid Perplexity: 55.403


                                                           

Epoch: 7
	Train Perplexity: 56.169
	Valid Perplexity: 47.816


                                                           

Epoch: 8
	Train Perplexity: 49.238
	Valid Perplexity: 42.169


                                                           

Epoch: 9
	Train Perplexity: 43.973
	Valid Perplexity: 37.701


                                                           

Epoch: 10
	Train Perplexity: 39.766
	Valid Perplexity: 34.174
	 Final train Perplexity: 39.766


#### Train all

In [149]:
print(device)
lang = [(bpemb_ar, train_arab, val_arab, 'arab'), 
        (bpemb_bn, train_beng, val_beng, 'beng'), 
        (bpemb_id, train_indo, val_indo, 'indo')]

# Train and validate all 6 models
for vocab, train_df, val_df, ln in lang:
    print(f'Lanuage: {ln}')
    for doc in ['question_text', 'document_plaintext']:
        print(f'Data: {doc}')
        
        train_data = get_data(train_df[doc], vocab, batch_size)
        val_data = get_data(val_df[doc], vocab, batch_size)

        model = LSTM(hidden_dim, num_layers, dropout_rate, tie_weights, vocab).to(device)
        optimizer = optim.Adam(model.parameters(), lr=lr)
        criterion = nn.CrossEntropyLoss()
        

        # reduce learning rate as we go along 
        lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=3) 

        if saved:
            model.load_state_dict(torch.load('best-val-lstm_lm.pt',  map_location=device))
            test_loss = evaluate(model, val_data, criterion, batch_size, seq_len, device)
            print(f'Test Perplexity: {math.exp(test_loss):.3f}')
        else:
            best_valid_loss = float('inf')

            for epoch in range(n_epochs):
                
                train_loss = train(model, train_data, optimizer, criterion, 
                            batch_size, seq_len, clip, device)
                valid_loss = evaluate(model, val_data, criterion, batch_size, 
                            seq_len, device)
                
                lr_scheduler.step(valid_loss)

                if valid_loss < best_valid_loss:
                    best_valid_loss = valid_loss
                    name = 'best-val-'+ln+'-'+doc+'.pt'
                    torch.save(model.state_dict(), name)

                print(f'Epoch: {epoch+1}')
                print(f'\tTrain Perplexity: {math.exp(train_loss):.3f}')
                print(f'\tValid Perplexity: {math.exp(valid_loss):.3f}')
            
            valid_loss = evaluate(model, train_data, criterion, batch_size, 
                            seq_len, device)
            print(f'\tFinal train Perplexity: {math.exp(valid_loss):.3f}')

                

cpu
Lanuage: arab
Data: question_text


                                                           

Epoch: 1
	Train Perplexity: 729.760
	Valid Perplexity: 447.003


                                                           

Epoch: 2
	Train Perplexity: 229.833
	Valid Perplexity: 157.402


                                                           

Epoch: 3
	Train Perplexity: 135.634
	Valid Perplexity: 106.427


                                                           

Epoch: 4
	Train Perplexity: 96.356
	Valid Perplexity: 77.224


                                                           

Epoch: 5
	Train Perplexity: 74.539
	Valid Perplexity: 61.927


                                                           

Epoch: 6
	Train Perplexity: 62.014
	Valid Perplexity: 52.349


                                                           

Epoch: 7
	Train Perplexity: 53.508
	Valid Perplexity: 45.453


                                                           

Epoch: 8
	Train Perplexity: 47.203
	Valid Perplexity: 40.222


                                                           

Epoch: 9
	Train Perplexity: 42.238
	Valid Perplexity: 36.053


                                                           

Epoch: 10
	Train Perplexity: 38.171
	Valid Perplexity: 32.662
	Final train Perplexity: 32.662
Data: document_plaintext


                                                             

Epoch: 1
	Train Perplexity: 763.998
	Valid Perplexity: 429.367


                                                             

Epoch: 2
	Train Perplexity: 389.459
	Valid Perplexity: 308.140


                                                             

Epoch: 3
	Train Perplexity: 317.485
	Valid Perplexity: 260.609


                                                             

Epoch: 4
	Train Perplexity: 284.262
	Valid Perplexity: 234.704


                                                             

Epoch: 5
	Train Perplexity: 264.578
	Valid Perplexity: 218.266


                                                             

Epoch: 6
	Train Perplexity: 251.509
	Valid Perplexity: 206.825


                                                             

Epoch: 7
	Train Perplexity: 242.116
	Valid Perplexity: 198.229


                                                             

Epoch: 8
	Train Perplexity: 235.145
	Valid Perplexity: 191.544


                                                             

Epoch: 9
	Train Perplexity: 229.338
	Valid Perplexity: 186.380


                                                             

Epoch: 10
	Train Perplexity: 224.863
	Valid Perplexity: 182.146
	Final train Perplexity: 182.146
Lanuage: beng
Data: question_text


                                                         

Epoch: 1
	Train Perplexity: 1364.253
	Valid Perplexity: 554.291


                                                         

Epoch: 2
	Train Perplexity: 554.382
	Valid Perplexity: 490.031


                                                         

Epoch: 3
	Train Perplexity: 520.503
	Valid Perplexity: 470.890


                                                         

Epoch: 4
	Train Perplexity: 450.435
	Valid Perplexity: 352.905


                                                         

Epoch: 5
	Train Perplexity: 309.572
	Valid Perplexity: 248.056


                                                         

Epoch: 6
	Train Perplexity: 235.858
	Valid Perplexity: 199.050


                                                         

Epoch: 7
	Train Perplexity: 195.050
	Valid Perplexity: 166.260


                                                         

Epoch: 8
	Train Perplexity: 160.304
	Valid Perplexity: 134.363


                                                         

Epoch: 9
	Train Perplexity: 129.368
	Valid Perplexity: 107.532


                                                         

Epoch: 10
	Train Perplexity: 104.487
	Valid Perplexity: 87.187
	Final train Perplexity: 87.187
Data: document_plaintext


                                                             

Epoch: 1
	Train Perplexity: 1531.547
	Valid Perplexity: 928.089


                                                             

Epoch: 2
	Train Perplexity: 770.030
	Valid Perplexity: 588.396


                                                             

Epoch: 3
	Train Perplexity: 544.857
	Valid Perplexity: 436.749


                                                             

Epoch: 4
	Train Perplexity: 431.630
	Valid Perplexity: 351.806


                                                             

Epoch: 5
	Train Perplexity: 361.721
	Valid Perplexity: 294.926


                                                             

Epoch: 6
	Train Perplexity: 312.863
	Valid Perplexity: 254.823


                                                             

Epoch: 7
	Train Perplexity: 277.311
	Valid Perplexity: 224.669


                                                             

Epoch: 8
	Train Perplexity: 250.307
	Valid Perplexity: 201.438


                                                             

Epoch: 9
	Train Perplexity: 229.375
	Valid Perplexity: 183.322


                                                             

Epoch: 10
	Train Perplexity: 212.529
	Valid Perplexity: 168.675
	Final train Perplexity: 168.675
Lanuage: indo
Data: question_text


                                                           

Epoch: 1
	Train Perplexity: 760.252
	Valid Perplexity: 404.975


                                                           

Epoch: 2
	Train Perplexity: 410.653
	Valid Perplexity: 374.757


                                                           

Epoch: 3
	Train Perplexity: 352.168
	Valid Perplexity: 259.047


                                                           

Epoch: 4
	Train Perplexity: 182.629
	Valid Perplexity: 128.502


                                                           

Epoch: 5
	Train Perplexity: 114.853
	Valid Perplexity: 92.677


                                                           

Epoch: 6
	Train Perplexity: 87.125
	Valid Perplexity: 72.610


                                                           

Epoch: 7
	Train Perplexity: 70.647
	Valid Perplexity: 60.450


                                                           

Epoch: 8
	Train Perplexity: 60.059
	Valid Perplexity: 51.940


                                                           

Epoch: 9
	Train Perplexity: 52.285
	Valid Perplexity: 45.414


                                                           

Epoch: 10
	Train Perplexity: 46.154
	Valid Perplexity: 40.092
	Final train Perplexity: 40.092
Data: document_plaintext


                                                             

Epoch: 1
	Train Perplexity: 1355.529
	Valid Perplexity: 779.856


                                                             

Epoch: 2
	Train Perplexity: 580.226
	Valid Perplexity: 426.418


                                                             

Epoch: 3
	Train Perplexity: 396.635
	Valid Perplexity: 320.054


                                                             

Epoch: 4
	Train Perplexity: 321.987
	Valid Perplexity: 265.161


                                                             

Epoch: 5
	Train Perplexity: 278.658
	Valid Perplexity: 230.458


                                                             

Epoch: 6
	Train Perplexity: 250.102
	Valid Perplexity: 206.508


                                                             

Epoch: 7
	Train Perplexity: 229.761
	Valid Perplexity: 188.953


                                                             

Epoch: 8
	Train Perplexity: 214.528
	Valid Perplexity: 175.430


                                                             

Epoch: 9
	Train Perplexity: 202.755
	Valid Perplexity: 165.132


                                                             

Epoch: 10
	Train Perplexity: 193.284
	Valid Perplexity: 156.357
	Final train Perplexity: 156.357


Define train and evaluate loops

Next word generation

In [None]:
def generate(prompt, max_seq_len, temperature, model, tokenizer, vocab, device, seed=None):
    if seed is not None:
        torch.manual_seed(seed)
    model.eval()
    tokens = tokenizer(prompt)
    indices = [vocab[t] for t in tokens]
    batch_size = 1
    hidden = model.init_hidden(batch_size, device)
    with torch.no_grad():
        for i in range(max_seq_len):
            src = torch.LongTensor([indices]).to(device)
            prediction, hidden = model(src, hidden)
            probs = torch.softmax(prediction[:, -1] / temperature, dim=-1)  
            prediction = torch.multinomial(probs, num_samples=1).item()    
            
            while prediction == vocab['<unk>']:
                prediction = torch.multinomial(probs, num_samples=1).item()

            if prediction == vocab['<eos>']:
                break

            indices.append(prediction)

    itos = vocab.get_itos()
    tokens = [itos[i] for i in indices]
    return tokens

In [None]:
# Tests
prompt = '' # Some example
max_seq_len = 30
seed = 0

temperatures = [0.5, 0.7, 0.75, 0.8, 1.0]
for temperature in temperatures:
    generation = generate(prompt, max_seq_len, temperature, model, tokenizer, 
                          vocab, device, seed)
    print(str(temperature)+'\n'+' '.join(generation)+'\n')