# Towards Data Science

In [None]:
pip install bpemb

In [None]:
# Imports
#import os
import torch
import torch.nn as nn
import torch.optim as optim
#import pandas as pd
#import Preprocessor as p
import math
#import functools as ft
#import operator
from bpemb import BPEmb
from torch.autograd import Variable
from torch.nn.utils.rnn import pad_sequence
from torch.nn.utils.rnn import pack_padded_sequence
from torch.nn.utils.rnn import pad_packed_sequence
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

#import torchtext
#import datasets

from tqdm import tqdm

# Use GPU if possible
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.manual_seed(0)

In [None]:
# Load dataset
#dir_path = os.getcwd()[:-1]+'6'
#train_df = pd.read_csv(dir_path+'/train_data.csv')
#val_df = pd.read_csv(dir_path+'/train_data.csv')
from datasets import load_dataset
dataset = load_dataset("copenlu/answerable_tydiqa")

train_df = dataset['train'].to_pandas()
train_df = train_df[train_df['language'].isin(['indonesian', 'arabic', 'bengali'])]

val_df = dataset['validation'].to_pandas()
val_df = val_df[val_df['language'].isin(['indonesian', 'arabic', 'bengali'])]

train_df['concat_text'] =  '<q>' + train_df['question_text'] + '</q> <d>' + train_df['document_plaintext'] + '</d>'
val_df['concat_text'] =  '<q>' + val_df['question_text'] + '</q> <d>' + val_df['document_plaintext'] + '</d>'

train_df['is_answerable'] = train_df['annotations'].apply(lambda x: int(x.get('answer_start', [-1])[0] != -1))
val_df['is_answerable'] = val_df['annotations'].apply(lambda x: int(x.get('answer_start', [-1])[0] != -1))

# divide data set
train_arab = train_df[train_df['language'] == 'arabic']
train_indo = train_df[train_df['language'] == 'indonesian']
train_beng = train_df[train_df['language'] == 'bengali']

val_arab = val_df[val_df['language'] == 'arabic']
val_indo = val_df[val_df['language'] == 'indonesian']
val_beng = val_df[val_df['language'] == 'bengali']

In [None]:
# hyperparameters
vocab_size = 10000
num_layers = 2
dropout_rate = 0.65  
lr = 1e-3
batch_size = 128
embedding_dim = 50 
hidden_dim = 128
output_dim = 2 
tie_weights = True if embedding_dim == hidden_dim else False

n_epochs = 1
seq_len = 5
clip = 0.25
saved = False

bpemb_en = BPEmb(lang="en", dim=embedding_dim, vs=vocab_size, add_pad_emb=True) # English (for testing)
bpemb_bn = BPEmb(lang="bn", dim=embedding_dim, vs=vocab_size, add_pad_emb=True) # Bengali
bpemb_ar = BPEmb(lang="ar", dim=embedding_dim, vs=vocab_size, add_pad_emb=True) # Arabic
bpemb_id = BPEmb(lang="id", dim=embedding_dim, vs=vocab_size, add_pad_emb=True) # indonesian

In [None]:
def pad_collate(batch):
    (xx, yy) = zip(*batch)
    x_lens = [len(x) for x in xx]
    xx_pad = pad_sequence(xx, batch_first=True, padding_value=0) # maybe -1?
    yy = torch.stack(yy)
    return xx_pad, yy, x_lens

class Dataset(torch.utils.data.Dataset):
    def __init__(self, inputs, labels):
        'Initialization'
        self.inputs = inputs
        self.labels = labels

    def __len__(self):
        'Denotes the total number of samples'
        return len(self.labels)

    def __getitem__(self, index):
        'Generates one sample of data'
        return self.inputs[index], self.labels[index]

def unpadded_set(df, vocab):
    inputs = vocab.encode_ids(df['concat_text'].values)
    inputs = list(map(torch.tensor, inputs))
    labels = torch.tensor(df['is_answerable'].values)
    print("unpadded_set() called")
    return Dataset(inputs, labels)

In [None]:
loader_train = torch.utils.data.DataLoader(unpadded_set(train_arab, bpemb_ar), 
                                                 batch_size=batch_size, 
                                                 num_workers=0, 
                                                 shuffle=True, 
                                                 collate_fn=pad_collate)

loader_val = torch.utils.data.DataLoader(unpadded_set(val_arab, bpemb_ar), 
                                                batch_size=batch_size, 
                                                num_workers=0, 
                                                shuffle=True, 
                                                collate_fn=pad_collate)

In [None]:
# Define LSTM model
class LSTM(nn.Module):
    def __init__(self, vocab_size: int, embedding_dim: int, hidden_dim: int, output_dim: int, num_layers: int, 
                 dropout_rate: float, tie_weights: bool, vocab):
        """
        vocab_size: size of one-hot vector
        embedding_dim: dimension of the word representation.
        hidden_dim: network width
        num_layers: network depth
        dropout_rate: regularization method
        tie_weights: Weight tying is a method that dispenses with this redundancy and 
            simply uses a single set of embeddings at the input and softmax layers. That 
            is, we dispense with V and use E in both the start and end of the computation.
            In addition to providing improved model perplexity, this approach significantly 
            reduces the number of parameters required for the model.
        vocab: bpemb entity
        """
                
        super().__init__()
        self.num_layers = num_layers
        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim

        self.embedding = nn.Embedding.from_pretrained(torch.tensor(vocab.vectors)) # use bpemb embedding
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, 
                    dropout=dropout_rate, batch_first=True)
        self.dropout = nn.Dropout(dropout_rate)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
        if tie_weights:
            assert embedding_dim == hidden_dim, 'cannot tie, check dims'
            self.embedding.weight = self.fc.weight
        self.init_weights()

    def init_weights(self):
        init_range_emb = 0.1
        init_range_other = 1/math.sqrt(self.hidden_dim)
        self.embedding.weight.data.uniform_(-init_range_emb, init_range_emb)
        self.fc.weight.data.uniform_(-init_range_other, init_range_other)
        self.fc.bias.data.zero_()
        for i in range(self.num_layers):
            self.lstm.all_weights[i][0] = torch.FloatTensor(self.embedding_dim,
                    self.hidden_dim).uniform_(-init_range_other, init_range_other) 
            self.lstm.all_weights[i][1] = torch.FloatTensor(self.hidden_dim, 
                    self.hidden_dim).uniform_(-init_range_other, init_range_other) 

    def init_hidden(self, batch_size, device):
        hidden = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(device)
        cell = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(device)
        return hidden, cell
    
    def detach_hidden(self, hidden):
        hidden, cell = hidden
        hidden = hidden.detach()
        cell = cell.detach()
        return hidden, cell

    def forward(self, xs, x_lens):
        
        h0 = Variable(torch.zeros(self.num_layers, xs.size(0), self.hidden_dim)) #hidden state
        c0 = Variable(torch.zeros(self.num_layers, xs.size(0), self.hidden_dim)) #internal state
        
        x_emb = self.embedding(xs)
        x_emb = self.dropout(x_emb)

        # Pack, forward, pad
        x_packed = pack_padded_sequence(x_emb, x_lens, batch_first=True, enforce_sorted=False)
        out_packed, (hn, cn) = self.lstm(x_packed, (h0,c0))  
        out_padded, out_lengths = pad_packed_sequence(out_packed, batch_first=True)   

        output = out_padded[torch.arange(xs.size(0)), out_lengths-1, :]
        output = self.dropout(output) 
        prediction = self.fc(output)
        return prediction


# Train classifier

## Train and eval funcs

In [None]:
import time
def train_model(net, optimizer, criterion, loader, epochs = 200):
    net.train()
    
    gpu = torch.cuda.is_available()
    if(gpu):
        net.to(device)  
    
    loss_lst = []
    for epoch in range(epochs):  # Loop over the dataset multiple times
        epoch_loss = 0.0
        for i, data in enumerate(loader, 0):
            print(f'{i}, ',end="")
            # Get the inputs; data is a list of [inputs, labels]
            if (gpu):
                inputs, labels, x_lens = data[0].to(device), data[1].to(device), data[2].to(device)
            else:
                inputs, labels, x_lens = data
          
            # Zero the parameter gradients
            optimizer.zero_grad()

            # Forward + backward + optimize
            start = time.time()
            outputs = net(inputs, x_lens)
            loss = criterion(outputs, labels.type(torch.LongTensor))
            print(f'Time forward + loss: {time.time() - start}')
            start = time.time()
            loss.backward()
            print(f'Time backwards: {time.time() - start}')
            optimizer.step()

        # Print epoch statistics
        reporting_interval = 20
        epoch_loss = loss.item()
        loss_lst.append(epoch_loss)
        if epoch % reporting_interval == reporting_interval-1:  # Print every reporting_interval mini-batches
            # report_loss = running_loss / reproint
            print(f'epoch loss: {epoch+1}, {epoch_loss}')

    print(f'Finished Training')
    return loss_lst

    
def eval_model(net, loader):
    net.eval()
    correct = 0
    total = 0
    predict_lst = []
    labels_lst = []
    with torch.no_grad():
        for data in loader:
            inputs, labels, x_lens = data
            outputs = net(inputs, x_lens)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            predict_lst.append(predicted)
            labels_lst.append(labels)
    
    y_pred = torch.cat(predict_lst)
    y_true = torch.cat(labels_lst)
    print('Accuracy on test strings: %.2f %%' % (100 * correct / total))
    return f1_score(y_true, y_pred), accuracy_score(y_true, y_pred)


## Run

In [None]:
# Define model and stuff
model = LSTM(vocab_size, embedding_dim, hidden_dim, output_dim, num_layers, 
             dropout_rate, tie_weights, bpemb_ar).to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = torch.nn.CrossEntropyLoss()

lstm_loss = train_model(net = model,    # LSTM, arabic, training
            optimizer = optimizer,      # Adam
            criterion = criterion,      # CrossEntropyLoss
            loader = loader_train,      # Dataloader for 
            epochs = n_epochs)          # 5

#### Def train and eval

In [None]:
# Train loop
def train(model, data, optimizer, criterion, batch_size, seq_len, clip, device):
    """
    model: LSTM entity
    data: result from get_data
     
    """
    epoch_loss = 0
    model.train() # training mode - dropout not disabled
    # drop all batches that are not a multiple of seq_len
    num_batches = data.shape[-1]
    data = data[:, :num_batches - (num_batches -1) % seq_len]
    num_batches = data.shape[-1]

    hidden = model.init_hidden(batch_size, device)
    
    # The last batch can't be a src
    for idx in tqdm(range(0, num_batches - 1, seq_len), desc='Training: ',leave=False):  
        optimizer.zero_grad()
        hidden = model.detach_hidden(hidden)

        src, target = get_batch(data, seq_len, num_batches, idx)
        src, target = src.to(device), target.to(device)
        batch_size = src.shape[0]
        prediction, hidden = model(src, hidden)               

        prediction = prediction.reshape(batch_size * seq_len, -1)   
        target = target.reshape(-1)
        loss = criterion(prediction, target)
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item() * seq_len
    return epoch_loss / num_batches


# Evaluation
def evaluate(model, data, criterion, batch_size, seq_len, device):

    epoch_loss = 0
    model.eval()
    num_batches = data.shape[-1]
    data = data[:, :num_batches - (num_batches -1) % seq_len]
    num_batches = data.shape[-1]

    hidden = model.init_hidden(batch_size, device)

    with torch.no_grad(): # we no longer need to backprop or keep track of gradients.
        for idx in range(0, num_batches - 1, seq_len):
            hidden = model.detach_hidden(hidden)
            src, target = get_batch(data, seq_len, num_batches, idx)
            src, target = src.to(device), target.to(device)
            batch_size= src.shape[0]

            prediction, hidden = model(src, hidden)
            prediction = prediction.reshape(batch_size * seq_len, -1)
            target = target.reshape(-1)

            loss = criterion(prediction, target)
            epoch_loss += loss.item() * seq_len
    return epoch_loss / num_batches

### Run

In [None]:
lang = [(bpemb_ar, train_arab, val_arab, 'arab'), 
        (bpemb_bn, train_beng, val_beng, 'beng'), 
        (bpemb_id, train_indo, val_indo, 'indo')]

# Train and validate all 6 models
for vocab, train_df, val_df, ln in lang:
    for doc in ['question_text', 'document_plaintext']:
        
        print(f'Lanuage: {ln}')
        print(f'{doc}------------------')
        
        train_data = get_data(train_df[doc], vocab, batch_size)
        val_data = get_data(val_df[doc], vocab, batch_size)

        model = LSTM(vocab_size, embedding_dim, hidden_dim, num_layers, dropout_rate, tie_weights, vocab=vocab).to(device)
        optimizer = optim.Adam(model.parameters(), lr=lr)
        criterion = nn.CrossEntropyLoss()
        
        #num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
        #print(f'The model has {num_params:,} trainable parameters')

        # reduce learning rate as we go along 
        lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=0) 

        if saved:
            model.load_state_dict(torch.load('best-val-lstm_lm.pt',  map_location=device))
            test_loss = evaluate(model, val_data, criterion, batch_size, seq_len, device)
            print(f'Test Perplexity: {math.exp(test_loss):.3f}')
        else:
            best_valid_loss = float('inf')

            for epoch in range(n_epochs):
                
                train_loss = train(model, train_data, optimizer, criterion, 
                            batch_size, seq_len, clip, device)
                valid_loss = evaluate(model, val_data, criterion, batch_size, 
                            seq_len, device)
                
                lr_scheduler.step(valid_loss)

                if valid_loss < best_valid_loss:
                    best_valid_loss = valid_loss
                    name = 'best-val-'+ln+'-'+doc+'.pt'
                    torch.save(model.state_dict(), name)

                print(f'Epoch: {epoch+1}')
                print(f'\tTrain Perplexity: {math.exp(train_loss):.3f}')
                print(f'\tValid Perplexity: {math.exp(valid_loss):.3f}')

                

### Train and eval

In [None]:
def generate(prompt, max_seq_len, temperature, model, tokenizer, vocab, device, seed=None):
    if seed is not None:
        torch.manual_seed(seed)
    model.eval()
    tokens = tokenizer(prompt)
    indices = [vocab[t] for t in tokens]
    batch_size = 1
    hidden = model.init_hidden(batch_size, device)
    with torch.no_grad():
        for i in range(max_seq_len):
            src = torch.LongTensor([indices]).to(device)
            prediction, hidden = model(src, hidden)
            probs = torch.softmax(prediction[:, -1] / temperature, dim=-1)  
            prediction = torch.multinomial(probs, num_samples=1).item()    
            
            while prediction == vocab['<unk>']:
                prediction = torch.multinomial(probs, num_samples=1).item()

            if prediction == vocab['<eos>']:
                break

            indices.append(prediction)

    itos = vocab.get_itos()
    tokens = [itos[i] for i in indices]
    return tokens

In [None]:
# Tests
prompt = '' # Some example
max_seq_len = 30
seed = 0

temperatures = [0.5, 0.7, 0.75, 0.8, 1.0]
for temperature in temperatures:
    generation = generate(prompt, max_seq_len, temperature, model, tokenizer, 
                          vocab, device, seed)
    print(str(temperature)+'\n'+' '.join(generation)+'\n')