# Towards Data Science

### -- REQUIREMENTS --

python 3.7.16

pytorch 1.6

In [None]:
pip install bpemb

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import math
from bpemb import BPEmb
from torch.autograd import Variable
from torch.nn.utils.rnn import pad_sequence
from torch.nn.utils.rnn import pack_padded_sequence
from torch.nn.utils.rnn import pad_packed_sequence
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

# Use GPU if possible
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

  from .autonotebook import tqdm as notebook_tqdm


cuda


In [2]:
# Load dataset
#dir_path = os.getcwd()[:-1]+'6'
#train_df = pd.read_csv(dir_path+'/train_data.csv')
#val_df = pd.read_csv(dir_path+'/train_data.csv')
from datasets import load_dataset
dataset = load_dataset("copenlu/answerable_tydiqa")

train_df = dataset['train'].to_pandas()
train_df = train_df[train_df['language'].isin(['indonesian', 'arabic', 'bengali'])]

val_df = dataset['validation'].to_pandas()
val_df = val_df[val_df['language'].isin(['indonesian', 'arabic', 'bengali'])]

train_df['concat_text'] =  '<q>' + train_df['question_text'] + '</q> <d>' + train_df['document_plaintext'] + '</d>'
val_df['concat_text'] =  '<q>' + val_df['question_text'] + '</q> <d>' + val_df['document_plaintext'] + '</d>'

train_df['is_answerable'] = train_df['annotations'].apply(lambda x: int(x.get('answer_start', [-1])[0] != -1))
val_df['is_answerable'] = val_df['annotations'].apply(lambda x: int(x.get('answer_start', [-1])[0] != -1))

# divide data set
train_arab = train_df[train_df['language'] == 'arabic']
train_indo = train_df[train_df['language'] == 'indonesian']
train_beng = train_df[train_df['language'] == 'bengali']

val_arab = val_df[val_df['language'] == 'arabic']
val_indo = val_df[val_df['language'] == 'indonesian']
val_beng = val_df[val_df['language'] == 'bengali']

Using custom data configuration copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6
Found cached dataset parquet (C:/Users/45311/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
100%|██████████| 2/2 [00:00<00:00, 143.06it/s]


In [3]:
# hyperparameters
vocab_size = 10000
num_layers = 2
dropout_rate = 0.65  
lr = 1e-3
batch_size = 128
embedding_dim = 50 
hidden_dim = 64
output_dim = 2 
tie_weights = True if embedding_dim == hidden_dim else False

n_epochs = 1
seq_len = 5
clip = 0.25
saved = False

bpemb_en = BPEmb(lang="en", dim=embedding_dim, vs=vocab_size, add_pad_emb=True) # English (for testing)
bpemb_bn = BPEmb(lang="bn", dim=embedding_dim, vs=vocab_size, add_pad_emb=True) # Bengali
bpemb_ar = BPEmb(lang="ar", dim=embedding_dim, vs=vocab_size, add_pad_emb=True) # Arabic
bpemb_id = BPEmb(lang="id", dim=embedding_dim, vs=vocab_size, add_pad_emb=True) # indonesian

In [4]:
def pad_collate(batch):
    (xx, yy) = zip(*batch)
    x_lens = [len(x) for x in xx]
    xx_pad = pad_sequence(xx, batch_first=True, padding_value=0) # maybe -1?
    yy = torch.stack(yy)
    return xx_pad, yy, x_lens

class Dataset(torch.utils.data.Dataset):
    def __init__(self, inputs, labels):
        'Initialization'
        self.inputs = inputs
        self.labels = labels

    def __len__(self):
        'Denotes the total number of samples'
        return len(self.labels)

    def __getitem__(self, index):
        'Generates one sample of data'
        return self.inputs[index], self.labels[index]

def unpadded_set(df, vocab):
    inputs = vocab.encode_ids(df['concat_text'].values)
    inputs = list(map(torch.tensor, inputs))
    labels = torch.tensor(df['is_answerable'].values)
    return Dataset(inputs, labels)

In [5]:
loader_train = torch.utils.data.DataLoader(unpadded_set(train_arab, bpemb_ar), 
                                                 batch_size=batch_size, 
                                                 num_workers=0, 
                                                 shuffle=True, 
                                                 collate_fn=pad_collate)

loader_val = torch.utils.data.DataLoader(unpadded_set(val_arab, bpemb_ar), 
                                                batch_size=batch_size, 
                                                num_workers=0, 
                                                shuffle=True, 
                                                collate_fn=pad_collate)

In [6]:
# Define LSTM model
class LSTM(nn.Module):
    def __init__(self, vocab_size: int, embedding_dim: int, hidden_dim: int, output_dim: int, num_layers: int, 
                 dropout_rate: float, tie_weights: bool, vocab):
        """
        vocab_size: size of one-hot vector
        embedding_dim: dimension of the word representation.
        hidden_dim: network width
        num_layers: network depth
        dropout_rate: regularization method
        tie_weights: Weight tying is a method that dispenses with this redundancy and 
            simply uses a single set of embeddings at the input and softmax layers. That 
            is, we dispense with V and use E in both the start and end of the computation.
            In addition to providing improved model perplexity, this approach significantly 
            reduces the number of parameters required for the model.
        vocab: bpemb entity
        """
                
        super().__init__()
        self.num_layers = num_layers
        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim

        self.embedding = nn.Embedding.from_pretrained(torch.tensor(vocab.vectors)) # use bpemb embedding
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, 
                    dropout=dropout_rate, batch_first=True)
        self.dropout = nn.Dropout(dropout_rate)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
        if tie_weights:
            assert embedding_dim == hidden_dim, 'cannot tie, check dims'
            self.embedding.weight = self.fc.weight
        self.init_weights()

    def init_weights(self):
        init_range_emb = 0.1
        init_range_other = 1/math.sqrt(self.hidden_dim)
        self.embedding.weight.data.uniform_(-init_range_emb, init_range_emb)
        self.fc.weight.data.uniform_(-init_range_other, init_range_other)
        self.fc.bias.data.zero_()
        for i in range(self.num_layers):
            self.lstm.all_weights[i][0] = torch.FloatTensor(self.embedding_dim,
                    self.hidden_dim).uniform_(-init_range_other, init_range_other) 
            self.lstm.all_weights[i][1] = torch.FloatTensor(self.hidden_dim, 
                    self.hidden_dim).uniform_(-init_range_other, init_range_other) 
    """
    def init_hidden(self, batch_size, device):
        hidden = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(device)
        cell = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(device)
        return hidden, cell
    
    def detach_hidden(self, hidden):
        hidden, cell = hidden
        hidden = hidden.detach()
        cell = cell.detach()
        return hidden, cell
    """
    
    def forward(self, xs, x_lens):
        
        h0 = Variable(torch.zeros(self.num_layers, xs.size(0), self.hidden_dim)).to(device) #hidden state
        c0 = Variable(torch.zeros(self.num_layers, xs.size(0), self.hidden_dim)).to(device) #internal state
        x_emb = self.embedding(xs)
        x_emb = self.dropout(x_emb)

        # Pack, forward, pad
        x_packed = pack_padded_sequence(x_emb, x_lens, batch_first=True, enforce_sorted=False).to(device)
        out_packed, (hn, cn) = self.lstm(x_packed, (h0,c0))  
        out_padded, out_lengths = pad_packed_sequence(out_packed, batch_first=True)   

        output = out_padded[torch.arange(xs.size(0)), out_lengths-1, :]
        output = self.dropout(output) 
        prediction = self.fc(output)
        return prediction


## Train and eval funcs

In [14]:
def train_model(net, optimizer, criterion, loader, epochs = 200):
    net.train()
    
    gpu = torch.cuda.is_available()
    if(gpu):
        net.to(device)  
    
    loss_lst = []
    for epoch in range(epochs):  # Loop over the dataset multiple times
        epoch_loss = 0.0
        for i, data in enumerate(loader, 0):
            print(f'{i}, ',end="")
            # Get the inputs; data is a list of [inputs, labels]
            if (gpu):
                inputs, labels, x_lens = data[0].to(device), data[1].to(device), torch.tensor(data[2], device='cuda')
            else:
                inputs, labels, x_lens = data
          
            # Zero the parameter gradients
            optimizer.zero_grad()

            # Forward + backward + optimize
            outputs = net(inputs, x_lens)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

        # Print epoch statistics
        reporting_interval = 1
        epoch_loss = loss.item()
        loss_lst.append(epoch_loss)
        if epoch % reporting_interval == reporting_interval-1:  # Print every reporting_interval mini-batches
            # report_loss = running_loss / reproint
            print(f'epoch loss: {epoch+1}, {epoch_loss}')

    print(f'Finished Training')
    return loss_lst

    
def eval_model(net, loader):
    net.eval()
    correct = 0
    total = 0
    predict_lst = []
    labels_lst = []
    with torch.no_grad():
        for data in loader:
            inputs, labels, x_lens = data[0].to(device), data[1].to(device), torch.tensor(data[2], device='cuda')
            outputs = net(inputs, x_lens)
            print(outputs)
            _, predicted = torch.max(outputs.data, 1) # indicies for highest value, axis=1
            print(predicted)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            predict_lst.append(predicted)
            labels_lst.append(labels)
    
    y_pred = torch.cat(predict_lst).to('cpu')
    y_true = torch.cat(labels_lst).to('cpu')
    print('Accuracy on test strings: %.2f %%' % (100 * correct / total))
    return f1_score(y_true, y_pred), accuracy_score(y_true, y_pred)


## Run

In [12]:
# Define model and stuff
model = LSTM(vocab_size, embedding_dim, hidden_dim, output_dim, num_layers, 
             dropout_rate, tie_weights, bpemb_ar).to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = torch.nn.CrossEntropyLoss()

lstm_loss = train_model(net = model,    # LSTM, arabic, training
            optimizer = optimizer,      # Adam
            criterion = criterion,      # CrossEntropyLoss
            loader = loader_train,      # Dataloader for 
            epochs = 5)          # 

0, torch.int64
1, torch.int64
2, torch.int64
3, torch.int64
4, torch.int64
5, torch.int64
6, torch.int64
7, torch.int64
8, torch.int64
9, torch.int64
10, torch.int64
11, torch.int64
12, torch.int64
13, torch.int64
14, torch.int64
15, torch.int64
16, torch.int64
17, torch.int64
18, torch.int64
19, torch.int64
20, torch.int64
21, torch.int64
22, torch.int64
23, torch.int64
24, torch.int64
25, torch.int64
26, torch.int64
27, torch.int64
28, torch.int64
29, torch.int64
30, torch.int64
31, torch.int64
32, torch.int64
33, torch.int64
34, torch.int64
35, torch.int64
36, torch.int64
37, torch.int64
38, torch.int64
39, torch.int64
40, torch.int64
41, torch.int64
42, torch.int64
43, torch.int64
44, torch.int64
45, torch.int64
46, torch.int64
47, torch.int64
48, torch.int64
49, torch.int64
50, torch.int64
51, torch.int64
52, torch.int64
53, torch.int64
54, torch.int64
55, torch.int64
56, torch.int64
57, torch.int64
58, torch.int64
59, torch.int64
60, torch.int64
61, torch.int64
62, torch.int64
63

In [13]:
lstm_loss = train_model(net = model,    # LSTM, arabic, training
            optimizer = optimizer,      # Adam
            criterion = criterion,      # CrossEntropyLoss
            loader = loader_train,      # Dataloader for 
            epochs = 20) 

0, torch.int64


1, torch.int64
2, torch.int64
3, torch.int64
4, torch.int64
5, torch.int64
6, torch.int64
7, torch.int64
8, torch.int64
9, torch.int64
10, torch.int64
11, torch.int64
12, torch.int64
13, torch.int64
14, torch.int64
15, torch.int64
16, torch.int64
17, torch.int64
18, torch.int64
19, torch.int64
20, torch.int64
21, torch.int64
22, torch.int64
23, torch.int64
24, torch.int64
25, torch.int64
26, torch.int64
27, torch.int64
28, torch.int64
29, torch.int64
30, torch.int64
31, torch.int64
32, torch.int64
33, torch.int64
34, torch.int64
35, torch.int64
36, torch.int64
37, torch.int64
38, torch.int64
39, torch.int64
40, torch.int64
41, torch.int64
42, torch.int64
43, torch.int64
44, torch.int64
45, torch.int64
46, torch.int64
47, torch.int64
48, torch.int64
49, torch.int64
50, torch.int64
51, torch.int64
52, torch.int64
53, torch.int64
54, torch.int64
55, torch.int64
56, torch.int64
57, torch.int64
58, torch.int64
59, torch.int64
60, torch.int64
61, torch.int64
62, torch.int64
63, torch.int64
6

In [15]:
f1, acc = eval_model(model, loader_val)

tensor([[-0.0035,  0.0036],
        [-0.0035,  0.0036],
        [-0.0035,  0.0036],
        [-0.0035,  0.0036],
        [-0.0035,  0.0036],
        [-0.0035,  0.0036],
        [-0.0035,  0.0036],
        [-0.0035,  0.0036],
        [-0.0035,  0.0036],
        [-0.0035,  0.0036],
        [-0.0035,  0.0036],
        [-0.0035,  0.0036],
        [-0.0035,  0.0036],
        [-0.0035,  0.0036],
        [-0.0035,  0.0036],
        [-0.0035,  0.0036],
        [-0.0035,  0.0036],
        [-0.0035,  0.0036],
        [-0.0035,  0.0036],
        [-0.0035,  0.0036],
        [-0.0035,  0.0036],
        [-0.0035,  0.0036],
        [-0.0035,  0.0036],
        [-0.0035,  0.0036],
        [-0.0035,  0.0036],
        [-0.0035,  0.0036],
        [-0.0035,  0.0036],
        [-0.0035,  0.0036],
        [-0.0035,  0.0036],
        [-0.0035,  0.0036],
        [-0.0035,  0.0036],
        [-0.0035,  0.0036],
        [-0.0035,  0.0036],
        [-0.0035,  0.0036],
        [-0.0035,  0.0036],
        [-0.0035,  0

In [None]:
_, predicted = torch.max(outputs.data, 1)

In [None]:
tens = torch.tensor([[-1,0],[2,0]])
_, pred = torch.max(tens,1)
pred