In [33]:
import gensim.downloader
import pandas as pd
import numpy as np
import torch
import random
from nltk.tokenize import word_tokenize
from sklearn.metrics import accuracy_score
import datasets
import matplotlib.pyplot as plt
# enabling inline plots in Jupyter
%matplotlib inline
datasets.logging.set_verbosity_error()
import nltk
nltk.download('punkt')

torch.manual_seed(42)
random.seed(42)
np.random.seed(42)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/emmastoklundlee/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [34]:
pip install transformers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Note: you may need to restart the kernel to use updated packages.


In [35]:
# Preamble 
import sys 
sys.path.append('..')

In [36]:
from datasets import load_dataset
import pandas as pd

dataset = load_dataset("copenlu/answerable_tydiqa")

train_set = dataset["train"]
validation_set = dataset["validation"]

df_train = train_set.to_pandas()
df_val = validation_set.to_pandas()

df_train.head()

  0%|          | 0/2 [00:00<?, ?it/s]

Unnamed: 0,question_text,document_title,language,annotations,document_plaintext,document_url
0,Milloin Charles Fort syntyi?,Charles Fort,finnish,"{'answer_start': [18], 'answer_text': ['6. elo...",Charles Hoy Fort (6. elokuuta (joidenkin lähte...,https://fi.wikipedia.org/wiki/Charles%20Fort
1,“ダン” ダニエル・ジャドソン・キャラハンの出身はどこ,ダニエル・J・キャラハン,japanese,"{'answer_start': [35], 'answer_text': ['カリフォルニ...",“ダン”こと、ダニエル・ジャドソン・キャラハンは1890年7月26日、カリフォルニア州サンフ...,https://ja.wikipedia.org/wiki/%E3%83%80%E3%83%...
2,వేప చెట్టు యొక్క శాస్త్రీయ నామం ఏమిటి?,వేప,telugu,"{'answer_start': [12], 'answer_text': ['Azadir...","వేప (లాటిన్ Azadirachta indica, syn. Melia aza...",https://te.wikipedia.org/wiki/%E0%B0%B5%E0%B1%...
3,চেঙ্গিস খান কোন বংশের রাজা ছিলেন ?,চেঙ্গিজ খান,bengali,"{'answer_start': [414], 'answer_text': ['বোরজি...",চেঙ্গিজ খান (মঙ্গোলীয়: Чингис Хаан আ-ধ্ব-ব: ...,https://bn.wikipedia.org/wiki/%E0%A6%9A%E0%A7%...
4,రెయ్యలగడ్ద గ్రామ విస్తీర్ణత ఎంత?,రెయ్యలగడ్ద,telugu,"{'answer_start': [259], 'answer_text': ['27 హె...","రెయ్యలగడ్ద, విశాఖపట్నం జిల్లా, గంగరాజు మాడుగుల...",https://te.wikipedia.org/wiki/%E0%B0%B0%E0%B1%...


In [37]:
# Creating a binary column where if the question is answered it is equal to 1, and if not answerable it is 0
def check_annotations(annotation):
    return annotation == {'answer_start': [-1], 'answer_text': ['']}

df_train['correct_answer'] = df_train['annotations'].apply(check_annotations)
df_train['correct_answer'] = (~df_train['correct_answer']).astype(int)

df_val['correct_answer'] = df_val['annotations'].apply(check_annotations)
df_val['correct_answer'] = (~df_val['correct_answer']).astype(int)

In [38]:
# Get train and validation data for each language
df_train_bengali = df_train[df_train['language'] == 'bengali']
df_train_arabic = df_train[df_train['language'] == 'arabic']
df_train_indonesian = df_train[df_train['language'] == 'indonesian']

df_val_bengali = df_val[df_val['language'] == 'bengali']
df_val_arabic = df_val[df_val['language'] == 'arabic']
df_val_indonesian = df_val[df_val['language'] == 'indonesian']

# For testing
df_val_english = df_val[df_val['language'] == 'english']
df_train_english = df_train[df_train['language'] == 'english']

In [39]:
df_train_bengali_document = df_train[df_train['language'] == 'bengali']["document_plaintext"]
df_train_arab_document = df_train[df_train['language'] == 'arabic']["document_plaintext"]
df_train_indonesian_document = df_train[df_train['language'] == 'indonesian']["document_plaintext"]
df_train_indonesian_document.head()

df_train_english_document = df_train[df_train['language'] == 'english']["document_plaintext"]


In [40]:
# Tokenize the documents
from transformers import AutoTokenizer
mbert_tokeniser = AutoTokenizer.from_pretrained("bert-base-multilingual-uncased")

def tokenize(df, key, transformer_model):
  df.loc[:, f'{key}_tokenized'] = [transformer_model.tokenize(row) for row in df[key]]

# Tokinize train document_plaintext
tokenize(df_train_bengali, "document_plaintext", mbert_tokeniser)
tokenize(df_train_arabic, "document_plaintext", mbert_tokeniser)
tokenize(df_train_indonesian, "document_plaintext", mbert_tokeniser)

# Tokinize validation document_plaintext
tokenize(df_val_bengali, "document_plaintext", mbert_tokeniser)
tokenize(df_val_arabic, "document_plaintext", mbert_tokeniser)
tokenize(df_val_indonesian, "document_plaintext", mbert_tokeniser)


# For testing
tokenize(df_train_english, "document_plaintext", mbert_tokeniser)
tokenize(df_val_english, "document_plaintext", mbert_tokeniser)

Token indices sequence length is longer than the specified maximum sequence length for this model (611 > 512). Running this sequence through the model will result in indexing errors
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, f'{key}_tokenized'] = [transformer_model.tokenize(row) for row in df[key]]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, f'{key}_tokenized'] = [transformer_model.tokenize(row) for row in df[key]]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value ins

In [41]:
# added in tokenization of the questions
# Tokinize train question_text
tokenize(df_train_bengali, "question_text", mbert_tokeniser)
tokenize(df_train_arabic, "question_text", mbert_tokeniser)
tokenize(df_train_indonesian, "question_text", mbert_tokeniser)

# Tokinize validation question_text
tokenize(df_val_bengali, "question_text", mbert_tokeniser)
tokenize(df_val_arabic, "question_text", mbert_tokeniser)
tokenize(df_val_indonesian, "question_text", mbert_tokeniser)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, f'{key}_tokenized'] = [transformer_model.tokenize(row) for row in df[key]]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, f'{key}_tokenized'] = [transformer_model.tokenize(row) for row in df[key]]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, f'{key}_tokenized'] = [tr

In [42]:
# For testing tokenize in english
tokenize(df_train_english, "question_text", mbert_tokeniser)
tokenize(df_val_english, "question_text", mbert_tokeniser)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, f'{key}_tokenized'] = [transformer_model.tokenize(row) for row in df[key]]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, f'{key}_tokenized'] = [transformer_model.tokenize(row) for row in df[key]]


In [43]:
# Training data document_plaintext tokenized
document_plaintext_tokenized_bengali = list(df_train_bengali["document_plaintext_tokenized"].explode())
document_plaintext_tokenized_arabic = list(df_train_arabic["document_plaintext_tokenized"].explode())
document_plaintext_tokenized_indonesian = list(df_train_indonesian["document_plaintext_tokenized"].explode())

# Validation data document_plaintext tokenized
document_plaintext_tokenized_val_bengali = list(df_val_bengali["document_plaintext_tokenized"].explode())
document_plaintext_tokenized_val_arabic = list(df_val_arabic["document_plaintext_tokenized"].explode())
document_plaintext_tokenized_val_indonesian = list(df_val_indonesian["document_plaintext_tokenized"].explode())

# Training data question_text tokenized
question_text_tokenized_bengali = list(df_train_bengali["question_text_tokenized"].explode())
question_text_tokenized_arabic = list(df_train_arabic["question_text_tokenized"].explode())
question_text_tokenized_indonesian = list(df_train_indonesian["question_text_tokenized"].explode())

# Validation data question_text tokenized
question_text_tokenized_val_bengali = list(df_val_bengali["question_text_tokenized"].explode())
question_text_tokenized_val_arabic = list(df_val_arabic["question_text_tokenized"].explode())
question_text_tokenized_val_indonesian = list(df_val_indonesian["question_text_tokenized"].explode())

# for testing in english
document_plaintext_tokenized_english = list(df_train_english["document_plaintext_tokenized"].explode())
document_plaintext_tokenized_val_english = list(df_val_english["document_plaintext_tokenized"].explode())
question_text_tokenized_english = list(df_train_english["question_text_tokenized"].explode())
question_text_tokenized_val_english = list(df_val_english["question_text_tokenized"].explode())


In [44]:
df_train_english['text_tokenized'] = df_train_english['document_plaintext_tokenized'] + df_train_english['question_text_tokenized']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train_english['text_tokenized'] = df_train_english['document_plaintext_tokenized'] + df_train_english['question_text_tokenized']


In [45]:
english_tokenized_text = list(df_train_english['text_tokenized'])

In [48]:
df_train_english.head()

Unnamed: 0,question_text,document_title,language,annotations,document_plaintext,document_url,correct_answer,document_plaintext_tokenized,question_text_tokenized,text_tokenized
26,When was quantum field theory developed?,Quantum field theory,english,"{'answer_start': [159], 'answer_text': ['1920s']}",Quantum field theory naturally began with the ...,https://en.wikipedia.org/wiki/Quantum%20field%...,1,"[quantum, field, theory, naturally, began, wit...","[when, was, quantum, field, theory, developed, ?]","[quantum, field, theory, naturally, began, wit..."
43,Who was the first Nobel prize winner for Liter...,List of Nobel laureates in Literature,english,"{'answer_start': [610], 'answer_text': ['Sully...",The Nobel Prize in Literature (Swedish: Nobelp...,https://en.wikipedia.org/wiki/List%20of%20Nobe...,1,"[the, nobel, prize, in, literature, (, swedish...","[who, was, the, first, nobel, prize, winner, f...","[the, nobel, prize, in, literature, (, swedish..."
112,When is the dialectical method used?,Dialectic,english,"{'answer_start': [129], 'answer_text': ['disco...","Dialectic or dialectics (Greek: διαλεκτική, di...",https://en.wikipedia.org/wiki/Dialectic,1,"[dialect, ##ic, or, dialect, ##ics, (, greek, ...","[when, is, the, dialect, ##ical, method, used, ?]","[dialect, ##ic, or, dialect, ##ics, (, greek, ..."
123,Who invented Hangul?,Origin of Hangul,english,"{'answer_start': [88], 'answer_text': ['Sejong...",Hangul was personally created and promulgated ...,https://en.wikipedia.org/wiki/Origin%20of%20Ha...,1,"[hangul, was, personally, created, and, promu,...","[who, invented, hangul, ?]","[hangul, was, personally, created, and, promu,..."
125,What do Grasshoppers eat?,Grasshopper,english,"{'answer_start': [0], 'answer_text': ['Grassho...","Grasshoppers are plant-eaters, with a few spec...",https://en.wikipedia.org/wiki/Grasshopper,1,"[grasshoppers, are, plant, -, eat, ##ers, ,, w...","[what, do, grasshoppers, eat, ?]","[grasshoppers, are, plant, -, eat, ##ers, ,, w..."


In [49]:
# create list of correct answers in df_train_english
# labels for whether the question could be answered or not
correct_answer_train_english = list(df_train_english["correct_answer"])
correct_answer_val_english = list(df_val_english["correct_answer"])


In [55]:
# creating the full list of vocabulary in the tweet_eval data
total_vocabulary = set()
for document in document_plaintext_tokenized_english + document_plaintext_tokenized_val_english + question_text_tokenized_english + question_text_tokenized_val_english:
    total_vocabulary.add(document.lower())
total_vocabulary = sorted(list(total_vocabulary))

# appending an empty padding token at the beginning of the vocabulary
total_vocabulary = [""]+total_vocabulary


In [57]:

def create_embedding_matrix(tokens, embedding):
    """creates an embedding matrix from pre-trained embeddings for a new vocabulary. It also adds an extra vector
    vector of zeroes in row 0 to embed the padding token, and initializes missing tokens as vectors of 0s"""
    oov = set()
    size = embedding.vector_size
    # note the extra zero vector that will used for padding
    embedding_matrix=np.zeros((len(tokens),size))
    c = 0
    for i in range(1,len(tokens)):
        try:
            embedding_matrix[i]=embedding[tokens[i]]
        except KeyError: #to catch the words missing in the embeddings
            try:
                embedding_matrix[i]=embedding[tokens[i].lower()]
            except KeyError:
                #if the token does not have an embedding, we initialize it as a vector of 0s
                embedding_matrix[i] = np.zeros(size)
                #we keep track of the out of vocabulary tokens
                oov.add(tokens[i])
                c +=1
    print(f'{c/len(tokens)*100} % of tokens are out of vocabulary')
    return embedding_matrix, oov

# load the pretrained embeddings (these can be used as the embedding argument in create_embedding_matrix)
# look into other gloves - is glove_twitter_25 the best?
glove = gensim.downloader.load('glove-wiki-gigaword-100') 

#get the embedding matrix and out of vocabulary words for our tweet_eval vocabulary
embedding_matrix, oov = create_embedding_matrix(total_vocabulary, glove)


31.577384266993352 % of tokens are out of vocabulary


In [58]:
def text_to_indices(text, total_vocabulary):
    """Turns the input text (one tweet) into a vector of indices in total_vocabulary that corresponds to the tokenized words in the input text"""
    vocab_dict = {word: index for index, word in enumerate(total_vocabulary)}

    # Initialize a list to store the encoded text
    encoded_text = []

    for t in text:
        # Convert the token to lowercase to match the vocabulary
        t_lower = t.lower()
        if t_lower in vocab_dict:
            # Use the dictionary to quickly find the index
            encoded_text.append(vocab_dict[t_lower])

    return encoded_text

def add_padding(vector, max_length, padding_index):
    """adds copies of the padding token to make the input vector the max_length size, so that all inputs are the same length (the length of tweet with most words)"""
    if len(vector) < max_length:
        vector = [padding_index for _ in range(max_length-len(vector))] + vector
    return vector

In [24]:
# DELETE AS DOES NOT WORK
# train_english_features = [[text_to_indices(x, total_vocabulary) for x in document] for document in english_tokenized_text[0]]
# val_english_features = [[text_to_indices(x, total_vocabulary) for x in document] for document in english_tokenized_text[0]]

# longest_document = max(train_english_features+val_english_features, key=len)
# max_length = len(longest_document)
# padding_index = 0



In [59]:
# getting the feature vectors by applying the text_to_indices function to each
train_english_features = [text_to_indices(x, total_vocabulary) for x in english_tokenized_text]
val_english_features = [text_to_indices(x, total_vocabulary) for x in english_tokenized_text]

longest_document = max(train_english_features+val_english_features, key=len)
max_length = len(longest_document)
padding_index = 0

In [62]:
# padding the feature vectors by applying the add_padding function to each
train_english_features = [add_padding(x, max_length, padding_index) for x in train_english_features]
val_english_features = [add_padding(x, max_length, padding_index) for x in val_english_features]

In [63]:
class QuestionClassifierTrain(torch.utils.data.Dataset):
    # defining the sources of the data
    def __init__(self, features, labels):
        self.X = torch.from_numpy(np.array(features))
        self.y = torch.from_numpy(np.array(labels))

    def __getitem__(self, index):
        X = self.X[index]
        y = self.y[index].unsqueeze(0)
        return X, y

    def __len__(self):
        return len(self.y)
    

data_train_english = QuestionClassifierTrain(train_english_features, correct_answer_train_english)
data_val_english = QuestionClassifierTrain(val_english_features, correct_answer_val_english)

train_loader = torch.utils.data.DataLoader(data_train_english, batch_size=64)
val_loader = torch.utils.data.DataLoader(data_val_english, batch_size = 64)

In [64]:
# defining the embedding step and RNN model

class SimpleRNN(torch.nn.Module):
    def __init__(self, rnn_size, n_classes, embedding_matrix):
        # initialize the model with a certain dimension of the RNN unit activations (this is rnn_size)
        # and a certain number of output classes
        
        super().__init__()
        
        #applying the embeddings to the inputs
        self.embedding = torch.nn.Embedding.from_pretrained(torch.FloatTensor(embedding_matrix), padding_idx=0, freeze=True)
        emb_dim = embedding_matrix.shape[1] #this will be the size of the input for the RNN
        
        #define the RNN itself 
        self.rnn = torch.nn.RNN(emb_dim, rnn_size, batch_first=True)
        #set batch_first=True for your RNN layer
        
        #define the output layer (no softmax needed here; we will apply softmax as part of the loss calculation)
        #applies a linear transformation to the RNN
        #final layer state and outputs scores for the n classes
        self.outputs = torch.nn.Linear(rnn_size, n_classes)

        
    def forward(self, inputs):
        # encode the input vectors
        encoded_inputs = self.embedding(inputs)
        
        # The RNN returns two tensors: one representing the hidden states at all positions,
        # and another representing only the final hidden states.
        # In this many-to-one model, we only need the final hidden states.
        all_states, final_state = self.rnn(encoded_inputs)
        final_state = final_state.squeeze() #flatten to make sure it has the right dimensions for the next linear step
        
        # run the final state through the output layer
        outputs = self.outputs(final_state)
        return outputs


In [65]:
# initializing and training the model:
myRNN = SimpleRNN(rnn_size=100, n_classes=3, embedding_matrix=embedding_matrix)

print(myRNN)

SimpleRNN(
  (embedding): Embedding(26937, 100, padding_idx=0)
  (rnn): RNN(100, 100, batch_first=True)
  (outputs): Linear(in_features=100, out_features=3, bias=True)
)


In [66]:
# training loop
def training_loop(model, num_epochs):
    loss_function = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

    for epoch in range(num_epochs):
        losses = []
        for batch_index, (inputs, targets) in enumerate(train_loader):

            optimizer.zero_grad()
            outputs = model(inputs).squeeze()
            targets = targets.squeeze() #dependending on your torch version you might have to use targets = targets.squeeze().long()
            loss = loss_function(outputs, targets)
            loss.backward()
            optimizer.step()
            losses.append(loss.item())
        print(f'Epoch {epoch+1}: loss {np.mean(losses)}')
    return model

def evaluate(model, val_loader):
    predictions = []
    labels = []
    with torch.no_grad(): # for evaluation we don't backpropagate and update weights anymore
        for batch_index, (inputs, targets) in enumerate(val_loader):
            outputs = torch.softmax(model(inputs), 1 ) # apply softmax to get probabilities/logits
            # getting the indices of the logit with the highest value, which corresponds to the predicted class (as labels 0, 1, 2)
            vals, indices = torch.max(outputs, 1)
            # accumulating the predictions
            predictions += indices.tolist()
            # accumulating the true labels
            labels += targets.tolist()
    
    acc = accuracy_score(predictions, labels)
    print(f'Model accuracy: {acc}')
    return acc, predictions

In [67]:
# initializing and training the model:
myRNN = SimpleRNN(rnn_size=100, n_classes=3, embedding_matrix=embedding_matrix)

myRNN = training_loop(myRNN, 3)
acc, preds = evaluate(myRNN, val_loader)

Epoch 1: loss 0.5597439453413244
Epoch 2: loss 0.8771149357379382
Epoch 3: loss 0.8947227676639912
Model accuracy: 0.5


In [68]:
# advanced version supporting multiple types of RNN layers

class RNN_or_LSTM(torch.nn.Module):
    def __init__(self, rnn_size, n_classes, embedding_matrix, type="RNN"):
        # initialize the model with a certain dimension of the RNN unit activations (this is rnn_size)
        # and a certain number of output classes
        
        super().__init__()
        
        #applying the embeddings to the inputs
        self.embedding = torch.nn.Embedding.from_pretrained(torch.FloatTensor(embedding_matrix), padding_idx=0, freeze=True)
        emb_dim = embedding_matrix.shape[1]
        
        #remember the batch_first=True argument
        if type == "RNN":
            self.rnn = torch.nn.RNN(emb_dim, rnn_size, batch_first=True)
        elif type == "LSTM":
            self.rnn = torch.nn.LSTM(emb_dim, rnn_size, batch_first=True)   
        else:
            raise LookupError("Only RNN and LSTM are supported.")
        self.output = torch.nn.Linear(rnn_size, n_classes)

    def forward(self, inputs):
        
        # encode the input vectors
        encoded_inputs = self.embedding(inputs)

        #apply the RNN or LSTM
        if type == "RNN":
            all_states, final_state = self.rnn(encoded_inputs)
        else:
            # LSTM's output is different and needs to be treated differently, see documentation for details
            all_states, (final_state, c_n) = self.rnn(encoded_inputs)
        
        # run the final states through the output layer
        outputs = self.output(final_state.squeeze())
        return outputs

In [69]:
myLSTM = RNN_or_LSTM(rnn_size=100, n_classes=3, type='LSTM', embedding_matrix=embedding_matrix)



myLSTM = training_loop(myLSTM, 3)
acc, preds = evaluate(myLSTM, val_loader)

Epoch 1: loss 0.8187243733524971
Epoch 2: loss 1.063502600416541
Epoch 3: loss 0.8644104276237816
Model accuracy: 0.5


In [70]:
class Bidirectional_RNN(torch.nn.Module):
    def __init__(self, rnn_size, n_classes, embedding_matrix):
        # initialize the model with a certain dimension of the RNN unit activations (this is rnn_size)
        # and a certain number of output classes
        
        super().__init__()
        
        #applying the embeddings to the inputs
        self.embedding = torch.nn.Embedding.from_pretrained(torch.FloatTensor(embedding_matrix), padding_idx=0, freeze=True)
        emb_dim = embedding_matrix.shape[1] #this will be the size of the input for the RNN
        
        #define the RNN itself 
        self.rnn = torch.nn.RNN(input_size=emb_dim, hidden_size=rnn_size, num_layers=1, bidirectional=True, batch_first=True)
        #set batch_first=True for your RNN layer
        
        #define the output layer (no softmax needed here; we will apply softmax as part of the loss calculation)
        #applies a linear transformation to the RNN
        #final layer state and outputs scores for the n classes
        self.fc_logits = torch.nn.Linear(2*rnn_size, n_classes)

        
    def forward(self, inputs):
        # encode the input vectors
        encoded_inputs = self.embedding(inputs)
        
        # NB: for a bidirectional RNN, the final state corresponds to the *last* token
        # in the forward direction and the *first* token in the backward direction.
        #Notice that we use torch.concat to concatenate the final states from the forward and backward directions
        rnn_out, final_state = self.rnn(encoded_inputs)
        final_states_combined = torch.cat([final_state[-2,:,:], final_state[-1,:,:]], dim=1)

        # run the output through the final linear layer
        outputslinear = self.fc_logits(final_states_combined)
        return outputslinear

In [71]:
biRNN = Bidirectional_RNN(rnn_size=100, n_classes=3, embedding_matrix=embedding_matrix)

biRNN = training_loop(biRNN, 3)
acc, preds = evaluate(biRNN, val_loader)

Epoch 1: loss 0.5578079082447873
Epoch 2: loss 0.8844208673429514
Epoch 3: loss 0.8676807398201319
Model accuracy: 0.5


**Look into ways of optimisation - will this fix the accuracy issue & the below does not work**

In [81]:
from torch.optim import Adam
import torch.optim as optim

from torch import nn
criterion = nn.CrossEntropyLoss()


In [82]:
biRNN = Bidirectional_RNN(rnn_size=100, n_classes=3, embedding_matrix=embedding_matrix)
optimizer = optim.Adam(biRNN.parameters(), lr=0.001)  # You can adjust the learning rate (lr) as needed


In [83]:
def training_loop(model, num_epochs):
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0

        for batch_inputs, batch_labels in train_loader:
            optimizer.zero_grad()  # Clear gradients
            outputs = model(batch_inputs)
            loss = criterion(outputs, batch_labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        avg_loss = total_loss / len(train_loader)
        print(f'Epoch {epoch + 1}: loss {avg_loss:.4f}')

    return model


In [84]:
biRNN = training_loop(biRNN, 3)
acc, preds = evaluate(biRNN, val_loader)


RuntimeError: 0D or 1D target tensor expected, multi-target not supported

In [85]:

# Define the model
class BiLSTMNetwork(nn.Module):
    """
    Basic BiLSTM network
    """
    def __init__(
            self,
            pretrained_embeddings: torch.tensor,
            lstm_dim: int,
            dropout_prob: float = 0.1,
            n_classes: int = 2
    ):
        """
        Initializer for basic BiLSTM network
        :param pretrained_embeddings: A tensor containing the pretrained BPE embeddings
        :param lstm_dim: The dimensionality of the BiLSTM network
        :param dropout_prob: Dropout probability
        :param n_classes: The number of output classes
        """

        # First thing is to call the superclass initializer
        super(BiLSTMNetwork, self).__init__()

        # We'll define the network in a ModuleDict, which makes organizing the model a bit nicer
        # The components are an embedding layer, a 2 layer BiLSTM, and a feed-forward output layer
        self.model = nn.ModuleDict({
            'embeddings': nn.Embedding.from_pretrained(pretrained_embeddings, padding_idx=pretrained_embeddings.shape[0] - 1),
            'bilstm': nn.LSTM(
                pretrained_embeddings.shape[1],
                lstm_dim,
                1,
                batch_first=True,
                dropout=dropout_prob,
                bidirectional=True),
            'cls': nn.Linear(2*lstm_dim, n_classes)
        })
        self.n_classes = n_classes
        self.dropout = nn.Dropout(p=dropout_prob)

        # Initialize the weights of the model
        self._init_weights()

    def _init_weights(self):
        all_params = list(self.model['bilstm'].named_parameters()) + \
                     list(self.model['cls'].named_parameters())
        for n,p in all_params:
            if 'weight' in n:
                nn.init.xavier_normal_(p)
            elif 'bias' in n:
                nn.init.zeros_(p)

    def forward(self, inputs, input_lens, labels = None):
        """
        Defines how tensors flow through the model
        :param inputs: (b x sl) The IDs into the vocabulary of the input samples
        :param input_lens: (b) The length of each input sequence
        :param labels: (b) The label of each sample
        :return: (loss, logits) if `labels` is not None, otherwise just (logits,)
        """

        # Get embeddings (b x sl x edim)
        embeds = self.model['embeddings'](inputs)

        # Pack padded: This is necessary for padded batches input to an RNN
        lstm_in = nn.utils.rnn.pack_padded_sequence(
            embeds,
            input_lens.cpu(),
            batch_first=True,
            enforce_sorted=False
        )

        # Pass the packed sequence through the BiLSTM
        lstm_out, hidden = self.model['bilstm'](lstm_in)

        # Unpack the packed sequence --> (b x sl x 2*lstm_dim)
        lstm_out,_ = nn.utils.rnn.pad_packed_sequence(lstm_out, batch_first=True)

        # Max pool along the last dimension
        ff_in = self.dropout(torch.max(lstm_out, 1)[0])
        # Some magic to get the last output of the BiLSTM for classification (b x 2*lstm_dim)
        #ff_in = lstm_out.gather(1, input_lens.view(-1,1,1).expand(lstm_out.size(0), 1, lstm_out.size(2)) - 1).squeeze()

        # Get logits (b x n_classes)
        logits = self.model['cls'](ff_in).view(-1, self.n_classes)
        outputs = (logits,)
        if labels is not None:
            # Xentropy loss
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits, labels)
            outputs = (loss,) + outputs

        return outputs



In [86]:
biLSTM = BiLSTMNetwork(rnn_size=100, n_classes=3, embedding_matrix=embedding_matrix)

biLSTM = training_loop(biLSTM, 3)
acc, preds = evaluate(biLSTM, val_loader)

TypeError: __init__() got an unexpected keyword argument 'rnn_size'

In [88]:


# Initialize BiLSTMNetwork
biLSTM = BiLSTMNetwork(embedding_matrix, lstm_dim =100, dropout_prob= 0/1, n_classes=3)

biLSTM = training_loop(biLSTM, 3)
acc, preds = evaluate(biLSTM, val_loader)


AttributeError: 'numpy.ndarray' object has no attribute 'dim'

3. Experiment with a second recurrent layer to implement a deep (or stacked) RNN. This can be done using the parameters of [RNN](https://pytorch.org/docs/stable/generated/torch.nn.RNN.html).