In [7]:
from datasets import load_dataset
import gensim.downloader as api
import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter
import torch.nn.utils.rnn as rnn_utils
import senteval
import argparse

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
# download Glove embeddings
Glove_model = api.load("glove-wiki-gigaword-300")

In [58]:
class MLP(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(MLP, self).__init__()
        self.fully_connected_1 = nn.Linear(input_size, hidden_size)
        self.fully_connected_2 = nn.Linear(hidden_size, hidden_size)
        self.fully_connected_3 = nn.Linear(hidden_size, output_size)
        
        self.seq = nn.Sequential(
            self.fully_connected_1,
            self.fully_connected_2,
            self.fully_connected_3,
            nn.Softmax(dim=1)
        )

    def forward(self, x):
        return self.seq(x)

def vocab(datapoint, vocabulary, UNK_TOKEN):
    tokenised = datapoint.lower().split()
    for token in tokenised:
        if token in Glove_model:
            vocabulary[token] = Glove_model[token]
        else:
            vocabulary[token] = vocabulary[UNK_TOKEN]
    return vocab

def evaluate_model(model, test_loader, checkpoint_path):
    # Load the checkpoint
    checkpoint = torch.load(checkpoint_path)
    model.load_state_dict(checkpoint['model_state_dict'])
    model.eval()

    correct = 0
    with torch.no_grad():
        for premise_hypothesis, label in test_loader:

            output = model(premise_hypothesis)
            _, predicted = torch.max(output, 1)
            print(predicted)
            correct = (predicted == label).sum().item()
    try:
        if predicted == 0:
            model_guess = "entailment"
        elif predicted == 1:
            model_guess = "neutral"
        elif predicted == 2:
            model_guess = "contradiction" 
        
        if label == 0:
            correct_ans = "entailment"
        elif label == 1:
            correct_ans = "neutral"
        elif label == 2:
            correct_ans = "contradiction" 
    except:
        print("Model could not make prediction, please inspect datapoint")
     
    print(f"The models answer was: {model_guess}, the correct label is: {correct_ans}")
    if correct == 1:
        a = 1
        print("So the model was correct")
    elif correct == 0:
        print("So the model was wrong")
        # for nicer visual
        print()
        print()

class BiLSTMEncoder(nn.Module):
    def __init__(self, embedding_dim, hidden_dim):
        super(BiLSTMEncoder, self).__init__()
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)

    def forward(self, embedded, lengths):
        packed = rnn_utils.pack_padded_sequence(embedded, lengths, batch_first=True, enforce_sorted=False)
        _, (hidden, _) = self.lstm(packed.float())
        # Concatenate the last hidden states of forward and backward LSTMs
        concatenated = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)
        return concatenated

def get_embeddings_BiLSTM(datapoint, encoder, vocabulary):
    lengths_list = []
    tokenised = datapoint.lower().split()
    indexed = [vocabulary[token] for token in tokenised if token in vocabulary and not np.all(vocabulary[token] == 0)]
    if len(indexed) == 0:
        return np.zeros(Glove_model.vector_size)
    lengths_list.append(len(indexed))
    indexed = torch.tensor(indexed).unsqueeze(0)
    indexed = torch.nn.utils.rnn.pad_sequence(indexed, batch_first=True, padding_value=1)
    embedding = encoder(indexed, torch.tensor(lengths_list))
    return embedding.detach().numpy()

def BiLSTM_dataset(data, lstm_encoder, vocabulary):
    all_embeddings = {}
    for datapoint in data:
        premise_embedding = get_embeddings_BiLSTM(datapoint['premise'], lstm_encoder, vocabulary)
        hypothesis_embedding = get_embeddings_BiLSTM(datapoint['hypothesis'], lstm_encoder, vocabulary)

        # Check if premise_embedding and hypothesis_embedding have the same number of dimensions
        if premise_embedding.shape[0] != hypothesis_embedding.shape[0]:
            print(f"Skipping datapoint: {datapoint}, premise_embedding and hypothesis_embedding have different dimensions")
            continue

        concat_embeddings = torch.cat([torch.tensor(premise_embedding, dtype=torch.float32), torch.tensor(hypothesis_embedding, dtype=torch.float32)], dim=1)
        elementwise_embeddings = torch.tensor(premise_embedding * hypothesis_embedding, dtype=torch.float32)
        abs_diff_embeddings = torch.tensor(np.abs(premise_embedding - hypothesis_embedding), dtype=torch.float32)

        embeddings = torch.cat([concat_embeddings, elementwise_embeddings, abs_diff_embeddings], dim=1).squeeze(0)
        all_embeddings[torch.tensor(embeddings, dtype=torch.float32).clone().detach()] = torch.tensor(datapoint['label'], dtype=torch.long).clone().detach()
    return all_embeddings


In [59]:
def example(premise, hypothesis, label):
    print("Premise:", premise)
    print("Hypothesis:", hypothesis)
    example_sentences = [{'premise': premise, 
                'hypothesis': hypothesis, 
                'label': label}]
    
    checkpoint_path = 'checkpoints_official/best_model_checkpoint_BiLSTM1.pth'

    # add unseen word token
    UNK_TOKEN = "<UNK>"
    vocabulary = {UNK_TOKEN: np.random.rand(Glove_model.vector_size)}

    for datapoint in example_sentences:
        vocab(datapoint['premise'], vocabulary, UNK_TOKEN)
        vocab(datapoint['hypothesis'], vocabulary, UNK_TOKEN)

    # check longest datapoint for padding
    longest = 0
    for datapoint in example_sentences:
        length = len(datapoint['premise'].lower().split())
        if length > longest:
            longest = length

        length = len(datapoint['hypothesis'].lower().split())
        if length > longest:
            longest = length
    # print(longest)

    input_size = 300
    hidden_size = 512
    Bilstm_encoder = BiLSTMEncoder(input_size, hidden_size)

    # BiLSTM version
    BiLSTM_test_data = BiLSTM_dataset(example_sentences, Bilstm_encoder, vocabulary)

    input_size = 4096
    output_size = 3
    BiLSTM_MLP_model = MLP(input_size, hidden_size, output_size)

    # Convert baseline_train_data dictionary to a list of tuples and filter -1 labels
    BiLSTM_test_data_list = [(embedding, label) for embedding, label in BiLSTM_test_data.items() if label != -1]

    BiLSTM_test_loader = DataLoader(BiLSTM_test_data_list, batch_size=64)

    evaluate_model(BiLSTM_MLP_model, BiLSTM_test_loader, checkpoint_path)


In [60]:
premise = "A man is walking a dog"
hypothesis = "No cat is outside"
label = 1

example(premise, hypothesis, label)



Premise: A man is walking a dog
Hypothesis: No cat is outside
tensor([0])
The models answer was: entailment, the correct label is: neutral
So the model was wrong




  all_embeddings[torch.tensor(embeddings, dtype=torch.float32).clone().detach()] = torch.tensor(datapoint['label'], dtype=torch.long).clone().detach()


In [61]:
premise = "Two men sitting in the sun"
hypothesis = "Nobody is sitting in the shade"
label = 1

example(premise, hypothesis, label)

Premise: Two men sitting in the sun
Hypothesis: Nobody is sitting in the shade
tensor([0])
The models answer was: entailment, the correct label is: neutral
So the model was wrong




  all_embeddings[torch.tensor(embeddings, dtype=torch.float32).clone().detach()] = torch.tensor(datapoint['label'], dtype=torch.long).clone().detach()


# Important note
Unfortunately our model(s) appear to consistently predict "entailment" regardless of the actual label. This means that our models perform no better than chance :(

This behaviour can also be seen in the two examples above. We would expect the model to predict "contradiction" (eventhough the actual label is "neutral"), however we consiquently get the prediction "entailment". Despite this issue, we will continue to evaluate and discuss the models' performance in our report for the sake of completing the practical requirements.

In [64]:
# version that only prints the predicted value so it's easy to see the model only predicts "entailment"
def evaluate_model2(model, test_loader, checkpoint_path):
    # Load the checkpoint
    checkpoint = torch.load(checkpoint_path)
    model.load_state_dict(checkpoint['model_state_dict'])
    model.eval()

    with torch.no_grad():
        for premise_hypothesis, label in test_loader:

            output = model(premise_hypothesis)
            _, predicted = torch.max(output, 1)
            print(predicted)


def example2(premise, hypothesis, label):
    example_sentences = [{'premise': premise, 
                'hypothesis': hypothesis, 
                'label': label}]
    
    checkpoint_path = 'checkpoints_official/best_model_checkpoint_BiLSTM1.pth'

    # add unseen word token
    UNK_TOKEN = "<UNK>"
    vocabulary = {UNK_TOKEN: np.random.rand(Glove_model.vector_size)}

    for datapoint in example_sentences:
        vocab(datapoint['premise'], vocabulary, UNK_TOKEN)
        vocab(datapoint['hypothesis'], vocabulary, UNK_TOKEN)

    # check longest datapoint for padding
    longest = 0
    for datapoint in example_sentences:
        length = len(datapoint['premise'].lower().split())
        if length > longest:
            longest = length

        length = len(datapoint['hypothesis'].lower().split())
        if length > longest:
            longest = length
    # print(longest)

    input_size = 300
    hidden_size = 512
    Bilstm_encoder = BiLSTMEncoder(input_size, hidden_size)

    # BiLSTM version
    BiLSTM_test_data = BiLSTM_dataset(example_sentences, Bilstm_encoder, vocabulary)

    input_size = 4096
    output_size = 3
    BiLSTM_MLP_model = MLP(input_size, hidden_size, output_size)

    # Convert baseline_train_data dictionary to a list of tuples and filter -1 labels
    BiLSTM_test_data_list = [(embedding, label) for embedding, label in BiLSTM_test_data.items() if label != -1]

    BiLSTM_test_loader = DataLoader(BiLSTM_test_data_list, batch_size=64)

    evaluate_model2(BiLSTM_MLP_model, BiLSTM_test_loader, checkpoint_path)


In [66]:
# Here we see that for all datapoints in the testset, the model predicts "entailment"
# even though this is wrong (for two thirds of the dataset)
test_dataset = load_dataset('stanfordnlp/snli', split='test')
# for time constrainst we show it for 100 datapoints
test_dataset = test_dataset.select(range(100))
for datapoint in test_dataset:
    premise = datapoint['premise']
    hypothesis = datapoint['hypothesis']
    label = datapoint['label']
    example2(premise, hypothesis, label)

  all_embeddings[torch.tensor(embeddings, dtype=torch.float32).clone().detach()] = torch.tensor(datapoint['label'], dtype=torch.long).clone().detach()


tensor([0])
tensor([0])
tensor([0])
tensor([0])
tensor([0])
tensor([0])
tensor([0])
tensor([0])
tensor([0])
tensor([0])
tensor([0])
tensor([0])
tensor([0])
tensor([0])
tensor([0])
tensor([0])
tensor([0])
tensor([0])
tensor([0])
tensor([0])
tensor([0])
tensor([0])
tensor([0])
tensor([0])
tensor([0])
tensor([0])
tensor([0])
tensor([0])
tensor([0])
tensor([0])
tensor([0])
tensor([0])
tensor([0])
tensor([0])
tensor([0])
tensor([0])
tensor([0])
tensor([0])
tensor([0])
tensor([0])
tensor([0])
tensor([0])
tensor([0])
tensor([0])
tensor([0])
tensor([0])
tensor([0])
tensor([0])
tensor([0])
tensor([0])
tensor([0])
tensor([0])
tensor([0])
tensor([0])
tensor([0])
tensor([0])
tensor([0])
tensor([0])
tensor([0])
tensor([0])
tensor([0])
tensor([0])
tensor([0])
tensor([0])
tensor([0])
tensor([0])
tensor([0])
tensor([0])
tensor([0])
tensor([0])
tensor([0])
tensor([0])
tensor([0])
tensor([0])
tensor([0])
tensor([0])
tensor([0])
tensor([0])
tensor([0])
tensor([0])
tensor([0])
tensor([0])
tensor([0])
tens