In [101]:
import random
from typing import Union

import torch
from torch import Tensor
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
import transformers

# ######################## PART 1: PROVIDED CODE ########################

def load_datasets(data_directory: str) -> Union[dict, dict]:
    """
    Reads the training and validation splits from disk and load
    them into memory.

    Parameters
    ----------
    data_directory: str
        The directory where the data is stored.
    
    Returns
    -------
    train: dict
        The train dictionary with keys 'premise', 'hypothesis', 'label'.
    validation: dict
        The validation dictionary with keys 'premise', 'hypothesis', 'label'.
    """
    import json
    import os

    with open(os.path.join(data_directory, "train.json"), "r") as f:
        train = json.load(f)

    with open(os.path.join(data_directory, "validation.json"), "r") as f:
        valid = json.load(f)

    return train, valid


class NLIDataset(torch.utils.data.Dataset):
    def __init__(self, data_dict: dict):
        self.data_dict = data_dict
        dd = data_dict

        if len(dd["premise"]) != len(dd["hypothesis"]) or len(dd["premise"]) != len(
            dd["label"]
        ):
            raise AttributeError("Incorrect length in data_dict")

    def __len__(self):
        return len(self.data_dict["premise"])

    def __getitem__(self, idx):
        dd = self.data_dict
        return dd["premise"][idx], dd["hypothesis"][idx], dd["label"][idx]


def train_distilbert(model, loader, device):
    model.train()
    criterion = model.get_criterion()
    total_loss = 0.0
    train_accuracy = []
    for premise, hypothesis, target in tqdm(loader):
        optimizer.zero_grad()

        inputs = model.tokenize(premise, hypothesis).to(device)
        target = target.to(device, dtype=torch.float32)
        #print(inputs)
        pred = model(inputs)
        #print(pred)
        #print(target)
        loss = criterion(pred, target)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    
    print("training accuracy is ")
    accuracy = torch.eq(target, pred.round())
    true_count = 0
    all_count = 0
    # Loop through each element
    for element in accuracy:
        all_count += 1
        if element:  # Check if element is True
             true_count += 1
    print(true_count/all_count)
    return total_loss / len(loader), true_count/all_count


@torch.no_grad()
def eval_distilbert(model, loader, device):
    model.eval()

    targets = []
    preds = []

    for premise, hypothesis, target in loader:
        preds.append(model(model.tokenize(premise, hypothesis).to(device)))

        targets.append(target)

    return torch.cat(preds), torch.cat(targets)

In [87]:
#A4 START HERE
class CustomDistilBert(nn.Module):
    def __init__(self):
        super().__init__()
        self.distilbert = transformers.DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.tokenizer = transformers.DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
        self.pred_layer = nn.Linear(self.distilbert.config.hidden_size, 1)
        self.sigmoid = nn.Sigmoid()
        self.criterion = nn.BCELoss()

    # vvvvv DO NOT CHANGE BELOW THIS LINE vvvvv
    def get_distilbert(self):
        return self.distilbert

    def get_tokenizer(self):
        return self.tokenizer

    def get_pred_layer(self):
        return self.pred_layer

    def get_sigmoid(self):
        return self.sigmoid
    
    def get_criterion(self):
        return self.criterion
    # ^^^^^ DO NOT CHANGE ABOVE THIS LINE ^^^^^

    def assign_optimizer(self, **kwargs):
        model_params = self.parameters()
        optimizer = torch.optim.Adam(model_params, **kwargs)  
        return optimizer

    def slice_cls_hidden_state(
        self, x: transformers.modeling_outputs.BaseModelOutput
    ) -> torch.Tensor:
        last_states = x.last_hidden_state
        result =last_states[:, 0, :]
        return result

    def tokenize(
        self,
        premise: "list[str]",
        hypothesis: "list[str]",
        max_length: int = 128,
        truncation: bool = True,
        padding: bool = True,
    ):
        tokenizer = self.get_tokenizer()
        encoded_inputs = tokenizer(premise, hypothesis,
                              max_length=max_length,
                              truncation=truncation,
                              padding=padding,
                              return_tensors="pt")
        return encoded_inputs

    def forward(self, inputs: transformers.BatchEncoding):
        #print(inputs)
        outputs = self.distilbert(**inputs)
        cls_token_hidden_state = self.slice_cls_hidden_state(outputs)
        zero_one = self.pred_layer(cls_token_hidden_state)
        #print(zero_one)
        predictions = self.sigmoid(zero_one)
        return predictions[:, 0]

In [103]:
import pandas as pd
from sklearn.metrics import f1_score  # Make sure sklearn is installed
import random
random.seed(2022)
torch.manual_seed(2022)

# Parameters (you can change them)
sample_size = 2500  # Change this if you want to take a subset of data for testing
batch_size = 64
n_epochs = 10
num_words = 50000
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_raw, valid_raw = load_datasets("./data/nli")
if sample_size is not None:
        for key in ["premise", "hypothesis", "label"]:
            train_raw[key] = train_raw[key][:sample_size]
            valid_raw[key] = valid_raw[key][:sample_size]
            
full_text = (
        train_raw["premise"]
        + train_raw["hypothesis"]
        + valid_raw["premise"]
        + valid_raw["hypothesis"]
    )

print("=" * 80)
print("Running test code for part 1")
print("-" * 80)

train_loader = torch.utils.data.DataLoader(
        NLIDataset(train_raw), batch_size=batch_size, shuffle=True
    )
valid_loader = torch.utils.data.DataLoader(
        NLIDataset(valid_raw), batch_size=batch_size, shuffle=False
    )

model = CustomDistilBert().to(device)
#optimizer = model.assign_optimizer(lr=1e-4)
optimizer = model.assign_optimizer(lr=1e-4)
validation_accuracy = []
training_accuracy = []
for epoch in range(n_epochs):
        loss, train_acc = train_distilbert(model, train_loader, device=device)
        training_accuracy.append(train_acc)
        preds, targets = eval_distilbert(model, valid_loader, device=device)
        #print("validation accuracy is ")
        #accuracy = torch.mean(torch.eq(target, pred.round())).item()
        #print(accuracy)
        preds = preds.round()
        print("validation accuracy is ")
        accuracy = torch.eq(targets, preds.round())
        true_count = 0
        all_count = 0
        # Loop through each element
        for element in accuracy:
            all_count += 1
            if element:  # Check if element is True
                true_count += 1
        print(true_count/all_count)
        validation_accuracy.append(true_count/all_count)
        score = f1_score(targets.cpu(), preds.cpu())
        print("Epoch:", epoch)
        print("Training loss:", loss)
        print("Validation F1 score:", score)
        print()

Running test code for part 1
--------------------------------------------------------------------------------


100%|██████████| 40/40 [05:21<00:00,  8.03s/it]


training accuracy is 
0.75
validation accuracy is 
0.848
Epoch: 0
Training loss: 0.5917958706617356
Validation F1 score: 0.8364888123924269



100%|██████████| 40/40 [05:53<00:00,  8.83s/it]


training accuracy is 
0.5


KeyboardInterrupt: 

In [110]:
print(train_raw['premise'][0])
print(train_raw['hypothesis'][0])
print(train_raw['label'][0])
print(train_raw['premise'][1])
print(train_raw['hypothesis'][1])
print(train_raw['label'][1])
print(train_raw['premise'][2])
print(train_raw['hypothesis'][2])
print(train_raw['label'][2])

A woman wearing a bike helmet and a warm-up suit is sitting in the park meditating.
A woman is sitting.
0
A child in a tie dye shirt and one in a white shirt are on a climbing wall.
Two children climb a wall.
0
Two brown dogs barking at each other.
The animals are making noise.
0


In [111]:
print(train_raw['premise'][8])
print(train_raw['hypothesis'][8])
print(train_raw['label'][8])
print(train_raw['premise'][7])
print(train_raw['hypothesis'][7])
print(train_raw['label'][7])
print(train_raw['premise'][6])
print(train_raw['hypothesis'][6])
print(train_raw['label'][6])

Man with dreadlocks twirling batons near boats.
The man is sewing a boat sail.
1
A worker with face protection is using a machine.
a worker is working
0
four friends cheerfully jumping off the flight stairs.
Four people jumping off stairs.
0


In [108]:
def freeze_params(model):
    # TODO: your work below
    for param in model.parameters(recurse=True):
        param.requires_grad = False


def pad_attention_mask(mask, p):
    print(mask)
    print(p)
    padded_mask = torch.nn.functional.pad(mask, (p, 0), value=1) 
    print(padded_mask)
    return padded_mask


class SoftPrompting(nn.Module):
    def __init__(self, p: int, e: int):
        super().__init__()
        self.p = p
        self.e = e
        
        self.prompts = torch.randn((p, e), requires_grad=True)
        
    def forward(self, embedded):
        batch_prompts = self.prompts.unsqueeze(0).expand(embedded.size(0), -1, -1)
        prompted_embeddings = torch.cat([batch_prompts, embedded], dim=1)
        return prompted_embeddings

In [85]:
def load_models_and_tokenizer(q_name, a_name, t_name, device='cpu'):
    # TODO: your work below
    q_enc = transformers.AutoModel.from_pretrained(q_name).to(device)
    a_enc =  transformers.AutoModel.from_pretrained(a_name).to(device)
    tokenizer =  transformers.AutoTokenizer.from_pretrained(t_name)
    return q_enc, a_enc, tokenizer
    

def tokenize_qa_batch(tokenizer, q_titles, q_bodies, answers, max_length=64) -> transformers.BatchEncoding:
    q_text = ["[CLS] " +title+ " [SEP] " + body for title, body in zip(q_titles, q_bodies)]
    for text in q_text:
        q = tokenizer(text, padding=True, return_token_type_ids= True, return_attention_mask=True, truncation=True, max_length=max_length, return_tensors="pt")
        print(q)
    a_batch= tokenizer(answers, padding=True, return_token_type_ids= True, return_attention_mask=True, truncation=True, max_length=max_length, return_tensors="pt")
    return [], a_batch
    
    # return q_batch, a_batch

def get_class_output(model, batch):
    # Since this is similar to a previous question, it is left ungraded
    # TODO: your work below.
    print(q_)
    pass

def inbatch_negative_sampling(Q: Tensor, P: Tensor, device: str = 'cpu') -> Tensor:
    # TODO: your work below
    # Calculate dot product similarity between questions and all passages in the batch
    S = torch.einsum('ne,me->nm', Q, P) 
    mean_negatives = torch.mean(S, dim=1, keepdim=True)  
    S = S - mean_negatives
    return S.to(device) 
    
    # return S

def contrastive_loss_criterion(S: Tensor, labels: Tensor = None, device: str = 'cpu'):
    # TODO: your work below
    pass
    
    # return loss

def get_topk_indices(Q, P, k: int = None):
    # TODO: your work below
    pass

    # return indices, scores

def select_by_indices(indices: Tensor, passages: 'list[str]') -> 'list[str]':
    # TODO: your work below
    pass


def embed_passages(passages: 'list[str]', model, tokenizer, device='cpu', max_length=512):
    # TODO: your work below
    pass


def embed_questions(titles, bodies, model, tokenizer, device='cpu', max_length=512):
    # TODO: your work below
    pass


def recall_at_k(retrieved_indices: 'list[list[int]]', true_indices: 'list[int]', k: int):
    # TODO: your work below
    pass


def mean_reciprocal_rank(retrieved_indices: 'list[list[int]]', true_indices: 'list[int]'):
    # TODO: your work below
    pass


bsize = 8
qa_data = dict(
        train = pd.read_csv('data/qa/train.csv'),
        valid = pd.read_csv('data/qa/validation.csv'),
        answers = pd.read_csv('data/qa/answers.csv'),
    )

q_titles = qa_data['train'].loc[:bsize-1, 'QuestionTitle'].tolist()
q_bodies = qa_data['train'].loc[:bsize-1, 'QuestionBody'].tolist()
answers = qa_data['train'].loc[:bsize-1, 'Answer'].tolist()

    # Loading huggingface models and tokenizers    
name = 'google/electra-small-discriminator'
q_enc, a_enc, tokenizer = load_models_and_tokenizer(q_name=name, a_name=name, t_name=name)
q_batch, a_batch = tokenize_qa_batch(tokenizer, q_titles, q_bodies, answers)
#q_out = get_class_output(q_enc, q_batch)
#a_out = get_class_output(a_enc, a_batch)


{'input_ids': tensor([[  101,   101,  2054,  2828,  1997,  5404,  2003,  2190,  2005,  5404,
         17548,  3869,  1029,   102,  1045,  2001,  2559,  2005,  1037,  5404,
         17548,  3869, 17974,  1996,  2060,  2154,  2043,  1045,  4384,  2087,
          1997,  1996, 19328,  2123,  1005,  1056,  2110,  1037,  2806,  1997,
          5404,  2000,  2224,  1012,  2070,  1997,  1996, 19328,  2224,  1037,
          3278,  3815,  1997,  5404,  2061,  1045,  7868,  2008,  2070,  1997,
          1996, 14894,  6337,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [80]:
q_batch['input_ids'].size()
print(q_batch['token_type_ids'].size())
q_batch['attention_mask'].size()


torch.Size([8, 64])


torch.Size([8, 64])