# 6.8610  PROJECT

## Install and import libraries

In [None]:
!pip install datasets
!pip install transformers[torch]
!pip install tokenizers
!pip install huggingface_hub
!pip install tqdm

In [1]:
import numpy as np
from datasets import Dataset, load_dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from torch.optim import AdamW
from torch.utils.data import DataLoader, TensorDataset
import os
import json
import torch
from tqdm import tqdm
import gc

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


## Helpers

### Prepare Dataset

In [99]:
def get_latest_checkpoint(checkpoint_dir):

    # List all directories in the results folder
    all_checkpoints = [d for d in os.listdir(checkpoint_dir) if os.path.isdir(os.path.join(checkpoint_dir, d))]
    # Extract the checkpoint numbers
    checkpoint_numbers = [int(d.split("-")[1]) for d in all_checkpoints if d.startswith("checkpoint-")]
    # Find the folder name of the latest checkpoint
    latest_checkpoint_folder = f"{checkpoint_dir}/checkpoint-{max(checkpoint_numbers)}"
    
    return latest_checkpoint_folder

In [100]:
def load_tokenizer(checkpoint_dir):
    checkpoint = get_latest_checkpoint(checkpoint_dir)
    tokenizer = AutoTokenizer.from_pretrained(checkpoint)
    return tokenizer

In [101]:
def load_datasets(dataset_dir):
    dataset_train_path = f"{dataset_dir}/train.jsonl"
    #dataset_test_path = f"{dataset_dir}/test.jsonl"
    dataset_test_path = f"{dataset_dir}/test_small.jsonl"
    dataset_train = load_dataset('json', data_files=dataset_train_path)['train']
    dataset_test = load_dataset('json', data_files=dataset_test_path)['train']
    return dataset_train, dataset_test

In [102]:
PROMPT = """
Given the description after "Description:", complete the last sentence with a true statement about the contents
of the specified box according to the description.
Description: 
"""

def tokenize_dataset(dataset, tokenizer):
    input_ids = []
    attention_masks = []
    labels = []

    for entry in tqdm(dataset):
        # tokenize problem
        input_data = PROMPT + entry['sentence_masked'][:-15] + ":"
        input_encoding = tokenizer(input_data, truncation=True, padding='max_length', max_length=512, return_attention_mask = True, return_tensors = 'pt')
        input_ids.append(input_encoding['input_ids'])
        attention_masks.append(input_encoding['attention_mask'])
         # tokenize answer
        target = entry['masked_content'][13:]
        target_encoding = tokenizer(target, truncation=True, padding='max_length', max_length=512, return_attention_mask = True, return_tensors = 'pt')
        target_input_ids = target_encoding['input_ids']
        target_input_ids[target_input_ids == tokenizer.pad_token_id] = -100
        labels.append(target_input_ids)

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.cat(labels, dim=0)
    tokenized_data = TensorDataset(input_ids, attention_masks, labels)
    return tokenized_data

In [103]:
BATCH_SIZE = 4

def make_dataloader(tokenized_dataset):
    dataloader = DataLoader(tokenized_dataset, batch_size=BATCH_SIZE, shuffle=False)
    return dataloader

### Two-Shot Training

In [104]:
def load_model(checkpoint_dir):
    checkpoint = get_latest_checkpoint(checkpoint_dir)
    model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint).to(device)
    return model

In [105]:
def train_model(model, train_loader):
    
    optimizer = AdamW(model.parameters(), lr = 1e-4, eps = 1e-8)

    for _ in tqdm(range(5)):

        total_train_loss = 0
        model.train()

        for batch in tqdm(train_loader):

          input_ids = batch[0].to(device)
          input_mask = batch[1].to(device)
          labels = batch[2].to(device)
          
          model.zero_grad()
          outputs = model(input_ids, attention_mask=input_mask, labels=labels)
          loss = outputs.loss
          total_train_loss += loss
          loss.backward()
          torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
          optimizer.step()
    
    print(f"Total train loss: {total_train_loss}.")

### Generate Predictions

In [106]:
def make_prediction_all(model, tokenizer, dataloader):
    model_predictions = []

    for batch in tqdm(dataloader):

        # Generate sequences for the batch
        input_ids = batch[0].to(device)
        input_mask = batch[1].to(device)
        sequence_ids = model.generate(input_ids, attention_mask=input_mask)
        sequences = tokenizer.batch_decode(sequence_ids, skip_special_tokens=True)

        # Accumulate generated sequences
        model_predictions.extend(sequences)

    return model_predictions

In [107]:
def add_prediction_to_data(model_preds, model_name, dataset):
    dataset = dataset.add_column(f"model_{model_name}_pred", model_preds)
    return dataset

## Evaluation Loop

In [108]:
def evaluate_model(model_name="base", zeroshot=True):

    # prepare datasets
    if (model_name == "base"):
        tokenizer = T5Tokenizer.from_pretrained("t5-base")
    else:
        tokenizer = load_tokenizer(f"./results/{model_name}/")
    dataset_train, dataset_test = load_datasets("./eval/current_use")
    tokenized_dataset_train = tokenize_dataset(dataset_train, tokenizer)
    tokenized_dataset_test = tokenize_dataset(dataset_test, tokenizer)
    train_loader = make_dataloader(tokenized_dataset_train)
    test_loader = make_dataloader(tokenized_dataset_test)

    # load and potentially train model
    if (model_name == "base"):
        model = T5ForConditionalGeneration.from_pretrained("t5-base").to(device)
    else:
        model = load_model(f"./results/{model_name}/")
    if not zeroshot:
        train_model(model, train_loader)

    # make predictions
    model_preds = make_prediction_all(model, tokenizer, test_loader)
    dataset_test = add_prediction_to_data(model_preds, model_name, dataset_test)
    dataset_test.to_json(f"./results/eval/preds_{model_name}_zeroshot.jsonl", orient="records")
    #dataset_test.to_json(f"./results/eval/preds_{model_name}.jsonl", orient="records")
    
    # garbage collection
    del model, tokenizer, tokenized_dataset_train, tokenized_dataset_test, train_loader, test_loader
    gc.collect()
    torch.cuda.empty_cache()

    return model_preds, dataset_test

In [109]:
model_preds_math, dataset_test_math = evaluate_model(model_name="math", zeroshot=False)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading data files: 100%|██████████| 1/1 [00:00<?, ?it/s]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 200.02it/s]
Generating train split: 14 examples [00:00, 2800.47 examples/s]
100%|██████████| 14/14 [00:00<00:00, 2000.14it/s]
100%|██████████| 100/100 [00:00<00:00, 1246.35it/s]
100%|██████████| 4/4 [00:06<00:00,  1.55s/it]
100%|██████████| 4/4 [00:05<00:00,  1.29s/it]
100%|██████████| 4/4 [00:05<00:00,  1.30s/it]
100%|██████████| 4/4 [00:05<00:00,  1.45s/it]
100%|██████████| 4/4 [00:04<00:00,  1.24s/it]
100%|██████████| 5/5 [00:27<00:00,  5.47s/it]


Total train loss: 4.514426231384277.


100%|██████████| 25/25 [00:06<00:00,  3.68it/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1043.62ba/s]


In [110]:
dataset_test_math

Dataset({
    features: ['sentence', 'sentence_masked', 'masked_content', 'sample_id', 'numops', 'model_math_pred'],
    num_rows: 100
})

In [111]:
model_preds_code1, dataset_test_code1 = evaluate_model(model_name="code1", zeroshot=False)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
100%|██████████| 14/14 [00:00<00:00, 286.07it/s]
100%|██████████| 100/100 [00:00<00:00, 1922.89it/s]
100%|██████████| 4/4 [00:04<00:00,  1.10s/it]
100%|██████████| 4/4 [00:05<00:00,  1.35s/it]
100%|██████████| 4/4 [00:05<00:00,  1.29s/it]
100%|██████████| 4/4 [00:05<00:00,  1.46s/it]
100%|██████████| 4/4 [00:04<00:00,  1.22s/it]
100%|██████████| 5/5 [00:25<00:00,  5.14s/it]


Total train loss: 4.321694850921631.


100%|██████████| 25/25 [00:05<00:00,  4.63it/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 499.68ba/s]


In [112]:
dataset_test_code1

Dataset({
    features: ['sentence', 'sentence_masked', 'masked_content', 'sample_id', 'numops', 'model_code1_pred'],
    num_rows: 100
})

In [113]:
model_preds_code2, dataset_test_code2 = evaluate_model(model_name="code2", zeroshot=False)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
100%|██████████| 14/14 [00:00<00:00, 285.71it/s]
100%|██████████| 100/100 [00:00<00:00, 1868.82it/s]
100%|██████████| 4/4 [00:04<00:00,  1.17s/it]
100%|██████████| 4/4 [00:05<00:00,  1.37s/it]
100%|██████████| 4/4 [00:05<00:00,  1.35s/it]
100%|██████████| 4/4 [00:06<00:00,  1.50s/it]
100%|██████████| 4/4 [00:05<00:00,  1.29s/it]
100%|██████████| 5/5 [00:26<00:00,  5.35s/it]


Total train loss: 4.195098876953125.


100%|██████████| 25/25 [00:05<00:00,  4.33it/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 499.32ba/s]


In [114]:
dataset_test_code2

Dataset({
    features: ['sentence', 'sentence_masked', 'masked_content', 'sample_id', 'numops', 'model_code2_pred'],
    num_rows: 100
})

In [115]:
model_preds_base, dataset_test_base = evaluate_model(model_name="base", zeroshot=False)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
100%|██████████| 14/14 [00:00<00:00, 307.37it/s]
100%|██████████| 100/100 [00:00<00:00, 1738.90it/s]
100%|██████████| 4/4 [00:04<00:00,  1.13s/it]
100%|██████████| 4/4 [00:05<00:00,  1.43s/it]
100%|██████████| 4/4 [00:05<00:00,  1.38s/it]
100%|██████████| 4/4 [00:06<00:00,  1.56s/it]
100%|██████████| 4/4 [00:05<00:00,  1.29s/it]
100%|██████████| 5/5 [00:27<00:00,  5.44s/it]


Total train loss: 2.103578567504883.


100%|██████████| 25/25 [00:06<00:00,  3.73it/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 500.22ba/s]


In [116]:
dataset_test_base

Dataset({
    features: ['sentence', 'sentence_masked', 'masked_content', 'sample_id', 'numops', 'model_base_pred'],
    num_rows: 100
})

In [117]:
dataset_test_all = Dataset.from_dict({
    # problem and solution
    'sentence_masked': dataset_test_base['sentence_masked'],
    'masked_content': dataset_test_base['masked_content'],
    # predictions
    'model_base_pred': model_preds_base,
    'model_math_pred': model_preds_math,
    'model_code1_pred': model_preds_code1,
    'model_code2_pred': model_preds_code2,
    # metadata
    'sample_id': dataset_test_base['sample_id'],
    'numops': dataset_test_base['numops'],
    })

In [118]:
dataset_test_all

Dataset({
    features: ['sentence_masked', 'masked_content', 'model_base_pred', 'model_math_pred', 'model_code1_pred', 'model_code2_pred', 'sample_id', 'numops'],
    num_rows: 100
})

In [119]:
dataset_test_all[:7]

{'sentence_masked': ['Box 0 contains the boat and the shoe and the television, Box 1 contains the camera and the car and the plane, Box 2 contains the cake, Box 3 contains the fan, Box 4 contains the cross and the file and the note, Box 5 contains the chemical, Box 6 contains nothing. Move the cake from Box 2 to Box 6. Put the dish and the stone into Box 5. Move the television from Box 0 to Box 6. Put the drink into Box 2. Move the chemical and the stone from Box 5 to Box 2. Box 0 contains <extra_id_0> .',
  'Box 0 contains the boat and the shoe and the television, Box 1 contains the camera and the car and the plane, Box 2 contains the cake, Box 3 contains the fan, Box 4 contains the cross and the file and the note, Box 5 contains the chemical, Box 6 contains nothing. Move the cake from Box 2 to Box 6. Put the dish and the stone into Box 5. Move the television from Box 0 to Box 6. Put the drink into Box 2. Move the chemical and the stone from Box 5 to Box 2. Box 1 contains <extra_id_0>

In [120]:
dataset_test_all.to_json(f"./results/eval/preds_twoshot_small.jsonl", orient="records")

Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 999.83ba/s]


88025

### Zeroshot

In [121]:
model_preds_math, dataset_test_math = evaluate_model(model_name="math", zeroshot=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


100%|██████████| 14/14 [00:00<00:00, 333.34it/s]
100%|██████████| 100/100 [00:00<00:00, 1834.23it/s]
100%|██████████| 25/25 [00:06<00:00,  3.97it/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 999.60ba/s]


In [122]:
model_preds_code1, dataset_test_code1 = evaluate_model(model_name="code1", zeroshot=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
100%|██████████| 14/14 [00:00<00:00, 2333.04it/s]
100%|██████████| 100/100 [00:00<00:00, 1218.88it/s]
100%|██████████| 25/25 [00:05<00:00,  4.17it/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1000.55ba/s]


In [123]:
model_preds_code2, dataset_test_code2 = evaluate_model(model_name="code2", zeroshot=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
100%|██████████| 14/14 [00:00<00:00, 2333.60it/s]
100%|██████████| 100/100 [00:00<00:00, 1226.06it/s]
100%|██████████| 25/25 [00:06<00:00,  3.97it/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 999.12ba/s]


In [124]:
model_preds_base, dataset_test_base = evaluate_model(model_name="base", zeroshot=True)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
100%|██████████| 14/14 [00:00<00:00, 2000.21it/s]
100%|██████████| 100/100 [00:00<00:00, 1069.83it/s]
100%|██████████| 25/25 [00:06<00:00,  4.07it/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 990.16ba/s]


In [125]:
dataset_test_all = Dataset.from_dict({
    # problem and solution
    'sentence_masked': dataset_test_base['sentence_masked'],
    'masked_content': dataset_test_base['masked_content'],
    # predictions
    'model_base_pred': model_preds_base,
    'model_math_pred': model_preds_math,
    'model_code1_pred': model_preds_code1,
    'model_code2_pred': model_preds_code2,
    # metadata
    'sample_id': dataset_test_base['sample_id'],
    'numops': dataset_test_base['numops'],
    })

In [126]:
dataset_test_all[:7]

{'sentence_masked': ['Box 0 contains the boat and the shoe and the television, Box 1 contains the camera and the car and the plane, Box 2 contains the cake, Box 3 contains the fan, Box 4 contains the cross and the file and the note, Box 5 contains the chemical, Box 6 contains nothing. Move the cake from Box 2 to Box 6. Put the dish and the stone into Box 5. Move the television from Box 0 to Box 6. Put the drink into Box 2. Move the chemical and the stone from Box 5 to Box 2. Box 0 contains <extra_id_0> .',
  'Box 0 contains the boat and the shoe and the television, Box 1 contains the camera and the car and the plane, Box 2 contains the cake, Box 3 contains the fan, Box 4 contains the cross and the file and the note, Box 5 contains the chemical, Box 6 contains nothing. Move the cake from Box 2 to Box 6. Put the dish and the stone into Box 5. Move the television from Box 0 to Box 6. Put the drink into Box 2. Move the chemical and the stone from Box 5 to Box 2. Box 1 contains <extra_id_0>

In [127]:
dataset_test_all.to_json(f"./results/eval/preds_zeroshot.jsonl", orient="records")

Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 500.75ba/s]


101648

## Metrics

### Helpers

In [51]:
def load_dataset_if_not_already(dataset_path="./results/eval/preds_twoshot.jsonl"):
    if "dataset_test_all" not in locals():
        dataset_test_all = load_dataset('json', data_files=dataset_path)['train']
    return dataset_test_all

In [52]:
def clean_labels(labels):
    # removes leading masked token
    cln_labels = list(map(lambda label: label[13:], labels))
    return cln_labels

In [53]:
def compute_metrics_for_model(labels_combined, preds_combined):

    # split labels and preds into list of items
    label_items = list(map(lambda label: label.split(" and "), labels_combined))
    preds_items = list(map(lambda pred: pred.split(" and "), preds_combined))

    # count predictions
    total_preds, total_correct, true_pos, false_pos, false_neg = 0, 0, 0, 0, 0
    for i in range(len(label_items)):
        labels, preds = label_items[i], preds_items[i]
        for label in labels:
            if (label in preds):     # correct answer that pred also has
                total_correct += 1
                true_pos += 1
            else:                    # correct answer that pred does not have
                false_neg += 1
            total_preds += 1         # count the item into total regardless
        for pred in preds:
            if (pred not in labels): # item that pred has but is not in answer
                false_pos += 1

    # compute metrics
    accuracy = total_correct / total_preds
    precision = true_pos / (true_pos + false_pos)
    recall = true_pos / (true_pos + false_neg)
    f1 = (precision * recall) / (precision + recall)

    print(f"accuracy: {accuracy}")
    print(f"precision: {precision}")
    print(f"recall: {recall}")
    print(f"f1 score: {f1}")

    return accuracy, precision, recall, f1

### Actual Computation

In [54]:
def compute_metrics(model="base", dataset_path="./results/eval/preds_twoshot.jsonl"):
    dataset_test_all = load_dataset_if_not_already(dataset_path=dataset_path)
    cln_labels = clean_labels(dataset_test_all['masked_content'])
    metrics = compute_metrics_for_model(cln_labels, dataset_test_all[f"model_{model}_pred"])
    return metrics

In [55]:
metrics_base = compute_metrics("base", dataset_path="./results/eval/preds_twoshot_small.jsonl")

accuracy: 0.4057142857142857
precision: 0.3879781420765027
recall: 0.4057142857142857
f1 score: 0.19832402234636873


In [56]:
metrics_math = compute_metrics("math", dataset_path="./results/eval/preds_twoshot_small.jsonl")

accuracy: 0.08
precision: 0.08433734939759036
recall: 0.08
f1 score: 0.04105571847507331


In [57]:
metrics_code1 = compute_metrics("code1", dataset_path="./results/eval/preds_twoshot_small.jsonl")

accuracy: 0.10857142857142857
precision: 0.12179487179487179
recall: 0.10857142857142857
f1 score: 0.05740181268882176


In [58]:
metrics_code2 = compute_metrics("code2", dataset_path="./results/eval/preds_twoshot_small.jsonl")

accuracy: 0.10857142857142857
precision: 0.10106382978723404
recall: 0.10857142857142857
f1 score: 0.05234159779614325
