# 6.8610  PROJECT

## Install and import libraries

In [2]:
!pip install datasets
!pip install transformers[torch]
!pip install tokenizers
!pip install huggingface_hub
!pip install tqdm



In [3]:
import numpy as np
from datasets import Dataset, load_dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from torch.optim import AdamW
from torch.utils.data import DataLoader, TensorDataset
import os
import json
import torch
from tqdm import tqdm
import gc

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


## Helpers

### Prepare Dataset

In [5]:
def get_latest_checkpoint(checkpoint_dir):

    # List all directories in the results folder
    all_checkpoints = [d for d in os.listdir(checkpoint_dir) if os.path.isdir(os.path.join(checkpoint_dir, d))]
    # Extract the checkpoint numbers
    checkpoint_numbers = [int(d.split("-")[1]) for d in all_checkpoints if d.startswith("checkpoint-")]
    # Find the folder name of the latest checkpoint
    latest_checkpoint_folder = f"{checkpoint_dir}/checkpoint-{max(checkpoint_numbers)}"
    
    return latest_checkpoint_folder

In [6]:
def load_tokenizer(checkpoint_dir):
    checkpoint = get_latest_checkpoint(checkpoint_dir)
    tokenizer = AutoTokenizer.from_pretrained(checkpoint)
    return tokenizer

In [7]:
def load_datasets(dataset_dir):
    dataset_train_path = f"{dataset_dir}/train.jsonl"
    #dataset_test_path = f"{dataset_dir}/test.jsonl"
    dataset_test_path = f"{dataset_dir}/test_small.jsonl"
    dataset_train = load_dataset('json', data_files=dataset_train_path)['train']
    dataset_test = load_dataset('json', data_files=dataset_test_path)['train']
    return dataset_train, dataset_test

In [8]:
PROMPT = """
Given the description after "Description:", complete the last sentence with a true statement about the contents
of the specified box according to the description.
Description: 
"""

def tokenize_dataset(dataset, tokenizer):
    input_ids = []
    attention_masks = []
    labels = []

    for entry in tqdm(dataset):
        # tokenize problem
        input_data = PROMPT + entry['sentence_masked'][:-15] + ":"
        input_encoding = tokenizer(input_data, truncation=True, padding='max_length', max_length=512, return_attention_mask = True, return_tensors = 'pt')
        input_ids.append(input_encoding['input_ids'])
        attention_masks.append(input_encoding['attention_mask'])
         # tokenize answer
        target = entry['masked_content'][13:]
        target_encoding = tokenizer(target, truncation=True, padding='max_length', max_length=512, return_attention_mask = True, return_tensors = 'pt')
        target_input_ids = target_encoding['input_ids']
        target_input_ids[target_input_ids == tokenizer.pad_token_id] = -100
        labels.append(target_input_ids)

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.cat(labels, dim=0)
    tokenized_data = TensorDataset(input_ids, attention_masks, labels)
    return tokenized_data

In [9]:
BATCH_SIZE = 8

def make_dataloader(tokenized_dataset):
    dataloader = DataLoader(tokenized_dataset, batch_size=BATCH_SIZE, shuffle=False)
    return dataloader

### Five-Shot Training

In [10]:
def load_model(checkpoint_dir):
    checkpoint = get_latest_checkpoint(checkpoint_dir)
    model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint).to(device)
    return model

In [11]:
def train_model(model, train_loader):
    
    optimizer = AdamW(model.parameters(), lr = 1e-4, eps = 1e-8)

    for _ in tqdm(range(3)):

        total_train_loss = 0
        model.train()

        for batch in tqdm(train_loader):

          input_ids = batch[0].to(device)
          input_mask = batch[1].to(device)
          labels = batch[2].to(device)
          
          model.zero_grad()
          outputs = model(input_ids, attention_mask=input_mask, labels=labels)
          loss = outputs.loss
          total_train_loss += loss
          loss.backward()
          torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
          optimizer.step()
    
    print(f"Total train loss: {total_train_loss}.")

### Generate Predictions

In [12]:
def make_prediction_all(model, tokenizer, dataloader):
    model_predictions = []

    for batch in tqdm(dataloader):

        # Generate sequences for the batch
        input_ids = batch[0].to(device)
        input_mask = batch[1].to(device)
        sequence_ids = model.generate(input_ids, attention_mask=input_mask)
        sequences = tokenizer.batch_decode(sequence_ids, skip_special_tokens=True)

        # Accumulate generated sequences
        model_predictions.extend(sequences)

    return model_predictions

In [13]:
def add_prediction_to_data(model_preds, model_name, dataset):
    dataset = dataset.add_column(f"model_{model_name}_pred", model_preds)
    return dataset

## Evaluation Loop

In [14]:
def evaluate_model(model_name="base", zeroshot=True):

    # prepare datasets
    if (model_name == "base"):
        tokenizer = T5Tokenizer.from_pretrained("t5-base")
    else:
        tokenizer = load_tokenizer(f"./results/{model_name}/")
    dataset_train, dataset_test = load_datasets("./eval/current_use")
    tokenized_dataset_train = tokenize_dataset(dataset_train, tokenizer)
    tokenized_dataset_test = tokenize_dataset(dataset_test, tokenizer)
    train_loader = make_dataloader(tokenized_dataset_train)
    test_loader = make_dataloader(tokenized_dataset_test)

    # load and potentially train model
    if (model_name == "base"):
        model = T5ForConditionalGeneration.from_pretrained("t5-base").to(device)
    else:
        model = load_model(f"./results/{model_name}/")
    if not zeroshot:
        train_model(model, train_loader)

    # make predictions
    model_preds = make_prediction_all(model, tokenizer, test_loader)
    dataset_test = add_prediction_to_data(model_preds, model_name, dataset_test)
    dataset_test.to_json(f"./results/eval/preds_{model_name}_small.jsonl", orient="records")
    #dataset_test.to_json(f"./results/eval/preds_{model_name}.jsonl", orient="records")
    
    # garbage collection
    del model, tokenizer, tokenized_dataset_train, tokenized_dataset_test, train_loader, test_loader
    gc.collect()
    torch.cuda.empty_cache()

    return model_preds, dataset_test

In [15]:
model_preds_math, dataset_test_math = evaluate_model(model_name="math", zeroshot=False)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
100%|██████████| 5/5 [00:00<00:00, 833.23it/s]
100%|██████████| 100/100 [00:00<00:00, 1904.30it/s]
100%|██████████| 1/1 [00:03<00:00,  3.86s/it]
100%|██████████| 1/1 [00:06<00:00,  6.83s/it]
100%|██████████| 1/1 [00:06<00:00,  6.54s/it]
100%|██████████| 3/3 [00:17<00:00,  5.75s/it]


Total train loss: 2.1575117111206055.


100%|██████████| 13/13 [01:12<00:00,  5.60s/it]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 58.82ba/s]


In [16]:
dataset_test_math

Dataset({
    features: ['sentence', 'sentence_masked', 'masked_content', 'sample_id', 'numops', 'model_math_pred'],
    num_rows: 100
})

In [17]:
model_preds_code, dataset_test_code = evaluate_model(model_name="code", zeroshot=False)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
100%|██████████| 5/5 [00:00<00:00, 1667.58it/s]
100%|██████████| 100/100 [00:00<00:00, 1169.47it/s]
100%|██████████| 1/1 [00:03<00:00,  3.72s/it]
100%|██████████| 1/1 [00:07<00:00,  7.13s/it]
100%|██████████| 1/1 [00:07<00:00,  7.60s/it]
100%|██████████| 3/3 [00:18<00:00,  6.15s/it]


Total train loss: 1.92369544506073.


100%|██████████| 13/13 [01:26<00:00,  6.68s/it]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 500.16ba/s]


In [18]:
dataset_test_code

Dataset({
    features: ['sentence', 'sentence_masked', 'masked_content', 'sample_id', 'numops', 'model_code_pred'],
    num_rows: 100
})

In [19]:
model_preds_base, dataset_test_base = evaluate_model(model_name="base", zeroshot=False)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
100%|██████████| 

Total train loss: 2.442920446395874.


100%|██████████| 13/13 [01:12<00:00,  5.60s/it]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 499.92ba/s]


In [20]:
dataset_test_base

Dataset({
    features: ['sentence', 'sentence_masked', 'masked_content', 'sample_id', 'numops', 'model_base_pred'],
    num_rows: 100
})

In [21]:
dataset_test_all = Dataset.from_dict({
    # problem and solution
    'sentence_masked': dataset_test_base['sentence_masked'],
    'masked_content': dataset_test_base['masked_content'],
    # predictions
    'model_base_pred': model_preds_base,
    'model_math_pred': model_preds_math,
    'model_code_pred': model_preds_code,
    # metadata
    'sample_id': dataset_test_base['sample_id'],
    'numops': dataset_test_base['numops'],
    })

In [22]:
dataset_test_all

Dataset({
    features: ['sentence_masked', 'masked_content', 'model_base_pred', 'model_math_pred', 'model_code_pred', 'sample_id', 'numops'],
    num_rows: 100
})

In [23]:
dataset_test_all[:5]

{'sentence_masked': ['Box 0 contains the boat and the shoe and the television, Box 1 contains the camera and the car and the plane, Box 2 contains the cake, Box 3 contains the fan, Box 4 contains the cross and the file and the note, Box 5 contains the chemical, Box 6 contains nothing. Move the cake from Box 2 to Box 6. Put the dish and the stone into Box 5. Move the television from Box 0 to Box 6. Put the drink into Box 2. Move the chemical and the stone from Box 5 to Box 2. Box 5 contains <extra_id_0> .',
  'Box 0 contains the boat and the shoe and the television, Box 1 contains the camera and the car and the plane, Box 2 contains the cake, Box 3 contains the fan, Box 4 contains the cross and the file and the note, Box 5 contains the chemical, Box 6 contains nothing. Move the cake from Box 2 to Box 6. Put the dish and the stone into Box 5. Move the television from Box 0 to Box 6. Put the drink into Box 2. Move the chemical and the stone from Box 5 to Box 2. Box 6 contains <extra_id_0>

In [24]:
dataset_test_all.to_json(f"./results/eval/preds_fiveshot_small.jsonl", orient="records")

Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 333.52ba/s]


95517