# 6.8610  PROJECT

## Install and import libraries

In [17]:
# !pip install nltk
# !pip install datasets
# !pip install transformers[torch]
# !pip install tokenizers
# !pip install evaluate
# !pip install rouge_score
# !pip install sentencepiece
# !pip install huggingface_hub
# !pip install wandb

In [18]:
import nltk
import evaluate
import numpy as np
from datasets import load_dataset
from datasets import Dataset
from transformers import T5Tokenizer, DataCollatorForSeq2Seq
from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
import os
import json
import torch
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
import gc
import pandas as pd
import random
random.seed(42)

In [19]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


## Load model

Instantiate a T5 tokenizer using the "t5-base" pre-trained model.

In [20]:
MODEL_NAME = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


## Create datasets

Code to preprocess and create the 5 datasets

### Create code dataset

In [21]:
def extract_text_before_example(text):
    return text.split("**Example 1:**")[0].strip()

In [22]:
def convert_folder_to_datasetdict_code(folder_path):
    """
    Converts data from a JSON file in the specified folder to a Hugging Face Dataset object.
    
    Parameters:
    - folder_path (str): Path to the folder containing the JSON file.

    Returns:
    datasets.Dataset: Hugging Face Dataset object containing 'code_with_problem', 'code_only', and 'question' fields.
    """
    with open(folder_path, 'r') as file:
        data = json.load(file)

    df = pd.json_normalize(data)
    dataset = Dataset.from_pandas(df[['code_with_problem', 'code_only']])
    dataset['question'] = dataset['code_with_problem'].apply(extract_text_before_example)
    return dataset

In [23]:
def convert_folder_to_datasetdict_code(folder_path):
    """
    Converts data from a JSON file in the specified folder to a Hugging Face Dataset object.
    
    Parameters:
    - folder_path (str): Path to the folder containing the JSON file.

    Returns:
    datasets.Dataset: Hugging Face Dataset object containing 'question' (extracted from 'code_with_problem') and 'answer' (renamed 'code_only') fields.
    """
    with open(folder_path, 'r') as file:
        data = json.load(file)

    df = pd.json_normalize(data)

    # Extract text before "Example 1" for the 'code_with_problem' column
    df['question'] = df['code_with_problem'].apply(extract_text_before_example)

    # Rename columns and create the dataset
    df.rename(columns={'code_only': 'answer'}, inplace=True)
    dataset = Dataset.from_pandas(df[['question', 'answer']])

    return dataset

In [24]:
code_dataset = convert_folder_to_datasetdict_code('data/code/leetcode-solutions.json')
train_dataset, test_dataset = train_test_split(code_dataset, test_size=0.2)
train_dataset = Dataset.from_dict(train_dataset)
test_dataset = Dataset.from_dict(test_dataset)

In [25]:
code_dict = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

print(code_dict)

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 1887
    })
    test: Dataset({
        features: ['question', 'answer'],
        num_rows: 472
    })
})


In [26]:
n_train = code_dict['train'].num_rows
n_test = code_dict['test'].num_rows
n_total = n_train + n_test

In [27]:
code_dict["train"][0]

{'question': '# You are participating in an online chess tournament. There is a chess round that starts every `15` minutes. The first round of the day starts at `00:00`, and after every `15` minutes, a new round starts.\n\n*   For example, the second round starts at `00:15`, the fourth round starts at `00:45`, and the seventh round starts at `01:30`.\n\nYou are given two strings `loginTime` and `logoutTime` where:\n\n*   `loginTime` is the time you will login to the game, and\n*   `logoutTime` is the time you will logout from the game.\n\nIf `logoutTime` is **earlier** than `loginTime`, this means you have played from `loginTime` to midnight and from midnight to `logoutTime`.\n\nReturn _the number of full chess rounds you have played in the tournament_.\n\n**Note:** All the given times follow the 24-hour clock. That means the first round of the day starts at `00:00` and the last round of the day starts at `23:45`.',
 'answer': '```python\ndef second_largest_digit(s: str) -> int:\n    l

### Create general knowledge dataset

In [28]:
csv_file_path = 'data/general/general.csv'
general = pd.read_csv(csv_file_path)
general = general.sample(n=n_total, random_state=42)
general['id'] = range(len(general))
train_sample, test_sample = train_test_split(general, test_size=0.2, random_state=42)
train_sample.reset_index(drop=True, inplace=True)
test_sample.reset_index(drop=True, inplace=True)

train_dataset = Dataset.from_pandas(train_sample[['question', 'answer']])
test_dataset = Dataset.from_pandas(test_sample[['question', 'answer']])

general_dict = DatasetDict({
    'train': train_dataset,
    'test': test_dataset,
})

print(general_dict)

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 1887
    })
    test: Dataset({
        features: ['question', 'answer'],
        num_rows: 472
    })
})


In [29]:
general_dict["train"][0]

{'question': 'Add 1 letter to "Iowa" to get the name of this tribe who lived south of the Iowa',
 'answer': 'Kiowa'}

### Create math dataset

In [30]:
def convert_folder_to_datasetdict_math(folder_path):
    #data = {"id": [], "question": [], "level": [], "type": [], "answer": []}
    data = {"question": [], "answer": []}

    subject_dictionary = {
        "algebra": 1,
        "counting_and_probability": 2,
        "geometry": 3,
        "intermediate_algebra": 4,
        "number_theory": 5,
        "prealgebra": 6,
        "precalculus": 7
    }

    for subdir, _, files in os.walk(folder_path):
        for file in files:
            file_path = os.path.join(subdir, file)
            folder_name = os.path.basename(os.path.normpath(subdir))
            
            with open(file_path, "r") as f:
                json_data = json.load(f)
                problem = json_data.get("problem", "")
                level = json_data.get("level", "")
                type_ = json_data.get("type", "")
                solution = json_data.get("solution", "")

                # Generate id from subject code and file name
                file_name = os.path.splitext(file)[0]
                subject_code = subject_dictionary.get(folder_name, 0)  # Default to 0 if not found
                id_ = f"{subject_code}_{file_name}"

                #data["id"].append(id_)
                data["question"].append(problem)
                #data["level"].append(level)
                #data["type"].append(type_)
                data["answer"].append(solution)

    dataset = Dataset.from_dict(data)
    return dataset

In [14]:
math_train = convert_folder_to_datasetdict_math("data/math/train/")
math_test = convert_folder_to_datasetdict_math("data/math/test/")

In [15]:
math_train = math_train.shuffle(seed=42)
math_train = math_train.select(range(n_train))

math_test = math_test.shuffle(seed=42)
math_test = math_test.select(range(n_test))

math_dict = DatasetDict({
    'train': math_train,
    'test': math_test
})

print(math_dict)

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 1887
    })
    test: Dataset({
        features: ['question', 'answer'],
        num_rows: 472
    })
})


In [16]:
math_dict["train"][0]

{'question': 'What is the number of units in the distance between $(2,5)$ and $(-6,-1)$?',
 'answer': 'We use the distance formula: $\\sqrt{(-6 - 2)^2 + (-1 - 5)^2},$ so then we find that $\\sqrt{64 + 36} = \\boxed{10}$.\n\n- OR -\n\nWe note that the points $(2, 5)$, $(-6, -1)$, and $(2, -1)$ form a right triangle with legs of length 6 and 8. This is a Pythagorean triple, so the length of the hypotenuse must be $\\boxed{10}$.'}

### Create 50% samples

Preprocessing to create 50% samples. Will be used to created the 50% general and 50% math, and the 50% general and 50% code datasets.

In [31]:
math_train_sample = math_dict['train'].shuffle(seed=42).select([i for i in range(math_dict['train'].num_rows // 2)])
math_test_sample = math_dict['test'].shuffle(seed=42).select([i for i in range(math_dict['test'].num_rows // 2)])

general_train_sample = general_dict['train'].shuffle(seed=42).select([i for i in range(general_dict['train'].num_rows // 2+1)])
general_test_sample = general_dict['test'].shuffle(seed=42).select([i for i in range(general_dict['test'].num_rows // 2)])

code_train_sample = code_dict['train'].shuffle(seed=42).select([i for i in range(code_dict['train'].num_rows // 2)])
code_test_sample = code_dict['test'].shuffle(seed=42).select([i for i in range(code_dict['test'].num_rows // 2)])

### Create 50% general 50% math dataset


In [32]:
general_math_dict = DatasetDict({
    'train': Dataset.from_dict({
        'question': math_train_sample['question'] + general_train_sample['question'],
        'answer': math_train_sample['answer'] + general_train_sample['answer'],
    }),
    'test': Dataset.from_dict({
        'question': math_test_sample['question'] + general_test_sample['question'],
        'answer': math_test_sample['answer'] + general_test_sample['answer'],
    })
})


general_math_dict = DatasetDict({
    'train': general_math_dict['train'].shuffle(seed=42),
    'test': general_math_dict['test'].shuffle(seed=42)
})

In [33]:
general_math_dict["train"][0]

{'question': 'Find the monic quadratic polynomial, in $x,$ with real coefficients, which has $1 - i$ as a root.',
 'answer': 'If a polynomial has real coefficients, then any complex conjugate of a root must also be a root.  Hence, the other root is $1 + i.$  Thus, the polynomial is\n\\[(x - 1 - i)(x - 1 + i) = (x - 1)^2 - i^2 = \\boxed{x^2 - 2x + 2}.\\]'}

### Create 50% general 50% code dataset


In [34]:
general_code_dict = DatasetDict({
    'train': Dataset.from_dict({
        'question': code_train_sample['question'] + general_train_sample['question'],
        'answer': code_train_sample['answer'] + general_train_sample['answer'],
    }),
    'test': Dataset.from_dict({
        'question': code_test_sample['question'] + general_test_sample['question'],
        'answer': code_test_sample['answer'] + general_test_sample['answer'],
    })
})

general_code_dict = DatasetDict({
    'train': general_code_dict['train'].shuffle(seed=42),
    'test': general_code_dict['test'].shuffle(seed=42)
})

print(general_code_dict)

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 1887
    })
    test: Dataset({
        features: ['question', 'answer'],
        num_rows: 472
    })
})


In [35]:
general_code_dict["train"][0]

{'question': '# You are given a `m x n` matrix `grid` consisting of **non-negative** integers where `grid[row][col]` represents the **minimum** time required to be able to visit the cell `(row, col)`, which means you can visit the cell `(row, col)` only when the time you visit it is greater than or equal to `grid[row][col]`.\n\nYou are standing in the **top-left** cell of the matrix in the `0th` second, and you must move to **any** adjacent cell in the four directions: up, down, left, and right. Each move you make takes 1 second.\n\nReturn _the **minimum** time required in which you can visit the bottom-right cell of the matrix_. If you cannot visit the bottom-right cell, then return `-1`.',
 'answer': '```python\nfrom collections import deque\n\ndef minTime(grid: list[list[int]]) -> int:\n    m, n = len(grid), len(grid[0])\n    visited = [[1000000] * n for _ in range(m)]\n\n    dx = [-1, 0, 1, 0]\n    dy = [0, 1, 0, -1]\n\n    visited[0][0] = 0\n\n    q = deque([(0, 0)])\n\n    while 

## Different Sizes

Preprocessing to create smaller versions of the datasets (fewer number of fine-tuning examples)

In [36]:
def smaller_df(train, test, num_train = 500, seed = 42):
    """
    Creates smaller training and testing datasets by randomly selecting a specified number of samples.
    
    Parameters:
    - train (datasets.Dataset): Original training dataset.
    - test (datasets.Dataset): Original testing dataset.
    - num_train (int): Number of samples to include in the smaller training dataset.
    - seed (int): Seed for reproducibility.

    Returns:
    Tuple[datasets.DatasetDict, datasets.Dataset, datasets.Dataset]: A DatasetDict containing 'train' and 'test' keys, and the smaller training and testing datasets.
    """
    num_test = int(((100*num_train)/80) - num_train) #to get 20% test

    train_small = train.shuffle(seed = seed).select(range(num_train))
    test_small = test.shuffle(seed = seed).select(range(num_test))
    
    dict_final =  DatasetDict({
    'train': train_small,
    'test': test_small
})
    
    return dict_final, train_small, test_small

In [37]:
# General Knowledge/Facts
general_dict_small, general_train_small, general_test_small = smaller_df(general_dict['train'],
                                                                         general_dict['test'])
# Math
math_dict_small, math_train_small, math_test_small = smaller_df(math_train, math_test)

#  Code
code_dict_small, code_train_small, code_test_small = smaller_df(code_dict['train'], code_dict['test'])

## Preprocessing

### Main

In [None]:
prefix = "Please answer this question: "

def preprocess_function(examples):
    """
    Add prefix to the sentences, tokenize the text, and set the labels.

    Parameters:
    - examples (dict): A dictionary containing 'question' and 'answer' fields.

    Returns:
    dict: Tokenized model inputs with added prefix and labels.
    """
   # The "inputs" are the tokenized answer:
   inputs = [prefix + doc for doc in examples["question"]]
   model_inputs = tokenizer(inputs, max_length=128, truncation=True)
  
   # The "labels" are the tokenized outputs:
   labels = tokenizer(text_target=examples["answer"], 
                      max_length=512,         
                      truncation=True)

   model_inputs["labels"] = labels["input_ids"]
   return model_inputs

### Extension - Prompts

In [38]:
prefix = "Please answer this question while paying attention to how variables are used to track information: "

def preprocess_function2(examples):
    """
    Add prefix to the sentences, tokenize the text, and set the labels.

    Parameters:
    - examples (dict): A dictionary containing 'question' and 'answer' fields.

    Returns:
    dict: Tokenized model inputs with added prefix and labels.
    """
   # The "inputs" are the tokenized answer:
   inputs = [prefix + doc for doc in examples["question"]]
   model_inputs = tokenizer(inputs, max_length=128, truncation=True)
  
   # The "labels" are the tokenized outputs:
   labels = tokenizer(text_target=examples["answer"], 
                      max_length=512,         
                      truncation=True)

   model_inputs["labels"] = labels["input_ids"]
   return model_inputs

In [39]:
tokenized_dataset_math = math_dict.map(preprocess_function, batched=True)
tokenized_dataset_code = code_dict.map(preprocess_function, batched=True)
tokenized_dataset_general = general_dict.map(preprocess_function, batched=True)
tokenized_dataset_general_code = general_code_dict.map(preprocess_function, batched=True)
tokenized_dataset_general_math = general_math_dict.map(preprocess_function, batched=True)

Map:   0%|          | 0/1887 [00:00<?, ? examples/s]Map: 100%|██████████| 1887/1887 [00:00<00:00, 2173.53 examples/s]
Map: 100%|██████████| 472/472 [00:00<00:00, 2383.41 examples/s]
Map: 100%|██████████| 1887/1887 [00:00<00:00, 2005.92 examples/s]
Map: 100%|██████████| 472/472 [00:00<00:00, 1820.35 examples/s]
Map: 100%|██████████| 1887/1887 [00:00<00:00, 8841.24 examples/s]
Map: 100%|██████████| 472/472 [00:00<00:00, 8733.93 examples/s]
Map: 100%|██████████| 1887/1887 [00:00<00:00, 2998.78 examples/s]
Map: 100%|██████████| 472/472 [00:00<00:00, 3199.70 examples/s]
Map: 100%|██████████| 1887/1887 [00:00<00:00, 3656.19 examples/s]
Map: 100%|██████████| 472/472 [00:00<00:00, 3766.08 examples/s]


### Smaller Models

In [40]:
tokenized_dataset_math_small = math_dict_small.map(preprocess_function, batched=True)
tokenized_dataset_code_small = code_dict_small.map(preprocess_function, batched=True)
tokenized_dataset_general_small = general_dict_small.map(preprocess_function, batched=True)

Map: 100%|██████████| 500/500 [00:00<00:00, 2127.33 examples/s]
Map: 100%|██████████| 125/125 [00:00<00:00, 2292.92 examples/s]
Map: 100%|██████████| 500/500 [00:00<00:00, 2109.43 examples/s]
Map: 100%|██████████| 125/125 [00:00<00:00, 1999.54 examples/s]
Map: 100%|██████████| 500/500 [00:00<00:00, 9171.13 examples/s]
Map: 100%|██████████| 125/125 [00:00<00:00, 7812.48 examples/s]


In [41]:
nltk.download("punkt", quiet=True)
metric = evaluate.load("rouge")

In [42]:
def compute_metrics(eval_preds):
    """
    Compute metrics for evaluation predictions.

    Parameters:
    - eval_preds (tuple): Tuple containing predictions (preds) and labels.

    Returns:
    dict: Computed metrics using RougeLSum with newline-separated sentences.
    """
   preds, labels = eval_preds

   # decode preds and labels
   labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
   decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
   decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

   # rougeLSum expects newline after each sentence
   decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
   decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

   result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
  
   return result

## Fine-tuning

### Experiment

In [43]:
def training_args(output_dir, L_RATE = 3e-4, BATCH_SIZE = 4, PER_DEVICE_EVAL_BATCH = 4, WEIGHT_DECAY = 0.01, SAVE_TOTAL_LIM = 3, NUM_EPOCHS = 3, OVERWRITE_OUTPUT_DIR = True, LOAD_BEST_MODEL_AT_END = True):
    """
    Define training arguments for Seq2Seq training.

    Parameters:
    - output_dir (str): Output directory for saving model checkpoints and logs.
    - L_RATE (float): Learning rate for training.
    - BATCH_SIZE (int): Batch size for training.
    - PER_DEVICE_EVAL_BATCH (int): Batch size for evaluation.
    - WEIGHT_DECAY (float): Weight decay for optimization.
    - SAVE_TOTAL_LIM (int): Total number of checkpoints to save.
    - NUM_EPOCHS (int): Number of training epochs.
    - OVERWRITE_OUTPUT_DIR (bool): Whether to overwrite the output directory if it exists.
    - LOAD_BEST_MODEL_AT_END (bool): Whether to load the best model at the end of training.

    Returns:
    transformers.Seq2SeqTrainingArguments: Training arguments for Seq2Seq model.
    """
    training_args = Seq2SeqTrainingArguments(
       output_dir=output_dir,
       overwrite_output_dir=OVERWRITE_OUTPUT_DIR,
       save_strategy="epoch",
       evaluation_strategy="epoch",
       load_best_model_at_end=LOAD_BEST_MODEL_AT_END,
       learning_rate=L_RATE,
       per_device_train_batch_size=BATCH_SIZE,
       per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH,
       weight_decay=WEIGHT_DECAY,
       save_total_limit=SAVE_TOTAL_LIM,
       num_train_epochs=NUM_EPOCHS,
       predict_with_generate=True,
       push_to_hub=False
    )
    return training_args

In [44]:
lrs = [3e-3, 3e-4, 3e-5] 

### Math

In [45]:
for lr in lrs:
    model_math = T5ForConditionalGeneration.from_pretrained(MODEL_NAME).to(device)
    data_collator_math = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_math)
    
    output_dir_root = "./results/prompt_math"
    output_dir = f"{output_dir_root}/{lr:.0e}".replace("0", "")
    training_args_math = training_args(L_RATE = lr, BATCH_SIZE = 4, NUM_EPOCHS = 3, output_dir=output_dir)

    trainer_math = Seq2SeqTrainer(
       model=model_math,
       args=training_args_math,
       train_dataset=tokenized_dataset_math["train"], 
       eval_dataset=tokenized_dataset_math["test"],   
       tokenizer=tokenizer,
       data_collator=data_collator_math,
       compute_metrics=compute_metrics
    )

    trainer_math.train()

    del model_math, trainer_math
    gc.collect()
    torch.cuda.empty_cache()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33myezixuanclara[0m. Use [1m`wandb login --relogin`[0m to force relogin


                                                  
 33%|███▎      | 472/1416 [02:08<02:55,  5.37it/s]

{'eval_loss': 2.157402276992798, 'eval_rouge1': 0.15791446753160732, 'eval_rouge2': 0.06253428545548179, 'eval_rougeL': 0.13231324716097564, 'eval_rougeLsum': 0.1450682450058146, 'eval_runtime': 41.9676, 'eval_samples_per_second': 11.247, 'eval_steps_per_second': 2.812, 'epoch': 1.0}


 35%|███▌      | 501/1416 [02:25<03:29,  4.36it/s]  

{'loss': 2.8422, 'learning_rate': 1.9406779661016948e-05, 'epoch': 1.06}


                                                  
 67%|██████▋   | 944/1416 [04:26<01:07,  6.95it/s]

{'eval_loss': 2.022892475128174, 'eval_rouge1': 0.143764168423235, 'eval_rouge2': 0.05453990187547328, 'eval_rougeL': 0.12376864720037017, 'eval_rougeLsum': 0.13246826930254563, 'eval_runtime': 39.6309, 'eval_samples_per_second': 11.91, 'eval_steps_per_second': 2.977, 'epoch': 2.0}


 71%|███████   | 1001/1416 [05:01<01:10,  5.88it/s] 

{'loss': 2.3288, 'learning_rate': 8.8135593220339e-06, 'epoch': 2.12}


                                                   
100%|██████████| 1416/1416 [06:59<00:00,  5.14it/s]

{'eval_loss': 1.9891810417175293, 'eval_rouge1': 0.14696774065600438, 'eval_rouge2': 0.056186690687112587, 'eval_rougeL': 0.12576572183593526, 'eval_rougeLsum': 0.13543265872320354, 'eval_runtime': 42.8265, 'eval_samples_per_second': 11.021, 'eval_steps_per_second': 2.755, 'epoch': 3.0}


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].
100%|██████████| 1416/1416 [07:15<00:00,  3.25it/s]


{'train_runtime': 438.19, 'train_samples_per_second': 12.919, 'train_steps_per_second': 3.231, 'train_loss': 2.4877741754391773, 'epoch': 3.0}


### Code

In [46]:
for lr in lrs:
    model_code = T5ForConditionalGeneration.from_pretrained(MODEL_NAME).to(device)
    data_collator_code = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_code)
    
    output_dir_root = "./results/prompt_code"
    output_dir = f"{output_dir_root}/{lr:.0e}".replace("0", "")
    training_args_code = training_args(L_RATE = lr, BATCH_SIZE = 4, NUM_EPOCHS = 3, output_dir=output_dir)

    trainer_code = Seq2SeqTrainer(
       model=model_code,
       args=training_args_code,
       train_dataset=tokenized_dataset_code["train"], 
       eval_dataset=tokenized_dataset_code["test"],   
       tokenizer=tokenizer,
       data_collator=data_collator_code,
       compute_metrics=compute_metrics
    )

    trainer_code.train()

    del model_code, trainer_code
    gc.collect()
    torch.cuda.empty_cache()


 33%|███▎      | 472/1416 [01:47<02:22,  6.64it/s]

{'eval_loss': 2.1423404216766357, 'eval_rouge1': 0.1182444351868111, 'eval_rouge2': 0.039196967880248804, 'eval_rougeL': 0.11625639712495212, 'eval_rougeLsum': 0.11622626648431872, 'eval_runtime': 38.4477, 'eval_samples_per_second': 12.276, 'eval_steps_per_second': 3.069, 'epoch': 1.0}


 35%|███▌      | 501/1416 [02:15<02:17,  6.64it/s]  

{'loss': 2.9453, 'learning_rate': 1.9406779661016948e-05, 'epoch': 1.06}



 67%|██████▋   | 944/1416 [04:02<01:09,  6.80it/s]

{'eval_loss': 1.9248026609420776, 'eval_rouge1': 0.11479931025698409, 'eval_rouge2': 0.04229243095290905, 'eval_rougeL': 0.1132031791854636, 'eval_rougeLsum': 0.1130226621738151, 'eval_runtime': 41.8051, 'eval_samples_per_second': 11.29, 'eval_steps_per_second': 2.823, 'epoch': 2.0}


 71%|███████   | 1001/1416 [04:29<00:56,  7.32it/s] 

{'loss': 2.2719, 'learning_rate': 8.8135593220339e-06, 'epoch': 2.12}



100%|██████████| 1416/1416 [06:08<00:00,  7.46it/s]

{'eval_loss': 1.8727754354476929, 'eval_rouge1': 0.10983940613667617, 'eval_rouge2': 0.04154851600131902, 'eval_rougeL': 0.10833415097896029, 'eval_rougeLsum': 0.10812533490939708, 'eval_runtime': 38.956, 'eval_samples_per_second': 12.116, 'eval_steps_per_second': 3.029, 'epoch': 3.0}


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].
100%|██████████| 1416/1416 [06:32<00:00,  3.61it/s]


{'train_runtime': 392.7687, 'train_samples_per_second': 14.413, 'train_steps_per_second': 3.605, 'train_loss': 2.4671413184559277, 'epoch': 3.0}


### General

In [None]:
for lr in lrs:
    model_general = T5ForConditionalGeneration.from_pretrained(MODEL_NAME).to(device)
    data_collator_general = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_general)

    output_dir_root = "./results/general"
    output_dir = f"{output_dir_root}/{lr:.0e}".replace("0", "")
    training_args_general = training_args(L_RATE = lr, BATCH_SIZE = 4, NUM_EPOCHS = 3, output_dir=output_dir)

    trainer_general = Seq2SeqTrainer(
       model=model_general,
       args=training_args_general,
       train_dataset=tokenized_dataset_general["train"], 
       eval_dataset=tokenized_dataset_general["test"],   
       tokenizer=tokenizer,
       data_collator=data_collator_general,
       compute_metrics=compute_metrics
    )

    trainer_general.train()

    del model_general, trainer_general
    gc.collect()
    torch.cuda.empty_cache()



### General + Code

In [29]:
for lr in lrs:
    model_general_code = T5ForConditionalGeneration.from_pretrained(MODEL_NAME).to(device)
    data_collator_general_code = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_general_code)

    output_dir_root = "./results/general_code"
    output_dir = f"{output_dir_root}/{lr:.0e}".replace("0", "")
    training_args_general_code = training_args(L_RATE = lr, BATCH_SIZE = 4, NUM_EPOCHS = 3, output_dir=output_dir)

    trainer_general_code = Seq2SeqTrainer(
       model=model_general_code,
       args=training_args_general_code,
       train_dataset=tokenized_dataset_general_code["train"], 
       eval_dataset=tokenized_dataset_general_code["test"],   
       tokenizer=tokenizer,
       data_collator=data_collator_general_code,
       compute_metrics=compute_metrics
    )

    trainer_general_code.train()

    del model_general_code, trainer_general_code
    gc.collect()
    torch.cuda.empty_cache()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33myezixuanclara[0m. Use [1m`wandb login --relogin`[0m to force relogin


                                                  
 33%|███▎      | 472/1416 [01:43<02:13,  7.08it/s]

{'eval_loss': 2.132077693939209, 'eval_rouge1': 0.055051808944643704, 'eval_rouge2': 0.016222703488948487, 'eval_rougeL': 0.05433968281931898, 'eval_rougeLsum': 0.05441902197562565, 'eval_runtime': 38.6494, 'eval_samples_per_second': 12.212, 'eval_steps_per_second': 3.053, 'epoch': 1.0}


 35%|███▌      | 501/1416 [02:10<02:00,  7.60it/s]  

{'loss': 2.6281, 'learning_rate': 0.001940677966101695, 'epoch': 1.06}


                                                  
 67%|██████▋   | 944/1416 [03:46<01:12,  6.51it/s]

{'eval_loss': 1.8248320817947388, 'eval_rouge1': 0.07404124374521334, 'eval_rouge2': 0.019232725796420602, 'eval_rougeL': 0.07357504130459036, 'eval_rougeLsum': 0.07368692572109758, 'eval_runtime': 36.6937, 'eval_samples_per_second': 12.863, 'eval_steps_per_second': 3.216, 'epoch': 2.0}


 71%|███████   | 1001/1416 [04:05<00:53,  7.71it/s] 

{'loss': 1.7786, 'learning_rate': 0.0008813559322033899, 'epoch': 2.12}


                                                   
100%|██████████| 1416/1416 [05:37<00:00,  7.48it/s]

{'eval_loss': 1.736644983291626, 'eval_rouge1': 0.07237803222073752, 'eval_rouge2': 0.020723353671982178, 'eval_rougeL': 0.07175227425435451, 'eval_rougeLsum': 0.07182516121018465, 'eval_runtime': 36.3015, 'eval_samples_per_second': 13.002, 'eval_steps_per_second': 3.251, 'epoch': 3.0}


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].
100%|██████████| 1416/1416 [05:48<00:00,  4.06it/s]


{'train_runtime': 351.6636, 'train_samples_per_second': 16.098, 'train_steps_per_second': 4.027, 'train_loss': 1.9456297017760196, 'epoch': 3.0}


                                                  
 33%|███▎      | 472/1416 [01:38<01:56,  8.08it/s]

{'eval_loss': 1.760346531867981, 'eval_rouge1': 0.12132619838044889, 'eval_rouge2': 0.0371879520789538, 'eval_rougeL': 0.1208510402218295, 'eval_rougeLsum': 0.1210844252051826, 'eval_runtime': 34.9811, 'eval_samples_per_second': 13.493, 'eval_steps_per_second': 3.373, 'epoch': 1.0}


 35%|███▌      | 501/1416 [01:49<01:55,  7.90it/s]  

{'loss': 2.3452, 'learning_rate': 0.00019406779661016945, 'epoch': 1.06}


                                                  
 67%|██████▋   | 944/1416 [03:23<01:11,  6.58it/s]

{'eval_loss': 1.5788134336471558, 'eval_rouge1': 0.11538274777158411, 'eval_rouge2': 0.03491347388730446, 'eval_rougeL': 0.11516147987108587, 'eval_rougeLsum': 0.11438624150784589, 'eval_runtime': 36.8886, 'eval_samples_per_second': 12.795, 'eval_steps_per_second': 3.199, 'epoch': 2.0}


 71%|███████   | 1001/1416 [03:42<00:53,  7.70it/s] 

{'loss': 1.659, 'learning_rate': 8.813559322033898e-05, 'epoch': 2.12}


                                                   
100%|██████████| 1416/1416 [05:14<00:00,  7.26it/s]

{'eval_loss': 1.5343466997146606, 'eval_rouge1': 0.12369657569645576, 'eval_rouge2': 0.038815463721845155, 'eval_rougeL': 0.12219236045938289, 'eval_rougeLsum': 0.12186390364948171, 'eval_runtime': 36.8111, 'eval_samples_per_second': 12.822, 'eval_steps_per_second': 3.206, 'epoch': 3.0}


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].
100%|██████████| 1416/1416 [05:22<00:00,  4.39it/s]


{'train_runtime': 322.2697, 'train_samples_per_second': 17.566, 'train_steps_per_second': 4.394, 'train_loss': 1.8327802668857036, 'epoch': 3.0}



 33%|███▎      | 472/1416 [01:41<02:24,  6.52it/s]

{'eval_loss': 2.3541741371154785, 'eval_rouge1': 0.10543474610190923, 'eval_rouge2': 0.027912390306222754, 'eval_rougeL': 0.104541700149297, 'eval_rougeLsum': 0.10379574014373033, 'eval_runtime': 35.2405, 'eval_samples_per_second': 13.394, 'eval_steps_per_second': 3.348, 'epoch': 1.0}


 35%|███▌      | 501/1416 [01:59<02:44,  5.56it/s]  

{'loss': 3.1622, 'learning_rate': 1.9406779661016948e-05, 'epoch': 1.06}



 67%|██████▋   | 944/1416 [03:40<01:12,  6.55it/s]

{'eval_loss': 2.1470742225646973, 'eval_rouge1': 0.11440688184860422, 'eval_rouge2': 0.03585640348614795, 'eval_rougeL': 0.11380251997686144, 'eval_rougeLsum': 0.11309426185751145, 'eval_runtime': 36.0755, 'eval_samples_per_second': 13.084, 'eval_steps_per_second': 3.271, 'epoch': 2.0}


 71%|███████   | 1001/1416 [04:03<01:09,  5.99it/s] 

{'loss': 2.4896, 'learning_rate': 8.8135593220339e-06, 'epoch': 2.12}



100%|██████████| 1416/1416 [05:54<00:00,  6.22it/s]

{'eval_loss': 2.0940744876861572, 'eval_rouge1': 0.11471425599899554, 'eval_rouge2': 0.03639041958518499, 'eval_rougeL': 0.11419256284976584, 'eval_rougeLsum': 0.11350022159156378, 'eval_runtime': 36.0568, 'eval_samples_per_second': 13.09, 'eval_steps_per_second': 3.273, 'epoch': 3.0}


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].
100%|██████████| 1416/1416 [06:11<00:00,  6.22it/s]

{'train_runtime': 370.9641, 'train_samples_per_second': 15.26, 'train_steps_per_second': 3.817, 'train_loss': 2.68684063927602, 'epoch': 3.0}


100%|██████████| 1416/1416 [06:11<00:00,  3.81it/s]


### General + Math

In [29]:
for lr in lrs:
    model_general_math = T5ForConditionalGeneration.from_pretrained(MODEL_NAME).to(device)
    data_collator_general_math = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_general_math)

    output_dir_root = "./results/general_math"
    output_dir = f"{output_dir_root}/{lr:.0e}".replace("0", "")
    training_args_general_math = training_args(L_RATE = lr, BATCH_SIZE = 4, NUM_EPOCHS = 3, output_dir=output_dir)

    trainer_general_math = Seq2SeqTrainer(
       model=model_general_math,
       args=training_args_general_math,
       train_dataset=tokenized_dataset_general_math["train"], 
       eval_dataset=tokenized_dataset_general_math["test"],   
       tokenizer=tokenizer,
       data_collator=data_collator_general_math,
       compute_metrics=compute_metrics
    )

    trainer_general_math.train()

    del model_general_math, trainer_general_math
    gc.collect()
    torch.cuda.empty_cache()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33myezixuanclara[0m. Use [1m`wandb login --relogin`[0m to force relogin


                                                  
 33%|███▎      | 472/1416 [02:07<02:35,  6.08it/s]

{'eval_loss': 2.5402798652648926, 'eval_rouge1': 0.032277776555268506, 'eval_rouge2': 0.0029141143878364728, 'eval_rougeL': 0.03204248391573171, 'eval_rougeLsum': 0.031950849918273065, 'eval_runtime': 37.0247, 'eval_samples_per_second': 12.748, 'eval_steps_per_second': 3.187, 'epoch': 1.0}


 35%|███▌      | 501/1416 [02:32<02:17,  6.63it/s]  

{'loss': 3.0606, 'learning_rate': 0.001940677966101695, 'epoch': 1.06}


                                                  
 67%|██████▋   | 944/1416 [04:16<01:17,  6.08it/s]

{'eval_loss': 2.3229095935821533, 'eval_rouge1': 0.032444724403872735, 'eval_rouge2': 0.0035144391149949, 'eval_rougeL': 0.03171422707637668, 'eval_rougeLsum': 0.03172260187056673, 'eval_runtime': 36.9298, 'eval_samples_per_second': 12.781, 'eval_steps_per_second': 3.195, 'epoch': 2.0}


 71%|███████   | 1001/1416 [04:32<01:09,  6.00it/s] 

{'loss': 2.2633, 'learning_rate': 0.0008813559322033899, 'epoch': 2.12}


                                                   
100%|██████████| 1416/1416 [06:11<00:00,  6.53it/s]

{'eval_loss': 2.247628927230835, 'eval_rouge1': 0.039121258214399816, 'eval_rouge2': 0.0032839848504259083, 'eval_rougeL': 0.03872789528552882, 'eval_rougeLsum': 0.038698280799125157, 'eval_runtime': 35.8744, 'eval_samples_per_second': 13.157, 'eval_steps_per_second': 3.289, 'epoch': 3.0}


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].
100%|██████████| 1416/1416 [06:22<00:00,  3.70it/s]


{'train_runtime': 385.1121, 'train_samples_per_second': 14.7, 'train_steps_per_second': 3.677, 'train_loss': 2.4215501257255254, 'epoch': 3.0}



 33%|███▎      | 472/1416 [01:48<02:38,  5.97it/s]

{'eval_loss': 1.9895418882369995, 'eval_rouge1': 0.1320215180700664, 'eval_rouge2': 0.03874952300747349, 'eval_rougeL': 0.12353501415658114, 'eval_rougeLsum': 0.1275214497003628, 'eval_runtime': 36.0791, 'eval_samples_per_second': 13.082, 'eval_steps_per_second': 3.271, 'epoch': 1.0}


 35%|███▌      | 501/1416 [02:00<02:13,  6.84it/s]  

{'loss': 2.4809, 'learning_rate': 0.00019406779661016945, 'epoch': 1.06}



 67%|██████▋   | 944/1416 [03:44<01:12,  6.51it/s]

{'eval_loss': 1.8589943647384644, 'eval_rouge1': 0.135461050614511, 'eval_rouge2': 0.038585915833919224, 'eval_rougeL': 0.12459804955343722, 'eval_rougeLsum': 0.12869890674568712, 'eval_runtime': 36.548, 'eval_samples_per_second': 12.915, 'eval_steps_per_second': 3.229, 'epoch': 2.0}


 71%|███████   | 1001/1416 [03:55<01:08,  6.07it/s] 

{'loss': 1.9168, 'learning_rate': 8.813559322033898e-05, 'epoch': 2.12}



100%|██████████| 1416/1416 [05:35<00:00,  6.85it/s]

{'eval_loss': 1.8303388357162476, 'eval_rouge1': 0.14259260962717454, 'eval_rouge2': 0.04300571794768096, 'eval_rougeL': 0.13308619842145114, 'eval_rougeLsum': 0.1364182526233655, 'eval_runtime': 36.3798, 'eval_samples_per_second': 12.974, 'eval_steps_per_second': 3.244, 'epoch': 3.0}


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].
100%|██████████| 1416/1416 [05:42<00:00,  4.14it/s]


{'train_runtime': 342.522, 'train_samples_per_second': 16.527, 'train_steps_per_second': 4.134, 'train_loss': 2.0566785995569608, 'epoch': 3.0}



 33%|███▎      | 472/1416 [01:48<02:35,  6.06it/s]

{'eval_loss': 2.3861546516418457, 'eval_rouge1': 0.1119125107451445, 'eval_rouge2': 0.0407551038633739, 'eval_rougeL': 0.10108785562620667, 'eval_rougeLsum': 0.10601038009687778, 'eval_runtime': 36.2431, 'eval_samples_per_second': 13.023, 'eval_steps_per_second': 3.256, 'epoch': 1.0}


 35%|███▌      | 501/1416 [02:01<02:24,  6.35it/s]  

{'loss': 3.0762, 'learning_rate': 1.9406779661016948e-05, 'epoch': 1.06}


                                                  
 67%|██████▋   | 944/1416 [03:46<01:13,  6.42it/s]

{'eval_loss': 2.2488908767700195, 'eval_rouge1': 0.12060288745848656, 'eval_rouge2': 0.039178581996189996, 'eval_rougeL': 0.11036107868460496, 'eval_rougeLsum': 0.11576035112598533, 'eval_runtime': 37.3415, 'eval_samples_per_second': 12.64, 'eval_steps_per_second': 3.16, 'epoch': 2.0}


 71%|███████   | 1001/1416 [04:03<01:13,  5.62it/s] 

{'loss': 2.5417, 'learning_rate': 8.8135593220339e-06, 'epoch': 2.12}



100%|██████████| 1416/1416 [05:46<00:00,  6.60it/s]

{'eval_loss': 2.2149879932403564, 'eval_rouge1': 0.11827782446039363, 'eval_rouge2': 0.0367991779285983, 'eval_rougeL': 0.1077602373969512, 'eval_rougeLsum': 0.11287944637073197, 'eval_runtime': 37.475, 'eval_samples_per_second': 12.595, 'eval_steps_per_second': 3.149, 'epoch': 3.0}


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].
100%|██████████| 1416/1416 [05:49<00:00,  4.05it/s]


{'train_runtime': 349.4659, 'train_samples_per_second': 16.199, 'train_steps_per_second': 4.052, 'train_loss': 2.6996212544414284, 'epoch': 3.0}


## Extension

In [32]:
lrs = [3e-4, 3e-5] 

### Small Math

In [33]:
for lr in lrs:
    model_small_math = T5ForConditionalGeneration.from_pretrained(MODEL_NAME).to(device)
    data_collator_math_small = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_small_math)
    
    output_dir_root = "./results/small_math"
    output_dir = f"{output_dir_root}/{lr:.0e}".replace("0", "")
    training_args_math_small = training_args(L_RATE = lr, BATCH_SIZE = 4, NUM_EPOCHS = 3, output_dir=output_dir)

    trainer_math_small = Seq2SeqTrainer(
       model=model_small_math,
       args=training_args_math_small,
       train_dataset=tokenized_dataset_math_small["train"], 
       eval_dataset=tokenized_dataset_math_small["test"],   
       tokenizer=tokenizer,
       data_collator=data_collator_math_small,
       compute_metrics=compute_metrics
    )

    trainer_math_small.train()

    del model_small_math, trainer_math_small
    gc.collect()
    torch.cuda.empty_cache()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33myezixuanclara[0m. Use [1m`wandb login --relogin`[0m to force relogin


                                                 
 33%|███▎      | 125/375 [00:34<00:48,  5.17it/s]

{'eval_loss': 2.0678186416625977, 'eval_rouge1': 0.14753079887204634, 'eval_rouge2': 0.05311448081010642, 'eval_rougeL': 0.12152703358065112, 'eval_rougeLsum': 0.13587401576603353, 'eval_runtime': 11.2295, 'eval_samples_per_second': 11.131, 'eval_steps_per_second': 2.85, 'epoch': 1.0}


                                                 
 67%|██████▋   | 250/375 [01:24<00:26,  4.68it/s]

{'eval_loss': 1.9126132726669312, 'eval_rouge1': 0.15403537605975903, 'eval_rouge2': 0.06527070297054133, 'eval_rougeL': 0.13088972856643843, 'eval_rougeLsum': 0.1433835411893397, 'eval_runtime': 11.7696, 'eval_samples_per_second': 10.621, 'eval_steps_per_second': 2.719, 'epoch': 2.0}


                                                 
100%|██████████| 375/375 [02:09<00:00,  5.60it/s]

{'eval_loss': 1.875769019126892, 'eval_rouge1': 0.14871922689408157, 'eval_rouge2': 0.061755863660551844, 'eval_rougeL': 0.1268634629697864, 'eval_rougeLsum': 0.13713029994356685, 'eval_runtime': 11.2254, 'eval_samples_per_second': 11.135, 'eval_steps_per_second': 2.851, 'epoch': 3.0}


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].
100%|██████████| 375/375 [02:11<00:00,  2.85it/s]


{'train_runtime': 134.7008, 'train_samples_per_second': 11.136, 'train_steps_per_second': 2.784, 'train_loss': 2.2278942057291666, 'epoch': 3.0}



 33%|███▎      | 125/375 [00:34<00:50,  4.96it/s]

{'eval_loss': 2.5613465309143066, 'eval_rouge1': 0.124674201440485, 'eval_rouge2': 0.048904178710419695, 'eval_rougeL': 0.10723991623451987, 'eval_rougeLsum': 0.11489920487529967, 'eval_runtime': 11.6595, 'eval_samples_per_second': 10.721, 'eval_steps_per_second': 2.745, 'epoch': 1.0}



 67%|██████▋   | 250/375 [01:20<00:22,  5.55it/s]

{'eval_loss': 2.4130988121032715, 'eval_rouge1': 0.1486222523556518, 'eval_rouge2': 0.0655636808190668, 'eval_rougeL': 0.12396740526687447, 'eval_rougeLsum': 0.13558528422910707, 'eval_runtime': 11.1454, 'eval_samples_per_second': 11.215, 'eval_steps_per_second': 2.871, 'epoch': 2.0}



100%|██████████| 375/375 [01:58<00:00,  5.31it/s]

{'eval_loss': 2.375945806503296, 'eval_rouge1': 0.1529917901464747, 'eval_rouge2': 0.06731742202098422, 'eval_rougeL': 0.1275433978983742, 'eval_rougeLsum': 0.140074213341659, 'eval_runtime': 12.2609, 'eval_samples_per_second': 10.195, 'eval_steps_per_second': 2.61, 'epoch': 3.0}


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].
100%|██████████| 375/375 [02:19<00:00,  2.69it/s]


{'train_runtime': 139.3229, 'train_samples_per_second': 10.766, 'train_steps_per_second': 2.692, 'train_loss': 3.0271315104166665, 'epoch': 3.0}


### Small Code

In [34]:
for lr in lrs:
    model_small_code = T5ForConditionalGeneration.from_pretrained(MODEL_NAME).to(device)
    data_collator_code_small = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_small_code)
    
    output_dir_root = "./results/small_code"
    output_dir = f"{output_dir_root}/{lr:.0e}".replace("0", "")
    training_args_code_small = training_args(L_RATE = lr, BATCH_SIZE = 4, NUM_EPOCHS = 3, output_dir=output_dir)

    trainer_code_small = Seq2SeqTrainer(
       model=model_small_code,
       args=training_args_code_small,
       train_dataset=tokenized_dataset_code_small["train"], 
       eval_dataset=tokenized_dataset_code_small["test"],   
       tokenizer=tokenizer,
       data_collator=data_collator_code_small,
       compute_metrics=compute_metrics
    )

    trainer_code_small.train()

    del model_small_code, trainer_code_small
    gc.collect()
    torch.cuda.empty_cache()


 33%|███▎      | 125/375 [00:29<00:34,  7.34it/s]

{'eval_loss': 2.016775369644165, 'eval_rouge1': 0.11612913432142086, 'eval_rouge2': 0.04115832449191913, 'eval_rougeL': 0.11456502677605027, 'eval_rougeLsum': 0.11431124814196948, 'eval_runtime': 10.6537, 'eval_samples_per_second': 11.733, 'eval_steps_per_second': 3.004, 'epoch': 1.0}



 67%|██████▋   | 250/375 [01:20<00:18,  6.64it/s]

{'eval_loss': 1.7889060974121094, 'eval_rouge1': 0.11480620883423139, 'eval_rouge2': 0.04466836894982739, 'eval_rougeL': 0.11265325718667234, 'eval_rougeLsum': 0.11256915855636684, 'eval_runtime': 10.3492, 'eval_samples_per_second': 12.078, 'eval_steps_per_second': 3.092, 'epoch': 2.0}


                                                 
100%|██████████| 375/375 [01:56<00:00,  6.40it/s]

{'eval_loss': 1.7229335308074951, 'eval_rouge1': 0.11195872617542241, 'eval_rouge2': 0.04489759086861367, 'eval_rougeL': 0.11069500056759757, 'eval_rougeLsum': 0.11065847908914592, 'eval_runtime': 11.2752, 'eval_samples_per_second': 11.086, 'eval_steps_per_second': 2.838, 'epoch': 3.0}


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].
100%|██████████| 375/375 [02:12<00:00,  2.83it/s]


{'train_runtime': 132.309, 'train_samples_per_second': 11.337, 'train_steps_per_second': 2.834, 'train_loss': 2.09246728515625, 'epoch': 3.0}



 33%|███▎      | 125/375 [00:46<00:50,  4.97it/s]

{'eval_loss': 2.841360569000244, 'eval_rouge1': 0.10304150997549864, 'eval_rouge2': 0.03568081898237045, 'eval_rougeL': 0.10188450896375748, 'eval_rougeLsum': 0.10186907991614094, 'eval_runtime': 13.5703, 'eval_samples_per_second': 9.211, 'eval_steps_per_second': 2.358, 'epoch': 1.0}



 67%|██████▋   | 250/375 [01:39<00:26,  4.65it/s]

{'eval_loss': 2.602370262145996, 'eval_rouge1': 0.12498260261167471, 'eval_rouge2': 0.03906354867748432, 'eval_rougeL': 0.12195713295643335, 'eval_rougeLsum': 0.12199950239308704, 'eval_runtime': 12.1936, 'eval_samples_per_second': 10.251, 'eval_steps_per_second': 2.624, 'epoch': 2.0}



100%|██████████| 375/375 [02:31<00:00,  4.86it/s]

{'eval_loss': 2.5382773876190186, 'eval_rouge1': 0.12598068779024985, 'eval_rouge2': 0.040002422760855236, 'eval_rougeL': 0.12354459804026416, 'eval_rougeLsum': 0.12334207922655581, 'eval_runtime': 11.4736, 'eval_samples_per_second': 10.895, 'eval_steps_per_second': 2.789, 'epoch': 3.0}


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].
100%|██████████| 375/375 [02:54<00:00,  4.86it/s]

{'train_runtime': 174.6364, 'train_samples_per_second': 8.589, 'train_steps_per_second': 2.147, 'train_loss': 3.161347330729167, 'epoch': 3.0}


100%|██████████| 375/375 [02:55<00:00,  2.14it/s]


### Small General Knowledge

In [35]:
for lr in lrs:
    model_small_general = T5ForConditionalGeneration.from_pretrained(MODEL_NAME).to(device)
    data_collator_general_small = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_small_general)

    output_dir_root = "./results/small_general"
    output_dir = f"{output_dir_root}/{lr:.0e}".replace("0", "")
    training_args_general_small = training_args(L_RATE = lr, BATCH_SIZE = 4, NUM_EPOCHS = 3, output_dir=output_dir)

    trainer_general_small = Seq2SeqTrainer(
       model=model_small_general,
       args=training_args_general_small,
       train_dataset=tokenized_dataset_general_small["train"], 
       eval_dataset=tokenized_dataset_general_small["test"],   
       tokenizer=tokenizer,
       data_collator=data_collator_general_small,
       compute_metrics=compute_metrics
    )

    trainer_general_small.train()

    del model_small_general, trainer_general_small
    gc.collect()
    torch.cuda.empty_cache()

                                                 
 33%|███▎      | 125/375 [00:19<00:30,  8.27it/s]

{'eval_loss': 2.662008762359619, 'eval_rouge1': 0.11345518925518927, 'eval_rouge2': 0.026666666666666665, 'eval_rougeL': 0.11248473748473747, 'eval_rougeLsum': 0.11274676434676437, 'eval_runtime': 4.4644, 'eval_samples_per_second': 27.999, 'eval_steps_per_second': 7.168, 'epoch': 1.0}


                                                 
 67%|██████▋   | 250/375 [00:56<00:13,  9.15it/s]

{'eval_loss': 2.707836389541626, 'eval_rouge1': 0.13003809523809523, 'eval_rouge2': 0.029333333333333333, 'eval_rougeL': 0.12880317460317464, 'eval_rougeLsum': 0.13023809523809526, 'eval_runtime': 3.8162, 'eval_samples_per_second': 32.755, 'eval_steps_per_second': 8.385, 'epoch': 2.0}


                                                 
100%|██████████| 375/375 [01:27<00:00,  8.46it/s]

{'eval_loss': 2.7521719932556152, 'eval_rouge1': 0.13297633477633478, 'eval_rouge2': 0.03911111111111112, 'eval_rougeL': 0.133212987012987, 'eval_rougeLsum': 0.13399278499278505, 'eval_runtime': 4.3759, 'eval_samples_per_second': 28.566, 'eval_steps_per_second': 7.313, 'epoch': 3.0}


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].
100%|██████████| 375/375 [01:36<00:00,  8.46it/s]

{'train_runtime': 96.4893, 'train_samples_per_second': 15.546, 'train_steps_per_second': 3.886, 'train_loss': 2.331829264322917, 'epoch': 3.0}


100%|██████████| 375/375 [01:36<00:00,  3.87it/s]

 33%|███▎      | 125/375 [00:20<00:30,  8.25it/s]

{'eval_loss': 2.800891637802124, 'eval_rouge1': 0.06090000000000001, 'eval_rouge2': 0.015238095238095238, 'eval_rougeL': 0.06083333333333333, 'eval_rougeLsum': 0.061333333333333344, 'eval_runtime': 4.6068, 'eval_samples_per_second': 27.134, 'eval_steps_per_second': 6.946, 'epoch': 1.0}



 67%|██████▋   | 250/375 [00:46<00:14,  8.40it/s]

{'eval_loss': 2.699368476867676, 'eval_rouge1': 0.08234285714285713, 'eval_rouge2': 0.019999999999999997, 'eval_rougeL': 0.08327619047619048, 'eval_rougeLsum': 0.08407619047619047, 'eval_runtime': 4.142, 'eval_samples_per_second': 30.179, 'eval_steps_per_second': 7.726, 'epoch': 2.0}



100%|██████████| 375/375 [01:16<00:00,  8.44it/s]

{'eval_loss': 2.682535171508789, 'eval_rouge1': 0.10452380952380952, 'eval_rouge2': 0.03047619047619048, 'eval_rougeL': 0.10563809523809525, 'eval_rougeLsum': 0.10454285714285715, 'eval_runtime': 4.1929, 'eval_samples_per_second': 29.812, 'eval_steps_per_second': 7.632, 'epoch': 3.0}


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].
100%|██████████| 375/375 [01:40<00:00,  3.73it/s]


{'train_runtime': 100.6401, 'train_samples_per_second': 14.905, 'train_steps_per_second': 3.726, 'train_loss': 3.2527493489583335, 'epoch': 3.0}
