# 6.8610  PROJECT

## Install and import libraries

In [1]:
# !pip install nltk
# !pip install datasets
# !pip install transformers[torch]
# !pip install tokenizers
# !pip install evaluate
# !pip install rouge_score
# !pip install sentencepiece
# !pip install huggingface_hub
# !pip install wandb

In [2]:
import nltk
import evaluate
import numpy as np
from datasets import load_dataset
from datasets import Dataset
from transformers import T5Tokenizer, DataCollatorForSeq2Seq
from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
import os
import json
import torch
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
import gc
import pandas as pd
import random
random.seed(42)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


## Load model

In [4]:
MODEL_NAME = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


## Create datasets

### Create code dataset

In [5]:
def extract_text_before_example(text):
    return text.split("**Example 1:**")[0].strip()

In [6]:
def convert_folder_to_datasetdict_code(folder_path):
    with open(folder_path, 'r') as file:
        data = json.load(file)

    df = pd.json_normalize(data)
    dataset = Dataset.from_pandas(df[['code_with_problem', 'code_only']])
    dataset['question'] = dataset['code_with_problem'].apply(extract_text_before_example)
    return dataset

In [7]:
def convert_folder_to_datasetdict_code(folder_path):
    with open(folder_path, 'r') as file:
        data = json.load(file)

    df = pd.json_normalize(data)

    # Extract text before "Example 1" for the 'code_with_problem' column
    df['question'] = df['code_with_problem'].apply(extract_text_before_example)

    # Rename columns and create the dataset
    df.rename(columns={'code_only': 'answer'}, inplace=True)
    dataset = Dataset.from_pandas(df[['question', 'answer']])

    return dataset

In [8]:
code_dataset = convert_folder_to_datasetdict_code('data/code/leetcode-solutions.json')
train_dataset, test_dataset = train_test_split(code_dataset, test_size=0.2)
train_dataset = Dataset.from_dict(train_dataset)
test_dataset = Dataset.from_dict(test_dataset)

In [9]:
code_dict = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

print(code_dict)

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 1887
    })
    test: Dataset({
        features: ['question', 'answer'],
        num_rows: 472
    })
})


In [10]:
n_train = code_dict['train'].num_rows
n_test = code_dict['test'].num_rows
n_total = n_train + n_test

In [11]:
code_dict["train"][0]

{'question': '# You are given an integer array `nums` consisting of `n` elements, and an integer `k`.\n\nFind a contiguous subarray whose **length is greater than or equal to** `k` that has the maximum average value and return _this value_. Any answer with a calculation error less than `10-5` will be accepted.',
 'answer': '```python\ndef findMaxAverage(nums, k):\n    total = sum(nums[:k])\n    max_avg = total / k\n    for i in range(k, len(nums)):\n        total += nums[i] - nums[i - k]\n        max_avg = max(max_avg, total / k)\n    return max_avg\n```\n\n'}

### Create general knowledge dataset

In [12]:
csv_file_path = 'data/general/general.csv'
general = pd.read_csv(csv_file_path)
general = general.sample(n=n_total, random_state=42)
general['id'] = range(len(general))
train_sample, test_sample = train_test_split(general, test_size=0.2, random_state=42)
train_sample.reset_index(drop=True, inplace=True)
test_sample.reset_index(drop=True, inplace=True)

train_dataset = Dataset.from_pandas(train_sample[['question', 'answer']])
test_dataset = Dataset.from_pandas(test_sample[['question', 'answer']])

general_dict = DatasetDict({
    'train': train_dataset,
    'test': test_dataset,
})

print(general_dict)

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 1887
    })
    test: Dataset({
        features: ['question', 'answer'],
        num_rows: 472
    })
})


In [13]:
general_dict["train"][0]

{'question': 'Add 1 letter to "Iowa" to get the name of this tribe who lived south of the Iowa',
 'answer': 'Kiowa'}

### Create math dataset

In [14]:
def convert_folder_to_datasetdict_math(folder_path):
    #data = {"id": [], "question": [], "level": [], "type": [], "answer": []}
    data = {"question": [], "answer": []}

    subject_dictionary = {
        "algebra": 1,
        "counting_and_probability": 2,
        "geometry": 3,
        "intermediate_algebra": 4,
        "number_theory": 5,
        "prealgebra": 6,
        "precalculus": 7
    }

    for subdir, _, files in os.walk(folder_path):
        for file in files:
            file_path = os.path.join(subdir, file)
            folder_name = os.path.basename(os.path.normpath(subdir))
            
            with open(file_path, "r") as f:
                json_data = json.load(f)
                problem = json_data.get("problem", "")
                level = json_data.get("level", "")
                type_ = json_data.get("type", "")
                solution = json_data.get("solution", "")

                # Generate id from subject code and file name
                file_name = os.path.splitext(file)[0]
                subject_code = subject_dictionary.get(folder_name, 0)  # Default to 0 if not found
                id_ = f"{subject_code}_{file_name}"

                #data["id"].append(id_)
                data["question"].append(problem)
                #data["level"].append(level)
                #data["type"].append(type_)
                data["answer"].append(solution)

    dataset = Dataset.from_dict(data)
    return dataset

In [15]:
math_train = convert_folder_to_datasetdict_math("data/math/train/")
math_test = convert_folder_to_datasetdict_math("data/math/test/")

In [16]:
math_train = math_train.shuffle(seed=42)
math_train = math_train.select(range(n_train))

math_test = math_test.shuffle(seed=42)
math_test = math_test.select(range(n_test))

math_dict = DatasetDict({
    'train': math_train,
    'test': math_test
})

print(math_dict)

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 1887
    })
    test: Dataset({
        features: ['question', 'answer'],
        num_rows: 472
    })
})


In [17]:
math_dict["train"][0]

{'question': 'What is the number of units in the distance between $(2,5)$ and $(-6,-1)$?',
 'answer': 'We use the distance formula: $\\sqrt{(-6 - 2)^2 + (-1 - 5)^2},$ so then we find that $\\sqrt{64 + 36} = \\boxed{10}$.\n\n- OR -\n\nWe note that the points $(2, 5)$, $(-6, -1)$, and $(2, -1)$ form a right triangle with legs of length 6 and 8. This is a Pythagorean triple, so the length of the hypotenuse must be $\\boxed{10}$.'}

### Create 50% samples

In [18]:
math_train_sample = math_dict['train'].shuffle(seed=42).select([i for i in range(math_dict['train'].num_rows // 2)])
math_test_sample = math_dict['test'].shuffle(seed=42).select([i for i in range(math_dict['test'].num_rows // 2)])

general_train_sample = general_dict['train'].shuffle(seed=42).select([i for i in range(general_dict['train'].num_rows // 2+1)])
general_test_sample = general_dict['test'].shuffle(seed=42).select([i for i in range(general_dict['test'].num_rows // 2)])

code_train_sample = code_dict['train'].shuffle(seed=42).select([i for i in range(code_dict['train'].num_rows // 2)])
code_test_sample = code_dict['test'].shuffle(seed=42).select([i for i in range(code_dict['test'].num_rows // 2)])

### Create 50% general 50% math dataset


In [19]:
general_math_dict = DatasetDict({
    'train': Dataset.from_dict({
        'question': math_train_sample['question'] + general_train_sample['question'],
        'answer': math_train_sample['answer'] + general_train_sample['answer'],
    }),
    'test': Dataset.from_dict({
        'question': math_test_sample['question'] + general_test_sample['question'],
        'answer': math_test_sample['answer'] + general_test_sample['answer'],
    })
})


general_math_dict = DatasetDict({
    'train': general_math_dict['train'].shuffle(seed=42),
    'test': general_math_dict['test'].shuffle(seed=42)
})

In [20]:
general_math_dict["train"][0]

{'question': 'Find the monic quadratic polynomial, in $x,$ with real coefficients, which has $1 - i$ as a root.',
 'answer': 'If a polynomial has real coefficients, then any complex conjugate of a root must also be a root.  Hence, the other root is $1 + i.$  Thus, the polynomial is\n\\[(x - 1 - i)(x - 1 + i) = (x - 1)^2 - i^2 = \\boxed{x^2 - 2x + 2}.\\]'}

### Create 50% general 50% code dataset


In [21]:
general_code_dict = DatasetDict({
    'train': Dataset.from_dict({
        'question': code_train_sample['question'] + general_train_sample['question'],
        'answer': code_train_sample['answer'] + general_train_sample['answer'],
    }),
    'test': Dataset.from_dict({
        'question': code_test_sample['question'] + general_test_sample['question'],
        'answer': code_test_sample['answer'] + general_test_sample['answer'],
    })
})

general_code_dict = DatasetDict({
    'train': general_code_dict['train'].shuffle(seed=42),
    'test': general_code_dict['test'].shuffle(seed=42)
})

print(general_code_dict)

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 1887
    })
    test: Dataset({
        features: ['question', 'answer'],
        num_rows: 472
    })
})


In [22]:
general_code_dict["train"][0]

{'question': '# You are given a 2D integer array `ranges` and two integers `left` and `right`. Each `ranges[i] = [starti, endi]` represents an **inclusive** interval between `starti` and `endi`.\n\nReturn `true` _if each integer in the inclusive range_ `[left, right]` _is covered by **at least one** interval in_ `ranges`. Return `false` _otherwise_.\n\nAn integer `x` is covered by an interval `ranges[i] = [starti, endi]` if `starti <= x <= endi`.',
 'answer': '```python\ndef isCovered(ranges: List[List[int]], left: int, right: int) -> bool:\n    for i in range(left, right + 1):\n        found = False\n        for _range in ranges:\n            if _range[0] <= i <= _range[1]:\n                found = True\n                break\n        if not found:\n            return False\n    return True\n```\n\n'}

## Preprocessing

In [23]:
prefix = "Please answer this question: "

def preprocess_function(examples):
   """Add prefix to the sentences, tokenize the text, and set the labels"""
   # The "inputs" are the tokenized answer:
   inputs = [prefix + doc for doc in examples["question"]]
   model_inputs = tokenizer(inputs, max_length=128, truncation=True)
  
   # The "labels" are the tokenized outputs:
   labels = tokenizer(text_target=examples["answer"], 
                      max_length=512,         
                      truncation=True)

   model_inputs["labels"] = labels["input_ids"]
   return model_inputs

In [24]:
tokenized_dataset_math = math_dict.map(preprocess_function, batched=True)
tokenized_dataset_code = code_dict.map(preprocess_function, batched=True)
tokenized_dataset_general = general_dict.map(preprocess_function, batched=True)
tokenized_dataset_general_code = general_code_dict.map(preprocess_function, batched=True)
tokenized_dataset_general_math = general_math_dict.map(preprocess_function, batched=True)

Map:   0%|          | 0/1887 [00:00<?, ? examples/s]

Map: 100%|██████████| 1887/1887 [00:00<00:00, 1888.70 examples/s]
Map: 100%|██████████| 472/472 [00:00<00:00, 1377.98 examples/s]
Map: 100%|██████████| 1887/1887 [00:01<00:00, 1378.70 examples/s]
Map: 100%|██████████| 472/472 [00:00<00:00, 1298.12 examples/s]
Map: 100%|██████████| 1887/1887 [00:00<00:00, 7385.19 examples/s]
Map: 100%|██████████| 472/472 [00:00<00:00, 7433.36 examples/s]
Map: 100%|██████████| 1887/1887 [00:00<00:00, 2077.31 examples/s]
Map: 100%|██████████| 472/472 [00:00<00:00, 2913.18 examples/s]
Map: 100%|██████████| 1887/1887 [00:00<00:00, 1953.16 examples/s]
Map: 100%|██████████| 472/472 [00:00<00:00, 2563.30 examples/s]


In [25]:
nltk.download("punkt", quiet=True)
metric = evaluate.load("rouge")

In [26]:
def compute_metrics(eval_preds):
   preds, labels = eval_preds

   # decode preds and labels
   labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
   decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
   decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

   # rougeLSum expects newline after each sentence
   decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
   decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

   result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
  
   return result

## Fine-tuning

### Experiment

In [27]:
def training_args(output_dir, L_RATE = 3e-4, BATCH_SIZE = 4, PER_DEVICE_EVAL_BATCH = 4, WEIGHT_DECAY = 0.01, SAVE_TOTAL_LIM = 3, NUM_EPOCHS = 3, OVERWRITE_OUTPUT_DIR = True, LOAD_BEST_MODEL_AT_END = True):
    training_args = Seq2SeqTrainingArguments(
       output_dir=output_dir,
       overwrite_output_dir=OVERWRITE_OUTPUT_DIR,
       save_strategy="epoch",
       evaluation_strategy="epoch",
       load_best_model_at_end=LOAD_BEST_MODEL_AT_END,
       learning_rate=L_RATE,
       per_device_train_batch_size=BATCH_SIZE,
       per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH,
       weight_decay=WEIGHT_DECAY,
       save_total_limit=SAVE_TOTAL_LIM,
       num_train_epochs=NUM_EPOCHS,
       predict_with_generate=True,
       push_to_hub=False
    )
    return training_args

In [28]:
lrs = [3e-3, 3e-4, 3e-5] 

### Math

In [29]:
for lr in lrs:
    model_math = T5ForConditionalGeneration.from_pretrained(MODEL_NAME).to(device)
    data_collator_math = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_math)
    
    output_dir_root = "./results/math"
    output_dir = f"{output_dir_root}/{lr:.0e}".replace("0", "")
    training_args_math = training_args(L_RATE = lr, BATCH_SIZE = 4, NUM_EPOCHS = 3, output_dir=output_dir)

    trainer_math = Seq2SeqTrainer(
       model=model_math,
       args=training_args_math,
       train_dataset=tokenized_dataset_math["train"], 
       eval_dataset=tokenized_dataset_math["test"],   
       tokenizer=tokenizer,
       data_collator=data_collator_math,
       compute_metrics=compute_metrics
    )

    trainer_math.train()

    del model_math, trainer_math
    gc.collect()
    torch.cuda.empty_cache()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33myezixuanclara[0m. Use [1m`wandb login --relogin`[0m to force relogin


                                                  
 33%|███▎      | 472/1416 [02:08<02:52,  5.46it/s]

{'eval_loss': 1.8149617910385132, 'eval_rouge1': 0.10995362110086625, 'eval_rouge2': 0.03205449885714297, 'eval_rougeL': 0.09534439305275969, 'eval_rougeLsum': 0.10215049099534834, 'eval_runtime': 41.9023, 'eval_samples_per_second': 11.264, 'eval_steps_per_second': 2.816, 'epoch': 1.0}


 35%|███▌      | 501/1416 [02:26<02:50,  5.36it/s]  

{'loss': 2.3497, 'learning_rate': 0.001940677966101695, 'epoch': 1.06}


                                                  
 67%|██████▋   | 944/1416 [04:31<01:10,  6.69it/s]

{'eval_loss': 1.5980454683303833, 'eval_rouge1': 0.14648329059044826, 'eval_rouge2': 0.05500205586630799, 'eval_rougeL': 0.1257581518045653, 'eval_rougeLsum': 0.13622542966603407, 'eval_runtime': 43.3151, 'eval_samples_per_second': 10.897, 'eval_steps_per_second': 2.724, 'epoch': 2.0}


 71%|███████   | 1001/1416 [04:53<01:09,  6.01it/s] 

{'loss': 1.6592, 'learning_rate': 0.0008813559322033899, 'epoch': 2.12}


                                                   
100%|██████████| 1416/1416 [06:52<00:00,  5.17it/s]

{'eval_loss': 1.5386518239974976, 'eval_rouge1': 0.13239823500131376, 'eval_rouge2': 0.048245988176787366, 'eval_rougeL': 0.11548947331316065, 'eval_rougeLsum': 0.12385955189580117, 'eval_runtime': 42.7494, 'eval_samples_per_second': 11.041, 'eval_steps_per_second': 2.76, 'epoch': 3.0}


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].
100%|██████████| 1416/1416 [07:00<00:00,  3.37it/s]


{'train_runtime': 422.5263, 'train_samples_per_second': 13.398, 'train_steps_per_second': 3.351, 'train_loss': 1.8174031144481595, 'epoch': 3.0}


                                                  
 33%|███▎      | 472/1416 [02:09<02:54,  5.42it/s]

{'eval_loss': 1.7312365770339966, 'eval_rouge1': 0.15218676157844974, 'eval_rouge2': 0.06163434755852851, 'eval_rougeL': 0.13078325801480137, 'eval_rougeLsum': 0.14037009582489374, 'eval_runtime': 43.6071, 'eval_samples_per_second': 10.824, 'eval_steps_per_second': 2.706, 'epoch': 1.0}


 35%|███▌      | 501/1416 [02:23<02:54,  5.24it/s]  

{'loss': 2.2384, 'learning_rate': 0.00019406779661016945, 'epoch': 1.06}


                                                  
 67%|██████▋   | 944/1416 [04:27<01:09,  6.76it/s]

{'eval_loss': 1.6180236339569092, 'eval_rouge1': 0.1419294322606442, 'eval_rouge2': 0.05933814874088656, 'eval_rougeL': 0.12402573039460824, 'eval_rougeLsum': 0.13327062586543964, 'eval_runtime': 42.3857, 'eval_samples_per_second': 11.136, 'eval_steps_per_second': 2.784, 'epoch': 2.0}


 71%|███████   | 1001/1416 [04:40<01:09,  5.99it/s] 

{'loss': 1.7438, 'learning_rate': 8.813559322033898e-05, 'epoch': 2.12}


                                                   
100%|██████████| 1416/1416 [06:36<00:00,  5.23it/s]

{'eval_loss': 1.587369680404663, 'eval_rouge1': 0.15091141282307097, 'eval_rouge2': 0.06177349036870975, 'eval_rougeL': 0.1303818569072119, 'eval_rougeLsum': 0.140499861351289, 'eval_runtime': 41.1036, 'eval_samples_per_second': 11.483, 'eval_steps_per_second': 2.871, 'epoch': 3.0}


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].
100%|██████████| 1416/1416 [06:39<00:00,  3.54it/s]


{'train_runtime': 399.528, 'train_samples_per_second': 14.169, 'train_steps_per_second': 3.544, 'train_loss': 1.878079990882658, 'epoch': 3.0}


                                                  
 33%|███▎      | 472/1416 [02:22<03:14,  4.85it/s]

{'eval_loss': 2.173102855682373, 'eval_rouge1': 0.1509541489128586, 'eval_rouge2': 0.05838174564159471, 'eval_rougeL': 0.1268694329637305, 'eval_rougeLsum': 0.1389241976155938, 'eval_runtime': 41.4301, 'eval_samples_per_second': 11.393, 'eval_steps_per_second': 2.848, 'epoch': 1.0}


 35%|███▌      | 501/1416 [02:31<03:12,  4.77it/s]  

{'loss': 2.861, 'learning_rate': 1.9406779661016948e-05, 'epoch': 1.06}


                                                  
 67%|██████▋   | 944/1416 [04:43<01:16,  6.16it/s]

{'eval_loss': 2.0327463150024414, 'eval_rouge1': 0.14705630121860458, 'eval_rouge2': 0.056810334381106616, 'eval_rougeL': 0.12533656587980546, 'eval_rougeLsum': 0.1356657964352232, 'eval_runtime': 41.0923, 'eval_samples_per_second': 11.486, 'eval_steps_per_second': 2.872, 'epoch': 2.0}


 71%|███████   | 1001/1416 [04:57<01:16,  5.39it/s] 

{'loss': 2.343, 'learning_rate': 8.8135593220339e-06, 'epoch': 2.12}


                                                   
100%|██████████| 1416/1416 [07:03<00:00,  4.64it/s]

{'eval_loss': 1.9992949962615967, 'eval_rouge1': 0.1474048317605285, 'eval_rouge2': 0.05774543918221979, 'eval_rougeL': 0.12583707187584028, 'eval_rougeLsum': 0.13708150892638965, 'eval_runtime': 40.9129, 'eval_samples_per_second': 11.537, 'eval_steps_per_second': 2.884, 'epoch': 3.0}


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].
100%|██████████| 1416/1416 [07:10<00:00,  3.29it/s]


{'train_runtime': 430.8929, 'train_samples_per_second': 13.138, 'train_steps_per_second': 3.286, 'train_loss': 2.502729642189155, 'epoch': 3.0}


### Code

In [30]:
for lr in lrs:
    model_code = T5ForConditionalGeneration.from_pretrained(MODEL_NAME).to(device)
    data_collator_code = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_code)
    
    output_dir_root = "./results/code"
    output_dir = f"{output_dir_root}/{lr:.0e}".replace("0", "")
    training_args_code = training_args(L_RATE = lr, BATCH_SIZE = 4, NUM_EPOCHS = 3, output_dir=output_dir)

    trainer_code = Seq2SeqTrainer(
       model=model_code,
       args=training_args_code,
       train_dataset=tokenized_dataset_code["train"], 
       eval_dataset=tokenized_dataset_code["test"],   
       tokenizer=tokenizer,
       data_collator=data_collator_code,
       compute_metrics=compute_metrics
    )

    trainer_code.train()

    del model_code, trainer_code
    gc.collect()
    torch.cuda.empty_cache()

                                                  
 33%|███▎      | 472/1416 [05:20<07:42,  2.04it/s]

{'eval_loss': 1.5350375175476074, 'eval_rouge1': 0.11721102782605994, 'eval_rouge2': 0.034336015307031405, 'eval_rougeL': 0.11502648637877302, 'eval_rougeLsum': 0.11501143176245161, 'eval_runtime': 62.031, 'eval_samples_per_second': 7.609, 'eval_steps_per_second': 1.902, 'epoch': 1.0}


 35%|███▌      | 501/1416 [05:54<08:08,  1.87it/s]  

{'loss': 2.061, 'learning_rate': 0.001940677966101695, 'epoch': 1.06}


                                                  
 67%|██████▋   | 944/1416 [11:05<03:55,  2.00it/s]

{'eval_loss': 1.264697790145874, 'eval_rouge1': 0.10659240077007925, 'eval_rouge2': 0.03670836595815304, 'eval_rougeL': 0.10468458144177192, 'eval_rougeLsum': 0.10468716007322851, 'eval_runtime': 55.0834, 'eval_samples_per_second': 8.569, 'eval_steps_per_second': 2.142, 'epoch': 2.0}


 71%|███████   | 1001/1416 [11:37<01:08,  6.02it/s] 

{'loss': 1.3332, 'learning_rate': 0.0008813559322033899, 'epoch': 2.12}


                                                   
100%|██████████| 1416/1416 [15:57<00:00,  3.33it/s]

{'eval_loss': 1.1811020374298096, 'eval_rouge1': 0.10671584122236769, 'eval_rouge2': 0.038957483768928686, 'eval_rougeL': 0.10457401587359638, 'eval_rougeLsum': 0.10459596143137674, 'eval_runtime': 53.9787, 'eval_samples_per_second': 8.744, 'eval_steps_per_second': 2.186, 'epoch': 3.0}


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].
100%|██████████| 1416/1416 [16:03<00:00,  1.47it/s]


{'train_runtime': 963.0746, 'train_samples_per_second': 5.878, 'train_steps_per_second': 1.47, 'train_loss': 1.5088243861656405, 'epoch': 3.0}



 33%|███▎      | 472/1416 [06:28<08:43,  1.80it/s]

{'eval_loss': 1.4910694360733032, 'eval_rouge1': 0.11911878258078282, 'eval_rouge2': 0.0444059650080581, 'eval_rougeL': 0.11698277072628263, 'eval_rougeLsum': 0.11654976093618492, 'eval_runtime': 57.2262, 'eval_samples_per_second': 8.248, 'eval_steps_per_second': 2.062, 'epoch': 1.0}


 35%|███▌      | 502/1416 [07:04<06:18,  2.42it/s]  

{'loss': 2.0653, 'learning_rate': 0.00019406779661016945, 'epoch': 1.06}



 67%|██████▋   | 944/1416 [13:00<06:30,  1.21it/s]

{'eval_loss': 1.328734040260315, 'eval_rouge1': 0.11465649503651226, 'eval_rouge2': 0.046958915863290096, 'eval_rougeL': 0.11268805229691237, 'eval_rougeLsum': 0.11251093973210091, 'eval_runtime': 57.9208, 'eval_samples_per_second': 8.149, 'eval_steps_per_second': 2.037, 'epoch': 2.0}


 71%|███████   | 1001/1416 [13:51<01:11,  5.80it/s] 

{'loss': 1.4615, 'learning_rate': 8.813559322033898e-05, 'epoch': 2.12}



100%|██████████| 1416/1416 [19:40<00:00,  2.25it/s]

{'eval_loss': 1.2787977457046509, 'eval_rouge1': 0.11488158867090989, 'eval_rouge2': 0.04650912829189177, 'eval_rougeL': 0.11301840310581476, 'eval_rougeLsum': 0.11313591044535914, 'eval_runtime': 58.0379, 'eval_samples_per_second': 8.133, 'eval_steps_per_second': 2.033, 'epoch': 3.0}


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].
100%|██████████| 1416/1416 [19:50<00:00,  1.19it/s]


{'train_runtime': 1190.0221, 'train_samples_per_second': 4.757, 'train_steps_per_second': 1.19, 'train_loss': 1.6281611986752957, 'epoch': 3.0}



 33%|███▎      | 472/1416 [14:13<19:45,  1.26s/it]

{'eval_loss': 2.1649770736694336, 'eval_rouge1': 0.12094778151993357, 'eval_rouge2': 0.041445618743021245, 'eval_rougeL': 0.11924863436296773, 'eval_rougeLsum': 0.1191366371153875, 'eval_runtime': 126.3156, 'eval_samples_per_second': 3.737, 'eval_steps_per_second': 0.934, 'epoch': 1.0}


 35%|███▌      | 500/1416 [15:17<23:27,  1.54s/it]   

{'loss': 2.939, 'learning_rate': 1.9406779661016948e-05, 'epoch': 1.06}



 67%|██████▋   | 944/1416 [28:35<12:25,  1.58s/it]

{'eval_loss': 1.9423874616622925, 'eval_rouge1': 0.1140291826879622, 'eval_rouge2': 0.04396361155780263, 'eval_rougeL': 0.11122934139816165, 'eval_rougeLsum': 0.11116114169005785, 'eval_runtime': 125.1329, 'eval_samples_per_second': 3.772, 'eval_steps_per_second': 0.943, 'epoch': 2.0}


 71%|███████   | 1000/1416 [30:12<06:28,  1.07it/s] 

{'loss': 2.2665, 'learning_rate': 8.8135593220339e-06, 'epoch': 2.12}



100%|██████████| 1416/1416 [43:04<00:00,  1.15s/it]

{'eval_loss': 1.8862913846969604, 'eval_rouge1': 0.11416862090139981, 'eval_rouge2': 0.043537742104730416, 'eval_rougeL': 0.11160464533282688, 'eval_rougeLsum': 0.1115809059668732, 'eval_runtime': 125.6944, 'eval_samples_per_second': 3.755, 'eval_steps_per_second': 0.939, 'epoch': 3.0}


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].
100%|██████████| 1416/1416 [43:09<00:00,  1.83s/it]


{'train_runtime': 2589.9732, 'train_samples_per_second': 2.186, 'train_steps_per_second': 0.547, 'train_loss': 2.470928213690634, 'epoch': 3.0}


### General

In [None]:
for lr in lrs:
    model_general = T5ForConditionalGeneration.from_pretrained(MODEL_NAME).to(device)
    data_collator_general = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_general)

    output_dir_root = "./results/general"
    output_dir = f"{output_dir_root}/{lr:.0e}".replace("0", "")
    training_args_general = training_args(L_RATE = lr, BATCH_SIZE = 4, NUM_EPOCHS = 3, output_dir=output_dir)

    trainer_general = Seq2SeqTrainer(
       model=model_general,
       args=training_args_general,
       train_dataset=tokenized_dataset_general["train"], 
       eval_dataset=tokenized_dataset_general["test"],   
       tokenizer=tokenizer,
       data_collator=data_collator_general,
       compute_metrics=compute_metrics
    )

    trainer_general.train()

    del model_general, trainer_general
    gc.collect()
    torch.cuda.empty_cache()



### General + Code

In [29]:
for lr in lrs:
    model_general_code = T5ForConditionalGeneration.from_pretrained(MODEL_NAME).to(device)
    data_collator_general_code = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_general_code)

    output_dir_root = "./results/general_code"
    output_dir = f"{output_dir_root}/{lr:.0e}".replace("0", "")
    training_args_general_code = training_args(L_RATE = lr, BATCH_SIZE = 4, NUM_EPOCHS = 3, output_dir=output_dir)

    trainer_general_code = Seq2SeqTrainer(
       model=model_general_code,
       args=training_args_general_code,
       train_dataset=tokenized_dataset_general_code["train"], 
       eval_dataset=tokenized_dataset_general_code["test"],   
       tokenizer=tokenizer,
       data_collator=data_collator_general_code,
       compute_metrics=compute_metrics
    )

    trainer_general_code.train()

    del model_general_code, trainer_general_code
    gc.collect()
    torch.cuda.empty_cache()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33myezixuanclara[0m. Use [1m`wandb login --relogin`[0m to force relogin


                                                  
 33%|███▎      | 472/1416 [01:43<02:13,  7.08it/s]

{'eval_loss': 2.132077693939209, 'eval_rouge1': 0.055051808944643704, 'eval_rouge2': 0.016222703488948487, 'eval_rougeL': 0.05433968281931898, 'eval_rougeLsum': 0.05441902197562565, 'eval_runtime': 38.6494, 'eval_samples_per_second': 12.212, 'eval_steps_per_second': 3.053, 'epoch': 1.0}


 35%|███▌      | 501/1416 [02:10<02:00,  7.60it/s]  

{'loss': 2.6281, 'learning_rate': 0.001940677966101695, 'epoch': 1.06}


                                                  
 67%|██████▋   | 944/1416 [03:46<01:12,  6.51it/s]

{'eval_loss': 1.8248320817947388, 'eval_rouge1': 0.07404124374521334, 'eval_rouge2': 0.019232725796420602, 'eval_rougeL': 0.07357504130459036, 'eval_rougeLsum': 0.07368692572109758, 'eval_runtime': 36.6937, 'eval_samples_per_second': 12.863, 'eval_steps_per_second': 3.216, 'epoch': 2.0}


 71%|███████   | 1001/1416 [04:05<00:53,  7.71it/s] 

{'loss': 1.7786, 'learning_rate': 0.0008813559322033899, 'epoch': 2.12}


                                                   
100%|██████████| 1416/1416 [05:37<00:00,  7.48it/s]

{'eval_loss': 1.736644983291626, 'eval_rouge1': 0.07237803222073752, 'eval_rouge2': 0.020723353671982178, 'eval_rougeL': 0.07175227425435451, 'eval_rougeLsum': 0.07182516121018465, 'eval_runtime': 36.3015, 'eval_samples_per_second': 13.002, 'eval_steps_per_second': 3.251, 'epoch': 3.0}


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].
100%|██████████| 1416/1416 [05:48<00:00,  4.06it/s]


{'train_runtime': 351.6636, 'train_samples_per_second': 16.098, 'train_steps_per_second': 4.027, 'train_loss': 1.9456297017760196, 'epoch': 3.0}


                                                  
 33%|███▎      | 472/1416 [01:38<01:56,  8.08it/s]

{'eval_loss': 1.760346531867981, 'eval_rouge1': 0.12132619838044889, 'eval_rouge2': 0.0371879520789538, 'eval_rougeL': 0.1208510402218295, 'eval_rougeLsum': 0.1210844252051826, 'eval_runtime': 34.9811, 'eval_samples_per_second': 13.493, 'eval_steps_per_second': 3.373, 'epoch': 1.0}


 35%|███▌      | 501/1416 [01:49<01:55,  7.90it/s]  

{'loss': 2.3452, 'learning_rate': 0.00019406779661016945, 'epoch': 1.06}


                                                  
 67%|██████▋   | 944/1416 [03:23<01:11,  6.58it/s]

{'eval_loss': 1.5788134336471558, 'eval_rouge1': 0.11538274777158411, 'eval_rouge2': 0.03491347388730446, 'eval_rougeL': 0.11516147987108587, 'eval_rougeLsum': 0.11438624150784589, 'eval_runtime': 36.8886, 'eval_samples_per_second': 12.795, 'eval_steps_per_second': 3.199, 'epoch': 2.0}


 71%|███████   | 1001/1416 [03:42<00:53,  7.70it/s] 

{'loss': 1.659, 'learning_rate': 8.813559322033898e-05, 'epoch': 2.12}


                                                   
100%|██████████| 1416/1416 [05:14<00:00,  7.26it/s]

{'eval_loss': 1.5343466997146606, 'eval_rouge1': 0.12369657569645576, 'eval_rouge2': 0.038815463721845155, 'eval_rougeL': 0.12219236045938289, 'eval_rougeLsum': 0.12186390364948171, 'eval_runtime': 36.8111, 'eval_samples_per_second': 12.822, 'eval_steps_per_second': 3.206, 'epoch': 3.0}


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].
100%|██████████| 1416/1416 [05:22<00:00,  4.39it/s]


{'train_runtime': 322.2697, 'train_samples_per_second': 17.566, 'train_steps_per_second': 4.394, 'train_loss': 1.8327802668857036, 'epoch': 3.0}



 33%|███▎      | 472/1416 [01:41<02:24,  6.52it/s]

{'eval_loss': 2.3541741371154785, 'eval_rouge1': 0.10543474610190923, 'eval_rouge2': 0.027912390306222754, 'eval_rougeL': 0.104541700149297, 'eval_rougeLsum': 0.10379574014373033, 'eval_runtime': 35.2405, 'eval_samples_per_second': 13.394, 'eval_steps_per_second': 3.348, 'epoch': 1.0}


 35%|███▌      | 501/1416 [01:59<02:44,  5.56it/s]  

{'loss': 3.1622, 'learning_rate': 1.9406779661016948e-05, 'epoch': 1.06}



 67%|██████▋   | 944/1416 [03:40<01:12,  6.55it/s]

{'eval_loss': 2.1470742225646973, 'eval_rouge1': 0.11440688184860422, 'eval_rouge2': 0.03585640348614795, 'eval_rougeL': 0.11380251997686144, 'eval_rougeLsum': 0.11309426185751145, 'eval_runtime': 36.0755, 'eval_samples_per_second': 13.084, 'eval_steps_per_second': 3.271, 'epoch': 2.0}


 71%|███████   | 1001/1416 [04:03<01:09,  5.99it/s] 

{'loss': 2.4896, 'learning_rate': 8.8135593220339e-06, 'epoch': 2.12}



100%|██████████| 1416/1416 [05:54<00:00,  6.22it/s]

{'eval_loss': 2.0940744876861572, 'eval_rouge1': 0.11471425599899554, 'eval_rouge2': 0.03639041958518499, 'eval_rougeL': 0.11419256284976584, 'eval_rougeLsum': 0.11350022159156378, 'eval_runtime': 36.0568, 'eval_samples_per_second': 13.09, 'eval_steps_per_second': 3.273, 'epoch': 3.0}


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].
100%|██████████| 1416/1416 [06:11<00:00,  6.22it/s]

{'train_runtime': 370.9641, 'train_samples_per_second': 15.26, 'train_steps_per_second': 3.817, 'train_loss': 2.68684063927602, 'epoch': 3.0}


100%|██████████| 1416/1416 [06:11<00:00,  3.81it/s]


### General + Math

In [29]:
for lr in lrs:
    model_general_math = T5ForConditionalGeneration.from_pretrained(MODEL_NAME).to(device)
    data_collator_general_math = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_general_math)

    output_dir_root = "./results/general_math"
    output_dir = f"{output_dir_root}/{lr:.0e}".replace("0", "")
    training_args_general_math = training_args(L_RATE = lr, BATCH_SIZE = 4, NUM_EPOCHS = 3, output_dir=output_dir)

    trainer_general_math = Seq2SeqTrainer(
       model=model_general_math,
       args=training_args_general_math,
       train_dataset=tokenized_dataset_general_math["train"], 
       eval_dataset=tokenized_dataset_general_math["test"],   
       tokenizer=tokenizer,
       data_collator=data_collator_general_math,
       compute_metrics=compute_metrics
    )

    trainer_general_math.train()

    del model_general_math, trainer_general_math
    gc.collect()
    torch.cuda.empty_cache()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33myezixuanclara[0m. Use [1m`wandb login --relogin`[0m to force relogin


                                                  
 33%|███▎      | 472/1416 [02:07<02:35,  6.08it/s]

{'eval_loss': 2.5402798652648926, 'eval_rouge1': 0.032277776555268506, 'eval_rouge2': 0.0029141143878364728, 'eval_rougeL': 0.03204248391573171, 'eval_rougeLsum': 0.031950849918273065, 'eval_runtime': 37.0247, 'eval_samples_per_second': 12.748, 'eval_steps_per_second': 3.187, 'epoch': 1.0}


 35%|███▌      | 501/1416 [02:32<02:17,  6.63it/s]  

{'loss': 3.0606, 'learning_rate': 0.001940677966101695, 'epoch': 1.06}


                                                  
 67%|██████▋   | 944/1416 [04:16<01:17,  6.08it/s]

{'eval_loss': 2.3229095935821533, 'eval_rouge1': 0.032444724403872735, 'eval_rouge2': 0.0035144391149949, 'eval_rougeL': 0.03171422707637668, 'eval_rougeLsum': 0.03172260187056673, 'eval_runtime': 36.9298, 'eval_samples_per_second': 12.781, 'eval_steps_per_second': 3.195, 'epoch': 2.0}


 71%|███████   | 1001/1416 [04:32<01:09,  6.00it/s] 

{'loss': 2.2633, 'learning_rate': 0.0008813559322033899, 'epoch': 2.12}


                                                   
100%|██████████| 1416/1416 [06:11<00:00,  6.53it/s]

{'eval_loss': 2.247628927230835, 'eval_rouge1': 0.039121258214399816, 'eval_rouge2': 0.0032839848504259083, 'eval_rougeL': 0.03872789528552882, 'eval_rougeLsum': 0.038698280799125157, 'eval_runtime': 35.8744, 'eval_samples_per_second': 13.157, 'eval_steps_per_second': 3.289, 'epoch': 3.0}


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].
100%|██████████| 1416/1416 [06:22<00:00,  3.70it/s]


{'train_runtime': 385.1121, 'train_samples_per_second': 14.7, 'train_steps_per_second': 3.677, 'train_loss': 2.4215501257255254, 'epoch': 3.0}



 33%|███▎      | 472/1416 [01:48<02:38,  5.97it/s]

{'eval_loss': 1.9895418882369995, 'eval_rouge1': 0.1320215180700664, 'eval_rouge2': 0.03874952300747349, 'eval_rougeL': 0.12353501415658114, 'eval_rougeLsum': 0.1275214497003628, 'eval_runtime': 36.0791, 'eval_samples_per_second': 13.082, 'eval_steps_per_second': 3.271, 'epoch': 1.0}


 35%|███▌      | 501/1416 [02:00<02:13,  6.84it/s]  

{'loss': 2.4809, 'learning_rate': 0.00019406779661016945, 'epoch': 1.06}



 67%|██████▋   | 944/1416 [03:44<01:12,  6.51it/s]

{'eval_loss': 1.8589943647384644, 'eval_rouge1': 0.135461050614511, 'eval_rouge2': 0.038585915833919224, 'eval_rougeL': 0.12459804955343722, 'eval_rougeLsum': 0.12869890674568712, 'eval_runtime': 36.548, 'eval_samples_per_second': 12.915, 'eval_steps_per_second': 3.229, 'epoch': 2.0}


 71%|███████   | 1001/1416 [03:55<01:08,  6.07it/s] 

{'loss': 1.9168, 'learning_rate': 8.813559322033898e-05, 'epoch': 2.12}



100%|██████████| 1416/1416 [05:35<00:00,  6.85it/s]

{'eval_loss': 1.8303388357162476, 'eval_rouge1': 0.14259260962717454, 'eval_rouge2': 0.04300571794768096, 'eval_rougeL': 0.13308619842145114, 'eval_rougeLsum': 0.1364182526233655, 'eval_runtime': 36.3798, 'eval_samples_per_second': 12.974, 'eval_steps_per_second': 3.244, 'epoch': 3.0}


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].
100%|██████████| 1416/1416 [05:42<00:00,  4.14it/s]


{'train_runtime': 342.522, 'train_samples_per_second': 16.527, 'train_steps_per_second': 4.134, 'train_loss': 2.0566785995569608, 'epoch': 3.0}



 33%|███▎      | 472/1416 [01:48<02:35,  6.06it/s]

{'eval_loss': 2.3861546516418457, 'eval_rouge1': 0.1119125107451445, 'eval_rouge2': 0.0407551038633739, 'eval_rougeL': 0.10108785562620667, 'eval_rougeLsum': 0.10601038009687778, 'eval_runtime': 36.2431, 'eval_samples_per_second': 13.023, 'eval_steps_per_second': 3.256, 'epoch': 1.0}


 35%|███▌      | 501/1416 [02:01<02:24,  6.35it/s]  

{'loss': 3.0762, 'learning_rate': 1.9406779661016948e-05, 'epoch': 1.06}


                                                  
 67%|██████▋   | 944/1416 [03:46<01:13,  6.42it/s]

{'eval_loss': 2.2488908767700195, 'eval_rouge1': 0.12060288745848656, 'eval_rouge2': 0.039178581996189996, 'eval_rougeL': 0.11036107868460496, 'eval_rougeLsum': 0.11576035112598533, 'eval_runtime': 37.3415, 'eval_samples_per_second': 12.64, 'eval_steps_per_second': 3.16, 'epoch': 2.0}


 71%|███████   | 1001/1416 [04:03<01:13,  5.62it/s] 

{'loss': 2.5417, 'learning_rate': 8.8135593220339e-06, 'epoch': 2.12}



100%|██████████| 1416/1416 [05:46<00:00,  6.60it/s]

{'eval_loss': 2.2149879932403564, 'eval_rouge1': 0.11827782446039363, 'eval_rouge2': 0.0367991779285983, 'eval_rougeL': 0.1077602373969512, 'eval_rougeLsum': 0.11287944637073197, 'eval_runtime': 37.475, 'eval_samples_per_second': 12.595, 'eval_steps_per_second': 3.149, 'epoch': 3.0}


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].
100%|██████████| 1416/1416 [05:49<00:00,  4.05it/s]


{'train_runtime': 349.4659, 'train_samples_per_second': 16.199, 'train_steps_per_second': 4.052, 'train_loss': 2.6996212544414284, 'epoch': 3.0}
