# 6.8610  PROJECT

## Install and import libraries

In [1]:
!pip install nltk
!pip install datasets
!pip install transformers[torch]
!pip install tokenizers
!pip install evaluate
!pip install rouge_score
!pip install sentencepiece
!pip install huggingface_hub
!pip install wandb



In [1]:
import nltk
import evaluate
import numpy as np
from datasets import load_dataset
from transformers import T5Tokenizer, DataCollatorForSeq2Seq
from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
import os
import json
import torch
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
import gc
import pandas as pd
import random
random.seed(42)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


## Load model

In [3]:
MODEL_NAME = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)

model_math = T5ForConditionalGeneration.from_pretrained(MODEL_NAME).to(device)
model_code1 = T5ForConditionalGeneration.from_pretrained(MODEL_NAME).to(device)
model_code2 = T5ForConditionalGeneration.from_pretrained(MODEL_NAME).to(device)
model_general = T5ForConditionalGeneration.from_pretrained(MODEL_NAME).to(device)

data_collator_math = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_math)
data_collator_code1 = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_code1)
data_collator_code2 = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_code2)
data_collator_general = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_general)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


## Create datasets

### Create code dataset

In [4]:
def convert_folder_to_datasetdict_code(folder_path):
    with open(folder_path, 'r') as file:
        data = json.load(file)

    df = pd.json_normalize(data)
    #df.rename(columns={'code_with_problem': 'question', 'code_only': 'answer'}, inplace=True)
    #dataset = Dataset.from_pandas(df[['question', 'answer']])
    dataset = Dataset.from_pandas(df[['code_with_problem', 'code_with_data', 'code_only']])
    return dataset

In [5]:
code_dataset = convert_folder_to_datasetdict_code('data/code/leetcode-solutions.json')
train_dataset, test_dataset = train_test_split(code_dataset, test_size=0.2)
train_dataset = Dataset.from_dict(train_dataset)
test_dataset = Dataset.from_dict(test_dataset)

In [6]:
code_dict = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

In [7]:
train_dataset_modified = code_dict['train'].rename_column('code_with_problem', 'question')
train_dataset_modified = train_dataset_modified.rename_column('code_only', 'answer')
train_dataset_modified = train_dataset_modified.remove_columns('code_with_data')

test_dataset_modified = code_dict['test'].rename_column('code_with_problem', 'question')
test_dataset_modified = test_dataset_modified.rename_column('code_only', 'answer')
test_dataset_modified = test_dataset_modified.remove_columns('code_with_data')

code_dict1 = DatasetDict({
    'train': train_dataset_modified,
    'test': test_dataset_modified
})

print(code_dict1)

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 1887
    })
    test: Dataset({
        features: ['question', 'answer'],
        num_rows: 472
    })
})


In [8]:
train_dataset_modified = code_dict['train'].rename_column('code_with_data', 'question')
train_dataset_modified = train_dataset_modified.rename_column('code_only', 'answer')
train_dataset_modified = train_dataset_modified.remove_columns('code_with_problem')

test_dataset_modified = code_dict['test'].rename_column('code_with_data', 'question')
test_dataset_modified = test_dataset_modified.rename_column('code_only', 'answer')
test_dataset_modified = test_dataset_modified.remove_columns('code_with_problem')

code_dict2 = DatasetDict({
    'train': train_dataset_modified,
    'test': test_dataset_modified
})

print(code_dict2)

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 1887
    })
    test: Dataset({
        features: ['question', 'answer'],
        num_rows: 472
    })
})


In [9]:
n_train = code_dict['train'].num_rows
n_test = code_dict['test'].num_rows
n_total = n_train + n_test

In [10]:
code_dict1["train"][0]
code_dict2["train"][0]

{'question': '# maximum-side-length-of-a-square-with-sum-less-than-or-equal-to-threshold\n# Maximum Side Length of a Square with Sum Less than or Equal to Threshold\n# Medium\n# Given a `m x n` matrix `mat` and an integer `threshold`, return _the maximum side-length of a square with a sum less than or equal to_ `threshold` _or return_ `0` _if there is no such square_.\n\n**Example 1:**\n\n**Input:** mat = \\[\\[1,1,3,2,4,3,2\\],\\[1,1,3,2,4,3,2\\],\\[1,1,3,2,4,3,2\\]\\], threshold = 4\n**Output:** 2\n**Explanation:** The maximum side length of square with sum less than 4 is 2 as shown.\n\n**Example 2:**\n\n**Input:** mat = \\[\\[2,2,2,2,2\\],\\[2,2,2,2,2\\],\\[2,2,2,2,2\\],\\[2,2,2,2,2\\],\\[2,2,2,2,2\\]\\], threshold = 1\n**Output:** 0\n\n**Constraints:**\n\n*   `m == mat.length`\n*   `n == mat[i].length`\n*   `1 <= m, n <= 300`\n*   `0 <= mat[i][j] <= 104`\n*   `0 <= threshold <= 105`\n```python\ndef maxSideLength(mat: List[List[int]], threshold: int) -> int:\n    m, n = len(mat), le

### Create general knowledge dataset

In [11]:
csv_file_path = 'data/general/general.csv'
general = pd.read_csv(csv_file_path)
general = general.sample(n=n_total, random_state=42)
general['id'] = range(len(general))
train_sample, test_sample = train_test_split(general, test_size=0.2, random_state=42)
train_sample.reset_index(drop=True, inplace=True)
test_sample.reset_index(drop=True, inplace=True)

train_dataset = Dataset.from_pandas(train_sample[['question', 'answer']])
test_dataset = Dataset.from_pandas(test_sample[['question', 'answer']])

general_dict = DatasetDict({
    'train': train_dataset,
    'test': test_dataset,
})

print(general_dict)

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 1887
    })
    test: Dataset({
        features: ['question', 'answer'],
        num_rows: 472
    })
})


In [12]:
general_dict["train"][0]

{'question': 'Add 1 letter to "Iowa" to get the name of this tribe who lived south of the Iowa',
 'answer': 'Kiowa'}

### Create math dataset

In [13]:
def convert_folder_to_datasetdict_math(folder_path):
    #data = {"id": [], "question": [], "level": [], "type": [], "answer": []}
    data = {"question": [], "answer": []}

    subject_dictionary = {
        "algebra": 1,
        "counting_and_probability": 2,
        "geometry": 3,
        "intermediate_algebra": 4,
        "number_theory": 5,
        "prealgebra": 6,
        "precalculus": 7
    }

    for subdir, _, files in os.walk(folder_path):
        for file in files:
            file_path = os.path.join(subdir, file)
            folder_name = os.path.basename(os.path.normpath(subdir))
            
            with open(file_path, "r") as f:
                json_data = json.load(f)
                problem = json_data.get("problem", "")
                level = json_data.get("level", "")
                type_ = json_data.get("type", "")
                solution = json_data.get("solution", "")

                # Generate id from subject code and file name
                file_name = os.path.splitext(file)[0]
                subject_code = subject_dictionary.get(folder_name, 0)  # Default to 0 if not found
                id_ = f"{subject_code}_{file_name}"

                #data["id"].append(id_)
                data["question"].append(problem)
                #data["level"].append(level)
                #data["type"].append(type_)
                data["answer"].append(solution)

    dataset = Dataset.from_dict(data)
    return dataset

In [14]:
math_train = convert_folder_to_datasetdict_math("data/math/train/")
math_test = convert_folder_to_datasetdict_math("data/math/test/")

In [15]:
math_train = math_train.shuffle(seed=42)
math_train = math_train.select(range(n_train))

math_test = math_test.shuffle(seed=42)
math_test = math_test.select(range(n_test))

math_dict = DatasetDict({
    'train': math_train,
    'test': math_test
})

print(math_dict)

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 1887
    })
    test: Dataset({
        features: ['question', 'answer'],
        num_rows: 472
    })
})


In [16]:
math_dict["train"][0]

{'question': 'What is the number of units in the distance between $(2,5)$ and $(-6,-1)$?',
 'answer': 'We use the distance formula: $\\sqrt{(-6 - 2)^2 + (-1 - 5)^2},$ so then we find that $\\sqrt{64 + 36} = \\boxed{10}$.\n\n- OR -\n\nWe note that the points $(2, 5)$, $(-6, -1)$, and $(2, -1)$ form a right triangle with legs of length 6 and 8. This is a Pythagorean triple, so the length of the hypotenuse must be $\\boxed{10}$.'}

## Preprocessing

In [17]:
prefix = "Please answer this question: "

def preprocess_function(examples):
   """Add prefix to the sentences, tokenize the text, and set the labels"""
   # The "inputs" are the tokenized answer:
   inputs = [prefix + doc for doc in examples["question"]]
   model_inputs = tokenizer(inputs, max_length=128, truncation=True)
  
   # The "labels" are the tokenized outputs:
   labels = tokenizer(text_target=examples["answer"], 
                      max_length=512,         
                      truncation=True)

   model_inputs["labels"] = labels["input_ids"]
   return model_inputs

In [18]:
tokenized_dataset_code1 = code_dict1.map(preprocess_function, batched=True)
tokenized_dataset_code2 = code_dict2.map(preprocess_function, batched=True)
tokenized_dataset_general = general_dict.map(preprocess_function, batched=True)
tokenized_dataset_math = math_dict.map(preprocess_function, batched=True)

Map: 100%|██████████| 1887/1887 [00:01<00:00, 1098.34 examples/s]
Map: 100%|██████████| 472/472 [00:00<00:00, 1102.44 examples/s]
Map: 100%|██████████| 1887/1887 [00:02<00:00, 832.68 examples/s]
Map: 100%|██████████| 472/472 [00:00<00:00, 825.52 examples/s]
Map: 100%|██████████| 1887/1887 [00:00<00:00, 13538.74 examples/s]
Map: 100%|██████████| 472/472 [00:00<00:00, 13174.28 examples/s]
Map: 100%|██████████| 1887/1887 [00:00<00:00, 2471.16 examples/s]
Map: 100%|██████████| 472/472 [00:00<00:00, 2535.50 examples/s]


In [19]:
nltk.download("punkt", quiet=True)
metric = evaluate.load("rouge")

In [20]:
def compute_metrics(eval_preds):
   preds, labels = eval_preds

   # decode preds and labels
   labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
   decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
   decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

   # rougeLSum expects newline after each sentence
   decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
   decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

   result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
  
   return result

## Fine-tuning

In [21]:
# Global Parameters
L_RATE = 3e-4
BATCH_SIZE = 4
PER_DEVICE_EVAL_BATCH = 4
WEIGHT_DECAY = 0.01
SAVE_TOTAL_LIM = 3
NUM_EPOCHS = 3
OVERWRITE_OUTPUT_DIR = True
LOAD_BEST_MODEL_AT_END = True

# Set up training arguments
training_args_math = Seq2SeqTrainingArguments(
   output_dir="./results/math",
   overwrite_output_dir=OVERWRITE_OUTPUT_DIR,
   save_strategy="epoch",
   evaluation_strategy="epoch",
   load_best_model_at_end=LOAD_BEST_MODEL_AT_END,
   learning_rate=L_RATE,
   per_device_train_batch_size=BATCH_SIZE,
   per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH,
   weight_decay=WEIGHT_DECAY,
   save_total_limit=SAVE_TOTAL_LIM,
   num_train_epochs=NUM_EPOCHS,
   predict_with_generate=True,
   push_to_hub=False
)

training_args_code1 = Seq2SeqTrainingArguments(
   output_dir="./results/code1",
   overwrite_output_dir=OVERWRITE_OUTPUT_DIR,
   save_strategy="epoch",
   evaluation_strategy="epoch",
   load_best_model_at_end=LOAD_BEST_MODEL_AT_END,
   learning_rate=L_RATE,
   per_device_train_batch_size=BATCH_SIZE,
   per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH,
   weight_decay=WEIGHT_DECAY,
   save_total_limit=SAVE_TOTAL_LIM,
   num_train_epochs=NUM_EPOCHS,
   predict_with_generate=True,
   push_to_hub=False
)

training_args_code2 = Seq2SeqTrainingArguments(
   output_dir="./results/code2",
   overwrite_output_dir=OVERWRITE_OUTPUT_DIR,
   save_strategy="epoch",
   evaluation_strategy="epoch",
   load_best_model_at_end=LOAD_BEST_MODEL_AT_END,
   learning_rate=L_RATE,
   per_device_train_batch_size=BATCH_SIZE,
   per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH,
   weight_decay=WEIGHT_DECAY,
   save_total_limit=SAVE_TOTAL_LIM,
   num_train_epochs=NUM_EPOCHS,
   predict_with_generate=True,
   push_to_hub=False
)

training_args_general = Seq2SeqTrainingArguments(
   output_dir="./results/base",
   overwrite_output_dir=OVERWRITE_OUTPUT_DIR,
   save_strategy="epoch",
   evaluation_strategy="epoch",
   load_best_model_at_end=LOAD_BEST_MODEL_AT_END,
   learning_rate=L_RATE,
   per_device_train_batch_size=BATCH_SIZE,
   per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH,
   weight_decay=WEIGHT_DECAY,
   save_total_limit=SAVE_TOTAL_LIM,
   num_train_epochs=NUM_EPOCHS,
   predict_with_generate=True,
   push_to_hub=False
)

### Math

In [None]:
trainer_math = Seq2SeqTrainer(
   model=model_math,
   args=training_args_math,
   train_dataset=tokenized_dataset_math["train"], 
   eval_dataset=tokenized_dataset_math["test"],   
   tokenizer=tokenizer,
   data_collator=data_collator_math,
   compute_metrics=compute_metrics
)

In [None]:
trainer_math.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33myezixuanclara[0m. Use [1m`wandb login --relogin`[0m to force relogin


                                                  
 33%|███▎      | 472/1416 [02:31<03:47,  4.15it/s]

{'eval_loss': 1.7312365770339966, 'eval_rouge1': 0.1519675858878297, 'eval_rouge2': 0.0613339060797851, 'eval_rougeL': 0.1306172730330254, 'eval_rougeLsum': 0.14008236218685827, 'eval_runtime': 42.2795, 'eval_samples_per_second': 11.164, 'eval_steps_per_second': 2.791, 'epoch': 1.0}


 35%|███▌      | 501/1416 [02:48<03:59,  3.83it/s]  

{'loss': 2.2384, 'learning_rate': 0.00019406779661016945, 'epoch': 1.06}


                                                  
 67%|██████▋   | 944/1416 [05:26<02:15,  3.48it/s]

{'eval_loss': 1.6180236339569092, 'eval_rouge1': 0.1419497361392592, 'eval_rouge2': 0.05908574050106513, 'eval_rougeL': 0.124237828156588, 'eval_rougeLsum': 0.1334857867543956, 'eval_runtime': 41.4327, 'eval_samples_per_second': 11.392, 'eval_steps_per_second': 2.848, 'epoch': 2.0}


 71%|███████   | 1001/1416 [05:43<01:05,  6.30it/s] 

{'loss': 1.7438, 'learning_rate': 8.813559322033898e-05, 'epoch': 2.12}


                                                   
100%|██████████| 1416/1416 [08:02<00:00,  3.46it/s]

{'eval_loss': 1.587369680404663, 'eval_rouge1': 0.15096462116192183, 'eval_rouge2': 0.061661991398463525, 'eval_rougeL': 0.1308013245441378, 'eval_rougeLsum': 0.1403668682057218, 'eval_runtime': 42.7915, 'eval_samples_per_second': 11.03, 'eval_steps_per_second': 2.758, 'epoch': 3.0}


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].
100%|██████████| 1416/1416 [08:13<00:00,  2.87it/s]

{'train_runtime': 496.0192, 'train_samples_per_second': 11.413, 'train_steps_per_second': 2.855, 'train_loss': 1.878079990882658, 'epoch': 3.0}





TrainOutput(global_step=1416, training_loss=1.878079990882658, metrics={'train_runtime': 496.0192, 'train_samples_per_second': 11.413, 'train_steps_per_second': 2.855, 'train_loss': 1.878079990882658, 'epoch': 3.0})

In [25]:
del model_math
gc.collect()
torch.cuda.empty_cache()

### Code 1

In [22]:
trainer_code1 = Seq2SeqTrainer(
   model=model_code1,
   args=training_args_code1,
   train_dataset=tokenized_dataset_code1["train"],    
   eval_dataset=tokenized_dataset_code1["test"],    
   tokenizer=tokenizer,
   data_collator=data_collator_code1,
   compute_metrics=compute_metrics
)

In [23]:
trainer_code1.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33myezixuanclara[0m. Use [1m`wandb login --relogin`[0m to force relogin


                                                  
 33%|███▎      | 472/1416 [01:47<02:26,  6.44it/s]

{'eval_loss': 1.548426628112793, 'eval_rouge1': 0.11080682240102191, 'eval_rouge2': 0.04262286448926207, 'eval_rougeL': 0.10877352614367107, 'eval_rougeLsum': 0.10891993020584771, 'eval_runtime': 38.24, 'eval_samples_per_second': 12.343, 'eval_steps_per_second': 3.086, 'epoch': 1.0}


 35%|███▌      | 501/1416 [02:04<02:52,  5.30it/s]  

{'loss': 2.0498, 'learning_rate': 0.00019406779661016945, 'epoch': 1.06}


                                                  
 67%|██████▋   | 944/1416 [04:04<01:02,  7.61it/s]

{'eval_loss': 1.3797942399978638, 'eval_rouge1': 0.11251429177517958, 'eval_rouge2': 0.04183903182471378, 'eval_rougeL': 0.1104787605899227, 'eval_rougeLsum': 0.11062254492669521, 'eval_runtime': 38.0585, 'eval_samples_per_second': 12.402, 'eval_steps_per_second': 3.1, 'epoch': 2.0}


 71%|███████   | 1001/1416 [04:19<01:10,  5.89it/s] 

{'loss': 1.4466, 'learning_rate': 8.813559322033898e-05, 'epoch': 2.12}


                                                   
100%|██████████| 1416/1416 [06:03<00:00,  6.57it/s]

{'eval_loss': 1.3264946937561035, 'eval_rouge1': 0.11379369743961823, 'eval_rouge2': 0.04372187033392986, 'eval_rougeL': 0.11183294741338806, 'eval_rougeLsum': 0.11210169735163278, 'eval_runtime': 38.8487, 'eval_samples_per_second': 12.15, 'eval_steps_per_second': 3.037, 'epoch': 3.0}


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].
100%|██████████| 1416/1416 [06:15<00:00,  3.77it/s]

{'train_runtime': 377.8306, 'train_samples_per_second': 14.983, 'train_steps_per_second': 3.748, 'train_loss': 1.6090646086439575, 'epoch': 3.0}





TrainOutput(global_step=1416, training_loss=1.6090646086439575, metrics={'train_runtime': 377.8306, 'train_samples_per_second': 14.983, 'train_steps_per_second': 3.748, 'train_loss': 1.6090646086439575, 'epoch': 3.0})

In [24]:
del model_code1, trainer_code1
gc.collect()
torch.cuda.empty_cache()

### Code 2

In [22]:
trainer_code2 = Seq2SeqTrainer(
   model=model_code2,
   args=training_args_code2,
   train_dataset=tokenized_dataset_code2["train"],    
   eval_dataset=tokenized_dataset_code2["test"],    
   tokenizer=tokenizer,
   data_collator=data_collator_code2,
   compute_metrics=compute_metrics
)

In [23]:
trainer_code2.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33myezixuanclara[0m. Use [1m`wandb login --relogin`[0m to force relogin


                                                  
 33%|███▎      | 472/1416 [02:51<03:02,  5.17it/s]

{'eval_loss': 1.490283727645874, 'eval_rouge1': 0.11464440722915901, 'eval_rouge2': 0.045900396760247576, 'eval_rougeL': 0.11331536076790824, 'eval_rougeLsum': 0.11363860320917193, 'eval_runtime': 67.7477, 'eval_samples_per_second': 6.967, 'eval_steps_per_second': 1.742, 'epoch': 1.0}


 35%|███▌      | 501/1416 [03:16<04:16,  3.57it/s]  

{'loss': 2.0369, 'learning_rate': 0.00019406779661016945, 'epoch': 1.06}


                                                  
 67%|██████▋   | 944/1416 [05:47<01:16,  6.21it/s]

{'eval_loss': 1.3298611640930176, 'eval_rouge1': 0.11525105091498902, 'eval_rouge2': 0.04818595199598334, 'eval_rougeL': 0.11378199927723637, 'eval_rougeLsum': 0.11388280600541119, 'eval_runtime': 42.9693, 'eval_samples_per_second': 10.985, 'eval_steps_per_second': 2.746, 'epoch': 2.0}


 71%|███████   | 1001/1416 [06:21<02:26,  2.83it/s] 

{'loss': 1.4452, 'learning_rate': 8.813559322033898e-05, 'epoch': 2.12}


                                                   
100%|██████████| 1416/1416 [08:39<00:00,  2.96it/s]

{'eval_loss': 1.2836138010025024, 'eval_rouge1': 0.116787621913371, 'eval_rouge2': 0.05169632646560274, 'eval_rougeL': 0.11546628985189317, 'eval_rougeLsum': 0.11574938911925602, 'eval_runtime': 42.1202, 'eval_samples_per_second': 11.206, 'eval_steps_per_second': 2.802, 'epoch': 3.0}


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].
100%|██████████| 1416/1416 [08:55<00:00,  2.64it/s]

{'train_runtime': 538.0575, 'train_samples_per_second': 10.521, 'train_steps_per_second': 2.632, 'train_loss': 1.602813677599201, 'epoch': 3.0}





TrainOutput(global_step=1416, training_loss=1.602813677599201, metrics={'train_runtime': 538.0575, 'train_samples_per_second': 10.521, 'train_steps_per_second': 2.632, 'train_loss': 1.602813677599201, 'epoch': 3.0})

In [24]:
del model_code2, trainer_code2
gc.collect()
torch.cuda.empty_cache()

### General

In [22]:
trainer_general = Seq2SeqTrainer(
   model=model_general,
   args=training_args_general,
   train_dataset=tokenized_dataset_general["train"],    
   eval_dataset=tokenized_dataset_general["test"],    
   tokenizer=tokenizer,
   data_collator=data_collator_general,
   compute_metrics=compute_metrics
)

In [23]:
trainer_general.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33myezixuanclara[0m. Use [1m`wandb login --relogin`[0m to force relogin


                                                  
 33%|███▎      | 472/1416 [01:26<02:45,  5.70it/s]

{'eval_loss': 2.7745418548583984, 'eval_rouge1': 0.11132939565142949, 'eval_rouge2': 0.01281779661016949, 'eval_rougeL': 0.11067186697483301, 'eval_rougeLsum': 0.1098510834739648, 'eval_runtime': 24.9347, 'eval_samples_per_second': 18.929, 'eval_steps_per_second': 4.732, 'epoch': 1.0}


 35%|███▌      | 501/1416 [01:48<02:47,  5.46it/s]  

{'loss': 3.1223, 'learning_rate': 0.00019406779661016945, 'epoch': 1.06}


                                                  
 67%|██████▋   | 944/1416 [03:36<01:23,  5.64it/s]

{'eval_loss': 2.8065195083618164, 'eval_rouge1': 0.12483368796928113, 'eval_rouge2': 0.018573446327683617, 'eval_rougeL': 0.12515461821605883, 'eval_rougeLsum': 0.12404248294078796, 'eval_runtime': 29.1169, 'eval_samples_per_second': 16.211, 'eval_steps_per_second': 4.053, 'epoch': 2.0}


 71%|███████   | 1001/1416 [03:58<01:14,  5.54it/s] 

{'loss': 2.2116, 'learning_rate': 8.813559322033898e-05, 'epoch': 2.12}


                                                   
100%|██████████| 1416/1416 [05:41<00:00,  5.68it/s]

{'eval_loss': 2.8924808502197266, 'eval_rouge1': 0.11745886528725508, 'eval_rouge2': 0.019067796610169493, 'eval_rougeL': 0.1173657733509428, 'eval_rougeLsum': 0.11738381086898031, 'eval_runtime': 28.663, 'eval_samples_per_second': 16.467, 'eval_steps_per_second': 4.117, 'epoch': 3.0}


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].
100%|██████████| 1416/1416 [05:59<00:00,  3.94it/s]

{'train_runtime': 361.36, 'train_samples_per_second': 15.666, 'train_steps_per_second': 3.919, 'train_loss': 2.380347041760461, 'epoch': 3.0}





TrainOutput(global_step=1416, training_loss=2.380347041760461, metrics={'train_runtime': 361.36, 'train_samples_per_second': 15.666, 'train_steps_per_second': 3.919, 'train_loss': 2.380347041760461, 'epoch': 3.0})

In [24]:
del model_general
gc.collect()
torch.cuda.empty_cache()