# 6.8610  PROJECT

## Install and import libraries

In [1]:
!pip install nltk
!pip install datasets
!pip install transformers[torch]
!pip install tokenizers
!pip install evaluate
!pip install rouge_score
!pip install sentencepiece
!pip install huggingface_hub
!pip install wandb



In [2]:
import nltk
import sentencepiece
import evaluate
import numpy as np
from datasets import load_dataset
from transformers import T5Tokenizer, DataCollatorForSeq2Seq
from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
import os
import json
import torch
import wandb
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
import gc

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


## Load model

In [4]:
MODEL_NAME = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)

model_math = T5ForConditionalGeneration.from_pretrained(MODEL_NAME).to(device)
model_code = T5ForConditionalGeneration.from_pretrained(MODEL_NAME).to(device)
model_base = T5ForConditionalGeneration.from_pretrained(MODEL_NAME).to(device)

data_collator_math = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_math)
data_collator_code = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_code)
data_collator_base = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_base)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


## Create datasets

### Create general knowledge dataset

In [None]:
def convert_folder_to_datasetdict_gen(folder_path):

    dataset = Dataset.from_dict(folder_path)
    return dataset

In [None]:
general_train = convert_folder_to_datasetdict_gen("data/general/train.gz")
general_test = convert_folder_to_datasetdict_gen("data/general/test.gz")

general_dict = DatasetDict({
    'train': general_train,
    'test': general_test
})

### Create math dataset

In [49]:
def convert_folder_to_datasetdict_math(folder_path):
    data = {"id": [], "question": [], "level": [], "type": [], "answer": []}

    subject_dictionary = {
        "algebra": 1,
        "counting_and_probability": 2,
        "geometry": 3,
        "intermediate_algebra": 4,
        "number_theory": 5,
        "prealgebra": 6,
        "precalculus": 7
    }

    for subdir, _, files in os.walk(folder_path):
        for file in files:
            file_path = os.path.join(subdir, file)
            folder_name = os.path.basename(os.path.normpath(subdir))
            
            with open(file_path, "r") as f:
                json_data = json.load(f)
                problem = json_data.get("problem", "")
                level = json_data.get("level", "")
                type_ = json_data.get("type", "")
                solution = json_data.get("solution", "")

                # Generate id from subject code and file name
                file_name = os.path.splitext(file)[0]
                subject_code = subject_dictionary.get(folder_name, 0)  # Default to 0 if not found
                id_ = f"{subject_code}_{file_name}"

                data["id"].append(id_)
                data["question"].append(problem)
                data["level"].append(level)
                data["type"].append(type_)
                data["answer"].append(solution)

    dataset = Dataset.from_dict(data)
    return dataset

In [50]:
math_train = convert_folder_to_datasetdict_math("data/math/train/")
math_test = convert_folder_to_datasetdict_math("data/math/test/")

math_dict = DatasetDict({
    'train': math_train,
    'test': math_test
})

In [51]:
math_dict["train"][2]

{'id': '1_1000',
 'question': 'What is the degree of the polynomial $(4 +5x^3 +100 +2\\pi x^4 + \\sqrt{10}x^4 +9)$?',
 'level': 'Level 3',
 'type': 'Algebra',
 'answer': "This polynomial is not written in standard form.  However, we don't need to write it in standard form, nor do we need to pay attention to the coefficients.  We just look for the exponents on $x$.  We have an $x^4$ term and no other term of higher degree, so $\\boxed{4}$ is the degree of the polynomial."}

### Create code dataset

In [8]:
def convert_folder_to_datasetdict_code(folder_path):
    data = {"id": [], "question": [], "type": [], "answer": []}

    for subdir, _, files in os.walk(folder_path):
        for file in files:
            file_path = os.path.join(subdir, file)
            folder_name = os.path.basename(os.path.normpath(subdir))
            file_name = os.path.splitext(file)[0]
            id_ = f"{file_name}"

            with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
                content = f.readlines()

                # Extract comments as questions
                comments = [line.strip()[2:] for line in content if line.strip().startswith('//')]
                question = '\n'.join(comments)
                # Extract non-commented lines as answers
                code_lines = [line.strip() for line in content if not line.strip().startswith('//')]
                answer = '\n'.join(code_lines)

                data["id"].append(id_)
                data["question"].append(question)
                data["type"].append(folder_name)
                data["answer"].append(answer)
                
    dataset = Dataset.from_dict(data)
    return dataset

In [9]:
code_dataset = convert_folder_to_datasetdict_code("data/code/")

In [10]:
code_dict = code_dataset.train_test_split(test_size=0.3)

In [11]:
code_dict

DatasetDict({
    train: Dataset({
        features: ['id', 'question', 'type', 'answer'],
        num_rows: 126
    })
    test: Dataset({
        features: ['id', 'question', 'type', 'answer'],
        num_rows: 54
    })
})

In [23]:
code_dict["train"][1]

{'id': 'GeneralizedAbbreviation',
 'question': ' Write a function to generate the generalized abbreviations of a word.\n Example:\n Given word = "word", return the following list (order does not matter):\n ["word", "1ord", "w1rd", "wo1d", "wor1", "2rd", "w2d", "wo2", "1o1d", "1or1", "w1r1", "1o2", "2r1", "3d", "w3", "4"]',
 'type': 'backtracking',
 'answer': '\n\npublic class GeneralizedAbbreviation {\npublic List<String> generateAbbreviations(String word) {\nList<String> result = new ArrayList<String>();\n\nbacktrack(result, word, 0, "", 0);\n\nreturn result;\n}\n\nvoid backtrack(List result, String word, int position, String current, int count) {\nif(position == word.length()) {\nif(count > 0) {\ncurrent += count;\n}\n\nresult.add(current);\n} else {\nbacktrack(result, word, position + 1, current, count + 1);\nbacktrack(result, word, position + 1, current + (count > 0 ? count : "") + word.charAt(position), 0);\n}\n}\n}'}

## Preprocessing

In [13]:
prefix = "Please answer this question: "

def preprocess_function(examples):
   """Add prefix to the sentences, tokenize the text, and set the labels"""
   # The "inputs" are the tokenized answer:
   inputs = [prefix + doc for doc in examples["question"]]
   model_inputs = tokenizer(inputs, max_length=128, truncation=True)
  
   # The "labels" are the tokenized outputs:
   labels = tokenizer(text_target=examples["answer"], 
                      max_length=512,         
                      truncation=True)

   model_inputs["labels"] = labels["input_ids"]
   return model_inputs

In [14]:
tokenized_dataset_math = math_dict.map(preprocess_function, batched=True)
tokenized_dataset_code = code_dict.map(preprocess_function, batched=True)

Map:   0%|          | 0/7500 [00:00<?, ? examples/s]

Map:   0%|          | 0/5004 [00:00<?, ? examples/s]

Map:   0%|          | 0/136 [00:00<?, ? examples/s]

Map:   0%|          | 0/59 [00:00<?, ? examples/s]

In [15]:
nltk.download("punkt", quiet=True)
metric = evaluate.load("rouge")

In [16]:
def compute_metrics(eval_preds):
   preds, labels = eval_preds

   # decode preds and labels
   labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
   decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
   decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

   # rougeLSum expects newline after each sentence
   decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
   decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

   result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
  
   return result

## Fine-tuning

In [17]:
# Global Parameters
L_RATE = 3e-4
BATCH_SIZE = 8
PER_DEVICE_EVAL_BATCH = 4
WEIGHT_DECAY = 0.01
SAVE_TOTAL_LIM = 3
NUM_EPOCHS = 3
OVERWRITE_OUTPUT_DIR = True
LOAD_BEST_MODEL_AT_END = True

# Set up training arguments
training_args_math = Seq2SeqTrainingArguments(
   output_dir="./results/math",
   overwrite_output_dir=OVERWRITE_OUTPUT_DIR,
   save_strategy="epoch",
   evaluation_strategy="epoch",
   load_best_model_at_end=LOAD_BEST_MODEL_AT_END,
   learning_rate=L_RATE,
   per_device_train_batch_size=BATCH_SIZE,
   per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH,
   weight_decay=WEIGHT_DECAY,
   save_total_limit=SAVE_TOTAL_LIM,
   num_train_epochs=NUM_EPOCHS,
   predict_with_generate=True,
   push_to_hub=False
)

training_args_code = Seq2SeqTrainingArguments(
   output_dir="./results/code",
   overwrite_output_dir=OVERWRITE_OUTPUT_DIR,
   save_strategy="epoch",
   evaluation_strategy="epoch",
   load_best_model_at_end=LOAD_BEST_MODEL_AT_END,
   learning_rate=L_RATE,
   per_device_train_batch_size=BATCH_SIZE,
   per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH,
   weight_decay=WEIGHT_DECAY,
   save_total_limit=SAVE_TOTAL_LIM,
   num_train_epochs=NUM_EPOCHS,
   predict_with_generate=True,
   push_to_hub=False
)

training_args_base = Seq2SeqTrainingArguments(
   output_dir="./results/base",
   overwrite_output_dir=OVERWRITE_OUTPUT_DIR,
   save_strategy="epoch",
   evaluation_strategy="epoch",
   load_best_model_at_end=LOAD_BEST_MODEL_AT_END,
   learning_rate=L_RATE,
   per_device_train_batch_size=BATCH_SIZE,
   per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH,
   weight_decay=WEIGHT_DECAY,
   save_total_limit=SAVE_TOTAL_LIM,
   num_train_epochs=NUM_EPOCHS,
   predict_with_generate=True,
   push_to_hub=False
)

### Math

In [18]:
trainer_math = Seq2SeqTrainer(
   model=model_math,
   args=training_args_math,
   train_dataset=tokenized_dataset_math["train"], 
   eval_dataset=tokenized_dataset_math["test"],   
   tokenizer=tokenizer,
   data_collator=data_collator_math,
   compute_metrics=compute_metrics
)

In [19]:
trainer_math.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33myezixuanclara[0m. Use [1m`wandb login --relogin`[0m to force relogin


 18%|█▊        | 501/2814 [00:59<04:35,  8.40it/s]

{'loss': 0.0484, 'learning_rate': 0.0002466950959488273, 'epoch': 0.53}


                                                  
 33%|███▎      | 938/2814 [02:47<03:37,  8.64it/s] 

{'eval_loss': 0.0, 'eval_rouge1': 0.0, 'eval_rouge2': 0.0, 'eval_rougeL': 0.0, 'eval_rougeLsum': 0.0, 'eval_runtime': 58.1889, 'eval_samples_per_second': 85.927, 'eval_steps_per_second': 21.482, 'epoch': 1.0}


 36%|███▌      | 1002/2814 [02:57<03:28,  8.67it/s] 

{'loss': 0.0, 'learning_rate': 0.00019339019189765458, 'epoch': 1.07}


 53%|█████▎    | 1501/2814 [03:56<02:49,  7.77it/s]

{'loss': 0.0, 'learning_rate': 0.00014008528784648186, 'epoch': 1.6}


                                                   
 67%|██████▋   | 1876/2814 [05:40<01:53,  8.27it/s]

{'eval_loss': 0.0, 'eval_rouge1': 0.0, 'eval_rouge2': 0.0, 'eval_rougeL': 0.0, 'eval_rougeLsum': 0.0, 'eval_runtime': 59.3238, 'eval_samples_per_second': 84.283, 'eval_steps_per_second': 21.071, 'epoch': 2.0}


 71%|███████   | 2002/2814 [05:57<01:31,  8.85it/s]  

{'loss': 0.0, 'learning_rate': 8.678038379530917e-05, 'epoch': 2.13}


 89%|████████▉ | 2502/2814 [06:54<00:35,  8.89it/s]

{'loss': 0.0, 'learning_rate': 3.347547974413646e-05, 'epoch': 2.67}


                                                   
100%|██████████| 2814/2814 [08:32<00:00,  8.96it/s]

{'eval_loss': 0.0, 'eval_rouge1': 0.0, 'eval_rouge2': 0.0, 'eval_rougeL': 0.0, 'eval_rougeLsum': 0.0, 'eval_runtime': 61.2103, 'eval_samples_per_second': 81.686, 'eval_steps_per_second': 20.421, 'epoch': 3.0}


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].
100%|██████████| 2814/2814 [08:36<00:00,  5.45it/s]

{'train_runtime': 518.6125, 'train_samples_per_second': 43.385, 'train_steps_per_second': 5.426, 'train_loss': 0.00859833234957453, 'epoch': 3.0}





TrainOutput(global_step=2814, training_loss=0.00859833234957453, metrics={'train_runtime': 518.6125, 'train_samples_per_second': 43.385, 'train_steps_per_second': 5.426, 'train_loss': 0.00859833234957453, 'epoch': 3.0})

In [20]:
del model_math
gc.collect()
torch.cuda.empty_cache()

### Code

In [21]:
trainer_code = Seq2SeqTrainer(
   model=model_code,
   args=training_args_code,
   train_dataset=tokenized_dataset_code["train"],    
   eval_dataset=tokenized_dataset_code["test"],    
   tokenizer=tokenizer,
   data_collator=data_collator_code,
   compute_metrics=compute_metrics
)

In [22]:
trainer_code.train()


 33%|███▎      | 16/48 [03:47<07:55, 14.87s/it]

{'eval_loss': 2.496579885482788, 'eval_rouge1': 0.03383376969610924, 'eval_rouge2': 0.0016460905349794238, 'eval_rougeL': 0.028476556253146653, 'eval_rougeLsum': 0.029104003651678312, 'eval_runtime': 18.5042, 'eval_samples_per_second': 2.918, 'eval_steps_per_second': 0.757, 'epoch': 1.0}



 67%|██████▋   | 32/48 [07:42<02:47, 10.45s/it]

{'eval_loss': 2.085508108139038, 'eval_rouge1': 0.14193502014963366, 'eval_rouge2': 0.05224203937530196, 'eval_rougeL': 0.1311783087726054, 'eval_rougeLsum': 0.1319044507434023, 'eval_runtime': 18.0557, 'eval_samples_per_second': 2.991, 'eval_steps_per_second': 0.775, 'epoch': 2.0}



100%|██████████| 48/48 [11:16<00:00, 12.77s/it]

{'eval_loss': 1.9809935092926025, 'eval_rouge1': 0.1505172341955189, 'eval_rouge2': 0.05507006806796853, 'eval_rougeL': 0.1411070353465603, 'eval_rougeLsum': 0.1417876510239457, 'eval_runtime': 18.1912, 'eval_samples_per_second': 2.968, 'eval_steps_per_second': 0.77, 'epoch': 3.0}


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].
100%|██████████| 48/48 [11:22<00:00, 14.22s/it]

{'train_runtime': 682.7185, 'train_samples_per_second': 0.554, 'train_steps_per_second': 0.07, 'train_loss': 2.8900632858276367, 'epoch': 3.0}





TrainOutput(global_step=48, training_loss=2.8900632858276367, metrics={'train_runtime': 682.7185, 'train_samples_per_second': 0.554, 'train_steps_per_second': 0.07, 'train_loss': 2.8900632858276367, 'epoch': 3.0})

In [None]:
del model_code
gc.collect()
torch.cuda.empty_cache()