# 6.8610  PROJECT

## Install and import libraries

In [None]:
pip install nltk
pip install datasets
pip install transformers[torch]
pip install tokenizers
pip install evaluate
pip install rouge_score
pip install sentencepiece
pip install huggingface_hub

In [1]:
import nltk
import sentencepiece
import evaluate
import numpy as np
from datasets import load_dataset
from transformers import T5Tokenizer, DataCollatorForSeq2Seq
from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
import os
import json
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

## Load model

In [2]:
MODEL_NAME = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


## Create datasets

### Create math dataset

In [49]:
def convert_folder_to_datasetdict_math(folder_path):
    data = {"id": [], "question": [], "level": [], "type": [], "answer": []}

    subject_dictionary = {
        "algebra": 1,
        "counting_and_probability": 2,
        "geometry": 3,
        "intermediate_algebra": 4,
        "number_theory": 5,
        "prealgebra": 6,
        "precalculus": 7
    }

    for subdir, _, files in os.walk(folder_path):
        for file in files:
            file_path = os.path.join(subdir, file)
            folder_name = os.path.basename(os.path.normpath(subdir))
            
            with open(file_path, "r") as f:
                json_data = json.load(f)
                problem = json_data.get("problem", "")
                level = json_data.get("level", "")
                type_ = json_data.get("type", "")
                solution = json_data.get("solution", "")

                # Generate id from subject code and file name
                file_name = os.path.splitext(file)[0]
                subject_code = subject_dictionary.get(folder_name, 0)  # Default to 0 if not found
                id_ = f"{subject_code}_{file_name}"

                data["id"].append(id_)
                data["question"].append(problem)
                data["level"].append(level)
                data["type"].append(type_)
                data["answer"].append(solution)

    dataset = Dataset.from_dict(data)
    return dataset

In [50]:
math_train = convert_folder_to_datasetdict_math("math/train/")
math_test = convert_folder_to_datasetdict_math("math/test/")

math_dict = DatasetDict({
    'train': math_train,
    'test': math_test
})

In [51]:
math_dict["train"][2]

{'id': '1_1000',
 'question': 'What is the degree of the polynomial $(4 +5x^3 +100 +2\\pi x^4 + \\sqrt{10}x^4 +9)$?',
 'level': 'Level 3',
 'type': 'Algebra',
 'answer': "This polynomial is not written in standard form.  However, we don't need to write it in standard form, nor do we need to pay attention to the coefficients.  We just look for the exponents on $x$.  We have an $x^4$ term and no other term of higher degree, so $\\boxed{4}$ is the degree of the polynomial."}

### Create code dataset

In [6]:
def convert_folder_to_datasetdict_code(folder_path):
    data = {"id": [], "question": [], "type": [], "answer": []}

    for subdir, _, files in os.walk(folder_path):
        for file in files:
            file_path = os.path.join(subdir, file)
            folder_name = os.path.basename(os.path.normpath(subdir))
            file_name = os.path.splitext(file)[0]
            id_ = f"{file_name}"

            with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
                content = f.readlines()

                # Extract comments as questions
                comments = [line.strip()[2:] for line in content if line.strip().startswith('//')]
                question = '\n'.join(comments)
                # Extract non-commented lines as answers
                code_lines = [line.strip() for line in content if not line.strip().startswith('//')]
                answer = '\n'.join(code_lines)

                data["id"].append(id_)
                data["question"].append(question)
                data["type"].append(folder_name)
                data["answer"].append(answer)
                
    dataset = Dataset.from_dict(data)
    return dataset

In [7]:
code_dataset = convert_folder_to_datasetdict_code("code/")

In [8]:
code_dict = code_dataset.train_test_split(test_size=0.3)

In [9]:
code_dict

DatasetDict({
    train: Dataset({
        features: ['id', 'question', 'type', 'answer'],
        num_rows: 136
    })
    test: Dataset({
        features: ['id', 'question', 'type', 'answer'],
        num_rows: 59
    })
})

In [23]:
code_dict["train"][1]

{'id': 'GeneralizedAbbreviation',
 'question': ' Write a function to generate the generalized abbreviations of a word.\n Example:\n Given word = "word", return the following list (order does not matter):\n ["word", "1ord", "w1rd", "wo1d", "wor1", "2rd", "w2d", "wo2", "1o1d", "1or1", "w1r1", "1o2", "2r1", "3d", "w3", "4"]',
 'type': 'backtracking',
 'answer': '\n\npublic class GeneralizedAbbreviation {\npublic List<String> generateAbbreviations(String word) {\nList<String> result = new ArrayList<String>();\n\nbacktrack(result, word, 0, "", 0);\n\nreturn result;\n}\n\nvoid backtrack(List result, String word, int position, String current, int count) {\nif(position == word.length()) {\nif(count > 0) {\ncurrent += count;\n}\n\nresult.add(current);\n} else {\nbacktrack(result, word, position + 1, current, count + 1);\nbacktrack(result, word, position + 1, current + (count > 0 ? count : "") + word.charAt(position), 0);\n}\n}\n}'}

## Preprocessing

In [10]:
prefix = "Please answer this question: "

def preprocess_function(examples):
   """Add prefix to the sentences, tokenize the text, and set the labels"""
   # The "inputs" are the tokenized answer:
   inputs = [prefix + doc for doc in examples["question"]]
   model_inputs = tokenizer(inputs, max_length=128, truncation=True)
  
   # The "labels" are the tokenized outputs:
   labels = tokenizer(text_target=examples["answer"], 
                      max_length=512,         
                      truncation=True)

   model_inputs["labels"] = labels["input_ids"]
   return model_inputs

In [11]:
tokenized_dataset_math = math_dict.map(preprocess_function, batched=True)
tokenized_dataset_code = code_dict.map(preprocess_function, batched=True)

Map:   0%|          | 0/7500 [00:00<?, ? examples/s]

Map:   0%|          | 0/5004 [00:00<?, ? examples/s]

Map:   0%|          | 0/136 [00:00<?, ? examples/s]

Map:   0%|          | 0/59 [00:00<?, ? examples/s]

In [12]:
nltk.download("punkt", quiet=True)
metric = evaluate.load("rouge")

In [13]:
def compute_metrics(eval_preds):
   preds, labels = eval_preds

   # decode preds and labels
   labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
   decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
   decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

   # rougeLSum expects newline after each sentence
   decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
   decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

   result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
  
   return result

## Fine-tuning

In [14]:
# Global Parameters
L_RATE = 3e-4
BATCH_SIZE = 8
PER_DEVICE_EVAL_BATCH = 4
WEIGHT_DECAY = 0.01
SAVE_TOTAL_LIM = 3
NUM_EPOCHS = 3

# Set up training arguments
training_args = Seq2SeqTrainingArguments(
   output_dir="./results",
   evaluation_strategy="epoch",
   learning_rate=L_RATE,
   per_device_train_batch_size=BATCH_SIZE,
   per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH,
   weight_decay=WEIGHT_DECAY,
   save_total_limit=SAVE_TOTAL_LIM,
   num_train_epochs=NUM_EPOCHS,
   predict_with_generate=True,
   push_to_hub=False
)

### Math

In [15]:
trainer_math = Seq2SeqTrainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_dataset_math["train"], 
   eval_dataset=tokenized_dataset_math["test"],   
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics
)

In [None]:
trainer_math.train()

### Code

In [16]:
trainer_code = Seq2SeqTrainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_dataset_code["train"],    
   eval_dataset=tokenized_dataset_code["test"],    
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics
)

In [None]:
trainer_code.train()