# 6.8610  PROJECT

## Install and import libraries

In [1]:
!pip install nltk
!pip install datasets
!pip install transformers[torch]
!pip install tokenizers
!pip install evaluate
!pip install rouge_score
!pip install sentencepiece
!pip install huggingface_hub
!pip install wandb



In [1]:
import nltk
import sentencepiece
import evaluate
import numpy as np
from datasets import load_dataset
from transformers import T5Tokenizer, DataCollatorForSeq2Seq
from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
import os
import json
import torch
import wandb
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
import gc
import pandas as pd
import random
random.seed(42)

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cpu


## Load model

In [40]:
MODEL_NAME = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)

model_math = T5ForConditionalGeneration.from_pretrained(MODEL_NAME).to(device)
model_code1 = T5ForConditionalGeneration.from_pretrained(MODEL_NAME).to(device)
model_code2 = T5ForConditionalGeneration.from_pretrained(MODEL_NAME).to(device)
model_general = T5ForConditionalGeneration.from_pretrained(MODEL_NAME).to(device)

data_collator_math = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_math)
data_collator_code1 = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_code1)
data_collator_code2 = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_code2)
data_collator_general = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_general)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


## Create datasets

### Create code dataset

In [12]:
def convert_folder_to_datasetdict_code(folder_path):
    with open(folder_path, 'r') as file:
        data = json.load(file)

    df = pd.json_normalize(data)
    #df.rename(columns={'code_with_problem': 'question', 'code_only': 'answer'}, inplace=True)
    #dataset = Dataset.from_pandas(df[['question', 'answer']])
    dataset = Dataset.from_pandas(df[['code_with_problem', 'code_with_data', 'code_only']])
    return dataset

In [13]:
code_dataset = convert_folder_to_datasetdict_code('data/code/leetcode-solutions.json')
train_dataset, test_dataset = train_test_split(code_dataset, test_size=0.2)
train_dataset = Dataset.from_dict(train_dataset)
test_dataset = Dataset.from_dict(test_dataset)

In [14]:
code_dict = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

In [15]:
train_dataset_modified = code_dict['train'].rename_column('code_with_problem', 'question')
train_dataset_modified = train_dataset_modified.rename_column('code_only', 'answer')
train_dataset_modified = train_dataset_modified.remove_columns('code_with_data')

test_dataset_modified = code_dict['test'].rename_column('code_with_problem', 'question')
test_dataset_modified = test_dataset_modified.rename_column('code_only', 'answer')
test_dataset_modified = test_dataset_modified.remove_columns('code_with_data')

code_dict1 = DatasetDict({
    'train': train_dataset_modified,
    'test': test_dataset_modified
})

print(code_dict1)

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 1887
    })
    test: Dataset({
        features: ['question', 'answer'],
        num_rows: 472
    })
})


In [16]:
train_dataset_modified = code_dict['train'].rename_column('code_with_data', 'question')
train_dataset_modified = train_dataset_modified.rename_column('code_only', 'answer')
train_dataset_modified = train_dataset_modified.remove_columns('code_with_problem')

test_dataset_modified = code_dict['test'].rename_column('code_with_data', 'question')
test_dataset_modified = test_dataset_modified.rename_column('code_only', 'answer')
test_dataset_modified = test_dataset_modified.remove_columns('code_with_problem')

code_dict2 = DatasetDict({
    'train': train_dataset_modified,
    'test': test_dataset_modified
})

print(code_dict2)

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 1887
    })
    test: Dataset({
        features: ['question', 'answer'],
        num_rows: 472
    })
})


In [17]:
n_train = code_dict['train'].num_rows
n_test = code_dict['test'].num_rows
n_total = n_train + n_test

In [23]:
code_dict1["train"][0]
code_dict2["train"][0]

{'question': '# distinct-numbers-in-each-subarray\n# Distinct Numbers in Each Subarray\n# Medium\n# Given an integer array `nums` and an integer `k`, you are asked to construct the array `ans` of size `n-k+1` where `ans[i]` is the number of **distinct** numbers in the subarray `nums[i:i+k-1] = [nums[i], nums[i+1], ..., nums[i+k-1]]`.\n\nReturn _the array_ `ans`.\n\n**Example 1:**\n\n**Input:** nums = \\[1,2,3,2,2,1,3\\], k = 3\n**Output:** \\[3,2,2,2,3\\]\n**Explanation:** The number of distinct elements in each subarray goes as follows:\n- nums\\[0:2\\] = \\[1,2,3\\] so ans\\[0\\] = 3\n- nums\\[1:3\\] = \\[2,3,2\\] so ans\\[1\\] = 2\n- nums\\[2:4\\] = \\[3,2,2\\] so ans\\[2\\] = 2\n- nums\\[3:5\\] = \\[2,2,1\\] so ans\\[3\\] = 2\n- nums\\[4:6\\] = \\[2,1,3\\] so ans\\[4\\] = 3\n\n**Example 2:**\n\n**Input:** nums = \\[1,1,1,1,2,3,4\\], k = 4\n**Output:** \\[1,2,3,4\\]\n**Explanation:** The number of distinct elements in each subarray goes as follows:\n- nums\\[0:3\\] = \\[1,1,1,1\\] s

### Create general knowledge dataset

In [20]:
csv_file_path = 'data/general/general.csv'
general = pd.read_csv(csv_file_path)
general = general.sample(n=n_total, random_state=42)
general['id'] = range(len(general))
train_sample, test_sample = train_test_split(general, test_size=0.2, random_state=42)
train_sample.reset_index(drop=True, inplace=True)
test_sample.reset_index(drop=True, inplace=True)

train_dataset = Dataset.from_pandas(train_sample[['question', 'answer']])
test_dataset = Dataset.from_pandas(test_sample[['question', 'answer']])

general_dict = DatasetDict({
    'train': train_dataset,
    'test': test_dataset,
})

print(general_dict)

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 1887
    })
    test: Dataset({
        features: ['question', 'answer'],
        num_rows: 472
    })
})


In [22]:
general_dict["train"][0]

{'question': 'Add 1 letter to "Iowa" to get the name of this tribe who lived south of the Iowa',
 'answer': 'Kiowa'}

### Create math dataset

In [24]:
def convert_folder_to_datasetdict_math(folder_path):
    #data = {"id": [], "question": [], "level": [], "type": [], "answer": []}
    data = {"question": [], "answer": []}

    subject_dictionary = {
        "algebra": 1,
        "counting_and_probability": 2,
        "geometry": 3,
        "intermediate_algebra": 4,
        "number_theory": 5,
        "prealgebra": 6,
        "precalculus": 7
    }

    for subdir, _, files in os.walk(folder_path):
        for file in files:
            file_path = os.path.join(subdir, file)
            folder_name = os.path.basename(os.path.normpath(subdir))
            
            with open(file_path, "r") as f:
                json_data = json.load(f)
                problem = json_data.get("problem", "")
                level = json_data.get("level", "")
                type_ = json_data.get("type", "")
                solution = json_data.get("solution", "")

                # Generate id from subject code and file name
                file_name = os.path.splitext(file)[0]
                subject_code = subject_dictionary.get(folder_name, 0)  # Default to 0 if not found
                id_ = f"{subject_code}_{file_name}"

                #data["id"].append(id_)
                data["question"].append(problem)
                #data["level"].append(level)
                #data["type"].append(type_)
                data["answer"].append(solution)

    dataset = Dataset.from_dict(data)
    return dataset

In [25]:
math_train = convert_folder_to_datasetdict_math("data/math/train/")
math_test = convert_folder_to_datasetdict_math("data/math/test/")

In [26]:
math_train = math_train.shuffle(seed=42)
math_train = math_train.select(range(n_train))

math_test = math_test.shuffle(seed=42)
math_test = math_test.select(range(n_test))

math_dict = DatasetDict({
    'train': math_train,
    'test': math_test
})

print(math_dict)

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 1887
    })
    test: Dataset({
        features: ['question', 'answer'],
        num_rows: 472
    })
})


In [27]:
math_dict["train"][0]

{'question': 'What is the number of units in the distance between $(2,5)$ and $(-6,-1)$?',
 'answer': 'We use the distance formula: $\\sqrt{(-6 - 2)^2 + (-1 - 5)^2},$ so then we find that $\\sqrt{64 + 36} = \\boxed{10}$.\n\n- OR -\n\nWe note that the points $(2, 5)$, $(-6, -1)$, and $(2, -1)$ form a right triangle with legs of length 6 and 8. This is a Pythagorean triple, so the length of the hypotenuse must be $\\boxed{10}$.'}

## Preprocessing

In [28]:
prefix = "Please answer this question: "

def preprocess_function(examples):
   """Add prefix to the sentences, tokenize the text, and set the labels"""
   # The "inputs" are the tokenized answer:
   inputs = [prefix + doc for doc in examples["question"]]
   model_inputs = tokenizer(inputs, max_length=128, truncation=True)
  
   # The "labels" are the tokenized outputs:
   labels = tokenizer(text_target=examples["answer"], 
                      max_length=512,         
                      truncation=True)

   model_inputs["labels"] = labels["input_ids"]
   return model_inputs

In [29]:
tokenized_dataset_code1 = code_dict1.map(preprocess_function, batched=True)
tokenized_dataset_code2 = code_dict2.map(preprocess_function, batched=True)
tokenized_dataset_general = general_dict.map(preprocess_function, batched=True)
tokenized_dataset_math = math_dict.map(preprocess_function, batched=True)

Map:   0%|          | 0/1887 [00:00<?, ? examples/s]

Map:   0%|          | 0/472 [00:00<?, ? examples/s]

Map:   0%|          | 0/1887 [00:00<?, ? examples/s]

Map:   0%|          | 0/472 [00:00<?, ? examples/s]

Map:   0%|          | 0/1887 [00:00<?, ? examples/s]

Map:   0%|          | 0/472 [00:00<?, ? examples/s]

Map:   0%|          | 0/1887 [00:00<?, ? examples/s]

Map:   0%|          | 0/472 [00:00<?, ? examples/s]

In [30]:
nltk.download("punkt", quiet=True)
metric = evaluate.load("rouge")

In [31]:
def compute_metrics(eval_preds):
   preds, labels = eval_preds

   # decode preds and labels
   labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
   decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
   decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

   # rougeLSum expects newline after each sentence
   decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
   decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

   result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
  
   return result

## Fine-tuning

In [42]:
# Global Parameters
L_RATE = 3e-4
BATCH_SIZE = 8
PER_DEVICE_EVAL_BATCH = 4
WEIGHT_DECAY = 0.01
SAVE_TOTAL_LIM = 3
NUM_EPOCHS = 3
OVERWRITE_OUTPUT_DIR = True
LOAD_BEST_MODEL_AT_END = True

# Set up training arguments
training_args_math = Seq2SeqTrainingArguments(
   output_dir="./results/math",
   overwrite_output_dir=OVERWRITE_OUTPUT_DIR,
   save_strategy="epoch",
   evaluation_strategy="epoch",
   load_best_model_at_end=LOAD_BEST_MODEL_AT_END,
   learning_rate=L_RATE,
   per_device_train_batch_size=BATCH_SIZE,
   per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH,
   weight_decay=WEIGHT_DECAY,
   save_total_limit=SAVE_TOTAL_LIM,
   num_train_epochs=NUM_EPOCHS,
   predict_with_generate=True,
   push_to_hub=False
)

training_args_code1 = Seq2SeqTrainingArguments(
   output_dir="./results/code1",
   overwrite_output_dir=OVERWRITE_OUTPUT_DIR,
   save_strategy="epoch",
   evaluation_strategy="epoch",
   load_best_model_at_end=LOAD_BEST_MODEL_AT_END,
   learning_rate=L_RATE,
   per_device_train_batch_size=BATCH_SIZE,
   per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH,
   weight_decay=WEIGHT_DECAY,
   save_total_limit=SAVE_TOTAL_LIM,
   num_train_epochs=NUM_EPOCHS,
   predict_with_generate=True,
   push_to_hub=False
)

training_args_code2 = Seq2SeqTrainingArguments(
   output_dir="./results/code2",
   overwrite_output_dir=OVERWRITE_OUTPUT_DIR,
   save_strategy="epoch",
   evaluation_strategy="epoch",
   load_best_model_at_end=LOAD_BEST_MODEL_AT_END,
   learning_rate=L_RATE,
   per_device_train_batch_size=BATCH_SIZE,
   per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH,
   weight_decay=WEIGHT_DECAY,
   save_total_limit=SAVE_TOTAL_LIM,
   num_train_epochs=NUM_EPOCHS,
   predict_with_generate=True,
   push_to_hub=False
)

training_args_general = Seq2SeqTrainingArguments(
   output_dir="./results/base",
   overwrite_output_dir=OVERWRITE_OUTPUT_DIR,
   save_strategy="epoch",
   evaluation_strategy="epoch",
   load_best_model_at_end=LOAD_BEST_MODEL_AT_END,
   learning_rate=L_RATE,
   per_device_train_batch_size=BATCH_SIZE,
   per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH,
   weight_decay=WEIGHT_DECAY,
   save_total_limit=SAVE_TOTAL_LIM,
   num_train_epochs=NUM_EPOCHS,
   predict_with_generate=True,
   push_to_hub=False
)

### Math

In [33]:
trainer_math = Seq2SeqTrainer(
   model=model_math,
   args=training_args_math,
   train_dataset=tokenized_dataset_math["train"], 
   eval_dataset=tokenized_dataset_math["test"],   
   tokenizer=tokenizer,
   data_collator=data_collator_math,
   compute_metrics=compute_metrics
)

In [19]:
trainer_math.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33myezixuanclara[0m. Use [1m`wandb login --relogin`[0m to force relogin


 18%|█▊        | 501/2814 [00:59<04:35,  8.40it/s]

{'loss': 0.0484, 'learning_rate': 0.0002466950959488273, 'epoch': 0.53}


                                                  
 33%|███▎      | 938/2814 [02:47<03:37,  8.64it/s] 

{'eval_loss': 0.0, 'eval_rouge1': 0.0, 'eval_rouge2': 0.0, 'eval_rougeL': 0.0, 'eval_rougeLsum': 0.0, 'eval_runtime': 58.1889, 'eval_samples_per_second': 85.927, 'eval_steps_per_second': 21.482, 'epoch': 1.0}


 36%|███▌      | 1002/2814 [02:57<03:28,  8.67it/s] 

{'loss': 0.0, 'learning_rate': 0.00019339019189765458, 'epoch': 1.07}


 53%|█████▎    | 1501/2814 [03:56<02:49,  7.77it/s]

{'loss': 0.0, 'learning_rate': 0.00014008528784648186, 'epoch': 1.6}


                                                   
 67%|██████▋   | 1876/2814 [05:40<01:53,  8.27it/s]

{'eval_loss': 0.0, 'eval_rouge1': 0.0, 'eval_rouge2': 0.0, 'eval_rougeL': 0.0, 'eval_rougeLsum': 0.0, 'eval_runtime': 59.3238, 'eval_samples_per_second': 84.283, 'eval_steps_per_second': 21.071, 'epoch': 2.0}


 71%|███████   | 2002/2814 [05:57<01:31,  8.85it/s]  

{'loss': 0.0, 'learning_rate': 8.678038379530917e-05, 'epoch': 2.13}


 89%|████████▉ | 2502/2814 [06:54<00:35,  8.89it/s]

{'loss': 0.0, 'learning_rate': 3.347547974413646e-05, 'epoch': 2.67}


                                                   
100%|██████████| 2814/2814 [08:32<00:00,  8.96it/s]

{'eval_loss': 0.0, 'eval_rouge1': 0.0, 'eval_rouge2': 0.0, 'eval_rougeL': 0.0, 'eval_rougeLsum': 0.0, 'eval_runtime': 61.2103, 'eval_samples_per_second': 81.686, 'eval_steps_per_second': 20.421, 'epoch': 3.0}


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].
100%|██████████| 2814/2814 [08:36<00:00,  5.45it/s]

{'train_runtime': 518.6125, 'train_samples_per_second': 43.385, 'train_steps_per_second': 5.426, 'train_loss': 0.00859833234957453, 'epoch': 3.0}





TrainOutput(global_step=2814, training_loss=0.00859833234957453, metrics={'train_runtime': 518.6125, 'train_samples_per_second': 43.385, 'train_steps_per_second': 5.426, 'train_loss': 0.00859833234957453, 'epoch': 3.0})

In [20]:
del model_math
gc.collect()
torch.cuda.empty_cache()

### Code 1

In [36]:
trainer_code1 = Seq2SeqTrainer(
   model=model_code1,
   args=training_args_code1,
   train_dataset=tokenized_dataset_code1["train"],    
   eval_dataset=tokenized_dataset_code1["test"],    
   tokenizer=tokenizer,
   data_collator=data_collator_code1,
   compute_metrics=compute_metrics
)

In [22]:
trainer_code1.train()


 33%|███▎      | 16/48 [03:47<07:55, 14.87s/it]

{'eval_loss': 2.496579885482788, 'eval_rouge1': 0.03383376969610924, 'eval_rouge2': 0.0016460905349794238, 'eval_rougeL': 0.028476556253146653, 'eval_rougeLsum': 0.029104003651678312, 'eval_runtime': 18.5042, 'eval_samples_per_second': 2.918, 'eval_steps_per_second': 0.757, 'epoch': 1.0}



 67%|██████▋   | 32/48 [07:42<02:47, 10.45s/it]

{'eval_loss': 2.085508108139038, 'eval_rouge1': 0.14193502014963366, 'eval_rouge2': 0.05224203937530196, 'eval_rougeL': 0.1311783087726054, 'eval_rougeLsum': 0.1319044507434023, 'eval_runtime': 18.0557, 'eval_samples_per_second': 2.991, 'eval_steps_per_second': 0.775, 'epoch': 2.0}



100%|██████████| 48/48 [11:16<00:00, 12.77s/it]

{'eval_loss': 1.9809935092926025, 'eval_rouge1': 0.1505172341955189, 'eval_rouge2': 0.05507006806796853, 'eval_rougeL': 0.1411070353465603, 'eval_rougeLsum': 0.1417876510239457, 'eval_runtime': 18.1912, 'eval_samples_per_second': 2.968, 'eval_steps_per_second': 0.77, 'epoch': 3.0}


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].
100%|██████████| 48/48 [11:22<00:00, 14.22s/it]

{'train_runtime': 682.7185, 'train_samples_per_second': 0.554, 'train_steps_per_second': 0.07, 'train_loss': 2.8900632858276367, 'epoch': 3.0}





TrainOutput(global_step=48, training_loss=2.8900632858276367, metrics={'train_runtime': 682.7185, 'train_samples_per_second': 0.554, 'train_steps_per_second': 0.07, 'train_loss': 2.8900632858276367, 'epoch': 3.0})

In [None]:
del model_code1
gc.collect()
torch.cuda.empty_cache()

### Code 2

In [37]:
trainer_code2 = Seq2SeqTrainer(
   model=model_code2,
   args=training_args_code2,
   train_dataset=tokenized_dataset_code2["train"],    
   eval_dataset=tokenized_dataset_code2["test"],    
   tokenizer=tokenizer,
   data_collator=data_collator_code2,
   compute_metrics=compute_metrics
)

In [None]:
trainer_code2.train()

In [None]:
del model_code2
gc.collect()
torch.cuda.empty_cache()

### General

In [43]:
trainer_general = Seq2SeqTrainer(
   model=model_general,
   args=training_args_general,
   train_dataset=tokenized_dataset_general["train"],    
   eval_dataset=tokenized_dataset_general["test"],    
   tokenizer=tokenizer,
   data_collator=data_collator_general,
   compute_metrics=compute_metrics
)

In [None]:
trainer_general.train()

In [None]:
del model_general
gc.collect()
torch.cuda.empty_cache()