## Install Libraries

In [None]:
# Installing Libraries
!pip install datasets
!pip install transformers[torch]
!pip install tokenizers
!pip install evaluate
!pip install huggingface_hub
!pip install peft

Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xxhash, dill, multiprocess, datasets
Successfully installed datasets-2

In [None]:
# Loading Libraries
import evaluate
import numpy as np
import pandas as pd
import nltk

from datasets import load_dataset
from datasets import Dataset
from transformers import T5Tokenizer, DataCollatorForSeq2Seq
from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
from peft import LoraConfig
from peft import get_peft_model
from sklearn.metrics import precision_recall_fscore_support
from peft import TaskType
from typing import List, Tuple
from nltk.tokenize import sent_tokenize

import warnings
warnings.simplefilter(action='ignore', category=Warning)
nltk.download("punkt", quiet=True)

True

## Data Preprocessing

In [None]:
def add_prompt_template_mohler(mohler_df):
  # Loads a dataframe and appends the strings that make a single prompt text.
  # Code stores this single text as a new column in the existing dataframe.
  # Includes guardrails
  # Returns only two columns, the input and the target

  premise = "You are a grader for a undergraduate computer science course. For the given 'Question', analyze the 'Given Answer' against the 'Expected Answer' and provide a score based on the relevancy of the answer."
  question_prefix = "'Question': "
  given_prefix = "'Given Answer': "
  expect_prefix = "'Expected Answer': "
  guard_rails = "Provide your score on the given scale. In order of least relevant answer to most relevant, the score can be one of the following: 'Incorrect', 'Poor', 'Fair', 'Adequate', 'Good', 'Excellent'. Do not display any other content other than the single score word."

  mohler_df['input_prompts'] = premise + question_prefix + mohler_df['question'] + expect_prefix + mohler_df['refanswer'] + given_prefix + mohler_df['answer'] + guard_rails

  return mohler_df

mohler_df = pd.read_csv('combined_string_scoring.csv', index_col=0)
# Created input prompts
mohler_df = add_prompt_template_mohler(mohler_df)
mohler_df.head()


Unnamed: 0,question,refanswer,answer,rounded_score,ScoreDescription,input_prompts
0,What is the role of a prototype program in pro...,To simulate the behaviour of portions of the d...,High risk problems are address in the prototyp...,4.0,Good,You are a grader for a undergraduate computer ...
1,What is the role of a prototype program in pro...,To simulate the behaviour of portions of the d...,To simulate portions of the desired final prod...,5.0,Excellent,You are a grader for a undergraduate computer ...
2,What is the role of a prototype program in pro...,To simulate the behaviour of portions of the d...,A prototype program simulates the behaviors of...,4.0,Good,You are a grader for a undergraduate computer ...
3,What is the role of a prototype program in pro...,To simulate the behaviour of portions of the d...,Defined in the Specification phase a prototype...,5.0,Excellent,You are a grader for a undergraduate computer ...
4,What is the role of a prototype program in pro...,To simulate the behaviour of portions of the d...,It is used to let the users have a first idea ...,3.0,Adequate,You are a grader for a undergraduate computer ...


In [None]:
mohler_dataset = Dataset.from_pandas(mohler_df[['input_prompts', 'ScoreDescription']])

# Perfomring Train Test Split
mohler_dataset = mohler_dataset.train_test_split(test_size=0.3)

mohler_dataset

DatasetDict({
    train: Dataset({
        features: ['input_prompts', 'ScoreDescription', '__index_level_0__'],
        num_rows: 4143
    })
    test: Dataset({
        features: ['input_prompts', 'ScoreDescription', '__index_level_0__'],
        num_rows: 1776
    })
})

## Model Configuration

In [None]:
# Load the tokenizer, model, and data collator
MODEL_NAME = "google/flan-t5-base"

# Global Parameters
L_RATE = 3e-4
BATCH_SIZE = 8
PER_DEVICE_EVAL_BATCH = 4
WEIGHT_DECAY = 0.01
SAVE_TOTAL_LIM = 3
NUM_EPOCHS = 20
TASK_TYPE = TaskType.SEQ_2_SEQ_LM

tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


## Tokenization

In [None]:
def preprocess_function(examples):
   """Tokenize the text, and set the labels"""
   # Inputs
   model_inputs = tokenizer(examples['input_prompts'], max_length=128, truncation=True)
   # Outputs
   labels = tokenizer(text_target=[str(score) for score in examples["ScoreDescription"]], max_length=5, truncation=True)

   model_inputs["labels"] = labels["input_ids"]

   return model_inputs

tokenized_dataset = mohler_dataset.map(preprocess_function, batched=True)
pd.DataFrame(tokenized_dataset['train'])


Map:   0%|          | 0/4143 [00:00<?, ? examples/s]

Map:   0%|          | 0/1776 [00:00<?, ? examples/s]

Unnamed: 0,input_prompts,ScoreDescription,__index_level_0__,input_ids,attention_mask,labels
0,You are a grader for a undergraduate computer ...,Excellent,213,"[148, 33, 3, 9, 2769, 52, 21, 3, 9, 12260, 121...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[11497, 1]"
1,You are a grader for a undergraduate computer ...,Excellent,2170,"[148, 33, 3, 9, 2769, 52, 21, 3, 9, 12260, 121...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[11497, 1]"
2,You are a grader for a undergraduate computer ...,Good,50,"[148, 33, 3, 9, 2769, 52, 21, 3, 9, 12260, 121...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1804, 1]"
3,You are a grader for a undergraduate computer ...,Excellent,662,"[148, 33, 3, 9, 2769, 52, 21, 3, 9, 12260, 121...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[11497, 1]"
4,You are a grader for a undergraduate computer ...,Excellent,2578,"[148, 33, 3, 9, 2769, 52, 21, 3, 9, 12260, 121...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[11497, 1]"
...,...,...,...,...,...,...
4138,You are a grader for a undergraduate computer ...,Excellent,295,"[148, 33, 3, 9, 2769, 52, 21, 3, 9, 12260, 121...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[11497, 1]"
4139,You are a grader for a undergraduate computer ...,Good,1789,"[148, 33, 3, 9, 2769, 52, 21, 3, 9, 12260, 121...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1804, 1]"
4140,You are a grader for a undergraduate computer ...,Excellent,615,"[148, 33, 3, 9, 2769, 52, 21, 3, 9, 12260, 121...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[11497, 1]"
4141,You are a grader for a undergraduate computer ...,Good,3366,"[148, 33, 3, 9, 2769, 52, 21, 3, 9, 12260, 121...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1804, 1]"


## Evaluation Metrics

In [None]:
metric = evaluate.load("f1")



def postprocess_text(
    preds: List[str], labels: List[str]
) -> Tuple[List[str], List[str]]:
    """helper function to postprocess text"""
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(sent_tokenize(label)) for label in labels]

    return preds, labels

def compute_metrics_huh(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]

        # Define the label mapping
    id2label = {
        0: 'Incorrect',
        1: 'Poor',
        2: 'Fair',
        3: 'Adequate',
        4: 'Good',
        5: 'Excellent'
    }

    # Create a reverse mapping from label to id
    label2id = {v: k for k, v in id2label.items()}

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    decoded_preds = [label2id[label] if label in label2id else 0 for label in decoded_preds]
    decoded_labels = [label2id[label] if label in label2id else 0 for label in decoded_labels]

    precision, recall, f1, _ = precision_recall_fscore_support(decoded_labels, decoded_preds, average="weighted")
    return {"precision": precision, "recall": recall, "f1": f1}



## Model

In [None]:
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [None]:
# Set up training arguments
training_args = Seq2SeqTrainingArguments(
   output_dir="./results",
   evaluation_strategy="epoch",
   learning_rate=L_RATE,
   per_device_train_batch_size=BATCH_SIZE,
   per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH,
   weight_decay=WEIGHT_DECAY,
   save_total_limit=SAVE_TOTAL_LIM,
   num_train_epochs=NUM_EPOCHS,
   predict_with_generate=True,
   push_to_hub=False
)

### Lora Finetuning

In [None]:

#If only targeting attention blocks of the model
target_modules = ["q_proj", "v_proj"]

#If targeting all linear layers
target_modules = ['q_proj','k_proj','v_proj','o_proj','gate_proj','down_proj','up_proj','lm_head']

lora_config = LoraConfig(
  r=128, # Rank
  lora_alpha=128,
  target_modules=["q", "v"],
  lora_dropout=0.05, bias="none",
  task_type= TASK_TYPE
  )


lora_model = get_peft_model(model, lora_config)

lora_trainer = Seq2SeqTrainer(
   model=lora_model,
   args=training_args,
   train_dataset=tokenized_dataset["train"],
   eval_dataset=tokenized_dataset["test"],
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics_huh
)


lora_trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.779,0.735094,0.402477,0.443131,0.313479
2,0.5534,0.49356,0.555748,0.600788,0.568591
3,0.4996,0.456574,0.638185,0.641892,0.604944
4,0.4547,0.425005,0.642749,0.653153,0.624003
5,0.4281,0.444025,0.655248,0.664977,0.630132
6,0.4087,0.436151,0.665952,0.677365,0.661477
7,0.385,0.423747,0.68317,0.688063,0.659714
8,0.3585,0.444639,0.69194,0.684685,0.658795
9,0.3406,0.424513,0.683979,0.695946,0.681095
10,0.3083,0.419018,0.698827,0.703266,0.693721


Checkpoint destination directory ./results/checkpoint-4000 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-4500 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-5000 already exists and is non-empty. Saving will proceed but saved results may be invalid.


TrainOutput(global_step=10360, training_loss=0.3454996366758604, metrics={'train_runtime': 5933.2726, 'train_samples_per_second': 13.965, 'train_steps_per_second': 1.746, 'total_flos': 1.50855723122688e+16, 'train_loss': 0.3454996366758604, 'epoch': 20.0})

In [None]:
lora_eval_results = lora_trainer.evaluate()
print(lora_eval_results)

{'eval_loss': 0.508598268032074, 'eval_precision': 0.7215385805962586, 'eval_recall': 0.7257882882882883, 'eval_f1': 0.7174305733275148, 'eval_runtime': 182.8692, 'eval_samples_per_second': 9.712, 'eval_steps_per_second': 2.428, 'epoch': 20.0}
