In [None]:
!pip install -U transformers
!pip install bitsandbytes
!pip install accelerate
!pip install datasets
!pip install huggingface_hub
!pip install peft
!pip install trl
!pip install accelerate
!pip install evaluate
!pip install wandb

# Load the data

In [None]:
import locale
locale.getpreferredencoding = lambda x: "UTF-8"

In [None]:
import pandas as pd
import os

data_path = "./data.csv"

if not os.path.exists(data_path):
    raise Exception("File not found : {}".format(data_path))

df = pd.read_csv(data_path)
df.head()

import sys
print(sys.version)

# Import the libraries

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
from huggingface_hub import login
from peft import LoraConfig, PeftModelForCausalLM
from peft import get_peft_model, TaskType

from sklearn.model_selection import train_test_split
from tqdm import tqdm
import numpy as np
from sklearn.metrics import mean_absolute_error

from datasets import Dataset
import json
from trl import SFTTrainer
from transformers import TrainingArguments
from transformers import AutoModelForSequenceClassification, GemmaForSequenceClassification, MistralForSequenceClassification

import pandas as pd
import os
from sklearn.metrics import f1_score
import json

import evaluate

login("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")

# Set the CONFIG Files
- Change the MODEL_ID, BNB Config based on the input model

In [None]:
class CONFIG:
    MODEL_ID = "mistralai/Mistral-7B-v0.1"
    # MODEL_ID = "meta-llama/Llama-2-7b-hf"
    # MODEL_ID = "NousResearch/Llama-2-7b-chat-hf"
    BNB_CONFIG = BitsAndBytesConfig(
        load_in_4bit=True,
        # bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
    )
    DEVICE_MAP = "auto"
    DEVICE = "cuda:0"
    ADD_EOS_TOKEN = False
    PADDING_SIDE = "left"
    ADD_PREFIX_SPACE = True
    NUM_LABELS = 6

    LORA_CONFIG = LoraConfig(
        lora_alpha = 32,
        lora_dropout=0.1,
        r=128,
        task_type=TaskType.SEQ_CLS,
        target_modules = [
        "q_proj",
        "v_proj",
    ]
    )

## Init model and tokenizer

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path=CONFIG.MODEL_ID,
                                                           num_labels = CONFIG.NUM_LABELS,)
                                                          #  quantization_config=CONFIG.BNB_CONFIG,
                                                          #  device_map=CONFIG.DEVICE_MAP)
model.config.pad_token_id = model.config.eos_token_id


tokenizer = AutoTokenizer.from_pretrained(CONFIG.MODEL_ID,
                                          add_prefix_space=CONFIG.ADD_PREFIX_SPACE)

tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.pad_token = tokenizer.eos_token


## Get the prompt template with gaurdrails

In [None]:
import evaluate
from transformers import pipeline


def get_prompt_gaurdrails(question: str, ref_answer: str, student_answer: str) -> str:
	device = CONFIG.DEVICE

	prompt_template = """
	<start_of_turn>user
	The give question is :
	{question}

	For the above question the reference answer is :
	{ref_answer}"

	Now a student has provided the below answer :
	{student_answer}

	Classify the essay into one of the following categories: bad, worse, satisfactory, good, very good, excellent.

	Classification :

	"""
	prompt = prompt_template.format(question = question,
									ref_answer = ref_answer,
									student_answer = student_answer)

	return prompt


def get_output_from_model(input_model: AutoModelForCausalLM, input_tokenizer :AutoTokenizer, input_df: pd.DataFrame, check = False) -> list:
	outputs = []
	# pbar = manager.counter(total=len(input_df))
	for i, row in tqdm(enumerate(input_df.iterrows())):
		if check and i == 3:
			break

		question = row[1]["question"]
		ref_answer = row[1]["refanswer"]
		student_answer = row[1]["answer"]
		score = row[1]["score_avg"]

		prompt = get_prompt_gaurdrails(question = question,
						ref_answer = ref_answer,
						student_answer = student_answer)

		encoded_str = tokenizer(tokenizer.eos_token+prompt, return_tensors="pt", add_special_tokens=True,)

		model_inputs = encoded_str.to(CONFIG.DEVICE)

		clf = pipeline(task="text-classification",
		               model=input_model,
									  tokenizer=input_tokenizer)
		output = clf(prompt)

		# generated_ids = model.generate(**model_inputs, max_new_tokens=1000, do_sample=True, pad_token_id=tokenizer.eos_token_id)
		# output = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
		# output = output.split("model\n\n")[1]

		outputs.append(output)

	return outputs


def calc_rmse(y_true: list, y_pred: list) -> float:
	y_pred = list(map(float, y_pred))
	y_true = list(map(float, y_true))

	rmse = np.sqrt(mean_absolute_error(y_true, y_pred))
	return rmse

def calc_f1(y_true: list, y_pred: list) -> float:
	y_pred = list(map(float, y_pred))
	y_true = list(map(float, y_true))

	f1 = f1_score(y_true, y_pred)
	return f1

def compute_metrics(eval_pred):
    # All metrics are already predefined in the HF `evaluate` package
    precision_metric = evaluate.load("precision")
    recall_metric = evaluate.load("recall")
    f1_metric= evaluate.load("f1")
    accuracy_metric = evaluate.load("accuracy")

    logits, labels = eval_pred # eval_pred is the tuple of predictions and labels returned by the model
    predictions = np.argmax(logits, axis=-1)

    precision = precision_metric.compute(predictions=predictions, references=labels, average="weighted")["precision"]
    recall = recall_metric.compute(predictions=predictions, references=labels, average="weighted")["recall"]
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="weighted")["f1"]
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]
    # The trainer is expecting a dictionary where the keys are the metrics names and the values are the scores.
    return {"precision": precision, "recall": recall, "f1-score": f1, 'accuracy': accuracy}


def get_cleaned_outputs(outputs):
    _out = []

    for i in range(len(outputs)):
        _out.append(outputs[i].split("Score : ")[-1].split("\n")[0])

    return _out


## Split the data into train and test df

In [None]:
# split the data into train and test set
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42, stratify=df["score_avg"])
print("train shape : ", df_train.shape)
print("test shape : ", df_test.shape)

## Execute the below code to make sure the model is working properly and get a sample output

In [None]:
question = "What is the role of a prototype program in problem solving?"
ref_answer = "To simulate the behaviour of portions of the desired software product."
student_answer = "High risk problems are address in the prototype program to make sure that the program is feasible.  A prototype may also be used to show a company that the software can be possibly programmed.  "
score = "3.5"

result = get_prompt_gaurdrails(question = question,
                        ref_answer = ref_answer,
                        student_answer = student_answer
                        )

print(result)

## Run the below code to generate the outputs from the model

In [None]:
outputs = get_output_from_model(model, tokenizer, df_train, check=True)
print(outputs)
# add the outputs to the df_train dataframe for easy processing

df_train["model_output"] = outputs
display(df_train.head(2))
# clean the output to remove the string associated with them
df_train["model_output"] = get_cleaned_outputs(df_train["model_output"].tolist())

# calculate the rmse square
calc_rmse(df_train["score_avg"].tolist(), df_train["model_output"].tolist())

# Finetune the model using Qlora

## Convert the data into prompts and store it as a json object

In [None]:
X = df.drop(columns="score_avg")
df["score_avg"] = df["score_avg"].apply(lambda x : round(x))

score_mapping = {
    0: "Bad",
    1: "Try Harder",
    2: "Satisfactory",
    3: "Good",
    4: "Very Good",
    5: "Excellent"
}

df["score_word"] = df["score_avg"].map(score_mapping)
y = df["score_word"]


# create a new set of train and test data
train_df, test_df = train_test_split(df, random_state=42, test_size=0.2)

# get the loss weights for each classes
loss_weights_mp = {}

for target in train_df["score_word"].unique():
  w = len(train_df) / (2 * train_df[train_df["score_word"] == target]["score_word"].value_counts())
  w = w.values[0]
  loss_weights_mp[target] = w

# calculate the maxiumum words and char
max_char = train_df['question'].str.len().max()
max_words = train_df['question'].str.split().str.len().max()

print("Weights for classes : ", loss_weights_mp)

print("len of x_train : ", len(train_df))
print("len of x_test : ", len(test_df))


def mistral_preprocessing_function(prompt):
    return tokenizer(prompt)

def convert_txt_to_qlora_dict(input_df : pd.DataFrame):

  __template_assistant = """
  Score : {score}
  """

  json_dict_ls = []

  for rowind, row in tqdm(input_df.iterrows()):
    json_dict = {
      "label" : None,
      "input_ids" : None,
      "attention_mask" : None
    }

    user_prompt = get_prompt_gaurdrails(question = row["question"],
					ref_answer = row["refanswer"],
					student_answer = row["answer"])

    assistant_prompt = __template_assistant.format(score = row["score_avg"])

    inp_out = mistral_preprocessing_function(user_prompt)
    json_dict["label"] = row["score_avg"]
    json_dict["input_ids"] = inp_out["input_ids"]
    json_dict["attention_mask"] = inp_out["attention_mask"]

    json_dict_ls.append(json_dict)

  return json_dict_ls




In [None]:
# get the dict version of the prompts
train_json = convert_txt_to_qlora_dict(train_df)
test_json = convert_txt_to_qlora_dict(test_df)

# Specify the file path where you want to save the JSON file
train_file_path = "EN-train_chatml.json"
test_file_path = "EN-val_chatml.json"

# Save the dictionary as a JSON file
with open(train_file_path, "w") as json_file:
    json.dump(train_json, json_file)

with open(test_file_path, "w") as json_file:
    json.dump(test_json, json_file)

# load the file again
save_path = "./"
dataset_train_name = 'EN-train'
dataset_val_name = 'EN-val'

file_name_train_chatml = f"{dataset_train_name}_chatml.json"
file_name_val_chatml = f"{dataset_val_name}_chatml.json"

with open(save_path + file_name_train_chatml, 'r') as f:
  dataset_train = Dataset.from_list(json.load(f))

with open(save_path + file_name_val_chatml, 'r') as f:
  dataset_val = Dataset.from_list(json.load(f))


In [None]:
dataset_train_df = pd.DataFrame(dataset_train)
dataset_val_df = pd.DataFrame(dataset_val)

print(dataset_train_df.index.nunique())
print(dataset_val_df.index.nunique())


In [None]:
  # lora_model = AutoModelForCausalLM.from_pretrained(CONFIG.MODEL_ID, quantization_config=CONFIG.BNB_CONFIG, device_map=CONFIG.DEVICE_MAP)
model = get_peft_model(model, CONFIG.LORA_CONFIG)
model.print_trainable_parameters()


lora_tokenizer = AutoTokenizer.from_pretrained(CONFIG.MODEL_ID, padding_side=CONFIG.PADDING_SIDE)

## Train the Qlora params

In [None]:
from transformers import Trainer
from transformers import DataCollatorWithPadding

print(loss_weights_mp)
class WeightedCELossTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")

        # Get model's predictions
        outputs = model(**inputs)
        logits = outputs.get("logits")
        weight = [loss_weights_mp["Bad"],
                  loss_weights_mp["Try Harder"],
                  loss_weights_mp["Satisfactory"],
                  loss_weights_mp["Good"],
                  loss_weights_mp["Very Good"],
                  loss_weights_mp["Excellent"]]

        # Compute custom loss
        loss_fct = torch.nn.CrossEntropyLoss(weight=torch.tensor(weight, device=model.device, dtype=logits.dtype))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

lr = 1e-5
batch_size = 10
num_epochs = 1

training_args = TrainingArguments(
    output_dir="mistral-lora-token-classification",
    learning_rate=lr,
    lr_scheduler_type= "constant",
    warmup_ratio= 0.1,
    max_grad_norm= 0.3,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.001,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to="wandb",
    fp16=True,
    gradient_checkpointing=True,
    label_names=["Bad", "Try Harder", "Satisfactory", "Good", "Very Good", "Excellent"]
)

mistral_data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

mistral_trainer = WeightedCELossTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_val,
    data_collator=mistral_data_collator,
    compute_metrics=compute_metrics,
    # num_labels=6,
)

mistral_trainer.train()

## Save the finetuned model

In [None]:
model_save_name = "/mistral_class_new"

mistral_trainer.model.save_pretrained(model_save_name)
finetuned_model = PeftModelForCausalLM.from_pretrained(model=model, model_id=model_save_name)


In [None]:
mistral_trainer.push_to_hub("adhi29/mistral-7b-lora-answer-evaluator-new")

# Sample Evaluation

In [None]:
!pip install timm

In [None]:
from transformers import AutoModel, AutoTokenizer

# finetuned_model = timm.create_model("hf_hub:adhi29/mistral-7b-lora-answer-evaluator", pretrained=True)

finetuned_model = AutoModel.from_pretrained("adhi29/mistral-7b-lora-answer-evaluator")
finetuned_model.config.num_labels = 6
finetuned_tokenizer = AutoTokenizer.from_pretrained(CONFIG.MODEL_ID)

# finetuned_model = mistral_trainer.model




In [None]:

from transformers import pipeline, AutoModel
import pandas as pd
from tqdm import tqdm
from peft import PeftModel

# classifier_finetuned = pipeline("text-classification",
#                                 "adhi29/mistral-7b-lora-answer-evaluator",
#                                 binary_output=False,
#                                 return_all_scores=True)
model = AutoModelForSequenceClassification.from_pretrained(CONFIG.MODEL_ID, num_labels=CONFIG.NUM_LABELS)
model = PeftModel.from_pretrained(model, "adhi29/mistral-7b-lora-answer-evaluator")
# model.load_adapter("adhi29/mistral-lora-token-classification")
# model.load_adapter("adhi29/mistral-lora-token-classification")
tokenizer = AutoTokenizer.from_pretrained(CONFIG.MODEL_ID)

classifier_finetuned = pipeline("text-classification", model=model, tokenizer=tokenizer)

# classifier_model = pipeline("text-classification", model = )

def convert_txt_to_eval_dict(input_df : pd.DataFrame):
  prompts = []

  for rowind, row in tqdm(input_df.iterrows()):
    json_dict = {
      "label" : None,
      "input_ids" : None,
      "attention_mask" : None
    }

    user_prompt = get_prompt_gaurdrails(question = row["question"],
					ref_answer = row["refanswer"],
					student_answer = row["answer"])

    prompts.append(user_prompt)

  return prompts


def evaluate(check=False):
  outputs = []
  scores = []
  actuals = []

  prompts = convert_txt_to_eval_dict(test_df)

  for i, prompt in tqdm(enumerate(prompts)):
    print(prompt)
    if check and i == 3:
      break

    actuals.append(float(test_df.iloc[i]["score_avg"]))
    output = classifier_finetuned(prompt)[0]
    print(output)
    output_label = output["label"]
    output_score = output["score"]

    outputs.append(output_label)
    scores.append(output_score)

  return outputs, scores, actuals

pred_outputs, pred_scores, actual_outputs = evaluate(check=True)

In [None]:
print(pred_outputs)
print(actual_outputs)
print(pred_scores)


In [None]:
messages=[
    {
        'role':'user',
        'content':'Who is Francesco Lelli?',
    }
]

messages = [
    {
        "role": "user",
        "content": "\n  The give question is : \n    What are the elements typically included in a class definition\n\n    For the above question the reference answer is : \n    Function members and data members\n\n    Now a student has provided the below answer : \n    the functions and variables used when the object is defined for the class\n\n    For the above answer, what is the appropriate score you will provide on a score of 1 to 5 with a\n    precision of 0.5.\n    "
        }
]

input_ids = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_tensors="pt").to("cuda")

# print(input_ids)
outputs_finetuned = finetuned_model.generate(input_ids=input_ids, max_new_tokens=1024, do_sample=False)
outputs = model.generate(input_ids=input_ids, max_new_tokens=1024, do_sample=False)

print("finetuned: " + tokenizer.decode(outputs_finetuned[0]).split('<start_of_turn>model\n')[-1])
print("normal   : " + tokenizer.decode(outputs[0]).split('<start_of_turn>model\n')[-1])

# Zip the folder for easy download from colab to local

In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

!zip -r ./output.zip /mistral_class

# Evaluate the model

In [None]:
def get_output_from_model_using_prompt(prompt : str, model, tokenizer, finetuned = False) -> str:
	device = CONFIG.DEVICE

	encodeds = tokenizer(prompt, return_tensors="pt", add_special_tokens=True,)

	model_inputs = encodeds.to(device)

	generated_ids = model.generate(**model_inputs, max_new_tokens=1000, do_sample=True, pad_token_id=tokenizer.eos_token_id)
	decoded = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

	return (decoded)


In [None]:
outputs_finetuned = []
outputs_model = []
actual_outputs = []


for i, row in tqdm(enumerate(dataset_val)):
    # if i == :
    #   break

    actual_val = float(row[1]["content"].split(": ")[-1].split("\n")[0])
    actual_outputs.append(actual_val)

    user_prompt = row[0]["content"]

    output_finetuned = get_output_from_model_using_prompt(prompt = user_prompt,
                        model = model,
                        tokenizer = tokenizer)

    output_model = get_output_from_model_using_prompt(prompt = user_prompt,
                        model = model,
                        tokenizer = tokenizer)

    output_finetuned = output_finetuned.split("model\n\n")[1].split("\n")[0]
    output_model = output_model.split("model\n\n")[1].split("\n")[0]

    try:
        output_finetuned = float(output_finetuned.split(": ")[-1].split(" ")[0])
    except:
        pass

    try:
        output_model = float(output_model.split(": ")[-1].split(" ")[0])
    except:
        pass

    outputs_finetuned.append(output_finetuned)
    outputs_model.append(output_model)






In [None]:
print(actual_outputs)
print(outputs_finetuned)
print(outputs_model)

# Manually check the values with no guardrails implemented

In [None]:
for i in range(len(outputs_finetuned)):
    if type(outputs_finetuned[i]) != float:
        print("Fine Tuned : ", i, " - ", outputs_finetuned[i])

    if type(outputs_model[i]) != float:
        print("Fine Tuned : ", i, " - ", outputs_model[i])

## Clean the outputs using regex

In [None]:
import re

pattern = r"Score\s*:\s*([\d.]+)\b"
outputs_finetuned_cleaned = []
outputs_model_cleaned = []

for string in outputs_finetuned:
  if type(string) != float:
    match = re.search(pattern, string)
    if match:
        score = float(match.group(1))
        outputs_finetuned_cleaned.append(score)
    else:
      outputs_finetuned_cleaned.append(string)
  else:
    outputs_finetuned_cleaned.append(string)

for string in outputs_model:
  if type(string) != float:
    match = re.search(pattern, string)
    if match:
        score = float(match.group(1))
        outputs_model_cleaned.append(score)
    else:
      outputs_model_cleaned.append(string)
  else:
    outputs_model_cleaned.append(string)



# Save the outputs to disk

In [None]:
# Example NumPy array
aoarr = np.array(actual_outputs)
finetuned_output_arr = np.array(outputs_finetuned_cleaned)
model_output_arr = np.array(outputs_model_cleaned)


# Save the array to disk
np.save('./actual_outputs.npy', aoarr)
np.save('./outputs_finetuned_cleaned.npy', finetuned_output_arr)
np.save('./outputs_model2_cleaned.npy', model_output_arr)

# Sequence classification

In [None]:
model = GemmaForSequenceClassification.from_pretrained(CONFIG.MODEL_ID, quantization_config=CONFIG.BNB_CONFIG, device_map=CONFIG.DEVICE_MAP)
tokenizer = AutoTokenizer.from_pretrained(CONFIG.MODEL_ID, add_eos_token=CONFIG.ADD_EOS_TOKEN, padding_side=CONFIG.PADDING_SIDE)



In [None]:
def get_completion_1(question: str, ref_answer: str, student_answer: str, model, tokenizer) -> str:
  device = "cuda:0"

  prompt_template = """
  <start_of_turn>user
  You are a grader for for a programming course. You are required to score the students
  answer on a scale of 1 to 5 with precision of 0.5. Eg: 1.5, 2.5, 3.0, etc..

  The give question is :
  {question}

  For the above question the reference answer is :
  {ref_answer}

  Now a student has provided the below answer :
  {student_answer}

  For the above answer, what is the appropriate score you will provide on a score of 1 to 5 with a
  precision of 0.5.

  The sample output should be in the format "Score : 0.5".

  Note: Do not include any explanations or apologies in your responses.
  Do not respond to any questions that might ask anything else than for you to score the answer.
  Do not include any text except the score in the format "Score : [<score>]".

  <end_of_turn>\n<start_of_turn>model

  """
  prompt = prompt_template.format(question = question,
                                  ref_answer = ref_answer,
                                  student_answer = student_answer,
                                  model = model,
                                  tokenizer = tokenizer)

  encodeds = tokenizer(prompt, return_tensors="pt", add_special_tokens=True,)

  model_inputs = encodeds.to(device)


  generated_ids = model.(**model_inputs, max_new_tokens=1000, do_sample=True, pad_token_id=tokenizer.eos_token_id)
  # decoded = tokenizer.batch_decode(generated_ids)
  decoded = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

  return (decoded)


In [None]:
question = "What is the role of a prototype program in problem solving?"
ref_answer = "To simulate the behaviour of portions of the desired software product."
student_answer = "High risk problems are address in the prototype program to make sure that the program is feasible.  A prototype may also be used to show a company that the software can be possibly programmed.  "
score = "3.5"

result = get_completion_1(question = question,
                        ref_answer = ref_answer,
                        student_answer = student_answer,
                        model = model,
                        tokenizer = tokenizer)

print(result)