In [None]:
import evaluate
import json
from tqdm import tqdm
from prometheus_eval import PrometheusEval
from prometheus_eval.litellm import LiteLLM
from prometheus_eval.prompts import ABSOLUTE_PROMPT, SCORE_RUBRIC_TEMPLATE
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    Trainer,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

'from datasets import load_dataset\nfrom transformers import (\n    AutoModelForCausalLM,\n    AutoTokenizer,\n    BitsAndBytesConfig,\n    HfArgumentParser,\n    TrainingArguments,\n    Trainer,\n    pipeline,\n    logging,\n)\nfrom peft import LoraConfig, PeftModel\nfrom trl import SFTTrainer'

In [57]:
name = 'Llama-2-7b-hf-M1-D2'
#name = 'Mistral-7B-Instruct-v0.3-M2-D1'
#name = 'StructLM-7B-M1-D2'

folder = 'Llama-2-7b-hf'
#folder = 'Mistral-7B-Instruct-v0.3'
#folder = 'StructLm-7B'

with open(f"./Outputs/{folder}/{name}.json", 'r') as f:
    data = json.load(f)

In [58]:
def evaluating(org, pred, metric_, value_):
    dict_metric = {}
    metric = evaluate.load(metric_)
    metric_per_review = 0
    idx = 0
    if "M1-D1" in name or 'raw' in name:
        for i in tqdm(range(len(org))):
            review_org = org[i]
            review_pred = pred[i]
            try:
                for key, val in review_org.items():
                    if key not in dict_metric:
                        dict_metric[key] = [0, 0] 
                    val_pred = [str(review_pred[key])]
                    if metric_ == 'bleu':        
                        score = metric.compute(references=[str(val)], predictions=val_pred, max_order=2, smooth=True)[value_]
                    else:
                        score = metric.compute(references=[str(val)], predictions=val_pred)[value_]
                    dict_metric[key][0] += score
                    dict_metric[key][1] += 1
                    idx += 1
            except:
                idx += 1
        sum = 0
        rest = 0
        for key in dict_metric.keys():
            if dict_metric[key][1] != 0:
                dict_metric[key][0] = round((dict_metric[key][0]/dict_metric[key][1])*100, 2)
            else:
                rest += 1
            sum += dict_metric[key][0]
        if len(dict_metric.keys()) - rest == 0:

            dict_metric["Total"] = [0]
        dict_metric["Total"] = [round(sum/(len(dict_metric.keys()) - rest), 2), len(dict_metric.keys()) - rest]
        return dict_metric
    else:
        score = 0
        for i in tqdm(range(len(org))):
            if metric_ == 'bleu':        
                score += metric.compute(references=[str(org[i])], predictions=[str(pred[i])], max_order=2, smooth=True)[value_]
            else:
                score += metric.compute(references=[str(org[i])], predictions=[str(pred[i])])[value_]
        
        dict_metric["Total"] = [round(score/(len(org)) * 100, 2), len(org)]
        return dict_metric

In [59]:
Original = data['Original']
Pred = data['Prediction']
bleu = evaluating(Original, Pred, 'bleu', 'bleu')
rougeL = evaluating(Original, Pred, "rouge", "rougeL")
rouge1 = evaluating(Original, Pred, "rouge", "rouge1")
rouge2 = evaluating(Original, Pred, "rouge", "rouge2")
rougeLsum = evaluating(Original, Pred, "rouge", "rougeLsum")
meteor = evaluating(Original, Pred, "meteor", "meteor")

100%|██████████| 677/677 [00:02<00:00, 305.48it/s]
100%|██████████| 677/677 [00:37<00:00, 18.08it/s]
100%|██████████| 677/677 [00:38<00:00, 17.79it/s]
100%|██████████| 677/677 [00:38<00:00, 17.75it/s]
100%|██████████| 677/677 [00:37<00:00, 17.92it/s]
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lagg1\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\lagg1\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\lagg1\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
100%|██████████| 677/677 [00:03<00:00, 188.27it/s]


In [60]:
print(f"BLEU: {bleu}")
print(f"ROUGE-L: {rougeL}")
print(f"ROUGE-1: {rouge1}")
print(f"ROUGE-2: {rouge2}")
print(f"ROUGE-Lsum: {rougeLsum}")
print(f"METEOR: {meteor}")


BLEU: {'Total': [6.5, 677]}
ROUGE-L: {'Total': [16.59, 677]}
ROUGE-1: {'Total': [22.77, 677]}
ROUGE-2: {'Total': [7.79, 677]}
ROUGE-Lsum: {'Total': [16.59, 677]}
METEOR: {'Total': [14.07, 677]}


In [None]:
"""import json

# Open and load the JSON file
with open(f"./metrics/{folder}/{name}.json", 'r') as f:
    data = json.load(f)

data['bleu'] = bleu
data['rougeL'] = rougeL
data['rouge1'] = rouge1
data['rouge2'] = rouge2
data['rougeLsum'] = rougeLsum
data['meteor'] = meteor
"""

In [None]:
"""with open(f"./metrics/{folder}/{name}.json", 'w+') as f:
    json.dump(data, f, indent=4 ,ensure_ascii=False)"""

In [8]:
model_name = "prometheus-eval/prometheus-7b-v2.0"

# Activate 8-bit precision base model loading
use_8bit = True

# Compute dtype for 4-bit base models
bnb_8bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

# Load the entire model on the GPU 0
device_map = {"": 0}

# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, bnb_8bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_8bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_8bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)
model.config.use_cache = False
model.config.pretraining_tp = 1

# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training
tokenizer.pad_token_id = tokenizer.eos_token_id  # Set pad_token_id to eos_token_id

Your GPU supports bfloat16: accelerate training with bf16=True


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

In [9]:
def corectness(Prompt, Predicted, Original):
    instruction = f"""Your task is to evaluate the generated answer and reference answer for the query: {Prompt}"""
    response = f"""{Predicted}"""
    reference_answer = f"""{Original}"""
    rubric = {
            "criteria": "Is the model proficient in generate a coherence response",
            "score1_description": "If the generated answer is not matching with any of the reference answers.",
            "score2_description": "If the generated answer is according to reference answer but not relevant to user query.",
            "score3_description": "If the generated answer is relevant to the user query and reference answer but contains mistakes.",
    		"score4_description": "If the generated answer is relevant to the user query and has the exact same metrics as the reference answer, but it is not as concise.",
            "score5_description": "If the generated answer is relevant to the user query and fully correct according to the reference answer."}
    #https://github.com/prometheus-eval/prometheus-eval

    ABS_SYSTEM_PROMPT = "You are a fair judge assistant tasked with providing clear, objective feedback based on specific criteria, ensuring each assessment reflects the absolute standards set for performance."

    ABSOLUTE_PROMPT = f"""###Task Description:
    An instruction (might include an Input inside it), a response to evaluate, a reference answer that gets a score of 5, and a score rubric representing a evaluation criteria are given.
    1. Write a detailed feedback that assess the quality of the response strictly based on the given score rubric, not evaluating in general.
    2. After writing a feedback, write a score that is an integer between 1 and 5. You should refer to the score rubric.
    3. The output format should look as follows: "Feedback: (write a feedback for criteria) [RESULT] (an integer number between 1 and 5)"
    4. Please do not generate any other opening, closing, and explanations.

    ###The instruction to evaluate:
    {instruction}

    ###Response to evaluate:
    {response}

    ###Reference Answer (Score 5):
    {reference_answer}

    ###Score Rubrics:
    {rubric}

    ###Feedback: """

    user_content = ABS_SYSTEM_PROMPT + "\n\n" + ABSOLUTE_PROMPT # Fill the prompt with your data

    messages = [
        {"role": "user", "content": user_content},
    ]

    encodeds = tokenizer.apply_chat_template(messages, return_tensors="pt")

    model_inputs = encodeds.to("cuda")


    generated_ids = model.generate(model_inputs, max_new_tokens=4000, do_sample=True, pad_token_id=tokenizer.eos_token_id)
    decoded = tokenizer.batch_decode(generated_ids)
    return decoded[0]

In [10]:
def faithfullness(Prompt, Predicted, Original):
    instruction = f"""Your task is to evaluate if the Generate answer has information from the context and also from the Existing answer."""
    response = f"""{Predicted}"""
    reference_answer = f"""{Original}"""
    rubric = {
            "score1_description": "If the generated answer is not matching with any of the reference answers and also not having information from the context.",
            "score2_description": "If the generated answer is having information from the context but not from existing answer and also have some irrelevant information.",
            "score3_description": "If the generated answer is having relevant information from the context and some information from existing answer but have additional information that do not exist in context and also do not in existing answer.",
    		"score4_description": "If the generated answer is having relevant information from the context and some information from existing answer.",
            "score5_description": "If the generated answer is matching with the existing answer and also having information from the context."}
    #https://github.com/prometheus-eval/prometheus-eval

    ABS_SYSTEM_PROMPT = "You are a fair judge assistant tasked with providing clear, objective feedback based on specific criteria, ensuring each assessment reflects the absolute standards set for performance."

    ABSOLUTE_PROMPT = f"""###Task Description:
    An instruction (might include an Input inside it), a response to evaluate, a reference answer that gets a score of 5, and a score rubric representing a evaluation criteria are given.
    1. Write a detailed feedback that assess the quality of the response strictly based on the given score rubric, not evaluating in general.
    2. After writing a feedback, write a score that is an integer between 1 and 5. You should refer to the score rubric.
    3. The output format should look as follows: "Feedback: (write a feedback for criteria) [RESULT] (an integer number between 1 and 5)"
    4. Please do not generate any other opening, closing, and explanations.
    5. Only evaluate on common things between generated answer and reference answer. Don't evaluate on things which are present in reference answer but not in generated answer.

    ###The instruction to evaluate:
    {instruction}

    ###Context:
    {Prompt}

    ###Existing answer (Score 5):
    {reference_answer}

    ###Generate answer to evaluate:
    {response}

    ###Score Rubrics:
    {rubric}

    ###Feedback: """

    user_content = ABS_SYSTEM_PROMPT + "\n\n" + ABSOLUTE_PROMPT # Fill the prompt with your data

    messages = [
        {"role": "user", "content": user_content},
    ]

    encodeds = tokenizer.apply_chat_template(messages, return_tensors="pt")

    model_inputs = encodeds.to("cuda")


    generated_ids = model.generate(model_inputs, max_new_tokens=4000, do_sample=True, pad_token_id=tokenizer.eos_token_id)
    decoded = tokenizer.batch_decode(generated_ids)
    return decoded[0]

In [11]:
def fluency(Prompt, Predicted, Original):
    instruction = f"""Evaluate the fluency of the generated JSON answer."""
    response = json.dumps(Predicted, indent=2)
    reference_answer = json.dumps(Original, indent=2)
    rubric = {
            "score1_description": "The generated JSON answer is not fluent and is difficult to understand.",
            "score2_description": "The generated JSON answer has several grammatical errors and awkward phrasing.",
            "score3_description": "The generated JSON answer is mostly fluent but contains some grammatical errors or awkward phrasing.",
            "score4_description": "The generated JSON answer is fluent with minor grammatical errors or awkward phrasing.",
            "score5_description": "The generated JSON answer is perfectly fluent with no grammatical errors or awkward phrasing."}
    #https://github.com/prometheus-eval/prometheus-eval

    ABS_SYSTEM_PROMPT = "You are a fair judge assistant tasked with providing clear, objective feedback based on specific criteria, ensuring each assessment reflects the absolute standards set for performance."

    ABSOLUTE_PROMPT = f"""###Task Description:
    An instruction (might include an Input inside it), a response to evaluate, a reference answer that gets a score of 5, and a score rubric representing a evaluation criteria are given.
    1. Write a detailed feedback that assess the quality of the response strictly based on the given score rubric, not evaluating in general.
    2. After writing a feedback, write a score that is an integer between 1 and 5. You should refer to the score rubric.
    3. The output format should look as follows: "Feedback: (write a feedback for criteria) [RESULT] (an integer number between 1 and 5)"
    4. Please do not generate any other opening, closing, and explanations.

    ###The instruction to evaluate:
    {instruction}

    ###Response to evaluate:
    {response}

    ###Reference Answer (Score 5):
    {reference_answer}

    ###Score Rubrics:
    {rubric}

    ###Feedback: """

    user_content = ABS_SYSTEM_PROMPT + "\n\n" + ABSOLUTE_PROMPT # Fill the prompt with your data

    messages = [
        {"role": "user", "content": user_content},
    ]

    encodeds = tokenizer.apply_chat_template(messages, return_tensors="pt")

    model_inputs = encodeds.to("cuda")

    generated_ids = model.generate(model_inputs, max_new_tokens=4000, do_sample=True, pad_token_id=tokenizer.eos_token_id)
    decoded = tokenizer.batch_decode(generated_ids)
    return decoded[0]


In [12]:
results = []
for i in tqdm(range(len(data["Original"]))):
    Prompt = data["Prompt"][i]
    Pred = data["Prediction"][i]
    Org = data["Original"][i]
    resp = corectness(Prompt=Prompt, Predicted=Pred, Original=Org)
    results.append(resp)

  0%|          | 0/2294 [00:00<?, ?it/s]The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
100%|██████████| 2294/2294 [3:03:06<00:00,  4.79s/it]  


In [13]:
results_faithfullness = []
for i in tqdm(range(len(data["Original"]))):
    Prompt = data["Prompt"][i]
    Pred = data["Prediction"][i]
    Org = data["Original"][i]
    resp = faithfullness(Prompt=Prompt, Predicted=Pred, Original=Org)
    results_faithfullness.append(resp)

100%|██████████| 2294/2294 [3:15:41<00:00,  5.12s/it]  


In [14]:
results_fluency = []
for i in tqdm(range(len(data["Original"]))):
    Prompt = data["Prompt"][i]
    Pred = data["Prediction"][i]
    Org = data["Original"][i]
    resp = corectness(Prompt=Prompt, Predicted=Pred, Original=Org)
    results_fluency.append(resp)

100%|██████████| 2294/2294 [3:05:39<00:00,  4.86s/it]  


In [15]:
results_ = []
for i in results:
    try:
        val = i.split("[RESULT] ")
        val = val[2]
        results_.append(int(val[0]))
    except:
        pass

results_faithfullness_ = []
for i in results_faithfullness:
    try:
        val = i.split("[RESULT] ")
        val = val[2]
        results_faithfullness_.append(int(val[0]))
    except:
        pass
results_fluency_ = []
for i in results_fluency:
    try:
        val = i.split("[RESULT] ")
        val = val[2]
        results_fluency_.append(int(val[0]))
    except:
        pass

In [16]:
metrics = {
    "bleu" : bleu,
    "rouge1" : rouge1,
    "rouge2" : rouge2,
    "rougeL" : rougeL,
    "rougeLsum" : rougeLsum,
    "meteor" : meteor,
    "correctness": ((sum(results_)/len(results_)) * 100)/5,
    "faithfullness": ((sum(results_faithfullness_)/len(results_faithfullness_)) * 100)/5,
    "fluency": ((sum(results_fluency_)/len(results_fluency_)) * 100)/5
}

In [17]:
print(metrics["correctness"])

73.07121013543032


In [18]:
with open(f"./metrics/{folder}/{name}.json", 'w+') as f:
    json.dump(metrics, f, indent=4 ,ensure_ascii=False)