# Imports
*Importing neccessary libraries*

In [1]:
import os
import dotenv
hugging_face_api_token = os.getenv("HUGGINGFACE_API_TOKEN")
from langchain.llms import HuggingFaceHub
from datasets import load_metric
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import SmoothingFunction
import pandas as pd
import numpy as np


# Initialising the LLMs
- codellama/CodeLlama-7b-hf
- mistralai/Mistral-7B-v0.1
- h2oai/h2o-danube-1.8b-chat
- HuggingFaceH4/zephyr-7b-beta

In [2]:
llm1 = HuggingFaceHub(huggingfacehub_api_token=hugging_face_api_token,
                     repo_id = "codellama/CodeLlama-7b-hf",
                     model_kwargs = {"temperature" : 0.7,"max_length": 150}
)
llm2 = HuggingFaceHub(huggingfacehub_api_token=hugging_face_api_token,
                     repo_id = "mistralai/Mistral-7B-v0.1",
                     model_kwargs = {"temperature" : 0.7,"max_length": 150}
)
llm3 = HuggingFaceHub(huggingfacehub_api_token=hugging_face_api_token,
                     repo_id = "h2oai/h2o-danube-1.8b-chat",
                     model_kwargs = {"temperature" : 0.7,"max_length": 150}
)
llm4 = HuggingFaceHub(huggingfacehub_api_token=hugging_face_api_token,
                     repo_id = "HuggingFaceH4/zephyr-7b-beta",
                     model_kwargs = {"temperature" : 0.7,"max_length": 150}
)

llm=[llm1,llm2,llm3,llm4]

# Evaluation Dataset
*Creating a sample dataset to evaluate the LLMs*

In [3]:
evaluation_dataset = [    
    {"prompt": "Discuss the importance of early childhood education.", "expected_output": "Early childhood education lays the foundation for lifelong learning and development."},
    {"prompt": "Explain the concept of differentiation in education.", "expected_output": "Differentiation in education involves adapting instruction to meet the diverse needs of students."},
    {"prompt": "Discuss the principles of democracy and their significance in modern politics.", "expected_output": "Democracy is a system of government based on the principles of popular sovereignty, political equality, and majority rule, with protections for minority rights."},
    {"prompt": "Describe the rules and objective of soccer.", "expected_output": "Soccer is a team sport played between two teams of eleven players each, with the objective of scoring goals by kicking the ball into the opposing team's goal."},
    {"prompt": "Discuss the health benefits of regular physical activity and sports participation.", "expected_output": "Regular physical activity and sports participation promote cardiovascular health, strength, endurance, and overall well-being."}]

# Calculating Scores
*Calculating and creating a dataframe of scores*
*Evaluation parameters:*
- Bert Score (precision, recall, f1 score)
- Bleu Score

In [4]:
df = pd.DataFrame(columns=["Prompt","Model", "Precision", "Recall", "F1_Score", "Bleu_Score"])
bertscore_metric = load_metric('bertscore')
j=0
i=0
for e in evaluation_dataset:
    for l,model in enumerate(llm,start=1):  
        prompt = e["prompt"]        
        expected_output = e["expected_output"]
        generated_text = model(prompt)
        bleu_score = sentence_bleu([expected_output.split()], generated_text.split(), smoothing_function=SmoothingFunction().method1)
        bert_scores = bertscore_metric.compute(predictions=[generated_text], references=[expected_output], lang="en")
        precision = bert_scores['precision'][0]
        recall = bert_scores['recall'][0]
        f1 = bert_scores['f1'][0]
        df.loc[j] = ["Prompt {}".format(i+1),"LLM {}".format(l), precision, recall, f1, bleu_score]
        j+=1
    i+=1

  bertscore_metric = load_metric('bertscore')



Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
binary_path: c:\Python311\Lib\site-packages\bitsandbytes\cuda_setup\libbitsandbytes_cuda116.dll
CUDA SETUP: Loading binary c:\Python311\Lib\site-packages\bitsandbytes\cuda_setup\libbitsandbytes_cuda116.dll...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
df

Unnamed: 0,Prompt,Model,Precision,Recall,F1_Score,Bleu_Score
0,Prompt 1,LLM 1,0.81959,0.871123,0.844571,0.005771
1,Prompt 1,LLM 2,0.81848,0.879037,0.847679,0.005587
2,Prompt 1,LLM 3,0.842658,0.919741,0.879514,0.011076
3,Prompt 1,LLM 4,0.84872,0.930715,0.887828,0.022944
4,Prompt 2,LLM 1,0.819763,0.868085,0.843232,0.003568
5,Prompt 2,LLM 2,0.847417,0.92505,0.884534,0.006
6,Prompt 2,LLM 3,0.842026,0.936841,0.886906,0.017695
7,Prompt 2,LLM 4,0.859235,0.922353,0.889676,0.009524
8,Prompt 3,LLM 1,0.801087,0.830884,0.815713,0.004625
9,Prompt 3,LLM 2,0.861482,0.90573,0.883052,0.017144


# Topsis 
*Weighted normalisation*

In [6]:

def wt_norm(df,weights):
  
  df = df.div(df.apply(lambda x: x*2).apply(sum).apply(lambda x: x*0.5))
  df = df.mul(weights)
  return df

*Topsis Function Definition*

In [7]:

def topsis(input,weights, impacts):
    df=input.drop(['Prompt','Model'],axis=1)
    df = wt_norm(df,weights)
    b=[]
    w=[]
    for i in range(len(impacts)):
        if impacts[i]=='+':
            b.append(max(df.iloc[:,i]))
            w.append(min(df.iloc[:,i]))
        else:
            b.append(min(df.iloc[:,i]))
            w.append(max(df.iloc[:,i]))

    # Calculating performance
    s_best=[]
    s_worst=[]
    for index, row in df.iterrows():
        s_best.append((sum((row - b) ** 2) ** 0.5) * 0.5)
        s_worst.append((sum((row - w) ** 2) ** 0.5) * 0.5)
    s_total = [i + j for i, j in zip(s_worst, s_best)]
    performance = [i / j if j != 0 else 0 for i, j in zip(s_worst, s_total)]
    df['Topsis Score'] = performance
    
    # Ranking
    sorted_array = df.loc[:,'Topsis Score'].argsort()
    ranks = np.empty_like(sorted_array)
    ranks[sorted_array] = np.arange(len(sorted_array))
    n=len(sorted_array)
    ranks = [n-i for i in ranks]
    df.loc[:,'Rank'] = ranks
    
    return df


*Inputs for Topsis*

In [8]:
inputs = [df[0:4 ], df[4:8], df[8:12], df[12:16], df[16:20]]
weights = [1,1,1,1]
impacts = ["+", "+", "+", "+"]

# Output
*Ranking the models  based on their performance score calculated using Topsis*

In [9]:
for i in range(5):
    print("Prompt ",i+1,":",evaluation_dataset[i]['prompt'])
    res = topsis(inputs[i], weights, impacts)
    
    print(res)
    print("\n")

Prompt  1 : Discuss the importance of early childhood education.
   Precision    Recall  F1_Score  Bleu_Score  Topsis Score  Rank
0   0.246164  0.241937  0.244125    0.127179      0.010628     3
1   0.245831  0.244135  0.245023    0.123120      0.006161     4
2   0.253092  0.255440  0.254225    0.244090      0.318706     2
3   0.254913  0.258488  0.256628    0.505612      1.000000     1


Prompt  2 : Explain the concept of differentiation in education.
   Precision    Recall  F1_Score  Bleu_Score  Topsis Score  Rank
4   0.243366  0.237680  0.240625    0.096982      0.000000     4
5   0.251576  0.253277  0.252410    0.163093      0.179213     3
6   0.249975  0.256505  0.253087    0.481029      0.986742     1
7   0.255084  0.252538  0.253878    0.258896      0.424021     2


Prompt  3 : Discuss the principles of democracy and their significance in modern politics.
    Precision    Recall  F1_Score  Bleu_Score  Topsis Score  Rank
8    0.238038  0.236913  0.237495    0.051687      0.000000

| Domain | Best Model | Model Name |
|-----------------|-----------------|-----------------|
| Education    | M4,M3   |  h2oai/h2o-danube-1.8b-chat , HuggingFaceH4/zephyr-7b-beta    |
| Sports    | M4,M3    |  h2oai/h2o-danube-1.8b-chat , HuggingFaceH4/zephyr-7b-beta   |
| Politics    | M3    |  h2oai/h2o-danube-1.8b-chat   |