In [2]:
import re
import nltk

#download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

#NLTK tokenizer and stopwords list
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

import pandas as pd

[nltk_data] Downloading package punkt to C:\Users\Marcus
[nltk_data]     Ong\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Marcus
[nltk_data]     Ong\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
swords = set(stopwords.words('english'))

In [4]:
def normalise(text):
    # Lower string cases while removing punctuation using ReGex
    ans_text = text.lower()
    ans_text = re.sub(r'[^A-Za-z0-9]', ' ', text)
    return ans_text


In [5]:
def process_tokens(text):
    # Tokenise using NLTK and swords
    tokens = word_tokenize(text)
    tokens = [token for token in tokens if token not in swords]
    return tokens


In [6]:

def get_tokens(pred,ground_truth):
    #Normalise text
    pred = normalise(pred)
    ground_truth = normalise(ground_truth)

    # Tokenize and remove stopwords using NLTK
    pred = process_tokens(pred)
    ground_truth = process_tokens(ground_truth)

    # Count tokens
    pred_tokens = set(pred)
    truth_tokens = set(ground_truth)

    return pred_tokens, truth_tokens


In [7]:
def get_common_tokens(pred_tokens,truth_tokens):
    #Get tokens in common
    common_tokens = pred_tokens.intersection(truth_tokens)
    return common_tokens

In [8]:
def precision_score(common_tokens,pred_tokens):
    return len(common_tokens) / len(pred_tokens)

def recall_score(common_tokens,truth_tokens):
    return len(common_tokens) / len(truth_tokens)

def f1_score(precision,recall):
    if precision + recall == 0:
        return 0
    return (2 * precision * recall) / (precision + recall)

In [9]:
adv_rag = pd.read_csv('../../data/MainDataset/results/Official/Advanced_RAG.csv')
naive_rag = pd.read_csv('../../data/MainDataset/results/Official/Naive_RAG.csv')
no_rag = pd.read_csv('../../data/MainDataset/results/Official/llama-8b-8192.csv')

In [19]:
df = pd.read_csv('../../data/Metrics/token_eval.csv')

In [24]:
gemma = pd.read_csv('../../data/MainDataset/results/Official/gemma2-9b-it.csv')
gpt = pd.read_csv('../../data/MainDataset/results/Official/GPT4oMini.csv')
mixtral = pd.read_csv('../../data/MainDataset/results/Official/mixtral-8x7b.csv')

In [20]:
total_precision = 0
total_recall = 0
total_f1 = 0
for index, row in gemma.iterrows(): 
    pred_tokens, truth_tokens = get_tokens(row['gen_answer'],row['answers'])
    common_tokens = get_common_tokens(pred_tokens,truth_tokens)
    total_precision += precision_score(common_tokens,pred_tokens)
    total_recall += recall_score(common_tokens,truth_tokens)
    total_f1 += f1_score(precision_score(common_tokens,pred_tokens),recall_score(common_tokens,truth_tokens))
 
 
print("Advanced RAG Metrics")
print(f"Precision : {total_precision/len(adv_rag['gen_answer']):.3f}")
print(f"Recall : {total_recall/len(adv_rag['gen_answer']):.3f}")
print(f"F1 Score : {total_f1/len(adv_rag['gen_answer']):.3f}")

df.loc[len(df)] = ['Gemma2-9B-it',total_precision/len(gemma['gen_answer']),total_recall/len(gemma['gen_answer']),total_f1/len(gemma['gen_answer'])]

Advanced RAG Metrics
Precision : 0.232
Recall : 0.387
F1 Score : 0.257


In [None]:
total_precision = 0
total_recall = 0
total_f1 = 0
for index, row in gemma.iterrows(): 
    pred_tokens, truth_tokens = get_tokens(row['gen_answer'],row['answers'])
    common_tokens = get_common_tokens(pred_tokens,truth_tokens)
    total_precision += precision_score(common_tokens,pred_tokens)
    total_recall += recall_score(common_tokens,truth_tokens)
    total_f1 += f1_score(precision_score(common_tokens,pred_tokens),recall_score(common_tokens,truth_tokens))
 
 
print("Advanced RAG Metrics")
print(f"Precision : {total_precision/len(adv_rag['gen_answer']):.3f}")
print(f"Recall : {total_recall/len(adv_rag['gen_answer']):.3f}")
print(f"F1 Score : {total_f1/len(adv_rag['gen_answer']):.3f}")

df.loc[len(df)] = ['Gemma2-9B-it',total_precision/len(gemma['gen_answer']),total_recall/len(gemma['gen_answer']),total_f1/len(gemma['gen_answer'])]

In [25]:
total_precision = 0
total_recall = 0
total_f1 = 0
for index, row in gpt.iterrows(): 
    pred_tokens, truth_tokens = get_tokens(row['gen_answer'],row['answers'])
    common_tokens = get_common_tokens(pred_tokens,truth_tokens)
    total_precision += precision_score(common_tokens,pred_tokens)
    total_recall += recall_score(common_tokens,truth_tokens)
    total_f1 += f1_score(precision_score(common_tokens,pred_tokens),recall_score(common_tokens,truth_tokens))
 
 
print("Advanced RAG Metrics")
print(f"Precision : {total_precision/len(adv_rag['gen_answer']):.3f}")
print(f"Recall : {total_recall/len(adv_rag['gen_answer']):.3f}")
print(f"F1 Score : {total_f1/len(adv_rag['gen_answer']):.3f}")

df.loc[len(df)] = ['GPT4o-mini',total_precision/len(gpt['gen_answer']),total_recall/len(gpt['gen_answer']),total_f1/len(gpt['gen_answer'])]

Advanced RAG Metrics
Precision : 0.130
Recall : 0.438
F1 Score : 0.182


In [26]:
total_precision = 0
total_recall = 0
total_f1 = 0
for index, row in mixtral.iterrows(): 
    pred_tokens, truth_tokens = get_tokens(row['gen_answer'],row['answers'])
    common_tokens = get_common_tokens(pred_tokens,truth_tokens)
    total_precision += precision_score(common_tokens,pred_tokens)
    total_recall += recall_score(common_tokens,truth_tokens)
    total_f1 += f1_score(precision_score(common_tokens,pred_tokens),recall_score(common_tokens,truth_tokens))
 
 
print("Advanced RAG Metrics")
print(f"Precision : {total_precision/len(adv_rag['gen_answer']):.3f}")
print(f"Recall : {total_recall/len(adv_rag['gen_answer']):.3f}")
print(f"F1 Score : {total_f1/len(adv_rag['gen_answer']):.3f}")

df.loc[len(df)] = ['mixtral-8x7b',total_precision/len(mixtral['gen_answer']),total_recall/len(mixtral['gen_answer']),total_f1/len(mixtral['gen_answer'])]

Advanced RAG Metrics
Precision : 0.077
Recall : 0.442
F1 Score : 0.117


In [None]:
df.to_csv('../../data/MainDataset/Metrics/token_eval.csv')

In [None]:
total_precision = 0
total_recall = 0
total_f1 = 0
for index, row in gemma.iterrows(): 
    pred_tokens, truth_tokens = get_tokens(row['gen_answer'],row['answers'])
    common_tokens = get_common_tokens(pred_tokens,truth_tokens)
    total_precision += precision_score(common_tokens,pred_tokens)
    total_recall += recall_score(common_tokens,truth_tokens)
    total_f1 += f1_score(precision_score(common_tokens,pred_tokens),recall_score(common_tokens,truth_tokens))
 
 
print("Advanced RAG Metrics")
print(f"Precision : {total_precision/len(adv_rag['gen_answer']):.3f}")
print(f"Recall : {total_recall/len(adv_rag['gen_answer']):.3f}")
print(f"F1 Score : {total_f1/len(adv_rag['gen_answer']):.3f}")

df.loc[len(df)] = ['Gemma2-9B-it',total_precision/len(gemma['gen_answer']),total_recall/len(gemma['gen_answer']),total_f1/len(gemma['gen_answer'])]

In [21]:
df

Unnamed: 0,method,precision,recall,f1_score
0,Advanced RAG,0.386138,0.695733,0.44202
1,Naive RAG,0.359226,0.692159,0.41631
2,No RAG,0.239094,0.396227,0.266087
3,Gemma2-9B-it,0.231763,0.387118,0.257008


In [11]:
total_precision = 0
total_recall = 0
total_f1 = 0
for index, row in adv_rag.iterrows(): 
    pred_tokens, truth_tokens = get_tokens(row['gen_answer'],row['answers'])
    common_tokens = get_common_tokens(pred_tokens,truth_tokens)
    total_precision += precision_score(common_tokens,pred_tokens)
    total_recall += recall_score(common_tokens,truth_tokens)
    total_f1 += f1_score(precision_score(common_tokens,pred_tokens),recall_score(common_tokens,truth_tokens))
 
 
print("Advanced RAG Metrics")
print(f"Precision : {total_precision/len(adv_rag['gen_answer']):.3f}")
print(f"Recall : {total_recall/len(adv_rag['gen_answer']):.3f}")
print(f"F1 Score : {total_f1/len(adv_rag['gen_answer']):.3f}")


Advanced RAG Metrics
Precision : 0.351
Recall : 0.662
F1 Score : 0.411


In [12]:
total_precision = 0
total_recall = 0
total_f1 = 0
for index, row in naive_rag.iterrows(): 
    pred_tokens, truth_tokens = get_tokens(row['gen_answer'],row['answers'])
    common_tokens = get_common_tokens(pred_tokens,truth_tokens)
    total_precision += precision_score(common_tokens,pred_tokens)
    total_recall += recall_score(common_tokens,truth_tokens)
    total_f1 += f1_score(precision_score(common_tokens,pred_tokens),recall_score(common_tokens,truth_tokens))
 
 
print("Naive RAG Metrics")
print(f"Precision : {total_precision/len(adv_rag['gen_answer']):.3f}")
print(f"Recall : {total_recall/len(adv_rag['gen_answer']):.3f}")
print(f"F1 Score : {total_f1/len(adv_rag['gen_answer']):.3f}")


Naive RAG Metrics
Precision : 0.332
Recall : 0.631
F1 Score : 0.383


In [13]:
total_precision = 0
total_recall = 0
total_f1 = 0
for index, row in no_rag.iterrows(): 
    pred_tokens, truth_tokens = get_tokens(row['gen_answer'],row['answers'])
    common_tokens = get_common_tokens(pred_tokens,truth_tokens)
    total_precision += precision_score(common_tokens,pred_tokens)
    total_recall += recall_score(common_tokens,truth_tokens)
    total_f1 += f1_score(precision_score(common_tokens,pred_tokens),recall_score(common_tokens,truth_tokens))
 
 
print("Non RAG Metrics")
print(f"Precision : {total_precision/len(adv_rag['gen_answer']):.3f}")
print(f"Recall : {total_recall/len(adv_rag['gen_answer']):.3f}")
print(f"F1 Score : {total_f1/len(adv_rag['gen_answer']):.3f}")


Non RAG Metrics
Precision : 0.212
Recall : 0.340
F1 Score : 0.234
