In [2]:
import re
import nltk
from collections import Counter

#download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

#NLTK tokenizer and stopwords list
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

import pandas as pd

[nltk_data] Downloading package punkt to C:\Users\Marcus
[nltk_data]     Ong\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Marcus
[nltk_data]     Ong\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
swords = set(stopwords.words('english'))

In [4]:
def normalise(ans_string):
    # Lower string cases while removing punctuation using ReGex
    ans_string = ans_string.lower()
    ans_string = re.sub(r'[^A-Za-z0-9]', ' ', ans_string)
    return ans_string


In [5]:
def process_tokens(text):
    # Tokenise using NLTK and swords
    tokens = word_tokenize(text)
    tokens = [token for token in tokens if token not in swords]
    return tokens


In [6]:

def get_tokens(pred,ground_truth):
    #Normalise text
    pred = normalise(pred)
    ground_truth = normalise(ground_truth)

    # Tokenize and remove stopwords using NLTK
    pred = process_tokens(pred)
    ground_truth = process_tokens(ground_truth)

    # Count tokens
    pred_tokens = set(pred)
    truth_tokens = set(ground_truth)

    return pred_tokens, truth_tokens


In [47]:
def get_conf_mat(pred_tokens,truth_tokens):
    #Get tokens in common
    common_tokens = pred_tokens.intersection(truth_tokens)
    # True Positive
    tp = len(common_tokens)
    # False Positive
    fp = len(pred_tokens - truth_tokens)
    # False Negatives
    fn = len(truth_tokens - pred_tokens)

    if tp == 0:
        return 0.0, 0.0, 0.0
    
    return tp,fp,fn

In [53]:
def precision_score(tp,fp):
    if tp + fp == 0:
        return 0
    return tp /(tp+fp)

def recall_score(tp,fn):
    if tp + fn == 0:
        return 0
    return tp /(tp+fn)

def f1_score(precision,recall):
    if precision == 0 or recall == 0:
        return 0.0
    return (2 * precision * recall) / (precision + recall)

In [9]:
adv_rag = pd.read_csv('../../data/MainDataset/results/Official/Advanced_RAG_3.csv')
naive_rag = pd.read_csv('../../data/MainDataset/results/Official/Naive_RAG.csv')
no_rag = pd.read_csv('../../data/MainDataset/results/Official/llama-8b-8192.csv')

In [10]:
adv_rag.drop(columns=['Unnamed: 4','Unnamed: 5','Unnamed: 6'], inplace=True)

In [55]:
precision = 0
recall = 0
f1 = 0
for index, row in adv_rag.iterrows(): 
    pred_tokens, truth_tokens = get_tokens(row['gen_answer'],row['answers'])
    tp,fp,fn = get_conf_mat(pred_tokens,truth_tokens)
    precision += precision_score(tp,fp)
    recall += recall_score(tp,fn)
    f1 += f1_score(precision_score(tp,fp),recall_score(tp,fn))
 
print("Advanced RAG Metrics")
print(f"Precision : {precision/len(adv_rag['gen_answer']):.3f}")
print(f"Recall : {recall/len(adv_rag['gen_answer']):.3f}")
print(f"F1 Score : {f1/len(adv_rag['gen_answer']):.3f}")


Advanced RAG Metrics
Precision : 0.386
Recall : 0.696
F1 Score : 0.442


In [57]:
precision = 0
recall = 0
f1 = 0
for index, row in naive_rag.iterrows(): 
    pred_tokens, truth_tokens = get_tokens(row['gen_answer'],row['answers'])
    tp,fp,fn = get_conf_mat(pred_tokens,truth_tokens)
    precision += precision_score(tp,fp)
    recall += recall_score(tp,fn)
    f1 += f1_score(precision_score(tp,fp),recall_score(tp,fn))
 
print("Naive RAG Metrics")
print(f"Precision : {precision/len(naive_rag['gen_answer']):.3f}")
print(f"Recall : {recall/len(naive_rag['gen_answer']):.3f}")
print(f"F1 Score : {f1/len(naive_rag['gen_answer']):.3f}")


Naive RAG Metrics
Precision : 0.359
Recall : 0.692
F1 Score : 0.416


In [56]:
precision = 0
recall = 0
f1 = 0
for index, row in no_rag.iterrows(): 
    pred_tokens, truth_tokens = get_tokens(row['gen_answer'],row['answers'])
    tp,fp,fn = get_conf_mat(pred_tokens,truth_tokens)
    precision += precision_score(tp,fp)
    recall += recall_score(tp,fn)
    f1 += f1_score(precision_score(tp,fp),recall_score(tp,fn))
 
print("No RAG Metrics")
print(f"Precision : {precision/len(no_rag['gen_answer']):.3f}")
print(f"Recall : {recall/len(no_rag['gen_answer']):.3f}")
print(f"F1 Score : {f1/len(no_rag['gen_answer']):.3f}")


No RAG Metrics
Precision : 0.239
Recall : 0.396
F1 Score : 0.266
