In [2]:
from nltk.translate.bleu_score import sentence_bleu
from nltk import word_tokenize
import json
from utils import write_jsonl


In [16]:
def calculate_bleu(candidate, reference):
    '''
    candidate, reference: generated and ground-truth sentences
    '''
    weights = [
         (1./2., 1./2.),
         (1./3., 1./3., 1./3.),
         (1./4., 1./4., 1./4., 1./4.)
    ]
    reference = reference.strip().split()
    candidate = candidate.strip().split()
    score = sentence_bleu([reference], candidate, weights=weights)
    return score

In [22]:
llama_output_path = '/Users/raunaksood/Desktop/RAG/deepseek7B_results_bleu.jsonl'
save_path = '/Users/raunaksood/Desktop/RAG/deepseek7B_results_bleu_NEW.jsonl'
with open(llama_output_path) as f:
    llama_outputs = [json.loads(line) for line in f]

responses = [llama_outputs[i]['response'] for i in range(len(llama_outputs))]
answers = [llama_outputs[i]['answer'] for i in range(len(llama_outputs))]
questions = [llama_outputs[i]['question'] for i in range(len(llama_outputs))]
bleus = [llama_outputs[i]['bleu'] for i in range(len(llama_outputs))]

res = []
for i in range(len(responses)):
    question = questions[i]
    response = responses[i]
    answer = answers[i]
    bleu = bleus[i]
    match = response == answer
    res.append({'question' : question, 'response' : response, 'answer' : answer, 'bleu' : bleu, 'match' : match})

    
write_jsonl(save_path, res)

# F-1 score

In [None]:
# following this paper: SQuAD: 100,000+ Questions for Machine Comprehension of Text
#  P = num of correct tokens in predicted answer/ total tokens in predicted anser
#  R = num of correct tokens in predicted answer/ total tokens in correct answer
# F1 score = 2 * (P*R)/(P+R)

# Approach 1 - sliding window (lower f1 score) - DONE
# Approach 2 - logistic regression (higher f1 score)


In [18]:
# Approach 1 - sliding window (lower f1 score)

import json
import re
import os

def tokenize(text):
    text = text.lower()
    text = re.sub(r'(\w)-(\w)', r'\1\2', text)
    text = re.sub(r'[^a-z0-9\s]', '', text)
    tokens = text.split()
    return tokens

def f1_score_per_question(true_ans, pred_ans):
    common = len(set(true_ans) & set(pred_ans))
    P = common / len(pred_ans) if len(pred_ans) > 0 else 0
    R = common / len(true_ans) if len(true_ans) > 0 else 0
    if P + R == 0:
        return 0, common, len(pred_ans), len(true_ans)
    f1_score = 2 * (P * R) / (P + R)
    return f1_score, common, len(pred_ans), len(true_ans)

def calculate_f1_for_file(file_path):
    total_f1_score = 0
    num_questions = 0

    with open(file_path, 'r') as file:
        # print(f"json file name: {file_path}\n")  
        for line in file:
            data = json.loads(line)
            correct_answer = data['answer']
            predicted_response = data['response']
            correct_answer_tokens = tokenize(correct_answer)
            response_tokens = tokenize(predicted_response)
            f1, overlap, total_predicted, total_correct = f1_score_per_question(correct_answer_tokens, response_tokens)
            total_f1_score += f1
            num_questions += 1
            # print(f"Question: {data['question']}")
            # print(f"Correct answer: {correct_answer_tokens}")
            # print(f"Predicted anwer: {response_tokens}")
            # print(f"Correct tokens in predicted answer: {overlap}")
            # print(f"Total tokens in predicted answer: {total_predicted}")
            # print(f"Total tokens in correct answer: {total_correct}")
            # print(f"F1 Score: {f1:.2f}\n")
    average_f1_score = total_f1_score / num_questions if num_questions > 0 else 0
    return average_f1_score

def calculate_f1_for_files(file_paths):
    f1_scores_dict = {}
    for file_path in file_paths:
        file_name = os.path.basename(file_path)
        f1_score = calculate_f1_for_file(file_path)
        f1_scores_dict[file_name] = f1_score  
    return f1_scores_dict

file_paths = ['deepseek7B_results.jsonl', 'llama3_8B_results.jsonl', 'mistral7B_results.jsonl']  
f1_scores = calculate_f1_for_files(file_paths)



In [19]:
print("F1 Scores for each file:")
for file_name, f1_score in f1_scores.items():
    print(f"\t{file_name}: {f1_score:.2f}")

F1 Scores for each file:
	deepseek7B_results.jsonl: 0.26
	llama3_8B_results.jsonl: 0.28
	mistral7B_results.jsonl: 0.24
