In [None]:
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Local PDF file path
local_path = r"D:\GitHub\Projetos\Mestrado\EnergyContext\pdf\appendixa_0.pdf"

# Load the PDF file
loader = UnstructuredPDFLoader(file_path=local_path)
data = loader.load()

# Split the document into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=7500, chunk_overlap=100)
chunks = text_splitter.split_documents(data)


In [None]:
from transformers import pipeline

# Load a question generation pipeline
question_generator = pipeline("text2text-generation", model="valhalla/t5-base-qg-hl")

# Generate synthetic questions and answers
synthetic_data = []
for chunk in chunks:
    text = chunk['content']
    question = question_generator("generate question: " + text)[0]['generated_text']
    synthetic_data.append((question, text))

# Separate into queries and expected answers
test_queries, expected_answers = zip(*synthetic_data)


In [None]:
from datasets import load_metric
from sentence_transformers import SentenceTransformer, util
import numpy as np

# Load the metrics
bleu_metric = load_metric("bleu")
rouge_metric = load_metric("rouge")

# Load a sentence transformer model for semantic similarity
model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to evaluate the RAG system using BLEU, ROUGE, and semantic similarity
def evaluate_rag_system(chain, test_queries, expected_answers):
    predictions = [chain.invoke(query) for query in test_queries]
    print("Predictions:", predictions)  # Debug: Print predictions to see what the system returns

    # Initialize lists to hold references and predictions for metrics calculation
    references = [[ref.split()] for ref in expected_answers]  # BLEU expects list of lists of tokens
    preds = [pred.split() for pred in predictions]

    # Calculate BLEU score
    bleu_metric.add_batch(predictions=preds, references=references)
    bleu_score = bleu_metric.compute()['bleu']

    # Calculate ROUGE scores
    rouge_metric.add_batch(predictions=predictions, references=expected_answers)
    rouge_scores = rouge_metric.compute()
    
    # Calculate semantic similarity
    semantic_similarities = []
    for pred, ref in zip(predictions, expected_answers):
        pred_embedding = model.encode(pred, convert_to_tensor=True)
        ref_embedding = model.encode(ref, convert_to_tensor=True)
        semantic_similarity = util.pytorch_cos_sim(pred_embedding, ref_embedding).item()
        semantic_similarities.append(semantic_similarity)
    
    avg_semantic_similarity = np.mean(semantic_similarities)

    # Print results
    print(f"BLEU Score: {bleu_score:.2f}")
    print(f"ROUGE-1 Score: {rouge_scores['rouge1'].mid.fmeasure:.2f}")
    print(f"ROUGE-2 Score: {rouge_scores['rouge2'].mid.fmeasure:.2f}")
    print(f"ROUGE-L Score: {rouge_scores['rougeL'].mid.fmeasure:.2f}")
    print(f"Average Semantic Similarity: {avg_semantic_similarity:.2f}")

    return bleu_score, rouge_scores, avg_semantic_similarity

# Evaluate the RAG system
evaluate_rag_system(chain, test_queries, expected_answers)
