In [1]:
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.chat_models import ChatOllama
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain_core.runnables import RunnablePassthrough
from rouge import Rouge
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from sentence_transformers import SentenceTransformer, util
import numpy as np


In [2]:
local_path = r"D:\GitHub\Projetos\Mestrado\EnergyContext\pdf\appendixa_0.pdf"
if local_path:
    loader = UnstructuredPDFLoader(file_path=local_path)
    data = loader.load()
else:
    raise FileNotFoundError("Upload a PDF file")


In [3]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=7500, chunk_overlap=100)
chunks = text_splitter.split_documents(data)


In [4]:
vector_db = Chroma.from_documents(
    documents=chunks, 
    embedding=OllamaEmbeddings(model="nomic-embed-text", show_progress=True),
    collection_name="local-rag"
)


OllamaEmbeddings: 100%|██████████| 3/3 [00:08<00:00,  2.96s/it]


In [5]:
'''
QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are an electrical engineer. Your task is to generate five
    different versions of the given user question to retrieve relevant documents from
    a vector database, and only the database. By generating multiple perspectives on the user question, your
    goal is to help the user overcome some of the limitations of the distance-based
    similarity search. Provide an answer combining the most important points of these five versions.
    Original question: {question}"""
)
'''
QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are an electrical engineer. Your task is to generate a response based on the vector database.Your
    goal is to help the user overcome some of the limitations of the distance-based
    similarity search. Provide an answer as close as possible to what is in the document.
    Original question: {question}"""
)


In [6]:
local_model = "mistral"
llm = ChatOllama(model=local_model)
retriever = MultiQueryRetriever.from_llm(
    vector_db.as_retriever(), 
    llm,
    prompt=QUERY_PROMPT
)


In [7]:
template = """Answer the question based ONLY on the following context:
{context}
Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)


In [8]:
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)


In [9]:
#chain.invoke("What are technologies used to provide indoor environmental comfort?")

OllamaEmbeddings: 100%|██████████| 1/1 [00:03<00:00,  3.95s/it]
Number of requested results 4 is greater than number of elements in index 3, updating n_results = 3
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.07s/it]
Number of requested results 4 is greater than number of elements in index 3, updating n_results = 3
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.06s/it]
Number of requested results 4 is greater than number of elements in index 3, updating n_results = 3
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.06s/it]
Number of requested results 4 is greater than number of elements in index 3, updating n_results = 3
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.14s/it]
Number of requested results 4 is greater than number of elements in index 3, updating n_results = 3
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.09s/it]
Number of requested results 4 is greater than number of elements in index 3, updating n_results = 3
OllamaEmbeddings

" The text provided doesn't explicitly mention technologies used to provide indoor environmental comfort, but it does mention some related concepts that can contribute to maintaining indoor comfort. Here are a few examples:\n\n1. High-efficiency heating, ventilating and air conditioning systems or control modifications: These systems help regulate temperature and humidity levels indoors, providing thermal comfort.\n\n2. Advanced electric motor drives: These can be used in various appliances and machinery within buildings, such as fans, pumps, and HVAC equipment, to improve their energy efficiency and thus contribute to maintaining a comfortable indoor environment while reducing energy consumption.\n\n3. Energy efficient lighting: This includes LED or CFL bulbs that use less energy and produce less heat compared to traditional incandescent bulbs, helping to maintain indoor comfort by reducing the amount of heat generated indoors.\n\n4. Combined Heat and Power (CHP) systems: These system

In [10]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from sentence_transformers import SentenceTransformer, util
import numpy as np
import pandas as pd

# Define a set of handwritten test queries and expected answers
test_queries = [
    "What is the relationship between Megawatt (MW) and Megawatt-hour (MWh)?",
    "Who prepared the Annual Energy Outlook?",
    "What is the Clean Air Act?",
    "What is an Electric Generating Unit?",
    "How Electricity Dispatch Models optimize the dispatch of a system?",
    "Who maintains the Emissions & Generation Resource Integrated Database?",
    "Give me examples of Energy Efficiency",
    "What are ISOs?",
    "What are Power Pools?",
    "What does the Public Utilities Commission or Public Service Commission do?",
    "What does Electricity Dispatch Models simulate?",
]

expected_answers = [
    "A megawatt (MW) is a unit of power representing the rate at which energy is used or generated, while a megawatt-hour (MWh) is a unit of energy representing the total amount of energy used or generated over an hour.",
    "The U.S. Department of Energy’s Energy Information Administration (EIA)",
    "The Clean Air Act (CAA) is the law that defines the U.S. Environmental Protection Agency’s responsibilities for protecting and improving the nation's air quality and the stratospheric ozone layer",
    "An entity that supplies electricity to the electricity system relying on a variety of fuels.",
    "These models optimize the dispatch of the system based on the variable costs of each resource and any operational constraints that have been entered into the model.",
    "The U.S. Environmental Protection Agency",
    "High-efficiency appliances; efficient lighting; high-efficiency heating, ventilating and air conditioning systems or control modifications; efficient building design; advanced electric motor drives; combined heat and power; and heat recovery systems.",
    "Independent System Operators",
    "A power pool is an association of two or more interconnected electric systems that agree to coordinate operations and planning for improved reliability and efficiencies.",
    "Regulates the rates and services of a public utility.",
    "Simulate the dynamic operation of the electric system, generally on a least-cost system dispatch",
]

# Function to categorize scores into low, mid, and high
def categorize_score(score, thresholds):
    if score < thresholds['low']:
        return 'low'
    elif score < thresholds['mid']:
        return 'mid'
    else:
        return 'high'

# Function to evaluate the RAG system using BLEU, ROUGE, and semantic similarity
def evaluate_rag_system(chain, test_queries, expected_answers):
    # Get predictions from the RAG system
    predictions = [chain.invoke(query) for query in test_queries]
    print("Predictions:", predictions)  # Debug: Print predictions to see what the system returns

    # Initialize metrics
    bleu_scores = []
    rouge_scores = []
    semantic_similarities = []

    # Initialize ROUGE scorer
    rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    
    # Load a sentence transformer model for semantic similarity
    model = SentenceTransformer('all-MiniLM-L6-v2')
    
    for pred, ref in zip(predictions, expected_answers):
        # Calculate BLEU score
        bleu_score = sentence_bleu([ref.split()], pred.split(), smoothing_function=SmoothingFunction().method1)
        bleu_scores.append(bleu_score)
        
        # Calculate ROUGE scores
        rouge_score = rouge.score(ref, pred)
        rouge_scores.append(rouge_score)
        
        # Calculate semantic similarity
        pred_embedding = model.encode(pred, convert_to_tensor=True)
        ref_embedding = model.encode(ref, convert_to_tensor=True)
        semantic_similarity = util.pytorch_cos_sim(pred_embedding, ref_embedding).item()
        semantic_similarities.append(semantic_similarity)
    
    # Calculate average scores
    avg_bleu = np.mean(bleu_scores)
    avg_rouge1 = np.mean([score['rouge1'].fmeasure for score in rouge_scores])
    avg_rouge2 = np.mean([score['rouge2'].fmeasure for score in rouge_scores])
    avg_rougeL = np.mean([score['rougeL'].fmeasure for score in rouge_scores])
    avg_semantic_similarity = np.mean(semantic_similarities)

    # Define thresholds for categorizing scores
    thresholds = {
        'bleu': {'low': 0.2, 'mid': 0.5},
        'rouge1': {'low': 0.2, 'mid': 0.5},
        'rouge2': {'low': 0.1, 'mid': 0.3},
        'rougeL': {'low': 0.2, 'mid': 0.5},
        'semantic': {'low': 0.5, 'mid': 0.7}
    }

    # Categorize the average scores
    bleu_category = categorize_score(avg_bleu, thresholds['bleu'])
    rouge1_category = categorize_score(avg_rouge1, thresholds['rouge1'])
    rouge2_category = categorize_score(avg_rouge2, thresholds['rouge2'])
    rougeL_category = categorize_score(avg_rougeL, thresholds['rougeL'])
    semantic_category = categorize_score(avg_semantic_similarity, thresholds['semantic'])

    # Print results
    print(f"Average BLEU Score: {avg_bleu:.2f} ({bleu_category})")
    print(f"Average ROUGE-1 Score: {avg_rouge1:.2f} ({rouge1_category})")
    print(f"Average ROUGE-2 Score: {avg_rouge2:.2f} ({rouge2_category})")
    print(f"Average ROUGE-L Score: {avg_rougeL:.2f} ({rougeL_category})")
    print(f"Average Semantic Similarity: {avg_semantic_similarity:.2f} ({semantic_category})")

    return {
        'avg_bleu': avg_bleu,
        'avg_rouge1': avg_rouge1,
        'avg_rouge2': avg_rouge2,
        'avg_rougeL': avg_rougeL,
        'avg_semantic_similarity': avg_semantic_similarity,
        'categories': {
            'bleu': bleu_category,
            'rouge1': rouge1_category,
            'rouge2': rouge2_category,
            'rougeL': rougeL_category,
            'semantic': semantic_category
        },
        'predictions': predictions  # Return the predictions
    }

# Assuming 'chain' is your RAG system already defined
# For example, chain.invoke("Your query") should return the answer from the RAG system
evaluation_results = evaluate_rag_system(chain, test_queries, expected_answers)

# Create a DataFrame with the results
df = pd.DataFrame({
    'Query': test_queries,
    'Expected Answer': expected_answers,
    'Predicted Answer': evaluation_results['predictions']
})

# Save the DataFrame to an Excel file
df.to_excel('evaluation_results.xlsx', index=False)



OllamaEmbeddings: 100%|██████████| 1/1 [00:03<00:00,  3.71s/it]
Number of requested results 4 is greater than number of elements in index 3, updating n_results = 3
OllamaEmbeddings: 100%|██████████| 1/1 [00:03<00:00,  3.69s/it]
Number of requested results 4 is greater than number of elements in index 3, updating n_results = 3
OllamaEmbeddings: 100%|██████████| 1/1 [00:03<00:00,  3.56s/it]
Number of requested results 4 is greater than number of elements in index 3, updating n_results = 3
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.05s/it]
Number of requested results 4 is greater than number of elements in index 3, updating n_results = 3
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.07s/it]
Number of requested results 4 is greater than number of elements in index 3, updating n_results = 3
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.06s/it]
Number of requested results 4 is greater than number of elements in index 3, updating n_results = 3
OllamaEmbeddings

Predictions: [' A MegaWatt (MW) represents a unit of power, or the rate at which energy is produced or consumed over time. On the other hand, a Megawatt-hour (MWh) is a unit of energy, representing one MegaWatt sustained for one hour. So, to convert power to energy, we multiply power (in watts or MW) by the duration (in hours) and divide the result by 1000 (since there are 1000 hours in a Megawatt-hour). For example, if a wind turbine produces 2 MW of power for an hour, its energy output would be 2 MWh.', ' The Annual Energy Outlook was not explicitly mentioned in the provided text, but it is a report prepared by the United States Energy Information Administration (EIA), which provides projections of energy supply, demand, and prices for the U.S. economy over the next 25 years.', ' The Clean Air Act is a United States legislation that requires the U.S. Environmental Protection Agency to set National Ambient Air Quality Standards for six common air pollutants, often referred to as "crit



Average BLEU Score: 0.03 (low)
Average ROUGE-1 Score: 0.26 (mid)
Average ROUGE-2 Score: 0.11 (mid)
Average ROUGE-L Score: 0.21 (mid)
Average Semantic Similarity: 0.64 (mid)


'\n# Print the answers that the model returns\nfor query, prediction in zip(test_queries, evaluation_results[\'predictions\']):\n    print(f"Query: {query}")\n    print(f"Prediction: {prediction}")'

In [19]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from datasets import load_metric
from sentence_transformers import SentenceTransformer, util
import numpy as np
import pandas as pd

# --- Data ---

test_queries = [
    "What is the relationship between Megawatt (MW) and Megawatt-hour (MWh)?",
    "Who prepared the Annual Energy Outlook?",
    "What is the Clean Air Act?",
    "What is an Electric Generating Unit?",
    "How Electricity Dispatch Models optimize the dispatch of a system?",
    "Who maintains the Emissions & Generation Resource Integrated Database?",
    "Give me examples of Energy Efficiency",
    "What are ISOs?",
    "What are Power Pools?",
    "What does the Public Utilities Commission or Public Service Commission do?",
    "What does Electricity Dispatch Models simulate?",
]

expected_answers = [
    "A megawatt (MW) is a unit of power representing the rate at which energy is used or generated, while a megawatt-hour (MWh) is a unit of energy representing the total amount of energy used or generated over an hour.",
    "The U.S. Department of Energy’s Energy Information Administration (EIA)",
    "The Clean Air Act (CAA) is the law that defines the U.S. Environmental Protection Agency’s responsibilities for protecting and improving the nation's air quality and the stratospheric ozone layer",
    "An entity that supplies electricity to the electricity system relying on a variety of fuels.",
    "These models optimize the dispatch of the system based on the variable costs of each resource and any operational constraints that have been entered into the model.",
    "The U.S. Environmental Protection Agency",
    "High-efficiency appliances; efficient lighting; high-efficiency heating, ventilating and air conditioning systems or control modifications; efficient building design; advanced electric motor drives; combined heat and power; and heat recovery systems.",
    "Independent System Operators",
    "A power pool is an association of two or more interconnected electric systems that agree to coordinate operations and planning for improved reliability and efficiencies.",
    "Regulates the rates and services of a public utility.",
    "Simulate the dynamic operation of the electric system, generally on a least-cost system dispatch",
]


# --- Evaluation Functions ---

def categorize_score(score, thresholds):
    if score < thresholds['low']:
        return 'low'
    elif score < thresholds['mid']:
        return 'mid'
    else:
        return 'high'

def calculate_scores(prediction, reference):
    bleu_score = sentence_bleu([reference.split()], prediction.split(), smoothing_function=SmoothingFunction().method1)

    rouge_scores = rouge.compute(predictions=[prediction], references=[reference])

    # Optionally print raw rouge scores to console for immediate feedback
    # print(f"Raw ROUGE scores for '{prediction}': {rouge_scores}")

    model = SentenceTransformer('all-MiniLM-L6-v2')
    embeddings = model.encode([prediction, reference], convert_to_tensor=True)
    semantic_similarity = util.pytorch_cos_sim(embeddings[0], embeddings[1]).item()

    return {
        'bleu': bleu_score,
        'rouge_scores': rouge_scores,
        'semantic_similarity': semantic_similarity,
    }

def evaluate_rag_system(chain, test_queries, expected_answers):
    results = []
    predictions = []

    for query, expected in zip(test_queries, expected_answers):
        prediction = chain.invoke(query)
        predictions.append(prediction)
        scores = calculate_scores(prediction, expected)
        results.append({'query': query, 'expected': expected, 'predicted': prediction, **scores})

    return results, predictions

# --- Main Evaluation ---

rouge = load_metric("rouge")

# Assuming 'chain' is your RAG system
evaluation_results, predictions = evaluate_rag_system(chain, test_queries, expected_answers)

# --- Create DataFrame and Flatten ROUGE ---

df = pd.DataFrame(evaluation_results)

# Flatten the rouge_scores dictionary into separate columns
for i, row in df.iterrows():
    for rouge_type in ['rouge1', 'rouge2', 'rougeL']:
        for metric, value in row['rouge_scores'][rouge_type].mid._asdict().items():
            df.at[i, f'{rouge_type}_{metric}'] = value
df = df.drop(columns=['rouge_scores'])  # Drop the original rouge_scores column

# --- Calculate and Print Averages ---

avg_bleu = df['bleu'].mean()
avg_semantic_similarity = df['semantic_similarity'].mean()

# Calculate and print average ROUGE scores
avg_rouge_scores = {}
for rouge_type in ['rouge1', 'rouge2', 'rougeL']:
    for metric in ['precision', 'recall', 'fmeasure']:
        avg_score = df[f'{rouge_type}_{metric}'].mean()
        avg_rouge_scores[f'{rouge_type}_{metric}'] = avg_score
        print(f"Average {rouge_type.upper()} {metric.capitalize()}: {avg_score:.2f}")

print(f"\nAverage BLEU Score: {avg_bleu:.2f}")
print(f"Average Semantic Similarity: {avg_semantic_similarity:.2f}\n")

# --- Save Results ---

df.to_excel('rouge_results.xlsx', index=False)


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
OllamaEmbeddings: 100%|██████████| 1/1 [00:04<00:00,  4.29s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:03<00:00,  3.73s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:03<00:00,  3.93s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:03<00:00,  3.68s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.06s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.05s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.07s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.12s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.08s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.15s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:03<00:00,  3.41s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.02s/it]
OllamaEmbeddings: 100%|████████

Average ROUGE1 Precision: 0.15
Average ROUGE1 Recall: 0.57
Average ROUGE1 Fmeasure: 0.23
Average ROUGE2 Precision: 0.08
Average ROUGE2 Recall: 0.23
Average ROUGE2 Fmeasure: 0.11
Average ROUGEL Precision: 0.13
Average ROUGEL Recall: 0.46
Average ROUGEL Fmeasure: 0.19

Average BLEU Score: 0.03
Average Semantic Similarity: 0.62

