In [29]:
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.chat_models import ChatOllama
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain_core.runnables import RunnablePassthrough
from rouge import Rouge
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from sentence_transformers import SentenceTransformer, util
import numpy as np


In [17]:
local_path = r"D:\GitHub\Projetos\Mestrado\EnergyContext\pdf\appendixa_0.pdf"
if local_path:
    loader = UnstructuredPDFLoader(file_path=local_path)
    data = loader.load()
else:
    raise FileNotFoundError("Upload a PDF file")


In [18]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=7500, chunk_overlap=100)
chunks = text_splitter.split_documents(data)


In [19]:
vector_db = Chroma.from_documents(
    documents=chunks, 
    embedding=OllamaEmbeddings(model="nomic-embed-text", show_progress=True),
    collection_name="local-rag"
)


OllamaEmbeddings: 100%|██████████| 3/3 [00:08<00:00,  2.98s/it]


In [20]:
QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are an electrical engineer. Your task is to generate five
    different versions of the given user question to retrieve relevant documents from
    a vector database, and only the database. By generating multiple perspectives on the user question, your
    goal is to help the user overcome some of the limitations of the distance-based
    similarity search. Provide an answer combining the most important points of these five versions.
    Original question: {question}"""
)


In [21]:
local_model = "mistral"
llm = ChatOllama(model=local_model)
retriever = MultiQueryRetriever.from_llm(
    vector_db.as_retriever(), 
    llm,
    prompt=QUERY_PROMPT
)


In [22]:
template = """Answer the question based ONLY on the following context:
{context}
Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)


In [23]:
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)


In [24]:
chain.invoke("What are technologies used to provide indoor environmental comfort?")

OllamaEmbeddings: 100%|██████████| 1/1 [00:03<00:00,  3.64s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.07s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.06s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.09s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.14s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.10s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.19s/it]


' The technologies used to provide indoor environmental comfort include, but are not limited to:\n1. Heating, Ventilating and Air Conditioning (HVAC) systems: These systems control the temperature, humidity, and air quality in buildings by regulating the movement of air, heating or cooling it as necessary. Examples include furnaces, boilers, heat pumps, air conditioners, and ventilation systems.\n2. Energy Efficient Appliances: High-efficiency appliances such as refrigerators, washing machines, and dryers can help maintain indoor comfort while minimizing energy consumption.\n3. Lighting Systems: Efficient lighting design and control strategies can improve indoor comfort by reducing glare and ensuring adequate light levels for various activities.\n4. Building Envelope Technologies: Insulation, windows, doors, and other building envelope components help to regulate temperature and prevent heat loss or gain, contributing to indoor environmental comfort.\n5. Combined Heat and Power (CHP) S

In [28]:
test_queries = [
    "What is the relationship between Megawatt (MW) and Megawatt-hour (MWh)?",
    # Add more handwritten test queries here
]

expected_answers = [
    "A megawatt (MW) is a unit of power representing the rate at which energy is used or generated, while a megawatt-hour (MWh) is a unit of energy representing the total amount of energy used or generated over an hour.",
    # Add more expected answers corresponding to the handwritten test queries
]


In [31]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from sentence_transformers import SentenceTransformer, util
import numpy as np

# Define a set of handwritten test queries and expected answers
test_queries = [
    "What is the relationship between Megawatt (MW) and Megawatt-hour (MWh)?",
    # Add more handwritten test queries here
]

expected_answers = [
    "A megawatt (MW) is a unit of power representing the rate at which energy is used or generated, while a megawatt-hour (MWh) is a unit of energy representing the total amount of energy used or generated over an hour.",
    # Add more expected answers corresponding to the handwritten test queries
]

# Function to categorize scores into low, mid, and high
def categorize_score(score, thresholds):
    if score < thresholds['low']:
        return 'low'
    elif score < thresholds['mid']:
        return 'mid'
    else:
        return 'high'

# Function to evaluate the RAG system using BLEU, ROUGE, and semantic similarity
def evaluate_rag_system(chain, test_queries, expected_answers):
    # Get predictions from the RAG system
    predictions = [chain.invoke(query) for query in test_queries]
    print("Predictions:", predictions)  # Debug: Print predictions to see what the system returns

    # Initialize metrics
    bleu_scores = []
    rouge_scores = []
    semantic_similarities = []

    # Initialize ROUGE scorer
    rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    
    # Load a sentence transformer model for semantic similarity
    model = SentenceTransformer('all-MiniLM-L6-v2')
    
    for pred, ref in zip(predictions, expected_answers):
        # Calculate BLEU score
        bleu_score = sentence_bleu([ref.split()], pred.split(), smoothing_function=SmoothingFunction().method1)
        bleu_scores.append(bleu_score)
        
        # Calculate ROUGE scores
        rouge_score = rouge.score(ref, pred)
        rouge_scores.append(rouge_score)
        
        # Calculate semantic similarity
        pred_embedding = model.encode(pred, convert_to_tensor=True)
        ref_embedding = model.encode(ref, convert_to_tensor=True)
        semantic_similarity = util.pytorch_cos_sim(pred_embedding, ref_embedding).item()
        semantic_similarities.append(semantic_similarity)
    
    # Calculate average scores
    avg_bleu = np.mean(bleu_scores)
    avg_rouge1 = np.mean([score['rouge1'].fmeasure for score in rouge_scores])
    avg_rouge2 = np.mean([score['rouge2'].fmeasure for score in rouge_scores])
    avg_rougeL = np.mean([score['rougeL'].fmeasure for score in rouge_scores])
    avg_semantic_similarity = np.mean(semantic_similarities)

    # Define thresholds for categorizing scores
    thresholds = {
        'bleu': {'low': 0.2, 'mid': 0.5},
        'rouge1': {'low': 0.2, 'mid': 0.5},
        'rouge2': {'low': 0.1, 'mid': 0.3},
        'rougeL': {'low': 0.2, 'mid': 0.5},
        'semantic': {'low': 0.5, 'mid': 0.7}
    }

    # Categorize the average scores
    bleu_category = categorize_score(avg_bleu, thresholds['bleu'])
    rouge1_category = categorize_score(avg_rouge1, thresholds['rouge1'])
    rouge2_category = categorize_score(avg_rouge2, thresholds['rouge2'])
    rougeL_category = categorize_score(avg_rougeL, thresholds['rougeL'])
    semantic_category = categorize_score(avg_semantic_similarity, thresholds['semantic'])

    # Print results
    print(f"Average BLEU Score: {avg_bleu:.2f} ({bleu_category})")
    print(f"Average ROUGE-1 Score: {avg_rouge1:.2f} ({rouge1_category})")
    print(f"Average ROUGE-2 Score: {avg_rouge2:.2f} ({rouge2_category})")
    print(f"Average ROUGE-L Score: {avg_rougeL:.2f} ({rougeL_category})")
    print(f"Average Semantic Similarity: {avg_semantic_similarity:.2f} ({semantic_category})")

    return {
        'avg_bleu': avg_bleu,
        'avg_rouge1': avg_rouge1,
        'avg_rouge2': avg_rouge2,
        'avg_rougeL': avg_rougeL,
        'avg_semantic_similarity': avg_semantic_similarity,
        'categories': {
            'bleu': bleu_category,
            'rouge1': rouge1_category,
            'rouge2': rouge2_category,
            'rougeL': rougeL_category,
            'semantic': semantic_category
        }
    }

# Assuming 'chain' is your RAG system already defined
# For example, chain.invoke("Your query") should return the answer from the RAG system
evaluation_results = evaluate_rag_system(chain, test_queries, expected_answers)


OllamaEmbeddings: 100%|██████████| 1/1 [00:03<00:00,  3.69s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.05s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.05s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.06s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.04s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.04s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.08s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.08s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.21s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.08s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.08s/it]


Predictions: [' The relationship between Megawatt (MW) and Megawatt-hour (MWh) can be understood by considering the time aspect of power consumption. One MegaWatt (MW) is a measure of power, which represents one million watts. It signifies the rate at which energy is being used or produced per second.\n\nOn the other hand, Megawatt-hour (MWh) is a measure of energy, representing one million watt-hours. An hour (h) represents the time it takes for this much power to be delivered continuously for 3600 seconds (since 1 hour = 3600 seconds). Therefore, if you use or produce 1 MW of power for 1 hour, you have consumed or produced 1 MWh.\n\nIn other words, 1 MW is equivalent to 1 MWh in 1 hour, but when power is consumed or produced over a different amount of time, the number of MWh will change accordingly (e.g., using or producing 1 MW for 30 minutes would be 0.5 MWh).']




Average BLEU Score: 0.06 (low)
Average ROUGE-1 Score: 0.33 (mid)
Average ROUGE-2 Score: 0.21 (mid)
Average ROUGE-L Score: 0.29 (mid)
Average Semantic Similarity: 0.88 (high)
