In [1]:
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain_community.document_loaders import OnlinePDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter


In [2]:
#local_path = r"D:\GitHub\Projetos\Mestrado\EnergyContext\2599.pdf"
local_path = r"D:\GitHub\Projetos\Mestrado\EnergyContext\pdf\appendixa_0.pdf"

# Local PDF file uploads
if local_path:
  loader = UnstructuredPDFLoader(file_path=local_path)
  data = loader.load()
else:
  print("Upload a PDF file")

In [3]:
from langchain_community.embeddings import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma

In [4]:
# Split and chunk 
text_splitter = RecursiveCharacterTextSplitter(chunk_size=7500, chunk_overlap=100)
chunks = text_splitter.split_documents(data)

In [5]:
# Add to vector database
vector_db = Chroma.from_documents(
    documents=chunks, 
    embedding=OllamaEmbeddings(model="nomic-embed-text",show_progress=True),
    collection_name="local-rag"
)

OllamaEmbeddings: 100%|██████████| 3/3 [00:07<00:00,  2.58s/it]


In [6]:
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.chat_models import ChatOllama
from langchain_core.runnables import RunnablePassthrough
from langchain.retrievers.multi_query import MultiQueryRetriever

In [7]:
# LLM from Ollama
local_model = "mistral"
llm = ChatOllama(model=local_model)

In [8]:
'''QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are an AI language model assistant. Your task is to generate five
    different versions of the given user question to retrieve relevant documents from
    a vector database. By generating multiple perspectives on the user question, your
    goal is to help the user overcome some of the limitations of the distance-based
    similarity search. Provide these alternative questions separated by newlines.
    Original question: {question}""",
)'''
QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are an electrical engineer. Your task is to generate five
    different versions of the given user question to retrieve relevant documents from
    a vector database, and only the database. By generating multiple perspectives on the user question, your
    goal is to help the user overcome some of the limitations of the distance-based
    similarity search. Provide an answer combining the most important points of these five versions.
    Original question: {question}""",
)

In [9]:
retriever = MultiQueryRetriever.from_llm(
    vector_db.as_retriever(), 
    llm,
    prompt=QUERY_PROMPT
)

# RAG prompt
template = """Answer the question based ONLY on the following context:
{context}
Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

In [10]:
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [11]:
chain.invoke("What are technologies used to provide indoor environmental comfort?")

OllamaEmbeddings: 100%|██████████| 1/1 [00:03<00:00,  3.77s/it]
Number of requested results 4 is greater than number of elements in index 3, updating n_results = 3
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.07s/it]
Number of requested results 4 is greater than number of elements in index 3, updating n_results = 3
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.03s/it]
Number of requested results 4 is greater than number of elements in index 3, updating n_results = 3
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.12s/it]
Number of requested results 4 is greater than number of elements in index 3, updating n_results = 3
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.12s/it]
Number of requested results 4 is greater than number of elements in index 3, updating n_results = 3
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.12s/it]
Number of requested results 4 is greater than number of elements in index 3, updating n_results = 3
OllamaEmbeddings

" The document provided does not explicitly mention technologies for providing indoor environmental comfort. However, it does mention Combined Heat and Power (CHP), which can be a system that provides both heating and cooling for buildings, thus contributing to indoor environmental comfort. Additionally, High-efficiency heating, ventilating and air conditioning systems or control modifications are also mentioned as examples of energy efficiency, which is crucial for maintaining comfortable indoor environments. It's important to note that this is an inference from the information provided and not a direct mention of the technologies in the document."

In [12]:
from transformers import pipeline

# Load a question generation pipeline
question_generator = pipeline("text2text-generation", model="valhalla/t5-base-qg-hl")

# Generate synthetic questions and answers
synthetic_data = []
for chunk in chunks:
    # Assuming the text content is accessed via the 'page_content' attribute
    text = chunk.page_content
    question = question_generator("generate question: " + text)[0]['generated_text']
    synthetic_data.append((question, text))

# Separate into queries and expected answers
test_queries, expected_answers = zip(*synthetic_data)


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Token indices sequence length is longer than the specified maximum sequence length for this model (1552 > 512). Running this sequence through the model will result in indexing errors


In [13]:
from datasets import load_metric
from sentence_transformers import SentenceTransformer, util
import numpy as np

# Load the metrics
bleu_metric = load_metric("bleu")
rouge_metric = load_metric("rouge")

# Load a sentence transformer model for semantic similarity
model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to evaluate the RAG system using BLEU, ROUGE, and semantic similarity
def evaluate_rag_system(chain, test_queries, expected_answers):
    predictions = [chain.invoke(query) for query in test_queries]
    print("Predictions:", predictions)  # Debug: Print predictions to see what the system returns

    # Initialize lists to hold references and predictions for metrics calculation
    references = [[ref.split()] for ref in expected_answers]  # BLEU expects list of lists of tokens
    preds = [pred.split() for pred in predictions]

    # Calculate BLEU score
    bleu_metric.add_batch(predictions=preds, references=references)
    bleu_score = bleu_metric.compute()['bleu']

    # Calculate ROUGE scores
    rouge_metric.add_batch(predictions=predictions, references=expected_answers)
    rouge_scores = rouge_metric.compute()
    
    # Calculate semantic similarity
    semantic_similarities = []
    for pred, ref in zip(predictions, expected_answers):
        pred_embedding = model.encode(pred, convert_to_tensor=True)
        ref_embedding = model.encode(ref, convert_to_tensor=True)
        semantic_similarity = util.pytorch_cos_sim(pred_embedding, ref_embedding).item()
        semantic_similarities.append(semantic_similarity)
    
    avg_semantic_similarity = np.mean(semantic_similarities)

    # Print results
    print(f"BLEU Score: {bleu_score:.2f}")
    print(f"ROUGE-1 Score: {rouge_scores['rouge1'].mid.fmeasure:.2f}")
    print(f"ROUGE-2 Score: {rouge_scores['rouge2'].mid.fmeasure:.2f}")
    print(f"ROUGE-L Score: {rouge_scores['rougeL'].mid.fmeasure:.2f}")
    print(f"Average Semantic Similarity: {avg_semantic_similarity:.2f}")

    return bleu_score, rouge_scores, avg_semantic_similarity

# Evaluate the RAG system
evaluate_rag_system(chain, test_queries, expected_answers)


  bleu_metric = load_metric("bleu")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
OllamaEmbeddings: 100%|██████████| 1/1 [00:03<00:00,  3.71s/it]
Number of requested results 4 is greater than number of elements in index 3, updating n_results = 3
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.07s/it]
Number of requested results 4 is greater than number of elements in index 3, updating n_results = 3
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.07s/it]
Number of requested results 4 is greater than number of elements in index 3, updating n_results = 3
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.12s/it]
Number

Predictions: [' The name of the roadmap for incorporating energy efficiency into state and tribal implementation plans is not explicitly mentioned in the provided text. However, it can be inferred that such a roadmap might be referred to as part of the "On the books" Energy Efficiency/ Renewable Energy Policies or "On the way" Energy Efficiency/ Renewable Energy Policies, which are policies that have been adopted or planned for adoption by a legislative or regulatory body. The actual name of this roadmap would depend on the specific jurisdiction and context in which it is being implemented.', " The document does not provide specific information about an organization that oversees the Energy Efficiency (EE) program directly. However, the North American Electric Reliability Corporation (NERC) is mentioned as an organization that ensures the reliability of the North American bulk power system, but this doesn't necessarily mean they oversee EE programs. The U.S. Environmental Protection Ag

(2.1556046581504414e-06,
 {'rouge1': AggregateScore(low=Score(precision=0.6818181818181818, recall=0.04904632152588556, fmeasure=0.09183673469387756), mid=Score(precision=0.7154996776273371, recall=0.058032401889632435, fmeasure=0.10725080400075455), high=Score(precision=0.7446808510638298, recall=0.06272401433691756, fmeasure=0.11570247933884296)),
  'rouge2': AggregateScore(low=Score(precision=0.2680452164323132, recall=0.023463923424740175, fmeasure=0.04314606047329369), mid=Score(precision=0.31662456501166175, recall=0.024854668601893126, fmeasure=0.046043153699008306), high=Score(precision=0.3918918918918919, recall=0.026363636363636363, fmeasure=0.04940374787052811)),
  'rougeL': AggregateScore(low=Score(precision=0.4393939393939394, recall=0.03178928247048138, fmeasure=0.059523809523809514), mid=Score(precision=0.4651407693960885, recall=0.03772470848315417, fmeasure=0.06972030942958492), high=Score(precision=0.48936170212765956, recall=0.04121863799283154, fmeasure=0.0760330578

In [14]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from sentence_transformers import SentenceTransformer, util
import numpy as np

# Define a set of handwritten test queries and expected answers
test_queries = [
    "What is the relationship between Megawatt (MW) and Megawatt-hour (MWh)?",
    # Add more handwritten test queries here
]

expected_answers = [
    "A megawatt (MW) is a unit of power representing the rate at which energy is used or generated, while a megawatt-hour (MWh) is a unit of energy representing the total amount of energy used or generated over an hour.",
    # Add more expected answers corresponding to the handwritten test queries
]

# Function to evaluate the RAG system using BLEU, ROUGE, and semantic similarity
def evaluate_rag_system(chain, test_queries, expected_answers):
    # Get predictions from the RAG system
    predictions = [chain.invoke(query) for query in test_queries]
    print("Predictions:", predictions)  # Debug: Print predictions to see what the system returns

    # Initialize metrics
    bleu_scores = []
    rouge_scores = []
    semantic_similarities = []

    # Initialize ROUGE scorer
    rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    
    # Load a sentence transformer model for semantic similarity
    model = SentenceTransformer('all-MiniLM-L6-v2')
    
    for pred, ref in zip(predictions, expected_answers):
        # Calculate BLEU score
        bleu_score = sentence_bleu([ref.split()], pred.split(), smoothing_function=SmoothingFunction().method1)
        bleu_scores.append(bleu_score)
        
        # Calculate ROUGE scores
        rouge_score = rouge.score(ref, pred)
        rouge_scores.append(rouge_score)
        
        # Calculate semantic similarity
        pred_embedding = model.encode(pred, convert_to_tensor=True)
        ref_embedding = model.encode(ref, convert_to_tensor=True)
        semantic_similarity = util.pytorch_cos_sim(pred_embedding, ref_embedding).item()
        semantic_similarities.append(semantic_similarity)
    
    # Calculate average scores
    avg_bleu = np.mean(bleu_scores)
    avg_rouge1 = np.mean([score['rouge1'].fmeasure for score in rouge_scores])
    avg_rouge2 = np.mean([score['rouge2'].fmeasure for score in rouge_scores])
    avg_rougeL = np.mean([score['rougeL'].fmeasure for score in rouge_scores])
    avg_semantic_similarity = np.mean(semantic_similarities)

    # Print results
    print(f"Average BLEU Score: {avg_bleu:.2f}")
    print(f"Average ROUGE-1 Score: {avg_rouge1:.2f}")
    print(f"Average ROUGE-2 Score: {avg_rouge2:.2f}")
    print(f"Average ROUGE-L Score: {avg_rougeL:.2f}")
    print(f"Average Semantic Similarity: {avg_semantic_similarity:.2f}")

    return avg_bleu, avg_rouge1, avg_rouge2, avg_rougeL, avg_semantic_similarity

# Assuming 'chain' is your RAG system already defined
# For example, chain.invoke("Your query") should return the answer from the RAG system
evaluate_rag_system(chain, test_queries, expected_answers)


OllamaEmbeddings: 100%|██████████| 1/1 [00:03<00:00,  3.68s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.07s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.07s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.11s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.20s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.09s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.15s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.17s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.27s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.08s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.10s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.22s/it]


Predictions: [' The Megawatt (MW) is a unit of power, representing one million watts (1 Joule per second). On the other hand, the Megawatt-hour (MWh) is a unit of energy, representing one million watt-hours (1 Joule per second for 3600 seconds or 3.6 kilowatt-hours). In simpler terms, 1 MW is the amount of power that can produce 1 MWh of energy over a period of one hour.']




Average BLEU Score: 0.14
Average ROUGE-1 Score: 0.50
Average ROUGE-2 Score: 0.32
Average ROUGE-L Score: 0.44
Average Semantic Similarity: 0.96


(0.1390357706150399,
 0.4954128440366973,
 0.3177570093457944,
 0.4403669724770642,
 0.9627901315689087)