In [1]:
import json
import os
from dotenv import load_dotenv
from pinecone import Pinecone, ServerlessSpec
import pandas as pd
import matplotlib as plt
import seaborn as sns
from langchain_community.embeddings import HuggingFaceBgeEmbeddings


  from tqdm.autonotebook import tqdm


# Create questions

In [2]:
data = pd.read_csv("output_512_without_vec.csv", sep=";")
questions = pd.read_csv("questions-512.csv", sep=";")
questions.head(5)

Unnamed: 0,id,question,answer chunk
0,2329bf02-8d29-4b0d-b16b-c12f50314931,What is the function of the RasterPaletteResol...,Name: RasterPaletteResolver Category: Rasters ...
1,fea7a481-edcb-4c3d-9fe8-840ba25c5cb0,What does the RasterGCPExtractor transformer do?,Name: RasterGCPExtractor Category: Coordinates...
2,dd898564-341b-45f5-9619-ee19505ad54e,Describe the purpose of the RasterMosaicker tr...,Name: RasterMosaicker Category: Filters and Jo...
3,92463f76-c130-40cd-9a4a-37a71374fd3d,What does the RasterCellValueReplacer transfor...,Name: RasterCellValueReplacer Category: Raster...
4,db039908-7b39-4ed0-a7de-f62abdcb9400,Explain the function of the Emailer transformer.,"Name: Emailer Category: Integrations, Web Desc..."


# Read data

In [3]:
# Load environment variables from .env file
load_dotenv()
pinecone_api_key = os.getenv("API_KEY")

# Initialize Pinecone
pc = Pinecone(api_key=pinecone_api_key)
index_name = "rag-example-512"

pinecone_index = pc.Index(index_name)

# load model

In [4]:
model_name = "BAAI/bge-m3"
model_kwargs = {"device": "cpu"}
encode_kwargs = {"normalize_embeddings": True}
hf = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs,
    query_instruction = "search_query:",
    embed_instruction = "search_document:"
)



In [5]:
for index, row in questions.head(1).iterrows():
    print(row.question)

What is the function of the RasterPaletteResolver transformer?


# Query Pinecone

In [14]:
# embed question
hit = 0
for index, row in questions.iterrows():
    vector = hf.embed_query(row["question"])
    answers = pinecone_index.query(
        namespace="",
        vector=vector,
        top_k=3,
        include_values=True
    )
    for match in answers["matches"]:
    # Filter the questions dataframe by the match id
        filtered_questions = questions[questions["id"] == match["id"]]
        
        # Check if the dataframe is not empty
        if not filtered_questions.empty:
            hit += 1
    
    
print(hit/len(questions))

0.4


# Evaluate the RAG

In [7]:
model_name = "BAAI/bge-m3"
model_kwargs = {"device": "cpu"}
encode_kwargs = {"normalize_embeddings": True}
hf = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs,
    query_instruction = "search_query:",
    embed_instruction = "search_document:"
)

# Load environment variables from .env file
load_dotenv()
pinecone_api_key = os.getenv("API_KEY")

# Initialize Pinecone
pc = Pinecone(api_key=pinecone_api_key)

rag_names = {
    "rag-example-512": {
        "question_key": "questions/questions-512.csv",
        "output": "tokens/output_512_without_vec.csv",
        "pinecone_database": "rag-example-512"
    },
    "rag-example-1024": {
        "question_key": "questions/questions-1024.csv",
        "output": "tokens/output_1024_without_vec.csv",
        "pinecone_database": "rag-example-1024"
    },
    "rag-example-2048": {
        "question_key": "questions/questions-2048.csv",
        "output": "tokens/output_2048_without_vec.csv",
        "pinecone_database": "rag-example-2048"
    }
}



In [42]:
for key in rag_names.keys():
    questions = pd.read_csv(rag_names[key]["question_key"], sep=";")
    data = pd.read_csv(rag_names[key]["output"], sep=";")
    pinecone_index = pc.Index(rag_names[key]["pinecone_database"])

    # evaluate
    hit = 0
    for index, row in questions.iterrows():
        vector = hf.embed_query(row["question"])
        answers = pinecone_index.query(
            namespace="",
            vector=vector,
            top_k=3,
            include_values=True
        )
        
        ids_set = set()
        for match in answers["matches"]:
            ids_set.add(match["id"])
        
        if row["id"] in ids_set:
            hit += 1
            #print(f'HIT! number: {hit} in {rag_names[key]["pinecone_database"]} with id: {questions[questions["id"] == match["id"]].id} and question: {row["question"]}')
    
    rag_names[key]["hit"] = hit

HIT! number: 1 in rag-example-512 with id: Series([], Name: id, dtype: object) and question: What is the function of the RasterPaletteResolver transformer?
HIT! number: 2 in rag-example-512 with id: Series([], Name: id, dtype: object) and question: What does the RasterGCPExtractor transformer do?
HIT! number: 3 in rag-example-512 with id: Series([], Name: id, dtype: object) and question: Describe the purpose of the RasterMosaicker transformer.
HIT! number: 4 in rag-example-512 with id: 3    92463f76-c130-40cd-9a4a-37a71374fd3d
Name: id, dtype: object and question: What does the RasterCellValueReplacer transformer do?
HIT! number: 5 in rag-example-512 with id: Series([], Name: id, dtype: object) and question: Explain the function of the Emailer transformer.
HIT! number: 1 in rag-example-1024 with id: Series([], Name: id, dtype: object) and question: What happens to the cell values when a palette is removed using the RasterPaletteRemover?
HIT! number: 2 in rag-example-1024 with id: Serie

In [43]:
for key in rag_names.keys():
    print(rag_names[key]["hit"])

5
10
10


# Manually evaluate

In [15]:
question = "I would like to send an Email with an HTML in the attachement, is there a transformer to send emails?"
for key in rag_names.keys():
    df_questions = pd.read_csv(rag_names[key]["question_key"], sep=";")
    df_data = pd.read_csv(rag_names[key]["output"], sep=";")
    pinecone_index = pc.Index(rag_names[key]["pinecone_database"])
    
    vector = hf.embed_query(question)
    answers = pinecone_index.query(
        namespace="",
        vector=vector,
        top_k=3,
        include_values=True
    )
    ids_set = set()
    for match in answers["matches"]:
        #ids_set.add(match["id"])
        print(df_data[df_data["id"] == match["id"]].text.iloc(0))
    
    """if row["id"] in ids_set:
        hit += 1"""
    #print(answers)

Series([], Name: text, dtype: object)
4    Name: Emailer\nCategory: Integrations, Web\nDe...
Name: text, dtype: object
31    Name: HTMLToXHTMLConverter\nCategory: \nDescri...
Name: text, dtype: object
107    Name: Emailer\nCategory: Integrations, Web\nDe...
Name: text, dtype: object
185    Name: Emailer\nCategory: Integrations, Web\nDe...
Name: text, dtype: object
44    Name: Emailer\nCategory: Integrations, Web\nDe...
Name: text, dtype: object
182    Name: HTMLExtractor\nCategory: Integrations, S...
Name: text, dtype: object
243    Name: AttributeEncoder\nCategory: Strings, Web...
Name: text, dtype: object
82    Name: HTMLExtractor\nCategory: Integrations, S...
Name: text, dtype: object
