In [None]:
import json
from llama_index.core import VectorStoreIndex, Settings
from llama_index.core.schema import NodeRelationship, RelatedNodeInfo, TextNode
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.vector_stores.elasticsearch import ElasticsearchStore
import torch_directml
from typing import cast

TOP_K = 20
SCORE_THRESHOLD = 0.45

input_questions_path = "data/benchmark_query_rewriting.json"
output_retrieved_contexts_path = f"data/retrieved_contexts_{int(SCORE_THRESHOLD*100)}_temp.json"

device = torch_directml.device()

# Load embedding model
model_name = "ibm-granite/granite-embedding-30m-english"
embed_model = HuggingFaceEmbedding(
    model_name=model_name,
    device=device,
    normalize=True
)

embed_model._model = embed_model._model.to(device)
print(f"Model moved to {device}")
Settings.embed_model = embed_model

# Open connection to the ElasticSearch DB where the embeddings of the documents are stored
elastic_search_store = ElasticsearchStore(
    index_name="techqa-index",
    es_url="http://localhost:9200",
    show_progress=True
)
index = VectorStoreIndex.from_vector_store(elastic_search_store)

# Set up the document retriever
retriever = index.as_retriever(similarity_top_k=TOP_K)

# Open input file
with open(input_questions_path, "r") as file:
    benchmark_instances = json.load(file)


questions_array = []
quest_count = 0
for benchmark_instance in benchmark_instances:
    
    if quest_count % 50 == 0:
        print(str(quest_count))
    quest_count += 1
    
    question = benchmark_instance["rewrited_question"]
    
    # Split the query in multiple subqueries
    queries = question.split(";")
    
    new_queries = []
    for query in queries:
        if query != "":
            new_queries.append(query)
    
    found_ids = []
    question_documents = []
    
    for query in new_queries:
        
        # Retrieve relevant sections
        retrieved_sections = retriever.retrieve(query)
        
        for section in retrieved_sections:
            # Retrieve all sections with a score greater than threshold
            if section.score is not None and section.score > SCORE_THRESHOLD:
                context_dict = {}
                doc_id = section.metadata["document_id"] 
                doc_title = section.metadata["document_title"] 
                
                # Rebuild the original document that contains the section found, avoiding duplicates
                if doc_id not in found_ids:
                    found_ids.append(doc_id)
                    first_node = section.node
                    
                    # Get the first node (section) of the document
                    while NodeRelationship.PREVIOUS in first_node.relationships:
                        previous_id = cast(RelatedNodeInfo, first_node.relationships[NodeRelationship.PREVIOUS]).node_id
                        first_node = index.vector_store.get_nodes([previous_id])[0]
                            
                    cur_node = first_node
                    document = {}
                    document["document_id"] = doc_id
                    document["document_title"] = doc_title
                    sections = []

                    # Iterate all the section until the document is completely rebuilt
                    while NodeRelationship.NEXT in cur_node.relationships:
                        section = {}
                        section["section_title"] = cur_node.metadata["section_title"]
                        section["section_text"] = cast(TextNode, cur_node).text
                        sections.append(section)
                        
                        next_id = cast(RelatedNodeInfo, cur_node.relationships[NodeRelationship.NEXT]).node_id
                        cur_node = index.vector_store.get_nodes([next_id])[0]
                    
                    # Get the last node (section) of the document
                    section = {}
                    section["section_title"] = cur_node.metadata["section_title"]
                    section["section_text"] = cast(TextNode, cur_node).text
                    sections.append(section)

                    document["sections"] = sections  
                    question_documents.append(document) 
                    
    questions_array.append(question_documents)       


# Save the retrieved documents in a json file
with open(output_retrieved_contexts_path, "w") as file:
    json.dump(questions_array, file, indent=4)

Model moved to privateuseone:0
0
50
100
150
200
250
300
350
400
450
500
550
600
650
700
750
800
850
900
