# Contextual Chunk Headers

### Imports and configs

In [20]:
from typing import List
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.schema import BaseNode, TransformComponent
from llama_index.vector_stores.faiss import FaissVectorStore
from llama_index.core.text_splitter import SentenceSplitter
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import Settings
import faiss
import os
import sys
from dotenv import load_dotenv
import pickle
from utils import TextCleaner, hash_documents


sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

EMBED_DIMENSION = 512
CHUNK_SIZE = 250
CHUNK_OVERLAP = 25

load_dotenv()

os.environ["OPENAI_API_KEY"] = os.getenv('OPENAI_API_KEY')

Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small", dimensions=EMBED_DIMENSION)

path = "../data/"
node_parser = SimpleDirectoryReader(input_dir=path, required_exts=['.txt', '.pdf'])
documents = node_parser.load_data()
    
CACHE_DIR = "../cache_contextual_chunk_headers"
VECTOR_STORE_PATH = os.path.join(CACHE_DIR, "faiss_index.pkl")
HASH_PATH = os.path.join(CACHE_DIR, "documents_hash.txt")

### Contextual Chunk Headers

In [21]:
class AddChunkHeader(TransformComponent):
    """
    Transformation to be used within the ingestion pipeline.
    Cleans clutters from texts.
    """
    def __call__(self, nodes, **kwargs) -> List[BaseNode]:
        
        for node in nodes:
            node_title = node.metadata['file_name']
            node.text = f"{node_title}\n{node.text}"
            print(node.text)
            
        return nodes

def load_or_create_vector_store(documents, embed_dim, chunk_size, chunk_overlap):
    os.makedirs(CACHE_DIR, exist_ok=True)
    
    current_hash = hash_documents(documents)
    
    if os.path.exists(HASH_PATH) and os.path.exists(VECTOR_STORE_PATH):
        with open(HASH_PATH, 'r') as f:
            stored_hash = f.read().strip()

        if stored_hash == current_hash:
            print("Loading vector store from cache...")
            with open(VECTOR_STORE_PATH, 'rb') as f:
                return pickle.load(f)
    
    print("Creating new vector store...")
    faiss_index = faiss.IndexFlatL2(embed_dim)
    vector_store = FaissVectorStore(faiss_index=faiss_index)
    
    text_splitter = SentenceSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    
    pipeline = IngestionPipeline(
        transformations=[
            TextCleaner(),
            text_splitter,
            AddChunkHeader(),
        ],
        vector_store=vector_store,
    )
    
    nodes = pipeline.run(documents=documents)
    vector_store_index = VectorStoreIndex(nodes)
    
    # Save the new vector store and hash
    with open(VECTOR_STORE_PATH, 'wb') as f:
        pickle.dump(vector_store_index, f)
    
    with open(HASH_PATH, 'w') as f:
        f.write(current_hash)
    
    return vector_store_index

vector_store_index = load_or_create_vector_store(documents, EMBED_DIMENSION, CHUNK_SIZE, CHUNK_OVERLAP)
retriever = vector_store_index.as_retriever(similarity_top_k=1)

Creating new vector store...
2024-06-20b-SNP-General-Election-Manifesto-2024_interactive.pdf
Promoted by SNP 3 Jacksons Entry EH8 8PJ. Printed by Saltire 60 Brook Street G40 2AB.“A FUTURE   MADE IN   SCOTLAND.”
VOTE SNP FOR SCOTLAND
2024-06-20b-SNP-General-Election-Manifesto-2024_interactive.pdf

2024-06-20b-SNP-General-Election-Manifesto-2024_interactive.pdf
DECISIONS MADE IN SCOTLAND, FOR SCOTLAND.      01
SNP General Election Manifesto  2024VOTE SNP FOR SCOTLAND TO BECOME AN INDEPENDENT COUNTRY .
2024-06-20b-SNP-General-Election-Manifesto-2024_interactive.pdf
02“ We will put  the interests of the people of Scotland first, and protect public services like  our cherished NHS. ” OUR VISION
2024-06-20b-SNP-General-Election-Manifesto-2024_interactive.pdf
DECISIONS MADE IN SCOTLAND, FOR SCOTLAND.      03
SNP General Election Manifesto  2024I believe people are crying out for principled leadership, which is prepared to argue for what it believes in.
In this election, and in this manifesto,



In [29]:
def get_similarity_score_average_from_context(context):
    similarity_scores = [c.score for c in context]
    return sum(similarity_scores) / len(similarity_scores)

In [30]:
test_query = "Is the labour party planning to increase taxes?"
context = retriever.retrieve(test_query)

In [31]:
new_similarity_average = get_similarity_score_average_from_context(context)

In [32]:
with open('../cache/faiss_index.pkl', 'rb') as f:
    original_query_store = pickle.load(f)

original_retriever = original_query_store.as_retriever(similarity_top_k=1)

In [33]:
context = original_retriever.retrieve(test_query)

In [34]:
old_similarity_average = get_similarity_score_average_from_context(context)

In [35]:
print(f"New similarity average: {new_similarity_average}")
print(f"Old similarity average: {old_similarity_average}")

New similarity average: 0.6926790557182427
Old similarity average: 0.6819604396914913
