### Semantic Chunking

* Semantic chunker is a document splitter that uses embedding similarity between sentences to decide chunk boundaries.
* It ensures that each chunk is semantically coherent and not cut off mid-thought like traditional character/token splitter.

In [2]:
#creating sentences from given document

with open("some_text.txt", "r") as file:
    text = file.read()

text

sentences = [sentence.strip() for sentence in text.split("\n")]
sentences



['Langchain is a framework for building applications with LLMs.',
 'Langchain provides modular abstractions to combine LLMs with tools like OpenAI and Pinecone.',
 'You can create chains, agents, memory and retrievers.',
 'The Eiffel Tower is located in Paris.',
 'France is a popular tourist destination.',
 '']

In [3]:
#get embeddings for those sentences
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
sentences_vectors = embedding_model.encode(sentences)
sentences_vectors

array([[-0.02109219, -0.04472181,  0.01087082, ..., -0.01217807,
         0.0860565 ,  0.02890728],
       [-0.03418018, -0.10210436,  0.00366995, ..., -0.01398788,
         0.04454358,  0.0055136 ],
       [-0.03057391, -0.05121858, -0.13566265, ...,  0.02557612,
         0.07362268, -0.03177413],
       [ 0.06605352,  0.03884843,  0.01661566, ...,  0.03093833,
         0.07991004,  0.05157553],
       [ 0.1040301 , -0.030977  ,  0.02524889, ...,  0.07805588,
         0.01353773, -0.02684903],
       [-0.11883845,  0.04829879, -0.00254817, ...,  0.12640947,
         0.04654907, -0.01571721]], shape=(6, 384), dtype=float32)

In [None]:
#cosine similarity

from sklearn.metrics.pairwise import cosine_similarity

cosine_similarity(sentences_vectors[0].reshape(1, -1), sentences_vectors[3].reshape(1, -1))
#cosine_similarity([sentences_vectors[0]], [sentences_vectors[1]])


array([[0.8263334]], dtype=float32)

In [20]:
#defining a function to compare cosine similarity between vectors and place them in same chunk
threshold = 0.8
chunks =[]
current_chunk=[sentences[0]]
for i in range(1, len(sentences)):
    sim_score = cosine_similarity([sentences_vectors[i-1]], [sentences_vectors[i]])[0][0]
    if sim_score >= threshold:
        current_chunk.append(sentences[i])
    else:
        chunks.append(" ".join(current_chunk)) 
        current_chunk = [sentences[i]]

    #appedn the last chunk
chunks.append(" ". join(current_chunk))   

#output the chunks
print("\n Semantic Chunks:")
for idx, chunk in enumerate(chunks):
    print(f"\nChunk {idx+1}:\n {chunk}")        

        


 Semantic Chunks:

Chunk 1:
 Langchain is a framework for building applications with LLMs. Langchain provides modular abstractions to combine LLMs with tools like OpenAI and Pinecone.

Chunk 2:
 You can create chains, agents, memory and retrievers.

Chunk 3:
 The Eiffel Tower is located in Paris.

Chunk 4:
 France is a popular tourist destination.

Chunk 5:
 


In [54]:
#defining class for this advanced semantic chunking
from langchain_core.documents import Document

class SemanticChunking:
    def __init__(self, model="sentence-transformers/all-MiniLM-L6-v2", threshold=0.8):
        self.embedding = SentenceTransformer(model)
        self.threshold = threshold

    def chunk_splitter(self, file_path:str):

        with open(file_path, "r") as file:
            text = file.read()

        sentences = [sentence.strip() for sentence in text.split("\n")]
        embeddings = self.embedding.encode(sentences)
        chunks = []
        current_chunk=[sentences[0]]

        for i in range(1,len(sentences)):
            sim_score = cosine_similarity([embeddings[i]], [embeddings[i-1]])[0][0]

            if sim_score > self.threshold:
                current_chunk.append(sentences[i])
            else:
                chunks.append(" ".join(current_chunk))
                current_chunk =[sentences[i]]

        chunks.append(" ".join(current_chunk))  
        return chunks
    
    def split_documents(self,file_path:str):
        result = []
        for idx, chunk in enumerate(self.chunk_splitter(file_path)):
                result.append(Document(page_content=chunk, metadata={"page": idx}))
        return result        


In [55]:
semantic_chunking = SemanticChunking()
semantic_chunking

<__main__.SemanticChunking at 0x1d008ac50d0>

In [56]:
semantic_chunks=semantic_chunking.chunk_splitter('some_text.txt')
semantic_chunks

['Langchain is a framework for building applications with LLMs. Langchain provides modular abstractions to combine LLMs with tools like OpenAI and Pinecone.',
 'You can create chains, agents, memory and retrievers.',
 'The Eiffel Tower is located in Paris.',
 'France is a popular tourist destination.',
 '']

In [57]:
final_docs = semantic_chunking.split_documents('some_text.txt')
final_docs


[Document(metadata={'page': 0}, page_content='Langchain is a framework for building applications with LLMs. Langchain provides modular abstractions to combine LLMs with tools like OpenAI and Pinecone.'),
 Document(metadata={'page': 1}, page_content='You can create chains, agents, memory and retrievers.'),
 Document(metadata={'page': 2}, page_content='The Eiffel Tower is located in Paris.'),
 Document(metadata={'page': 3}, page_content='France is a popular tourist destination.'),
 Document(metadata={'page': 4}, page_content='')]

In [58]:
#vectorstore
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings

vectorstore = FAISS.from_documents(documents=semantic_chunking.split_documents('some_text.txt'),
                                   embedding=HuggingFaceEmbeddings(model='all-MiniLM-L6-v2'))

vectorstore

<langchain_community.vectorstores.faiss.FAISS at 0x1d0119f4c50>

In [59]:
#retriever
retriever = vectorstore.as_retriever()

In [61]:
#Prompt 
from langchain_core.prompts import ChatPromptTemplate

prompt = ChatPromptTemplate.from_template(
    template='''Answer the following question based on context.
    context: {context}
    question: {question}
    '''
)
prompt

ChatPromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template='Answer the following question based on context.\n    context: {context}\n    question: {question}\n    '), additional_kwargs={})])

In [62]:
#model
from langchain_groq import ChatGroq
model = ChatGroq(model="llama-3.1-8b-instant")
model

ChatGroq(profile={'max_input_tokens': 131072, 'max_output_tokens': 8192, 'image_inputs': False, 'audio_inputs': False, 'video_inputs': False, 'image_outputs': False, 'audio_outputs': False, 'video_outputs': False, 'reasoning_output': False, 'tool_calling': True}, client=<groq.resources.chat.completions.Completions object at 0x000001D01514A310>, async_client=<groq.resources.chat.completions.AsyncCompletions object at 0x000001D01528AC90>, model_name='llama-3.1-8b-instant', model_kwargs={}, groq_api_key=SecretStr('**********'))

In [63]:
#LCEL Chain with Retriever
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

rag_chain = (
    {"context": retriever ,
     "question": RunnablePassthrough()}
     | prompt
     | model
     | StrOutputParser()
)

rag_chain

{
  context: VectorStoreRetriever(tags=['FAISS', 'HuggingFaceEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x000001D0119F4C50>, search_kwargs={}),
  question: RunnablePassthrough()
}
| ChatPromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template='Answer the following question based on context.\n    context: {context}\n    question: {question}\n    '), additional_kwargs={})])
| ChatGroq(profile={'max_input_tokens': 131072, 'max_output_tokens': 8192, 'image_inputs': False, 'audio_inputs': False, 'video_inputs': False, 'image_outputs': False, 'audio_outputs': False, 'video_outputs': False, 'reasoning_output': False, 'tool_calling': True}, client=<groq.resources.chat.completions.Completions object at 0x000001D01514A310>, async_client=<groq.resources.chat.completions.AsyncCo

In [66]:
rag_chain.invoke("say about eiffle tower")

"The Eiffel Tower is located in Paris, as stated in the document with id '879b4756-3315-4491-a5f1-a05c212460d4'."