### Semantic Chunking
#### . Semantic chunker is a document splitter that uses similarity between sentences to decide chunk boundaries
#### . It ensures that each chunk is semantically coherent and not cut off mid-thought like tranditional character/token splitters.

In [3]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [7]:
#initialize the model
model = SentenceTransformer('all-MiniLM-L6-v2')

## Sample text
text = """
1 Langchain is a framework for building applications with LLM's.
2 Langchain provides modular abstractions to combine LLM's with tools like OpenAI and Pinecone.
3 You can create chains, agents, memory and retrievers.
4 The Eiffel Tower is located in Paris.
5 France is a popular tourist destination.
"""

## Step 1 : Split into sentences
sentences =[s.strip() for s in text.split("\n") if s.strip()]
## Step 2: Embed each sentence
embeddings = model.encode(sentences)

## Step 3 : Initialize parameters
threshold = 0.7 # control chunk tightness
chunks = []
current_chunk= []
## Step 4: Semantic grouping based on threshold
for i in range(1,len(sentences)):
    sim = cosine_similarity(
        [embeddings[i-1]],
        [embeddings[i]]
    )[0][0]

    if sim>threshold:
        current_chunk.append(sentences[i])
    else:
        chunks.append(" ".join(current_chunk))
        current_chunk=[sentences[i]]

# Append the last chunk
chunks.append(" ".join(current_chunk))

#Output the chunks
print("Semantic Chunks: ")
for idx, chunk in enumerate(chunks):
    print(f"\nChunk {idx+1}:\n{chunk}")


Semantic Chunks: 

Chunk 1:
2 Langchain provides modular abstractions to combine LLM's with tools like OpenAI and Pinecone.

Chunk 2:
3 You can create chains, agents, memory and retrievers.

Chunk 3:
4 The Eiffel Tower is located in Paris.

Chunk 4:
5 France is a popular tourist destination.


### RAG Pipeline Modular Coding

In [8]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from langchain.schema import Document
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
from langchain.chat_models import init_chat_model
from langchain.schema.runnable import RunnableLambda,RunnableMap
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
import os
os.environ["GROQ_API_KEY"]=os.getenv("GROQ_API_KEY")


In [None]:
## Custom Semantic Chunker with Threshold

class ThresholdSemanticChunker:
    def __init__(self,model_name ="all-MiniLM-L6-v2",threshold=0.7):
        self.model=SentenceTransformer(model_name)
        self.threshold=threshold
        