### Semantic Chunking

* Semantic chunker is a document splitter that uses embedding similarity between sentences to decide chunk boundaries.
* It ensures that each chunk is semantically coherent and not cut off mid-thought like traditional character/token splitter.

In [2]:
#creating sentences from given document

with open("some_text.txt", "r") as file:
    text = file.read()

text

sentences = [sentence.strip() for sentence in text.split("\n")]
sentences



['Langchain is a framework for building applications with LLMs.',
 'Langchain provides modular abstractions to combine LLMs with tools like OpenAI and Pinecone.',
 'You can create chains, agents, memory and retrievers.',
 'The Eiffel Tower is located in Paris.',
 'France is a popular tourist destination.',
 '']

In [3]:
#get embeddings for those sentences
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
sentences_vectors = embedding_model.encode(sentences)
sentences_vectors

array([[-0.02109219, -0.04472181,  0.01087082, ..., -0.01217807,
         0.0860565 ,  0.02890728],
       [-0.03418018, -0.10210436,  0.00366995, ..., -0.01398788,
         0.04454358,  0.0055136 ],
       [-0.03057391, -0.05121858, -0.13566265, ...,  0.02557612,
         0.07362268, -0.03177413],
       [ 0.06605352,  0.03884843,  0.01661566, ...,  0.03093833,
         0.07991004,  0.05157553],
       [ 0.1040301 , -0.030977  ,  0.02524889, ...,  0.07805588,
         0.01353773, -0.02684903],
       [-0.11883845,  0.04829879, -0.00254817, ...,  0.12640947,
         0.04654907, -0.01571721]], shape=(6, 384), dtype=float32)

In [None]:
#cosine similarity

from sklearn.metrics.pairwise import cosine_similarity

cosine_similarity(sentences_vectors[0].reshape(1, -1), sentences_vectors[3].reshape(1, -1))
#cosine_similarity([sentences_vectors[0]], [sentences_vectors[1]])


array([[0.8263334]], dtype=float32)

In [20]:
#defining a function to compare cosine similarity between vectors and place them in same chunk
threshold = 0.8
chunks =[]
current_chunk=[sentences[0]]
for i in range(1, len(sentences)):
    sim_score = cosine_similarity([sentences_vectors[i-1]], [sentences_vectors[i]])[0][0]
    if sim_score >= threshold:
        current_chunk.append(sentences[i])
    else:
        chunks.append(" ".join(current_chunk)) 
        current_chunk = [sentences[i]]

    #appedn the last chunk
chunks.append(" ". join(current_chunk))   

#output the chunks
print("\n Semantic Chunks:")
for idx, chunk in enumerate(chunks):
    print(f"\nChunk {idx+1}:\n {chunk}")        

        


 Semantic Chunks:

Chunk 1:
 Langchain is a framework for building applications with LLMs. Langchain provides modular abstractions to combine LLMs with tools like OpenAI and Pinecone.

Chunk 2:
 You can create chains, agents, memory and retrievers.

Chunk 3:
 The Eiffel Tower is located in Paris.

Chunk 4:
 France is a popular tourist destination.

Chunk 5:
 


In [None]:
#defining class for this advanced semantic chunking

class SemanticChunking:
    def __init__(self, model="sentence-transformers/all-MiniLM-L6-v2", threshold=0.8):
        self.embedding = SentenceTransformer(model=model)
        self.threshold = threshold
        