In [1]:
!pip install -U sentence-transformers
!pip install transformers
!pip install langchain_experimental



In [4]:
from langchain_experimental.text_splitter import SemanticChunker
from langchain_community.embeddings import HuggingFaceEmbeddings
from google.colab import userdata
import os

# Load your Hugging Face API key (not actually needed for local models,
# but useful if downloading private models)
os.environ["HF_TOKEN"] = userdata.get('HuggingFace')

# Step 1: Create Hugging Face embeddings wrapper
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Step 2: Create semantic chunker
text_splitter = SemanticChunker(
    embeddings,
    breakpoint_threshold_type="standard_deviation",
    breakpoint_threshold_amount=2
)

# Step 3: Sample text
sample = """
Farmers were working hard in the fields, preparing the soil and planting seeds for the next season.
The sun was bright, and the air smelled of earth and fresh grass.
The Indian Premier League (IPL) is the biggest cricket league in the world.
People all over the world watch the matches and cheer for their favourite teams.

Terrorism is a big danger to peace and safety.
It causes harm to people and creates fear in cities and villages.
When such attacks happen, they leave behind pain and sadness.
To fight terrorism, we need strong laws, alert security forces,
and support from people who care about peace and safety.
"""

# Step 4: Split into semantic chunks
docs = text_splitter.create_documents([sample])

# Step 5: Print results
print(f"Number of chunks: {len(docs)}\n")
for i, doc in enumerate(docs, 1):
    print(f"--- Chunk {i} ---")
    print(doc.page_content, "\n")


Number of chunks: 2

--- Chunk 1 ---

Farmers were working hard in the fields, preparing the soil and planting seeds for the next season. 

--- Chunk 2 ---
The sun was bright, and the air smelled of earth and fresh grass. The Indian Premier League (IPL) is the biggest cricket league in the world. People all over the world watch the matches and cheer for their favourite teams. Terrorism is a big danger to peace and safety. It causes harm to people and creates fear in cities and villages. When such attacks happen, they leave behind pain and sadness. To fight terrorism, we need strong laws, alert security forces, 
and support from people who care about peace and safety.  



Text Structure Base

In [6]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text = """
Space exploration has led to incredible scientific discoveries. From landing on the Moon to exploring Mars, humanity continues to push the boundaries of what’s possible beyond our planet.

These missions have not only expanded our knowledge of the universe but have also contributed to advancements in technology here on Earth. Satellite communications, GPS, and even certain medical imaging techniques trace their roots back to innovations driven by space programs.
"""

# Initialize the splitter
splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=0,
)

# Perform the split
chunks = splitter.split_text(text)

print(len(chunks))
print(chunks)

5
['Space exploration has led to incredible scientific discoveries. From landing on the Moon to', 'exploring Mars, humanity continues to push the boundaries of what’s possible beyond our planet.', 'These missions have not only expanded our knowledge of the universe but have also contributed to', 'advancements in technology here on Earth. Satellite communications, GPS, and even certain medical', 'imaging techniques trace their roots back to innovations driven by space programs.']
