In [25]:
from langchain_openai.chat_models import ChatOpenAI
from pydantic import BaseModel
from langchain_core.documents import Document
from dotenv import load_dotenv
from langchain_core.prompts import ChatPromptTemplate

In [26]:
load_dotenv()

True

In [27]:
text = """Artificial intelligence is transforming technology and shaping the future.
Machine learning algorithms are becoming more sophisticated every day.
Deep learning models can now process vast amounts of data efficiently.
Neural networks are inspired by the human brain's structure.
The best pasta recipes include fresh ingredients and proper cooking techniques.
Italian cuisine emphasizes quality olive oil and regional cheeses.
Authentic carbonara uses guanciale, eggs, pecorino romano, and black pepper.
Cooking pasta al dente ensures the best texture and flavor.
Climate change is affecting ecosystems worldwide.
Rising temperatures are causing glaciers to melt at unprecedented rates.
Scientists warn that immediate action is needed to reduce carbon emissions.
Renewable energy sources offer hope for a sustainable future."""

In [28]:
# pydantic class for structured output

class Chunk(BaseModel): 
    
    chunk_text: str
    summary: str
    
    
class Chunker(BaseModel):
    
    chunks: list[Chunk]

In [29]:
# define model

model = ChatOpenAI(model="gpt-5-mini")

llm_chunker = model.with_structured_output(schema=Chunker)

In [30]:
# prompt for chunking

prompt = ChatPromptTemplate(messages=[
    ("system", 
     """You are an expert Text Chunker that splits the given text and outputs them as a 
     list of strings. You understand the natural topic boundaries of text and 
     also do not change the existing text. You just split the text where ever applicable.
     Once you create the chunk, you also generate a 1-2 line summary of the chunk also"""),
    ("human",
     "Split the given text into chunks\nText: {text}")
], input_variables=["text"])

In [31]:
# chunking through llm

model_chain = prompt | llm_chunker

response = model_chain.invoke({"text": text})

In [48]:
response

Chunker(chunks=[Chunk(chunk_text="Artificial intelligence is transforming technology and shaping the future.\nMachine learning algorithms are becoming more sophisticated every day.\nDeep learning models can now process vast amounts of data efficiently.\nNeural networks are inspired by the human brain's structure.", summary="Overview of artificial intelligence topics: AI's impact, machine learning, deep learning, and neural networks."), Chunk(chunk_text='The best pasta recipes include fresh ingredients and proper cooking techniques.\nItalian cuisine emphasizes quality olive oil and regional cheeses.\nAuthentic carbonara uses guanciale, eggs, pecorino romano, and black pepper.\nCooking pasta al dente ensures the best texture and flavor.', summary='Notes on pasta and Italian cuisine: ingredients, authentic carbonara, and cooking pasta al dente.'), Chunk(chunk_text='Climate change is affecting ecosystems worldwide.\nRising temperatures are causing glaciers to melt at unprecedented rates.\n

In [39]:
response.chunks

[Chunk(chunk_text="Artificial intelligence is transforming technology and shaping the future.\nMachine learning algorithms are becoming more sophisticated every day.\nDeep learning models can now process vast amounts of data efficiently.\nNeural networks are inspired by the human brain's structure.", summary="Overview of artificial intelligence topics: AI's impact, machine learning, deep learning, and neural networks."),
 Chunk(chunk_text='The best pasta recipes include fresh ingredients and proper cooking techniques.\nItalian cuisine emphasizes quality olive oil and regional cheeses.\nAuthentic carbonara uses guanciale, eggs, pecorino romano, and black pepper.\nCooking pasta al dente ensures the best texture and flavor.', summary='Notes on pasta and Italian cuisine: ingredients, authentic carbonara, and cooking pasta al dente.'),
 Chunk(chunk_text='Climate change is affecting ecosystems worldwide.\nRising temperatures are causing glaciers to melt at unprecedented rates.\nScientists wa

In [33]:
len(response.chunks)

3

In [42]:
chunks = response.chunks

In [34]:
from termcolor import COLORS, colored
from random import choice

In [35]:
def display_chunks(chunks):
    colors_list = list(COLORS.keys())[2:8]
    print(f"Total Number of Chunks: {len(chunks)}")
    
    for num, chunk in enumerate(chunks, 1):
        print(f"Chunk {num}: Length {len(chunk)} chars")
        print(colored(text=chunk, color=choice(colors_list)), end="\n\n")

In [43]:
display_chunks([chunk.chunk_text for chunk in chunks])

Total Number of Chunks: 3
Chunk 1: Length 277 chars
[33mArtificial intelligence is transforming technology and shaping the future.
Machine learning algorithms are becoming more sophisticated every day.
Deep learning models can now process vast amounts of data efficiently.
Neural networks are inspired by the human brain's structure.[0m

Chunk 2: Length 283 chars
[34mThe best pasta recipes include fresh ingredients and proper cooking techniques.
Italian cuisine emphasizes quality olive oil and regional cheeses.
Authentic carbonara uses guanciale, eggs, pecorino romano, and black pepper.
Cooking pasta al dente ensures the best texture and flavor.[0m

Chunk 3: Length 260 chars
[33mClimate change is affecting ecosystems worldwide.
Rising temperatures are causing glaciers to melt at unprecedented rates.
Scientists warn that immediate action is needed to reduce carbon emissions.
Renewable energy sources offer hope for a sustainable future.[0m



In [53]:
response.chunks[2]

Chunk(chunk_text='Climate change is affecting ecosystems worldwide.\nRising temperatures are causing glaciers to melt at unprecedented rates.\nScientists warn that immediate action is needed to reduce carbon emissions.\nRenewable energy sources offer hope for a sustainable future.', summary='Discussion of climate change impacts, the urgency of emissions reduction, and the role of renewable energy.')

In [45]:
# create documents from chunks

docs = [Document(page_content=chunk.chunk_text, metadata={"summary": chunk.summary}) for chunk in chunks]

In [46]:
print(docs)

[Document(metadata={'summary': "Overview of artificial intelligence topics: AI's impact, machine learning, deep learning, and neural networks."}, page_content="Artificial intelligence is transforming technology and shaping the future.\nMachine learning algorithms are becoming more sophisticated every day.\nDeep learning models can now process vast amounts of data efficiently.\nNeural networks are inspired by the human brain's structure."), Document(metadata={'summary': 'Notes on pasta and Italian cuisine: ingredients, authentic carbonara, and cooking pasta al dente.'}, page_content='The best pasta recipes include fresh ingredients and proper cooking techniques.\nItalian cuisine emphasizes quality olive oil and regional cheeses.\nAuthentic carbonara uses guanciale, eggs, pecorino romano, and black pepper.\nCooking pasta al dente ensures the best texture and flavor.'), Document(metadata={'summary': 'Discussion of climate change impacts, the urgency of emissions reduction, and the role of

In [54]:
print(docs[1])

page_content='The best pasta recipes include fresh ingredients and proper cooking techniques.
Italian cuisine emphasizes quality olive oil and regional cheeses.
Authentic carbonara uses guanciale, eggs, pecorino romano, and black pepper.
Cooking pasta al dente ensures the best texture and flavor.' metadata={'summary': 'Notes on pasta and Italian cuisine: ingredients, authentic carbonara, and cooking pasta al dente.'}
