In [5]:
from rich import print as rich_print

from langchain.docstore.document import Document
from langchain_community.chat_models import ChatOllama
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import OllamaEmbeddings
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain.output_parsers.openai_tools import JsonOutputToolsParser
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableLambda
from langchain.chains import create_extraction_chain
from typing import Optional, List
from langchain.chains import create_extraction_chain_pydantic
from pydantic import BaseModel
from langchain import hub



In [2]:
local_llm = ChatOllama(model ="mistral")


In [3]:
# RAG
def rag(chunks, collection_name):
    vectorstore = Chroma.from_documents(
        documents=documents,
        collection_name=collection_name,
        embedding=OllamaEmbeddings(model='nomic-embed-text'),
    )
    retriever = vectorstore.as_retriever()

    prompt_template = """Answer the question based only on the following context:
    {context}
    Question: {question}
    """
    prompt = ChatPromptTemplate.from_template(prompt_template)

    chain = (
        {"context": retriever, "question": RunnablePassthrough()}
        | prompt
        | local_llm
        | StrOutputParser()
    )
    result = chain.invoke("What is the use of Text Splitting?")
    rich_print(result)


In [4]:
# 1. Character Text Splitting
print("#### Character Text Splitting ####")

text = "Text splitting in LangChain is a critical feature that facilitates the division of large texts into smaller, manageable segments. "

# Manual Splitting
chunks = []
chunk_size = 35 # Characters
for i in range(0, len(text), chunk_size):
    chunk = text[i:i + chunk_size]
    chunks.append(chunk)
documents = [Document(page_content=chunk, metadata={"source": "local"}) for chunk in chunks]
rich_print(documents)

# Automatic Text Splitting
from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter(chunk_size = 35, chunk_overlap=0, separator='', strip_whitespace=False)
documents = text_splitter.create_documents([text])
rich_print(documents)

#### Character Text Splitting ####


In [5]:
# 2. Recursive Character Text Splitting
print("#### Recursive Character Text Splitting ####")

from langchain.text_splitter import RecursiveCharacterTextSplitter
with open('E:\git\AI-Algorithm-Journey\chunking_text.txt', 'r', encoding='utf-8') as file:
    text = file.read()

text_splitter = RecursiveCharacterTextSplitter(chunk_size = 65, chunk_overlap=0) # ["\n\n", "\n", " ", ""] 65,450
rich_print(text_splitter.create_documents([text])) 

#### Recursive Character Text Splitting ####


In [6]:
# 3. Document Specific Splitting
print("#### Document Specific Splitting ####")

# Document Specific Splitting - Markdown
from langchain.text_splitter import MarkdownTextSplitter
splitter = MarkdownTextSplitter(chunk_size = 40, chunk_overlap=0)
markdown_text = """
# Fun in California

## Driving

Try driving on the 1 down to San Diego

### Food

Make sure to eat a burrito while you're there

## Hiking

Go to Yosemite
"""
rich_print(splitter.create_documents([markdown_text]))

#### Document Specific Splitting ####


In [7]:
# Document Specific Splitting - Python
from langchain.text_splitter import PythonCodeTextSplitter
python_text = """
class Person:
  def __init__(self, name, age):
    self.name = name
    self.age = age

p1 = Person("John", 36)

for i in range(10):
    print (i)
"""
python_splitter = PythonCodeTextSplitter(chunk_size=100, chunk_overlap=0)
rich_print(python_splitter.create_documents([python_text]))


In [8]:
# Document Specific Splitting - Javascript
from langchain.text_splitter import RecursiveCharacterTextSplitter, Language
javascript_text = """
// Function is called, the return value will end up in x
let x = myFunction(4, 3);

function myFunction(a, b) {
// Function returns the product of a and b
  return a * b;
}
"""
js_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.JS, chunk_size=65, chunk_overlap=0
)
rich_print(js_splitter.create_documents([javascript_text]))

In [1]:
import os
openapi_key = os.environ.get('OPENAI_API_KEY')


In [10]:
# 4. Semantic Chunking
print("#### Semantic Chunking ####")

from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai.embeddings import OpenAIEmbeddings
with open('E:\git\AI-Algorithm-Journey\chunking_text.txt', 'r', encoding='utf-8') as file:
    text = file.read() 

# Percentile - all differences between sentences are calculated, and then any difference greater than the X percentile is split
text_splitter = SemanticChunker(OpenAIEmbeddings(api_key=openapi_key))
text_splitter = SemanticChunker(
    OpenAIEmbeddings(), breakpoint_threshold_type="percentile" # "standard_deviation", "interquartile"
)
documents = text_splitter.create_documents([text])
rich_print(documents)

#### Semantic Chunking ####


In [6]:
# 5. Agentic Chunking
print("#### Proposition-Based Chunking ####")


class Sentences(BaseModel):
    sentences: List[str]

text = "Text splitting in LangChain is a critical feature that facilitates the division of large texts into smaller, manageable segments. "
with open('E:\git\AI-Algorithm-Journey\chunking_text.txt', 'r', encoding='utf-8') as file:
    text = file.read() 


# https://arxiv.org/pdf/2312.06648.pdf
obj = hub.pull("wfh/proposal-indexing")
llm = ChatOpenAI(model='gpt-3.5-turbo').with_structured_output(Sentences)
runnable = obj | llm



# Extraction
# extraction_chain = create_extraction_chain_pydantic(pydantic_schema=Sentences, llm=llm)
def get_propositions(text):
    # Create an LLM with structured output based on the Sentences schema
    # structured_llm = runnable.with_structured_output(Sentences)

    # Invoke the LLM with the input text and get the structured output
    runnable_output = runnable.invoke({
        "input": text
    })

    # Extract sentences from the structured output
    propositions = runnable_output.sentences

    return propositions
paragraphs = text.split("\n")
text_propositions = []
for i, para in enumerate(paragraphs[:5]):
    propositions = get_propositions(para)
    text_propositions.extend(propositions)
    print (f"Done with {i}")


rich_print (f"You have {len(text_propositions)} propositions")
rich_print(text_propositions[:10])


#### Proposition-Based Chunking ####




Done with 0
Done with 1
Done with 2
Done with 3


In [13]:
print("#### Agentic Chunking ####")

from agentic_chunker import AgenticChunker
ac = AgenticChunker()
ac.add_propositions(text_propositions)
rich_print(ac.pretty_print_chunks())
chunks = ac.get_chunks(get_type='list_of_strings')
rich_print(chunks)
documents = [Document(page_content=chunk, metadata={"source": "local"}) for chunk in chunks]
rag(documents, "agentic-chunks")

#### Agentic Chunking ####



For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  from agentic_chunker import AgenticChunker


  extraction_chain = create_extraction_chain_pydantic(pydantic_schema=ChunkID, llm=self.llm)
