In [1]:
from pathlib import Path
from dotenv import load_dotenv
load_dotenv()

print("‚úÖ Environment ready")

‚úÖ Environment ready


In [5]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import TextLoader

txt_path = 'sample_data/notes.txt'

if Path(txt_path).exists():
    loader = TextLoader(txt_path)
    doc = loader.load()

    splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len,
        separators=["\n\n","\n","."," ",""]
    )
    chunks = splitter.split_documents(doc)
    print(f"Split into {len(chunks)} chunks\n")

    print(chunks[:3])
    for i, chunk in enumerate(chunks,1):
        print("="*70)
        print(f"Chunk {i}: {len(chunk.page_content)} chars")
        print("="*70)
        print(chunk.page_content[:300] if len(chunk.page_content)>300 else chunk.page_content)
        print()
else:
    print(f"File not found: {txt_path}")



Split into 12 chunks

Chunk 1: 991 chars
LANGCHAIN STUDY NOTES - RAG IMPLEMENTATION

Date: January 15, 2025
Topic: Retrieval-Augmented Generation with LangChain 1.0+


CORE CONCEPTS
-------------

1. Document Object Structure
   - page_content: The actual text content
   - metadata: Dictionary wit

Chunk 2: 809 chars
TEXT SPLITTING STRATEGIES
--------------------------

RecursiveCharacterTextSplitter (RECOMMENDED)
- Tries to split on semantic boundaries
- Order: double newline √¢‚Ä†‚Äô newline √¢‚Ä†‚Äô period √¢‚Ä†‚Äô space √¢‚Ä†‚Äô character
- Best for general text and documentation
- Configuration: chunk_size=1000, chunk_overlap=

Chunk 3: 864 chars
TokenTextSplitter
- Splits based on token count, not characters
- More accurate for LLM context window limits
- Uses tiktoken for OpenAI models


CHUNK SIZE GUIDELINES
----------------------

Content Type          | Chunk Size | Overlap | Notes
----------------------|------------|---------|---------

Chunk 4: 911 chars
OpenAI text-embeddi

In [8]:
if Path(txt_path).exists():
    docs = TextLoader(txt_path).load()

    splitter_with_overlap = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=100
    )
    chunks = splitter_with_overlap.split_documents(docs)
    print("üîç Examining overlap between chunks:\n")

    if len(chunks)>2:
        chunk_1_end = chunks[0].page_content[-150:]
        chunk_2_start = chunks[1].page_content[:150]

        print("\nChunk 1 ending: ")
        print(f"   ...{chunk_1_end}")
        print("\nChunk 2 starting: ")
        print(f"   {chunk_2_start}...")
        print("\nüí° Notice the overlap? This preserves context!")



else:
    print(f"File not found: {txt_path}")

üîç Examining overlap between chunks:


Chunk 1 ending: 
   ...ontent: The actual text content
   - metadata: Dictionary with additional information (source, page, date, etc.)
   - id: Unique identifier (optional)

Chunk 2 starting: 
   2. LCEL (LangChain Expression Language)
   - Uses pipe operator | to chain components
   - More readable than nested function calls
   - Better error ...

üí° Notice the overlap? This preserves context!


In [11]:
from langchain_text_splitters import Language, RecursiveCharacterTextSplitter

python_code = '''
def calculate_total(items):
    """Calculate total price of items."""
    total = 0
    for item in items:
        total += item['price']
    return total

def apply_discount(total, discount_percent):
    """Apply discount to total."""
    discount = total * (discount_percent / 100)
    return total - discount

class ShoppingCart:
    def __init__(self):
        self.items = []
    
    def add_item(self, item):
        self.items.append(item)
'''
python_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.PYTHON,
    chunk_size=200,
    chunk_overlap=50
)

code_chunk = python_splitter.split_text(python_code)

print(f"‚úÇÔ∏è Split code into {len(code_chunk)} chunks:\n")
for i, chunk in enumerate(code_chunk,1):
    print(f"Chunk {i}:")
    print(chunk)
    print("-"*50)

‚úÇÔ∏è Split code into 3 chunks:

Chunk 1:
def calculate_total(items):
    """Calculate total price of items."""
    total = 0
    for item in items:
        total += item['price']
    return total
--------------------------------------------------
Chunk 2:
def apply_discount(total, discount_percent):
    """Apply discount to total."""
    discount = total * (discount_percent / 100)
    return total - discount
--------------------------------------------------
Chunk 3:
class ShoppingCart:
    def __init__(self):
        self.items = []

    def add_item(self, item):
        self.items.append(item)
--------------------------------------------------


In [14]:
from langchain_text_splitters import CharacterTextSplitter

# Sample text with clear paragraph breaks
sample_text = """First paragraph about machine learning.
It has multiple sentences. This is important context.

Second paragraph about deep learning.
Neural networks are powerful. They learn from data.

Third paragraph about transformers.
Attention mechanisms are key. They revolutionized NLP.
"""

simple_splitter = CharacterTextSplitter(
    separator="\n\n",
    chunk_size=100,
    chunk_overlap=20
)
chunks = simple_splitter.split_text(sample_text)
print(chunks)
print(f"Split into {len(chunks)} chunks\n")
for i, chunk in enumerate(chunks,1):
    print(f"Chunk {i}: \n{chunk}")

['First paragraph about machine learning.\nIt has multiple sentences. This is important context.', 'Second paragraph about deep learning.\nNeural networks are powerful. They learn from data.', 'Third paragraph about transformers.\nAttention mechanisms are key. They revolutionized NLP.']
Split into 3 chunks

Chunk 1: 
First paragraph about machine learning.
It has multiple sentences. This is important context.
Chunk 2: 
Second paragraph about deep learning.
Neural networks are powerful. They learn from data.
Chunk 3: 
Third paragraph about transformers.
Attention mechanisms are key. They revolutionized NLP.


In [15]:
from langchain_text_splitters import HTMLHeaderTextSplitter
html_path = "sample_data/blog_post.html"

if Path(html_path).exists():
    with open(html_path, 'r', encoding='utf-8') as f:
        html_content = f.read()

    headers_to_split = [
        ("h1","Title"),
        ("h2", "Section"),
        ("h3", "Subsection"),
    ]

    html_splitter =HTMLHeaderTextSplitter(
        headers_to_split_on=headers_to_split
    )

    html_chunks = html_splitter.split_text(html_content)
    print(f"‚úÇÔ∏è Split HTML into {len(html_chunks)} sections\n")

    for i, chunk in enumerate(html_chunks,1):
        print("="*70)
        print(f"Section {i}")
        print(f"Metadata: {chunk.metadata}")
        print(f"Content: {chunk.page_content[:200]}...")
else:
    print(f"‚ùå HTML file not found: {html_path}")

‚úÇÔ∏è Split HTML into 48 sections

Section 1
Metadata: {'Title': 'Building Intelligent Applications with RAG'}
Content: Building Intelligent Applications with RAG...
Section 2
Metadata: {'Title': 'Building Intelligent Applications with RAG'}
Content: | |  
By Dr. Amanda Foster  
January 15, 2025  
12 min read...
Section 3
Metadata: {'Title': 'Building Intelligent Applications with RAG', 'Section': 'Introduction'}
Content: Introduction...
Section 4
Metadata: {'Title': 'Building Intelligent Applications with RAG', 'Section': 'Introduction'}
Content: In the rapidly evolving landscape of artificial intelligence, Retrieval-Augmented Generation (RAG) has emerged as a game-changing approach for building intelligent applications. Unlike traditional cha...
Section 5
Metadata: {'Title': 'Building Intelligent Applications with RAG', 'Section': 'What is Retrieval-Augmented Generation?'}
Content: What is Retrieval-Augmented Generation?...
Section 6
Metadata: {'Title': 'Building Intelligent Applica

In [16]:
from langchain_text_splitters import RecursiveJsonSplitter
import json

json_path = "sample_data/api_response.json"

if Path(json_path).exists():
    with open(json_path, 'r') as f:
        json_data = json.load(f)
    
    # Create splitter
    json_splitter = RecursiveJsonSplitter(
        max_chunk_size=1000,
        min_chunk_size=100
    )
    
    # Split
    json_chunks = json_splitter.split_text(
        json_data=json_data,
        convert_lists=True
    )
    
    print(f"‚úÇÔ∏è Split JSON into {len(json_chunks)} chunks\n")
    
    # Show first chunk
    print("First chunk:")
    print(json.dumps(json_chunks[0], indent=2)[:500] + "...")
else:
    print(f"‚ùå JSON file not found: {json_path}")

‚úÇÔ∏è Split JSON into 7 chunks

First chunk:
"{\"api_version\": \"v2.0\", \"timestamp\": \"2025-01-15T10:30:00Z\", \"total_results\": 5, \"articles\": {\"0\": {\"id\": \"article_001\", \"title\": \"Introduction to Retrieval-Augmented Generation (RAG)\", \"author\": \"Dr. Sarah Chen\", \"published_date\": \"2025-01-10\", \"category\": \"Machine Learning\", \"tags\": {\"0\": \"RAG\", \"1\": \"LLM\", \"2\": \"NLP\", \"3\": \"AI\"}, \"summary\": \"Retrieval-Augmented Generation (RAG) is a powerful technique that combines information retrieval ...


In [18]:
from langchain_text_splitters import TokenTextSplitter

# Sample text
text = """The transformer architecture, introduced in the paper 'Attention Is All You Need', 
revolutionized natural language processing. It uses self-attention mechanisms to process 
sequences in parallel, making it much faster than recurrent neural networks."""

# Token-based splitter
token_splitter = TokenTextSplitter(
    chunk_size=10,  # 50 tokens (not characters!)
    chunk_overlap=2,
    encoding_name="cl100k_base"  # GPT-3.5/GPT-4 tokenizer
)

token_chunks = token_splitter.split_text(text)

print(f"Split into {len(token_chunks)} token-based chunks:\n")
for i, chunk in enumerate(token_chunks, 1):
    print(f"Chunk {i}: {chunk}\n")

Split into 6 token-based chunks:

Chunk 1: The transformer architecture, introduced in the paper 'Attention

Chunk 2:  'Attention Is All You Need', 
revolutionized

Chunk 3: revolutionized natural language processing. It uses self-

Chunk 4:  self-attention mechanisms to process 
sequences in parallel

Chunk 5:  in parallel, making it much faster than recurrent neural

Chunk 6:  recurrent neural networks.

