In [3]:
# Infroduction to data ingestion

In [4]:
import os
import pandas as pd
from typing import List,Dict,Any

In [15]:
from langchain_core.documents import Document
from langchain_text_splitters import (
    RecursiveCharacterTextSplitter,
    CharacterTextSplitter,
    TokenTextSplitter
)

print("Setup is completed")

  from .autonotebook import tqdm as notebook_tqdm


Setup is completed


In [16]:
# understand document structure in langchain


In [20]:
# create a simple document
doc  = Document(
    page_content="This is the main content that will be embedded and searched .",
    metadata = {
        "source": "example.txt",
        "page": 1,
        "author" : "Harshit Kamriya",
        "date_created": "2026-01-01",
        "custom-field":"any value"
    }
)
print("Document structure")
print(f"Content : {doc.page_content}")
print(f"Meta data : {doc.metadata}")


# why metad data is required
print("Meta data is crucial for:")
print("Filtering search results")
print("Tracking document sources")
print("Providing contenxt in responses")
print("Debugging and auditing")

Document structure
Content : This is the main content that will be embedded and searched .
Meta data : {'source': 'example.txt', 'page': 1, 'author': 'Harshit Kamriya', 'date_created': '2026-01-01', 'custom-field': 'any value'}
Meta data is crucial for:
Filtering search results
Tracking document sources
Providing contenxt in responses
Debugging and auditing


In [21]:
# Text files - reading the text file

In [22]:
# create a simple txt file
import os
os.makedirs("data/text_files",exist_ok = True)

In [None]:
sample_text = {
    "data/text_files/python_intro.txt":"""Python is a high-level, 
    interpreted programming language created by Guido van Rossum in 1991.
      It is widely known for its simple, readable syntax that resembles English, making it beginner-friendly while still powerful for professionals. Python supports multiple paradigms — object-oriented, functional, 
      and procedural programming — and automatically manages memory and variable types, reducing boilerplate code. It is used across diverse fields such as web development, data analysis, artificial intelligence,
        automation, and scientific computing. Because of its vast ecosystem of libraries and frameworks,
          Python enables developers to build applications quickly, from small scripts to large-scale systems. Its portability across platforms like Windows, Linux, and macOS further strengthens its role as one of the most versatile and popular programming languages today.
    """
}
for filepath,content in sample_text.items():
    with open(filepath,"w",encoding="utf-8") as f:
        f.write(content)

print("sample file got created")


sample file got created


In [26]:
# TextLoader - single file


In [31]:
from langchain_community.document_loaders import TextLoader

# loading single text file
loader= TextLoader("data/text_files/python_intro.txt",encoding="utf-8")

documents = loader.load()
print(type(documents))
print(documents)

<class 'list'>
[Document(metadata={'source': 'data/text_files/python_intro.txt'}, page_content='Python is a high-level, \n    interpreted programming language created by Guido van Rossum in 1991.\n      It is widely known for its simple, readable syntax that resembles English, making it beginner-friendly while still powerful for professionals. Python supports multiple paradigms — object-oriented, functional, \n      and procedural programming — and automatically manages memory and variable types, reducing boilerplate code. It is used across diverse fields such as web development, data analysis, artificial intelligence,\n        automation, and scientific computing. Because of its vast ecosystem of libraries and frameworks,\n          Python enables developers to build applications quickly, from small scripts to large-scale systems. Its portability across platforms like Windows, Linux, and macOS further strengthens its role as one of the most versatile and popular programming languages 

In [32]:
# Direcotry Loader = Multiple text files 


In [35]:
from langchain_community.document_loaders import DirectoryLoader

# Load all the text files from the directory
dir_loader = DirectoryLoader(
    "data/text_files",
    glob="**/*.txt",  # patter to match files
    loader_cls=TextLoader, # loader class to sue
    loader_kwargs={'encoding':'utf-8'},
    show_progress=True
)
documents = dir_loader.load()
print(f"loaded ({len(documents)}) documents")
for i,doc in enumerate(documents):
    print(f"Docuemnt {i+1}")
    print(f"source : {doc.metadata['source']}")
    print(f" Length : {len(doc.page_content)} characters")


100%|██████████| 1/1 [00:00<00:00, 395.61it/s]

loaded (1) documents
Docuemnt 1
source : data\text_files\python_intro.txt
 Length : 910 characters





In [36]:
# Text spliting Strategies

In [37]:
from langchain_text_splitters import(
    CharacterTextSplitter,
    RecursiveCharacterTextSplitter,
    TokenTextSplitter
)
print(documents)

[Document(metadata={'source': 'data\\text_files\\python_intro.txt'}, page_content='Python is a high-level, \n    interpreted programming language created by Guido van Rossum in 1991.\n      It is widely known for its simple, readable syntax that resembles English, making it beginner-friendly while still powerful for professionals. Python supports multiple paradigms — object-oriented, functional, \n      and procedural programming — and automatically manages memory and variable types, reducing boilerplate code. It is used across diverse fields such as web development, data analysis, artificial intelligence,\n        automation, and scientific computing. Because of its vast ecosystem of libraries and frameworks,\n          Python enables developers to build applications quickly, from small scripts to large-scale systems. Its portability across platforms like Windows, Linux, and macOS further strengthens its role as one of the most versatile and popular programming languages today.\n    '

In [40]:
# Method 1 - Character text splitter

text = documents[0].page_content
text

'Python is a high-level, \n    interpreted programming language created by Guido van Rossum in 1991.\n      It is widely known for its simple, readable syntax that resembles English, making it beginner-friendly while still powerful for professionals. Python supports multiple paradigms — object-oriented, functional, \n      and procedural programming — and automatically manages memory and variable types, reducing boilerplate code. It is used across diverse fields such as web development, data analysis, artificial intelligence,\n        automation, and scientific computing. Because of its vast ecosystem of libraries and frameworks,\n          Python enables developers to build applications quickly, from small scripts to large-scale systems. Its portability across platforms like Windows, Linux, and macOS further strengthens its role as one of the most versatile and popular programming languages today.\n    '

In [44]:
# Method 1 : Character-based splitting
print("Charcter text splitter")
char_splitter = CharacterTextSplitter(
    separator='\n',
    chunk_size = 200,
    chunk_overlap = 20,
    length_function = len
)

char_chunks = char_splitter.split_text(text)
print(f"Created {len(char_chunks)} chunks")
print(f"First Chunk {char_chunks[0][:100]}...")

Created a chunk of size 215, which is longer than the specified 200
Created a chunk of size 212, which is longer than the specified 200
Created a chunk of size 272, which is longer than the specified 200


Charcter text splitter
Created 5 chunks
First Chunk Python is a high-level, 
    interpreted programming language created by Guido van Rossum in 1991....


In [48]:
print(char_chunks[0])
print(char_chunks[1])
print(char_chunks[2])
print(char_chunks[3])

Python is a high-level, 
    interpreted programming language created by Guido van Rossum in 1991.
It is widely known for its simple, readable syntax that resembles English, making it beginner-friendly while still powerful for professionals. Python supports multiple paradigms — object-oriented, functional,
and procedural programming — and automatically manages memory and variable types, reducing boilerplate code. It is used across diverse fields such as web development, data analysis, artificial intelligence,
automation, and scientific computing. Because of its vast ecosystem of libraries and frameworks,


In [52]:
print("Recursive character text splitter")
recursive_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n","\n"," ",""],
    chunk_size = 200,
    chunk_overlap = 20,
    length_function = len
)
recursive_chunks = recursive_splitter.split_text(text)
print(f"created {len(recursive_chunks)} chunks")
print(f"First chunk: {recursive_chunks[0][:100]} ...")

Recursive character text splitter
created 8 chunks
First chunk: Python is a high-level, 
    interpreted programming language created by Guido van Rossum in 1991. ...


In [53]:
print(recursive_chunks[0])
print("----------------")
print(recursive_chunks[1])
print("----------------")
print(recursive_chunks[2])
print("----------------")
print(recursive_chunks[3])

Python is a high-level, 
    interpreted programming language created by Guido van Rossum in 1991.
----------------
It is widely known for its simple, readable syntax that resembles English, making it beginner-friendly while still powerful for professionals. Python supports multiple paradigms —
----------------
paradigms — object-oriented, functional,
----------------
and procedural programming — and automatically manages memory and variable types, reducing boilerplate code. It is used across diverse fields such as web development, data analysis, artificial


In [54]:
# Method 3 : Token based splitting 
print("Token text splitter")
token_splitter = TokenTextSplitter(
    chunk_size = 50,
    chunk_overlap = 10
)

token_chunks = token_splitter.split_text(text)
print(f"Created {len(token_chunks)} chunks")
print(f"First chunk {token_chunks[0][:100]} ....")

Token text splitter
Created 5 chunks
First chunk Python is a high-level, 
    interpreted programming language created by Guido van Rossum in 1991.
  ....
