Introduction to Data Ingestion

In [2]:
import os
from typing import List, Dict, Any
import pandas as pd

In [3]:
from langchain_core.documents import Document
from langchain_text_splitters import (
    RecursiveCharacterTextSplitter,
    CharacterTextSplitter,
    TokenTextSplitter
)
print("Setup Completed")


  from .autonotebook import tqdm as notebook_tqdm


Setup Completed


Understanding Document Structure: page_content> string , metadata > Dictionary

In [4]:

#create a Simple document
doc = Document(
    page_content="This is the main text content that will be embedded and searched",
    metadata={
        "source": "example.com",
        "author": "Abhishek Agrahari",
        "date": "2025-12-01"
    }
)
print("Doc Structure") 

print(f"Content: {doc.page_content}")
print(f"Metadata: {doc.metadata}")

#create a list of documents
# docs = [
#     Document(page_content="This is the first document", metadata={"source": "doc1.txt"}),
#     Document(page_content="This is the second document", metadata={"source": "doc2.txt"})
# ]


Doc Structure
Content: This is the main text content that will be embedded and searched
Metadata: {'source': 'example.com', 'author': 'Abhishek Agrahari', 'date': '2025-12-01'}


### Reading a text files


In [5]:
## create simple text file
import os
os.makedirs("data/text_files",exist_ok = True)


In [6]:
sample_text = {
    "data/text_files/python_intro.txt": """Python is a popular, high-level, general-purpose programming language known for its simple, English-like syntax and excellent readability, which makes it a top choice for beginners and experienced developers alike. Developed by Guido van Rossum and first released in 1991, it is an open-source, dynamically typed, and interpreted language. 
Key Features
Readability and Simplicity: Python's design emphasizes clean, uncluttered syntax using indentation to define code blocks, unlike many languages that use curly brackets.
Versatile and Multi-Platform: It runs on various operating systems (Windows, macOS, Linux) and supports multiple programming paradigms, including procedural, object-oriented, and functional programming.
Extensive Libraries and Frameworks: Python has a vast standard library and a massive ecosystem of third-party libraries (like NumPy, Pandas, TensorFlow, Django, Flask) that simplify complex tasks in various fields.
Interpreted and Dynamic: Code is executed line by line at runtime, which speeds up the edit-test-debug cycle and makes prototyping fast. Variables are dynamically typed, meaning you don't need to declare their data types explicitly.
Strong Community Support: A large and active community contributes to its development, provides extensive documentation, and offers help on forums and Q&A sites."""
, 

"data/text_files/ML_intro.txt": """ Machine learning thing- second file"""
}

for filepath,content in sample_text.items():
    with open (filepath, 'w', encoding = 'utf -8') as f:
        f.write(content)

### Text Loader- Read Single text file


In [7]:

from langchain_community.document_loaders import TextLoader

loader = TextLoader("data/text_files/python_intro.txt", encoding="utf-8")

documents = loader.load()
print(type(documents))
print(documents)

print(f"Loaded {len(documents)} documents")
print(f"First document: {documents[0].page_content[:100]}")
print(f"Metadata: {documents[0].metadata}")

### Text Loader- Read Multiple text files





<class 'list'>
[Document(metadata={'source': 'data/text_files/python_intro.txt'}, page_content="Python is a popular, high-level, general-purpose programming language known for its simple, English-like syntax and excellent readability, which makes it a top choice for beginners and experienced developers alike. Developed by Guido van Rossum and first released in 1991, it is an open-source, dynamically typed, and interpreted language. \nKey Features\nReadability and Simplicity: Python's design emphasizes clean, uncluttered syntax using indentation to define code blocks, unlike many languages that use curly brackets.\nVersatile and Multi-Platform: It runs on various operating systems (Windows, macOS, Linux) and supports multiple programming paradigms, including procedural, object-oriented, and functional programming.\nExtensive Libraries and Frameworks: Python has a vast standard library and a massive ecosystem of third-party libraries (like NumPy, Pandas, TensorFlow, Django, Flask) that s

## DirectoryLoader- Multiple Text Files

In [8]:
from langchain_community.document_loaders import DirectoryLoader

dir_loader = DirectoryLoader(
    "data/text_files",
    glob = "**/*.txt",  ## pattern to match files , double * shows folder single one file
    loader_cls = TextLoader,
    loader_kwargs = {"encoding": "utf-8"},
    show_progress = True

)

documents = dir_loader.load()

print(f"Loaded {len(documents)} documents")
for i,doc in enumerate(documents):
    print(f"\nDocument {i+1}:")
    print(f"Content: {doc.page_content[:100]}")
    print(f"Length: {len(doc.page_content)}") 
    print(f"Source: {doc.metadata['source']}")
    print("\n")






  0%|          | 0/2 [00:00<?, ?it/s]

100%|██████████| 2/2 [00:00<00:00, 39.54it/s]

Loaded 2 documents

Document 1:
Content:  Machine learning thing- second file
Length: 36
Source: data\text_files\ML_intro.txt



Document 2:
Content: Python is a popular, high-level, general-purpose programming language known for its simple, English-
Length: 1335
Source: data\text_files\python_intro.txt







## Text Splitting Techniques

In [9]:
from langchain_text_splitters import (
    RecursiveCharacterTextSplitter,
    CharacterTextSplitter,
    TokenTextSplitter
)

print(documents)





[Document(metadata={'source': 'data\\text_files\\ML_intro.txt'}, page_content=' Machine learning thing- second file'), Document(metadata={'source': 'data\\text_files\\python_intro.txt'}, page_content="Python is a popular, high-level, general-purpose programming language known for its simple, English-like syntax and excellent readability, which makes it a top choice for beginners and experienced developers alike. Developed by Guido van Rossum and first released in 1991, it is an open-source, dynamically typed, and interpreted language. \nKey Features\nReadability and Simplicity: Python's design emphasizes clean, uncluttered syntax using indentation to define code blocks, unlike many languages that use curly brackets.\nVersatile and Multi-Platform: It runs on various operating systems (Windows, macOS, Linux) and supports multiple programming paradigms, including procedural, object-oriented, and functional programming.\nExtensive Libraries and Frameworks: Python has a vast standard librar

## Character based text splitter

In [10]:
text = documents[0].page_content
text


' Machine learning thing- second file'

In [11]:
print("Character text Splitter")
char_splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_size = 200,
    chunk_overlap = 20,
    length_function= len
)

char_chunks = char_splitter.split_text(text)
print(f"Number of chunks: {len(char_chunks)}")
print(char_chunks[0][:100])

Character text Splitter
Number of chunks: 1
Machine learning thing- second file


In [12]:
print(char_chunks[0])
print(char_chunks[1])

Machine learning thing- second file


IndexError: list index out of range

## Recursive character text Splitter

In [13]:
#advantage> Support multiple separator
print("Recursive Text Splitter")
recursive_splitter = RecursiveCharacterTextSplitter(
    separators= ["\n\n", "\n", " ",""],
    chunk_size = 200,
    chunk_overlap = 20,
    length_function= len
)

recursive_chunks = recursive_splitter.split_text(text)
print(f"Number of chunks: {len(recursive_chunks)}")
print(recursive_chunks[0][:100]) # first chunk 100 character

for i in range(len(recursive_chunks)-1):
    print(f"Chunk {i+1}:'{recursive_chunks[i]}'" )
    print(f"Chunk {i+2}:'{recursive_chunks[i+1]}'" )






Recursive Text Splitter
Number of chunks: 1
Machine learning thing- second file


In [None]:
print(recursive_chunks[0])
print("-----------------------")
print(recursive_chunks[1])

Python is a popular, high-level, general-purpose programming language known for its simple, English-like syntax and excellent readability, which makes it a top choice for beginners and experienced
-----------------------
and experienced developers alike. Developed by Guido van Rossum and first released in 1991, it is an open-source, dynamically typed, and interpreted language.
