### Introduction to Data Ingestion

In [25]:
import os
from typing import List, Dict, Any
import pandas as pd

In [26]:
from langchain_core.documents import Document
from langchain.text_splitter import(
    RecursiveCharacterTextSplitter,
    CharacterTextSplitter,
    TokenTextSplitter
)

### Understand Document Structure in Langchain

In [27]:
## create a simple document
doc = Document(
    page_content='This is the main text content that will be embedded and searched.',
    metadata = {
        "source":"example.txt",
        "page":'1',
        "author":"Atharva",
        "date_created":"09-19-2025",
        "custom_field":"any_value"
    }
)

print("Document Structure")
print(f"Content :{doc.page_content}")
print(f"Metadata :{doc.metadata}")

Document Structure
Content :This is the main text content that will be embedded and searched.
Metadata :{'source': 'example.txt', 'page': '1', 'author': 'Atharva', 'date_created': '09-19-2025', 'custom_field': 'any_value'}


### Text Files (.txt)

In [28]:
import os
os.makedirs('data/text_files',exist_ok=True)

In [29]:
sample_texts = {
    'data/text_files/python_intro.txt':"""Introduction to Python

Python is a high-level, interpreted programming language that is widely 
used for various applications such as web development, scientific 
computing, and data analysis. Created in the late 1980s by Guido van 
Rossum, Python is known for its simplicity, readability, and ease of use, 
making it an ideal language for beginners and experts alike.

Python's syntax is concise and intuitive, with a focus on code 
readability. It has a vast standard library that includes modules for 
tasks such as file I/O, networking, and data structures. The language also 
supports object-oriented programming (OOP) concepts like classes and 
inheritance.
""",
    'data/text_files/ml_intro.txt':"""Introduction to Machine Learning

Machine learning is a subset of artificial intelligence that enables 
systems to learn from data without being explicitly programmed. It 
involves training algorithms on large datasets to enable them to make 
predictions, classify patterns, or make decisions on their own. The goal 
of machine learning is to improve the accuracy and efficiency of tasks 
such as image recognition, natural language processing, and predictive 
modeling. Machine learning algorithms can be broadly classified into 
supervised, unsupervised, and reinforcement learning, each with its unique 
approach to data-driven decision-making. It has numerous applications in 
areas like computer vision, robotics, and healthcare.
    """
}

for filepath, content in sample_texts.items():
    with open(filepath,'w', encoding='utf-8') as f:
        f.write(content)

print("Sample file created")

Sample file created


### TextLoader - Read Single File

In [30]:
from langchain.document_loaders import TextLoader

# Loading a single txt file
loader = TextLoader('data/text_files/python_intro.txt', encoding='utf-8')

documents = loader.load()


In [31]:
print(type(documents))

<class 'list'>


In [32]:
print(documents)

[Document(metadata={'source': 'data/text_files/python_intro.txt'}, page_content="Introduction to Python\n\nPython is a high-level, interpreted programming language that is widely \nused for various applications such as web development, scientific \ncomputing, and data analysis. Created in the late 1980s by Guido van \nRossum, Python is known for its simplicity, readability, and ease of use, \nmaking it an ideal language for beginners and experts alike.\n\nPython's syntax is concise and intuitive, with a focus on code \nreadability. It has a vast standard library that includes modules for \ntasks such as file I/O, networking, and data structures. The language also \nsupports object-oriented programming (OOP) concepts like classes and \ninheritance.\n")]


In [33]:
print(f" Loaded {len(documents)} documents")
print("Content preview:")
print(documents[0].page_content[:100])
print("Metadata preview:")
print(documents[0].metadata)

# Loading multiple txt files


 Loaded 1 documents
Content preview:
Introduction to Python

Python is a high-level, interpreted programming language that is widely 
use
Metadata preview:
{'source': 'data/text_files/python_intro.txt'}


### Directory Loader - Read All Files in a Directory

In [34]:
from langchain.document_loaders import DirectoryLoader

dir_loader = DirectoryLoader(
    'data/text_files',
    glob="**/*.txt",
    loader_cls = TextLoader,
    loader_kwargs = {'encoding':'utf-8'},
    show_progress=True
)

documents = dir_loader.load()

for i, doc in enumerate(documents):
    print(f"Document {i+1}:")
    print(f"Content: {doc.page_content[:100]}")
    print(f"Metadata: {doc.metadata}")
    print("\n")

print("Dicrectory Loader Characteristics")
print("Advantages:")
print("- Loads multiple files at once")
print("- supports glob patterns")

print("Disadvantage:")
print("- Does not support metadata filtering")
print("- All files must be of the same type")
print("- Can be slow for large directories")




100%|██████████| 2/2 [00:00<00:00, 881.53it/s]

Document 1:
Content: Introduction to Python

Python is a high-level, interpreted programming language that is widely 
use
Metadata: {'source': 'data/text_files/python_intro.txt'}


Document 2:
Content: Introduction to Machine Learning

Machine learning is a subset of artificial intelligence that enabl
Metadata: {'source': 'data/text_files/ml_intro.txt'}


Dicrectory Loader Characteristics
Advantages:
- Loads multiple files at once
- supports glob patterns
Disadvantage:
- Does not support metadata filtering
- All files must be of the same type
- Can be slow for large directories





## Text Splitting Stratergies

In [35]:
from langchain.text_splitter import(
    RecursiveCharacterTextSplitter,
    CharacterTextSplitter,
    TokenTextSplitter
)
print(documents)

[Document(metadata={'source': 'data/text_files/python_intro.txt'}, page_content="Introduction to Python\n\nPython is a high-level, interpreted programming language that is widely \nused for various applications such as web development, scientific \ncomputing, and data analysis. Created in the late 1980s by Guido van \nRossum, Python is known for its simplicity, readability, and ease of use, \nmaking it an ideal language for beginners and experts alike.\n\nPython's syntax is concise and intuitive, with a focus on code \nreadability. It has a vast standard library that includes modules for \ntasks such as file I/O, networking, and data structures. The language also \nsupports object-oriented programming (OOP) concepts like classes and \ninheritance.\n"), Document(metadata={'source': 'data/text_files/ml_intro.txt'}, page_content='Introduction to Machine Learning\n\nMachine learning is a subset of artificial intelligence that enables \nsystems to learn from data without being explicitly pr

In [51]:
### Method 1: Character Text Splitter
text = documents[0].page_content
text 

"Introduction to Python\n\nPython is a high-level, interpreted programming language that is widely \nused for various applications such as web development, scientific \ncomputing, and data analysis. Created in the late 1980s by Guido van \nRossum, Python is known for its simplicity, readability, and ease of use, \nmaking it an ideal language for beginners and experts alike.\n\nPython's syntax is concise and intuitive, with a focus on code \nreadability. It has a vast standard library that includes modules for \ntasks such as file I/O, networking, and data structures. The language also \nsupports object-oriented programming (OOP) concepts like classes and \ninheritance.\n"

In [52]:
char_splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_size = 100,
    chunk_overlap = 20,
    length_function = len
)

char_chunks = char_splitter.split_text(text)
print(f"Created {len(char_chunks)} chunks")
print("First chunk:")
print(char_chunks[0])
print('--------------------------------')
print("next chunk:")
print(char_chunks[1])





Created 9 chunks
First chunk:
Introduction to Python
Python is a high-level, interpreted programming language that is widely
--------------------------------
next chunk:
used for various applications such as web development, scientific


In [53]:
# Method 2: Recursive Character Text Splitter
recursive_splitter = RecursiveCharacterTextSplitter(
    separators = [" "],
    chunk_size = 100,
    chunk_overlap = 20,
    length_function = len
)

recursive_chunks = recursive_splitter.split_text(text)
print(f"Created {len(recursive_chunks)} chunks")
print("First chunk:")
print(recursive_chunks[0][:100])
print('--------------------------------')
print("Next chunk:")
print(recursive_chunks[1])



Created 9 chunks
First chunk:
Introduction to Python

Python is a high-level, interpreted programming language that is widely
--------------------------------
Next chunk:
that is widely 
used for various applications such as web development, scientific 
computing, and


In [54]:
# Create text without natural breaks points
text = "This is a simple text. It doesn't have any natural breaks. This is the third sentence. Now we got to the fourth senetence."

# Character Text Splitter
splitter = RecursiveCharacterTextSplitter(
    separators = [" "],
    chunk_size = 80,
    chunk_overlap = 20,
    length_function = len
)

chunks = splitter.split_text(text)
print(f"Simple Text example - {len(chunks)} chunks created")

for i in range(len(chunks)-1):
    print(f"Chunk {i+1}: '{chunks[i]}'")
   
    print(f"Chunk {i+2}: '{chunks[i+1]}'")
    
    print() 
   

Simple Text example - 2 chunks created
Chunk 1: 'This is a simple text. It doesn't have any natural breaks. This is the third'
Chunk 2: 'This is the third sentence. Now we got to the fourth senetence.'



In [56]:
### TOKEN TEXT SPLITTER
from langchain.text_splitter import TokenTextSplitter

text = "This is a simple text. It doesn't have any natural breaks. This is the third sentence. Now we got to the fourth senetence."

# Token Text Splitter
token_splitter = TokenTextSplitter(
    chunk_size = 20,
    chunk_overlap = 5,
    length_function = len
)

token_chunks = token_splitter.split_text(text)
print(f"Created {len(token_chunks)} chunks")
print("First chunk:")
print(token_chunks[0])
print('--------------------------------')
print("Next chunk:")
print(token_chunks[1])



Created 2 chunks
First chunk:
This is a simple text. It doesn't have any natural breaks. This is the third sentence.
--------------------------------
Next chunk:
 is the third sentence. Now we got to the fourth senetence.
