### Introduction to Data Ingestion

In [1]:
import os
from typing import List, Dict, Any
import pandas as pd

In [4]:
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter, CharacterTextSplitter, TokenTextSplitter

#### Understanding Document Structure in LangChain

In [7]:
#creating simple document

doc = Document(
    page_content="this is the main text content that will be embedded and searched",
    metadata={
        "source":"exampe.txt",
        "author":"Mounica",
        "created_on":"06 Dec 2025"
    }
)

In [8]:
print("Document Struture")
print(f'Content: {doc.page_content}')
print(f'Metadata: {doc.metadata}')

Document Struture
Content: this is the main text content that will be embedded and searched
Metadata: {'source': 'exampe.txt', 'author': 'Mounica', 'created_on': '06 Dec 2025'}


In [9]:
type(doc)

langchain_core.documents.base.Document

#### Text Files (.txt)

In [10]:
#create a simple txt file

import os

os.makedirs("data/text_files", exist_ok=True)

In [11]:
sample_texts={
    "data/text_files/python_data.txt":
    '''
üêç Python Programming Overview Python is a high-level, interpreted programming language known for its simplicity and readability. Its clean syntax, which often resembles plain English, makes it an excellent choice for beginners while still being powerful enough for professionals. Python supports multiple programming paradigms, including object-oriented, functional, and procedural styles, giving developers flexibility in how they approach problem-solving.

‚öôÔ∏è Applications of Python One of Python‚Äôs greatest strengths is its versatility. It is widely used in web development with frameworks like Django and Flask, in data science and machine learning with libraries such as Pandas, NumPy, and TensorFlow, and in automation and scripting tasks. Python also plays a major role in scientific computing, artificial intelligence, and even game development. Its extensive ecosystem of libraries and frameworks allows developers to build robust applications quickly and efficiently.

üåç Community and Growth Python‚Äôs popularity has surged in recent years, largely due to its strong community support and open-source nature. Millions of developers contribute to its libraries, tutorials, and forums, making it easy to find help and resources. Organizations across industries‚Äîfrom startups to tech giants‚Äîrely on Python for critical projects, ensuring its continued relevance and growth. With its balance of simplicity and power, Python has become one of the most influential programming languages in the world today.
'''
}

In [14]:
for filepath, content in sample_texts.items():
    with open(filepath, "w", encoding="utf-8") as f:
        f.write(content)

#### TextLoader : Read a single file

In [16]:
from langchain_community.document_loaders import TextLoader

loader= TextLoader("data/text_files/python_data.txt", encoding="utf-8")
documents=loader.load()

In [17]:
type(documents)

list

In [18]:
documents

[Document(metadata={'source': 'data/text_files/python_data.txt'}, page_content='\nüêç Python Programming Overview Python is a high-level, interpreted programming language known for its simplicity and readability. Its clean syntax, which often resembles plain English, makes it an excellent choice for beginners while still being powerful enough for professionals. Python supports multiple programming paradigms, including object-oriented, functional, and procedural styles, giving developers flexibility in how they approach problem-solving.\n\n‚öôÔ∏è Applications of Python One of Python‚Äôs greatest strengths is its versatility. It is widely used in web development with frameworks like Django and Flask, in data science and machine learning with libraries such as Pandas, NumPy, and TensorFlow, and in automation and scripting tasks. Python also plays a major role in scientific computing, artificial intelligence, and even game development. Its extensive ecosystem of libraries and frameworks a

In [24]:
print(f"üìÑ Loaded {len(documents)} Documents")
print(f"Content preview: {documents[0].page_content[:100]}")
print(f"Metadata : {documents[0].metadata}")

üìÑ Loaded 1 Documents
Content preview: 
üêç Python Programming Overview Python is a high-level, interpreted programming language known for it
Metadata : {'source': 'data/text_files/python_data.txt'}


#### Directory Loader - Multiple Text Files

In [28]:
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders import TextLoader

#load all the text files from the directory

dir_loader = DirectoryLoader(
    path="data/text_files",
    glob="**/*.txt", #pattern to match files
    loader_cls=TextLoader,
    loader_kwargs={"encoding":"utf-8"},
    show_progress=True
)

documents = dir_loader.load()

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 594.77it/s]


In [30]:
print(f"üìÑ Loaded {len(documents)} Documents\n")
for doc in documents:
    print(f"Content preview: {doc.page_content[:100]}")
    print(f"Metadata : {doc.metadata}")
    print(f"No of characters in document: {len(doc.page_content)}")

üìÑ Loaded 1 Documents

Content preview: 
üêç Python Programming Overview Python is a high-level, interpreted programming language known for it
Metadata : {'source': 'data\\text_files\\python_data.txt'}
No of characters in document: 1511
