In [2]:
### Document structure
from langchain_core.documents import Document

In [3]:
document = Document(
    page_content="This is the main text content that goes into the document.",
    metadata={
        "source":"example.txt",
        "pages":1,
        "author":"Navya",
        "date_created":"2025-01-01"
    }
)

In [4]:
## create a sample txt file
import os
os.makedirs("../data/text_files", exist_ok=True)

In [5]:
sample_texts={
    "../data/text_files/python_intro.txt": "Python is a high-level, interpreted programming language known for its readability and versatility. It supports multiple programming paradigms, including procedural, object-oriented, and functional programming. Python has a large standard library and a vibrant ecosystem of third-party packages, making it suitable for a wide range of applications such as web development, data analysis, artificial intelligence, scientific computing, and automation.",
    "../data/text_files/machine_learning.txt": "Machine learning is a subset of artificial intelligence that focuses on the development of algorithms and statistical models that enable computers to perform specific tasks without explicit instructions. Instead, these systems learn from and make predictions or decisions based on data. Machine learning techniques can be broadly categorized into supervised learning, unsupervised learning, and reinforcement learning. It is widely used in various applications, including image and speech recognition, recommendation systems, fraud detection, and natural language processing.",
}
for file_path, content in sample_texts.items():
    with open(file_path, "w") as f:
        f.write(content)    

print("Sample text files created.")

Sample text files created.


In [6]:
### Text Loader
#from lanchain.document_loaders import TextLoader

from langchain_community.document_loaders import TextLoader

loader = TextLoader("../data/text_files/python_intro.txt", encoding="utf-8")
document=loader.load()
print(document)

[Document(metadata={'source': '../data/text_files/python_intro.txt'}, page_content='Python is a high-level, interpreted programming language known for its readability and versatility. It supports multiple programming paradigms, including procedural, object-oriented, and functional programming. Python has a large standard library and a vibrant ecosystem of third-party packages, making it suitable for a wide range of applications such as web development, data analysis, artificial intelligence, scientific computing, and automation.')]


In [7]:
### Directory Loader
from langchain_community.document_loaders import DirectoryLoader

# Load all .txt files from the specified directory
dir_loader=DirectoryLoader(
    "../data/text_files", 
    glob="*.txt", ## Pattern to match files
    loader_cls=TextLoader, ## loader class to use
    loader_kwargs={"encoding":"utf-8"}
)

documents=dir_loader.load()
documents

[Document(metadata={'source': '..\\data\\text_files\\machine_learning.txt'}, page_content='Machine learning is a subset of artificial intelligence that focuses on the development of algorithms and statistical models that enable computers to perform specific tasks without explicit instructions. Instead, these systems learn from and make predictions or decisions based on data. Machine learning techniques can be broadly categorized into supervised learning, unsupervised learning, and reinforcement learning. It is widely used in various applications, including image and speech recognition, recommendation systems, fraud detection, and natural language processing.'),
 Document(metadata={'source': '..\\data\\text_files\\python_intro.txt'}, page_content='Python is a high-level, interpreted programming language known for its readability and versatility. It supports multiple programming paradigms, including procedural, object-oriented, and functional programming. Python has a large standard libr

In [9]:
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader

# Load all pdf files from the specified directory
dir_loader=DirectoryLoader(
    "../data/pdf", 
    glob="**/*.pdf", ## Pattern to match files
    loader_cls=PyMuPDFLoader, ## loader class to use
    show_progress=False
)

pdf_documents=dir_loader.load()
pdf_documents

[Document(metadata={'producer': 'ReportLab PDF Library - www.reportlab.com', 'creator': '(unspecified)', 'creationdate': '2025-09-21T06:05:21+00:00', 'source': '..\\data\\pdf\\DSA_Week1_QuestionBank.pdf', 'file_path': '..\\data\\pdf\\DSA_Week1_QuestionBank.pdf', 'total_pages': 7, 'format': 'PDF 1.4', 'title': '(anonymous)', 'author': '(anonymous)', 'subject': '(unspecified)', 'keywords': '', 'moddate': '2025-09-21T06:05:21+00:00', 'trapped': '', 'modDate': "D:20250921060521+00'00'", 'creationDate': "D:20250921060521+00'00'", 'page': 0}, page_content='Day 1 – Arrays Basics\n1. Find the maximum element in an array\n2. Find the minimum element in an array\n3. Reverse an array in-place\n4. Rotate array by k positions\n5. Find the second largest element\n6. Compute prefix sums of an array\n7. Check if array is sorted\n8. Find frequency of each element\n9. Move all zeros to the end\n10. Find leaders in an array\n11. Find missing number in 1..n array\n12. Linear search for a target element\n1

In [11]:
type(document[0])

langchain_core.documents.base.Document