In [27]:
#PART 1 – DOCUMENT LOADERS
# TASK 1 : TEXT LOADER


from langchain_community.document_loaders import TextLoader


# Step 1: Load the .txt file
loader = TextLoader("data/sample.txt")
docs = loader.load()

# Step 2: Print required outputs
print("Number of documents loaded:", len(docs))

print("\n--- Page Content Preview ---")
print(docs[0].page_content[:300])

print("\n--- Metadata ---")
print(docs[0].metadata)


Number of documents loaded: 1

--- Page Content Preview ---


--- Metadata ---
{'source': 'data/sample.txt'}


In [28]:

# TASK 2 : CSV LOADER


from langchain_community.document_loaders import CSVLoader

# Step 1: Load CSV file
loader = CSVLoader("data/sample.csv")

# Step 2: Convert rows into documents
csv_docs = loader.load()

# Step 3: Print sample document
print("Total CSV documents:", len(csv_docs))

print("\n--- Sample CSV Document ---")
print(csv_docs[0].page_content)


Total CSV documents: 10

--- Sample CSV Document ---
id: 1
title: LangChain Basics
category: AI
description: LangChain provides document loaders and text splitters for building LLM applications.


In [29]:

#  TASK 3 : PDF LOADER


from langchain_community.document_loaders import PyPDFLoader

# Step 1: Load PDF
loader = PyPDFLoader("data/sample.pdf")
pdf_docs = loader.load()

# Step 2: Print details
print("Total Pages:", len(pdf_docs))

print("\n--- Sample Page Content ---")
print(pdf_docs[0].page_content[:500])


Total Pages: 8

--- Sample Page Content ---
IntroducƟon to LangChain and LLM LimitaƟons 
Large Language Models have transformed the way soŌware applicaƟons interact with human 
knowledge. Instead of wriƟng complex rule-based programs, developers can now rely on models that 
understand natural language and generate meaningful responses.  
However, these models do not automaƟcally know about private documents, company policies, or 
personal notes.  
They are trained on general public data and therefore require addiƟonal mechanisms to access


In [30]:

# PART 1 - TASK 4 : DIRECTORY LOADER (FIXED)


from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders import TextLoader, PyPDFLoader, CSVLoader

loaders = []

# Load TXT files
txt_loader = DirectoryLoader(
    "data/",
    glob="*.txt",
    loader_cls=TextLoader
)
loaders.extend(txt_loader.load())


# Load PDF files
pdf_loader = DirectoryLoader(
    "data/",
    glob="*.pdf",
    loader_cls=PyPDFLoader
)
loaders.extend(pdf_loader.load())


# Load CSV files
csv_loader = DirectoryLoader(
    "data/",
    glob="*.csv",
    loader_cls=CSVLoader
)
loaders.extend(csv_loader.load())


all_docs = loaders

print("Total documents from directory:", len(all_docs))

for d in all_docs:
    print(d.metadata)


Total documents from directory: 19
{'source': 'data\\sample.txt'}
{'producer': 'Microsoft: Print To PDF', 'creator': 'PyPDF', 'creationdate': '2026-02-08T12:09:22+05:30', 'author': 'Kiruthiga Mutharasu', 'moddate': '2026-02-08T12:09:22+05:30', 'title': 'Microsoft Word - sample', 'source': 'data\\sample.pdf', 'total_pages': 8, 'page': 0, 'page_label': '1'}
{'producer': 'Microsoft: Print To PDF', 'creator': 'PyPDF', 'creationdate': '2026-02-08T12:09:22+05:30', 'author': 'Kiruthiga Mutharasu', 'moddate': '2026-02-08T12:09:22+05:30', 'title': 'Microsoft Word - sample', 'source': 'data\\sample.pdf', 'total_pages': 8, 'page': 1, 'page_label': '2'}
{'producer': 'Microsoft: Print To PDF', 'creator': 'PyPDF', 'creationdate': '2026-02-08T12:09:22+05:30', 'author': 'Kiruthiga Mutharasu', 'moddate': '2026-02-08T12:09:22+05:30', 'title': 'Microsoft Word - sample', 'source': 'data\\sample.pdf', 'total_pages': 8, 'page': 2, 'page_label': '3'}
{'producer': 'Microsoft: Print To PDF', 'creator': 'PyPDF'

In [31]:

# PART 1 - TASK 5 : WEB LOADER

from langchain_community.document_loaders import WebBaseLoader

# Step 1: Load webpage
loader = WebBaseLoader("https://en.wikipedia.org/wiki/LangChain")
web_docs = loader.load()

# Step 2: Print first 500 characters
print(web_docs[0].page_content[:500])






LangChain - Wikipedia



























Jump to content







Main menu





Main menu
move to sidebar
hide



		Navigation
	


Main pageContentsCurrent eventsRandom articleAbout WikipediaContact us





		Contribute
	


HelpLearn to editCommunity portalRecent changesUpload fileSpecial pages



















Search











Search






















Appearance
















Donate

Create account

Log in








Personal tools





Donate Create account Log in















# PART 2 - TASK 6 : WHY TEXT SPLITTING REQUIRED

1. Why large docs cannot be passed?
- Token limits of LLM
- High cost
- Irrelevant context
- Slow response

2. Problems chunking solves:
- Fits context window
- Improves retrieval
- Reduces hallucination
- Better search relevance


In [32]:

# PART 2 - TASK 7 : LENGTH BASED SPLITTER


from langchain_text_splitters import CharacterTextSplitter

splitter = CharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=50
)

chunks = splitter.split_documents(all_docs)

print("Number of chunks:", len(chunks))
print("\nSample Chunk:\n", chunks[0].page_content)


Number of chunks: 18

Sample Chunk:
 IntroducƟon to LangChain and LLM LimitaƟons 
Large Language Models have transformed the way soŌware applicaƟons interact with human 
knowledge. Instead of wriƟng complex rule-based programs, developers can now rely on models that 
understand natural language and generate meaningful responses.  
However, these models do not automaƟcally know about private documents, company policies, or 
personal notes.  
They are trained on general public data and therefore require addiƟonal mechanisms to access 
domain speciﬁc informaƟon. LangChain was created to bridge this gap between raw data sources 
and intelligent language models. 
LangChain is an open source framework that provides building blocks for creaƟng GeneraƟve AI 
applicaƟons. It includes modules for loading documents, spliƫ ng long text, generaƟng embeddings, 
storing vectors, and construcƟng prompts.  
Each module solves a pracƟcal engineering problem faced while integraƟng LLMs into real products

In [34]:

# PART 2 - TASK 8 : RECURSIVE SPLITTER


from langchain_text_splitters import RecursiveCharacterTextSplitter

recursive = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=50
)

rec_chunks = recursive.split_documents(all_docs)

print("Chunks from recursive:", len(rec_chunks))


Chunks from recursive: 64


# TASK 9 & 10

Semantic chunking means:
- Splitting based on meaning
- Not fixed length
- Uses embeddings similarity
- Keeps topics together


In [35]:

# PART 3 - TASK 11 : UNIFIED PIPELINE


def load_and_split_documents(path_or_url):

    if path_or_url.startswith("http"):
        loader = WebBaseLoader(path_or_url)

    elif path_or_url.endswith(".pdf"):
        loader = PyPDFLoader(path_or_url)

    elif path_or_url.endswith(".csv"):
        loader = CSVLoader(path_or_url)

    else:
        loader = TextLoader(path_or_url)

    docs = loader.load()

    splitter = RecursiveCharacterTextSplitter(
        chunk_size=400,
        chunk_overlap=50
    )

    return splitter.split_documents(docs)


1. Loader Mapping
- txt → TextLoader
- csv → CSVLoader
- pdf → PyPDFLoader
- web → WebBaseLoader

2. Best Splitter
- Small text → Character
- Large PDF → Recursive
- Web → Recursive

3. Overlap Importance
- Keeps context
- Prevents cut sentences
