# Chunking with LangChain and Unstructured I/O


This notebook demonstrates how to:
- Extract text from PDF, DOCX, and HTML using Unstructured I/O
- Chunk the extracted text using LangChain's text splitters
- Compare different chunking strategies
- Prepare data for a basic RAG pipeline
    

In [1]:
print("required installations")
# pip install langchain unstructured pdf2image python-docx lxml nltk
# pip install "unstructured[local-inference]" --quiet


required installations


In [None]:

from langchain.text_splitter import (
    CharacterTextSplitter,
    RecursiveCharacterTextSplitter,
    MarkdownHeaderTextSplitter
)

from unstructured.partition.pdf import partition_pdf
from unstructured.partition.docx import partition_docx
from unstructured.partition.html import partition_html

import os


## 1. Extract and Chunk PDF

In [None]:

pdf_elements = partition_pdf(filename="sample.pdf")
pdf_text = " ".join([el.text for el in pdf_elements if el.text])

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
pdf_chunks = text_splitter.split_text(pdf_text)

print(f"Total chunks from PDF: {len(pdf_chunks)}")
print(pdf_chunks[:2])


## 2. Extract and Chunk DOCX

In [None]:

docx_elements = partition_docx(filename="sample.docx")
docx_text = " ".join([el.text for el in docx_elements if el.text])

docx_chunks = text_splitter.split_text(docx_text)

print(f"Total chunks from DOCX: {len(docx_chunks)}")
print(docx_chunks[:2])


## 3. Extract and Chunk HTML

In [None]:

html_elements = partition_html(filename="sample.html")
html_text = " ".join([el.text for el in html_elements if el.text])

html_chunks = text_splitter.split_text(html_text)

print(f"Total chunks from HTML: {len(html_chunks)}")
print(html_chunks[:2])
