# DOCUMENT SPLITTER

In [2]:
import os
import openai
import sys
sys.path.append('../.env')

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

openai.api_key  = os.environ['OPENAI_API_KEY']

In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter

In [5]:
chunk_size =26
chunk_overlap = 4

In [6]:
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)
c_splitter = CharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)

In [7]:
# it won't split the text as data size is 26
text1 = 'abcdefghijklmnopqrstuvwxyz'

In [8]:
r_splitter.split_text(text1)

['abcdefghijklmnopqrstuvwxyz']

In [9]:
text2 = 'abcdefghijklmnopqrstuvwxyzabcdefg'

In [10]:
r_splitter.split_text(text2)

['abcdefghijklmnopqrstuvwxyz', 'wxyzabcdefg']

In [11]:
text3 = "a b c d e f g h i j k l m n o p q r s t u v w x y z"

In [12]:
r_splitter.split_text(text3)

['a b c d e f g h i j k l m', 'l m n o p q r s t u v w x', 'w x y z']

In [13]:
c_splitter.split_text(text3)

['a b c d e f g h i j k l m n o p q r s t u v w x y z']

In [15]:
c_splitter = CharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    separator = ' '
)
c_splitter.split_text(text3)

['abcdefghijklmnopqrstuvwxyzabcdefg']

# RECURSIVE SPLITTING DETAILS

In [16]:
some_text = """When writing documents, writers will use document structure to group content. \
This can convey to the reader, which idea's are related. For example, closely related ideas \
are in sentances. Similar ideas are in paragraphs. Paragraphs form a document. \n\n  \
Paragraphs are often delimited with a carriage return or two carriage returns. \
Carriage returns are the "backslash n" you see embedded in this string. \
Sentences have a period at the end, but also, have a space.\
and words are separated by space."""

In [17]:
len(some_text)

496

In [18]:
c_splitter = CharacterTextSplitter(
    chunk_size=450,
    chunk_overlap=0,
    separator = ' '
)
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=450,
    chunk_overlap=0, 
    separators=["\n\n", "\n", " ", ""]
)

In [19]:
c_splitter.split_text(some_text)

['When writing documents, writers will use document structure to group content. This can convey to the reader, which idea\'s are related. For example, closely related ideas are in sentances. Similar ideas are in paragraphs. Paragraphs form a document. \n\n Paragraphs are often delimited with a carriage return or two carriage returns. Carriage returns are the "backslash n" you see embedded in this string. Sentences have a period at the end, but also,',
 'have a space.and words are separated by space.']

In [20]:
r_splitter.split_text(some_text)

["When writing documents, writers will use document structure to group content. This can convey to the reader, which idea's are related. For example, closely related ideas are in sentances. Similar ideas are in paragraphs. Paragraphs form a document.",
 'Paragraphs are often delimited with a carriage return or two carriage returns. Carriage returns are the "backslash n" you see embedded in this string. Sentences have a period at the end, but also, have a space.and words are separated by space.']

In [21]:
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=150,
    chunk_overlap=0,
    separators=["\n\n", "\n", "\. ", " ", ""]
)
r_splitter.split_text(some_text)

  separators=["\n\n", "\n", "\. ", " ", ""]


["When writing documents, writers will use document structure to group content. This can convey to the reader, which idea's are related. For example,",
 'closely related ideas are in sentances. Similar ideas are in paragraphs. Paragraphs form a document.',
 'Paragraphs are often delimited with a carriage return or two carriage returns. Carriage returns are the "backslash n" you see embedded in this',
 'string. Sentences have a period at the end, but also, have a space.and words are separated by space.']

In [22]:
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=150,
    chunk_overlap=0,
    separators=["\n\n", "\n", "(?<=\. )", " ", ""]
)
r_splitter.split_text(some_text)

  separators=["\n\n", "\n", "(?<=\. )", " ", ""]


["When writing documents, writers will use document structure to group content. This can convey to the reader, which idea's are related. For example,",
 'closely related ideas are in sentances. Similar ideas are in paragraphs. Paragraphs form a document.',
 'Paragraphs are often delimited with a carriage return or two carriage returns. Carriage returns are the "backslash n" you see embedded in this',
 'string. Sentences have a period at the end, but also, have a space.and words are separated by space.']

In [1]:
from langchain.document_loaders import PyPDFLoader
loader = PyPDFLoader("../docs/pdf/data.pdf")
pages = loader.load()

In [2]:
from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=1000,
    chunk_overlap=150,
    length_function=len
)

In [3]:
docs = text_splitter.split_documents(pages)

In [4]:
len(docs)

13

In [5]:
len(pages)

2

In [8]:
print(pages[0].page_content)

Home	|	Education	For	All	
Skip	to	main	content	
Â	Help	DeskÂ		
Â	Screen	Reader	Â	051-111-112-468	
-A	A	+A	
Select	your	language
EnglishØ§Ø±Ø¯Ùˆ	
Admission	Open	For	Semester	Spring	2024	
Reschedule	Exam	(AD,	BS,	B.Ed)	
AJK	on	10	&	11	May,	2024	
MBA
Viva	Voce	Result	March,	2024	
MSc	Viva	Voce	ResultAdministrative	Sciences	February	19	to	20,	2024	
Convocation	Click	Here	For
Registration	
AIOU	SWIFT	CENTERS	
"	Click	for	Details"	
Certificate/Degree	Processing	Requirements	
Examinations	Department
Main	navigation	
Home	
About	
Vice	Chancellor	Message	
AIOU	at	a	Glance	
Organization	Overview	
Center	of	Excellence	
Seerat-un-
Nabi	(ï·º)	Chair	
Iqbal	Chair	on	Tasawwuf	and	Muslim	Thoughts	
Chair	for	Creating	Archives	of	Intelligentsia	of	Pakistan	
Project
Management	Unit	
Administration	
ADMINISTRATIVE	DEPARTMENTS	
Vice	Chancellor	Office	
Registrar	Office	
Treasurer	Office
Directorate	of	Regional	Services	
Directorate	of	Admissions	&	Mailing	
Directorate	of	Planning	&	Development	
Directorate	o

In [9]:
print(pages[1].page_content)

of	national	progress	and	development.	The	university	takes	same	special	initiatives	for	the	underprivileged	and	marginalized
segments	of	the	society	not	only	for	their	economic	empowerment	but	also	for	poverty	alleviation	in	the	country.	
Prof.	Dr	Shah
Mohyuddin	Hashmi	
Faculty	of	Arabic	And	Islamic	Studies	
"Dean	Message"	Islamic	Studies	is	very	important	and	base	of
Education	in	Islamic	Republic	of	Pakistan	and	no	University	can	be	completed	without	it.	This	is	why	from	the	start	of	AIOU,	The
Institute	of	Arabic	&	Islamic	Studies	was	established	in	1974.This	Institute	prepared	and	offered	many	courses	of	Islamic	Studies
and	Arabic	Language	from	Secondary	School	to	Graduation	level.	
Prof.	Dr.	Irshad	Ahmed	Arshad	
Faculty	of	Sciences	
"Dean
Message"	The	Faculty	of	Sciences	at	AIOU	was	first	established	in	1982	with	only	five	departments	and	limited	number	of
undergraduate	and	basic	functional	courses.	Over	last	three	to	four	years	five	new	departments	and	a	range	of	higher	degree
prog

In [28]:
print(docs[0].page_content[:500])

Home	|	Education	For	All	


# TOKKEN SPLITTING

In [15]:
from langchain.text_splitter import TokenTextSplitter

In [30]:
%pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.6.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting regex>=2022.1.18 (from tiktoken)
  Downloading regex-2024.5.10-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.9/40.9 kB[0m [31m857.7 kB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
Downloading tiktoken-0.6.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m0m
[?25hDownloading regex-2024.5.10-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (789 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m789.3/789.3 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: regex, tiktoken
Successfully installed regex-2024.5.10 tiktoken-0.6.0
Note: yo

In [16]:
text_splitter = TokenTextSplitter(chunk_size=1, chunk_overlap=0)

In [17]:
text1 = "foo bar bazzyfoo"
text2="ArithmeticError: foo bar b foo this is not a number and this is not a variable"

In [19]:
# text_splitter.split_text(text1)
text_splitter.split_text(text2)

['Ar',
 'ithmetic',
 'Error',
 ':',
 ' foo',
 ' bar',
 ' b',
 ' foo',
 ' this',
 ' is',
 ' not',
 ' a',
 ' number',
 ' and',
 ' this',
 ' is',
 ' not',
 ' a',
 ' variable']

In [20]:
text_splitter = TokenTextSplitter(chunk_size=10, chunk_overlap=0)

In [21]:
text_splitter.split_text(text2)

['ArithmeticError: foo bar b foo this is',
 ' not a number and this is not a variable']

In [22]:
docs = text_splitter.split_documents(pages)

In [29]:
docs[0].page_content

'Home\t|\tEducation\tFor\tAll\t'

In [24]:
pages[0].metadata

{'source': '../docs/pdf/data.pdf', 'page': 0}

# Context aware splitting

In [11]:
from langchain.document_loaders import NotionDirectoryLoader
from langchain.text_splitter import MarkdownHeaderTextSplitter

In [12]:
markdown_document = """# Title\n\n \
## Chapter 1\n\n \
Hi this is Jim\n\n Hi this is Joe\n\n \
### Section \n\n \
Hi this is Lance \n\n 
## Chapter 2\n\n \
Hi this is Molly"""

In [13]:
headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]

In [14]:
markdown_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=headers_to_split_on
)
md_header_splits = markdown_splitter.split_text(markdown_document)

In [15]:
md_header_splits[0]

Document(page_content='Hi this is Jim  \nHi this is Joe', metadata={'Header 1': 'Title', 'Header 2': 'Chapter 1'})

In [16]:
md_header_splits[1]

Document(page_content='Hi this is Lance', metadata={'Header 1': 'Title', 'Header 2': 'Chapter 1', 'Header 3': 'Section'})