https://learn.deeplearning.ai/langchain-chat-with-your-data/lesson/2/document-loading

In [1]:
#pip install langchain
# pip install python-dotenv
import os 
import openai 
import sys 
sys.path.append("../..")

In [3]:

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local env.file 

openai.api_key = os.environ["OPENAI_API_KEY"]

Doc splitting

In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter

# to avoid that one sentence may be splitted appart, we try
# to get semantic meaninng like this: chunkk overlap so that we are sure
# that we get the rigth information

chunk_size =26
chunk_overlap = 4 

In [5]:
#initizalize two different splitters to see what they do

r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)

c_splitter = CharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)

In [6]:
text1 = "abcdefghijklmnopqrstuvwx"
r_splitter.split_text(text1) # ['abcdefghijklmnopqrstuvwx']

# since in text2 there are more characters, r_splitter is going to separate it and give the whole text first, 
# and later the second chunk

text2 = text2 = 'abcdefghijklmnopqrstuvwxyzabcdefg'
r_splitter.split_text(text2) # ['abcdefghijklmnopqrstuvwxyz', 'wxyzabcdefg'] 

['abcdefghijklmnopqrstuvwxyz', 'wxyzabcdefg']

In [7]:
text3 = "a b c d e f g h i j k l m n o p q r s t u v w x y z"
r_splitter.split_text(text3)

['a b c d e f g h i j k l m', 'l m n o p q r s t u v w x', 'w x y z']

In [8]:
c_splitter.split_text(text3)

['a b c d e f g h i j k l m n o p q r s t u v w x y z']

In [9]:
# since it works for each line, you need to add more spaces so that it interpretes
# that there are different characters

c_splitter = CharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    separator = ' '
)
c_splitter.split_text(text3)

['a b c d e f g h i j k l m', 'l m n o p q r s t u v w x', 'w x y z']

In [10]:
# I add some spaces between a and b and it does not work! 
# We need to add the SEPARATOR parameter! as before
text3_otro = "a            b  c  d  e            f g h i  j k l m n o p q   r  s    t u v w x y z"
c_splitter = CharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
)
c_splitter.split_text(text3)

['a b c d e f g h i j k l m n o p q r s t u v w x y z']

RECURSIVE SPLITTING DETAILS


In [11]:
# RecursiveCharacterTextSplitter recommended for general texts
some_text = """When writing documents, writers will use document structure to group content. \
This can convey to the reader, which idea's are related. For example, closely related ideas \
are in sentances. Similar ideas are in paragraphs. Paragraphs form a document. \n\n  \
Paragraphs are often delimited with a carriage return or two carriage returns. \
Carriage returns are the "backslash n" you see embedded in this string. \
Sentences have a period at the end, but also, have a space.\
and words are separated by space."""
len(some_text)

496

In [12]:
c_splitter = CharacterTextSplitter(
    chunk_size=450,
    chunk_overlap=0,
    separator = ' '
)
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=450,
    chunk_overlap=0, 
    separators=["\n\n", "\n", " ", ""] # list of separators, as default
)

c_splitter.split_text(some_text)

['When writing documents, writers will use document structure to group content. This can convey to the reader, which idea\'s are related. For example, closely related ideas are in sentances. Similar ideas are in paragraphs. Paragraphs form a document. \n\n Paragraphs are often delimited with a carriage return or two carriage returns. Carriage returns are the "backslash n" you see embedded in this string. Sentences have a period at the end, but also,',
 'have a space.and words are separated by space.']

In [13]:
r_splitter.split_text(some_text)

# there are some problems bc regex is not the best, so we change the regex 



r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=150,
    chunk_overlap=0,
    separators=["\n\n", "\n", "(?<=\. )", " ", ""]
)
r_splitter.split_text(some_text)




["When writing documents, writers will use document structure to group content. This can convey to the reader, which idea's are related.",
 'For example, closely related ideas are in sentances. Similar ideas are in paragraphs. Paragraphs form a document.',
 'Paragraphs are often delimited with a carriage return or two carriage returns.',
 'Carriage returns are the "backslash n" you see embedded in this string.',
 'Sentences have a period at the end, but also, have a space.and words are separated by space.']

NOw, with pdf

In [14]:
from langchain.document_loaders import PyPDFLoader
loader = PyPDFLoader("MachineLearning-Lecture01.pdf")
pages = loader.load()

In [15]:
from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=1000,
    chunk_overlap=150,
    length_function=len
)

docs = text_splitter.split_documents(pages)

print("len of docs" , len(docs))
print("original pages", len(pages))

len of docs 77
original pages 22


Token splitting

In [18]:
from langchain.text_splitter import TokenTextSplitter

#chunk size = 1 to get tokens
text_splitter = TokenTextSplitter(chunk_size=1, chunk_overlap=0)
text1 = "foo bar bazzyfoo"
text_splitter.split_text(text1)


['foo', ' bar', ' b', 'az', 'zy', 'foo']

In [21]:
text_splitter = TokenTextSplitter(chunk_size=10, chunk_overlap=0)
docs = text_splitter.split_documents(pages)
docs[0]



Document(page_content='MachineLearning-Lecture01  \n', metadata={'source': 'MachineLearning-Lecture01.pdf', 'page': 0})

In [22]:
pages[0].metadata

{'source': 'MachineLearning-Lecture01.pdf', 'page': 0}

CONTEXT AWARE SPLITTING

In [23]:
from langchain.document_loaders import NotionDirectoryLoader
from langchain.text_splitter import MarkdownHeaderTextSplitter

In [24]:
markdown_document = """# Title\n\n \
## Chapter 1\n\n \
Hi this is Jim\n\n Hi this is Joe\n\n \
### Section \n\n \
Hi this is Lance \n\n 
## Chapter 2\n\n \
Hi this is Molly"""


headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]

In [25]:
markdown_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=headers_to_split_on
)
md_header_splits = markdown_splitter.split_text(markdown_document)

In [26]:
md_header_splits[0]

Document(page_content='Hi this is Jim  \nHi this is Joe', metadata={'Header 1': 'Title', 'Header 2': 'Chapter 1'})