In [24]:
from langchain.text_splitter import RecursiveCharacterTextSplitter,CharacterTextSplitter


In [25]:
# Quantidade de caracter no Chunk
chunk_size = 26
# Quantidade de caracters anteriores que terão sobreposição
chunk_overlap = 4

In [26]:
# Conseguimos adaptar melhor para a estrutura do texto
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)
# Melhor utilizar quando não queremos considerar paragráfos e sentenças, apenas fixar o tamanho dos chunks
c_splitter = CharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)

In [27]:
# Quantidade correta de 26 caracteres, 1 chunk
text1 = 'abcdefghijklmnopqrstuvwxyz'
r_splitter.split_text(text1)

['abcdefghijklmnopqrstuvwxyz']

In [28]:
# 33 caracteres, 2 chunks
text2 = 'abcdefghijklmnopqrstuvwxyzabcdefg'
# Armazena os 26 primeiros, sobrepõe os 4 últimos, e retorna de onde parou
r_splitter.split_text(text2)

['abcdefghijklmnopqrstuvwxyz', 'wxyzabcdefg']

In [29]:
# Temos 3 chunks, pois espaços em branco entram na contagem, inclusive na sobreposição
text3 = "a b c d e f g h i j k l m n o p q r s t u v w x y z"
r_splitter.split_text(text3)

['a b c d e f g h i j k l m', 'l m n o p q r s t u v w x', 'w x y z']

In [30]:
# Considera apenas os caracteres pois não especificamos um separados, 1 chunk
c_splitter.split_text(text3)

['a b c d e f g h i j k l m n o p q r s t u v w x y z']

In [31]:
c_splitter = CharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    separator = ' '
)

In [32]:
c_splitter.split_text(text3)

['a b c d e f g h i j k l m', 'l m n o p q r s t u v w x', 'w x y z']

### RecursiveCharacterTextSplitter is recommended for generic text. 

In [34]:
some_text = """When writing documents, writers will use document structure to group content. \
This can convey to the reader, which idea's are related. For example, closely related ideas \
are in sentances. Similar ideas are in paragraphs. Paragraphs form a document. \n\n  \
Paragraphs are often delimited with a carriage return or two carriage returns. \
Carriage returns are the "backslash n" you see embedded in this string. \
Sentences have a period at the end, but also, have a space.\
and words are separated by space."""

len(some_text)

496

In [35]:
c_splitter = CharacterTextSplitter(
    chunk_size=450,
    chunk_overlap=0,
    separator = ' '
)
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=450,
    chunk_overlap=0, 
    separators=["\n\n", "\n", " ", ""]
)

In [41]:
# Possui 2 chunks
# c_splitter.split_text(some_text[:450])
c_splitter.split_text(some_text)

['When writing documents, writers will use document structure to group content. This can convey to the reader, which idea\'s are related. For example, closely related ideas are in sentances. Similar ideas are in paragraphs. Paragraphs form a document. \n\n Paragraphs are often delimited with a carriage return or two carriage returns. Carriage returns are the "backslash n" you see embedded in this string. Sentences have a period at the end, but also,',
 'have a space.and words are separated by space.']

In [43]:
# Possui 2 chunks
r_splitter.split_text(some_text)

["When writing documents, writers will use document structure to group content. This can convey to the reader, which idea's are related. For example, closely related ideas are in sentances. Similar ideas are in paragraphs. Paragraphs form a document.",
 'Paragraphs are often delimited with a carriage return or two carriage returns. Carriage returns are the "backslash n" you see embedded in this string. Sentences have a period at the end, but also, have a space.and words are separated by space.']

In [44]:
# Vamos reduzir um pouco o tamanho do chunk e add um ponto final nos separadores.

r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=150,
    chunk_overlap=0,
    separators=["\n\n", "\n", "\. ", " ", ""]
)
# Temos 5 chunks, pontos finais estão no local errado
r_splitter.split_text(some_text)

["When writing documents, writers will use document structure to group content. This can convey to the reader, which idea's are related",
 '. For example, closely related ideas are in sentances. Similar ideas are in paragraphs. Paragraphs form a document.',
 'Paragraphs are often delimited with a carriage return or two carriage returns',
 '. Carriage returns are the "backslash n" you see embedded in this string',
 '. Sentences have a period at the end, but also, have a space.and words are separated by space.']

In [45]:
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=150,
    chunk_overlap=0,
    separators=["\n\n", "\n", "(?<=\. )", " ", ""]
)
# Pontos finais estão no local correto
r_splitter.split_text(some_text)

["When writing documents, writers will use document structure to group content. This can convey to the reader, which idea's are related.",
 'For example, closely related ideas are in sentances. Similar ideas are in paragraphs. Paragraphs form a document.',
 'Paragraphs are often delimited with a carriage return or two carriage returns.',
 'Carriage returns are the "backslash n" you see embedded in this string.',
 'Sentences have a period at the end, but also, have a space.and words are separated by space.']

In [64]:
from langchain.document_loaders import PyPDFLoader

loader = PyPDFLoader(r'C:\AI\Cursos\LangChain_Chat_Data\data\Containers_com_Docker.pdf')

pages = loader.load()

In [65]:
text_splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_size = 1000,
    chunk_overlap = 150,
    length_function = len
)

docs = text_splitter.split_documents(pages)

In [68]:
# Temos uma quantidade maior pois a divisão foi feita em blocos de 1000
len(docs)

[Document(page_content='Containers com  \nDocker  \nDo desenvolvimento à produção', metadata={'source': 'C:\\AI\\Cursos\\LangChain_Chat_Data\\data\\Containers_com_Docker.pdf', 'page': 0}), Document(page_content='© Casa do Código  \nT odos os direitos reservados e protegidos pela Lei nº9.610, de 10/02/1998.  \nNenhuma parte deste livro poderá ser reproduzida, nem transmitida, sem \nautorização prévia por escrito da editora, sejam quais forem os meios: \nfotográficos, eletrônicos, mecânicos, gravação ou quaisquer outros.  \n \n \nCasa do Código  \nLivros para o programador  \nRua Vergueiro, 3185 - 8º andar  \n04101-300 – Vila Mariana – São Paulo – SP – Brasil', metadata={'source': 'C:\\AI\\Cursos\\LangChain_Chat_Data\\data\\Containers_com_Docker.pdf', 'page': 1}), Document(page_content='Casa do Código \n \n  \n \n  \n \n \n \nAgradecimentos \n \n  \n \nDedico esta obra à minha esposa Mychelle. Obrigado por compreender \na minha ausência quando necessário, e pelo apoio em todos os momento

In [69]:
# Tamando do arquivo original
len(pages)

[Document(page_content=' \nContainers com  \nDocker  \nDo desenvolvimento à produção  ', metadata={'source': 'C:\\AI\\Cursos\\LangChain_Chat_Data\\data\\Containers_com_Docker.pdf', 'page': 0}), Document(page_content='  \n© Casa do Código  \nT odos os direitos reservados e protegidos pela Lei nº9.610, de 10/02/1998.  \nNenhuma parte deste livro poderá ser reproduzida, nem transmitida, sem \nautorização prévia por escrito da editora, sejam quais forem os meios: \nfotográficos, eletrônicos, mecânicos, gravação ou quaisquer outros.  \n \n \nCasa do Código  \nLivros para o programador  \nRua Vergueiro, 3185 - 8º andar  \n04101-300 – Vila Mariana – São Paulo – SP – Brasil  ', metadata={'source': 'C:\\AI\\Cursos\\LangChain_Chat_Data\\data\\Containers_com_Docker.pdf', 'page': 1}), Document(page_content=' Casa do Código \n \n  \n \n  \n \n \n \nAgradecimentos \n \n  \n \nDedico esta obra à minha esposa Mychelle. Obrigado por compreender \na minha ausência quando necessário, e pelo apoio em todo

In [70]:
from langchain.text_splitter import TokenTextSplitter

In [71]:
# Converte o texto em tokens. O tokenizer identifica unidades significativas de linguagem, como palavras, pontuação e símbolos.
text_splitter = TokenTextSplitter(
    chunk_size = 1,
    chunk_overlap = 0
)

text1 = 'foo bar bazzyfoo'

In [72]:
text_splitter.split_text(text1)

['foo', ' bar', ' b', 'az', 'zy', 'foo']

In [73]:
text_splitter = TokenTextSplitter(
    chunk_size = 10,
    chunk_overlap = 0
)

docs = text_splitter.split_documents(pages)

In [74]:
docs

[Document(page_content=' \nContainers com  \nDocker', metadata={'source': 'C:\\AI\\Cursos\\LangChain_Chat_Data\\data\\Containers_com_Docker.pdf', 'page': 0}),
 Document(page_content='  \nDo desenvolvimento', metadata={'source': 'C:\\AI\\Cursos\\LangChain_Chat_Data\\data\\Containers_com_Docker.pdf', 'page': 0}),
 Document(page_content=' à produção  ', metadata={'source': 'C:\\AI\\Cursos\\LangChain_Chat_Data\\data\\Containers_com_Docker.pdf', 'page': 0}),
 Document(page_content='  \n© Casa do Códig', metadata={'source': 'C:\\AI\\Cursos\\LangChain_Chat_Data\\data\\Containers_com_Docker.pdf', 'page': 1}),
 Document(page_content='o  \nT odos os direit', metadata={'source': 'C:\\AI\\Cursos\\LangChain_Chat_Data\\data\\Containers_com_Docker.pdf', 'page': 1}),
 Document(page_content='os reservados e protegidos pela', metadata={'source': 'C:\\AI\\Cursos\\LangChain_Chat_Data\\data\\Containers_com_Docker.pdf', 'page': 1}),
 Document(page_content=' Lei nº9.610, de 10/', metadata={'source': 'C:\\AI\