In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
# import kagglehub
# youssef19_how_to_build_a_career_in_ai_pdf_path = kagglehub.dataset_download('youssef19/how-to-build-a-career-in-ai-pdf')
youssef19_how_to_build_a_career_in_ai_pdf_path = "./data/youssef19/how-to-build-a-career-in-ai-pdf"

print('Data source import complete.')


In [None]:
!pip install langchain langchain-openai langchain-community pypdf yt_dlp pydub unstructured docx2txt python-dotenv xlrd openai transformers huggingface-hub


Collecting langchain-community
  Downloading langchain_community-0.4.1-py3-none-any.whl.metadata (3.0 kB)
Collecting yt_dlp
  Downloading yt_dlp-2025.10.22-py3-none-any.whl.metadata (176 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m176.0/176.0 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
Collecting unstructured
  Downloading unstructured-0.18.15-py3-none-any.whl.metadata (24 kB)
Collecting docx2txt
  Downloading docx2txt-0.9-py3-none-any.whl.metadata (529 bytes)
INFO: pip is looking at multiple versions of langchain-community to determine which version is compatible with other requirements. This could take a while.
Collecting langchain-community
  Downloading langchain_community-0.4-py3-none-any.whl.metadata (3.0 kB)
  Downloading langchain_community-0.3.31-py3-none-any.whl.metadata (3.0 kB)
Collecting langchain-core<1.0.0,>=0.3.72 (from langchain)
  Downloading langchain_core-0.3.79-py3-none-any.whl.metadata (3.2 kB)
Collecting python-magic (from unstructured)
 

In [None]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableSequence,RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
import os
from dotenv import load_dotenv
import os
from dotenv import load_dotenv
from openai import OpenAI

load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")

client = OpenAI(
    api_key=openai_api_key,
)
model="gpt-3.5-turbo"


# recursive character text splitter & the character text splitter


## Character Text Splitter
- Splits by number of characters only.
- Does not preserve meaning (may cut sentences).
- Best for short or simple texts.
- Simple and straightforward.

## Recursive Character Text Splitter
- Splits hierarchically using natural separators first.
- Preserves meaning as much as possible.
- Ideal for long or complex texts.
- More advanced and intelligent.

## Key Difference
- Recursive = smarter, preserves meaning.
- Character = simple, may break text.


In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter , CharacterTextSplitter

r_splitter =RecursiveCharacterTextSplitter (chunk_size=25, chunk_overlap=4)

c_splitter =CharacterTextSplitter (chunk_size=25, chunk_overlap=4)

In [None]:
text1 = 'abcdefghijklmnopqrstuvwxyz'
r_splitter.split_text(text1)


['abcdefghijklmnopqrstuvwxy', 'vwxyz']

In [None]:
text2 = 'abcdefghijklmnopqrstuvwxyzabcdefg'
r_splitter.split_text(text2)

['abcdefghijklmnopqrstuvwxy', 'vwxyzabcdefg']

Here we can see that two different chunks are created. The first one ends at Z, so that’s 26 characters. The next one we can see starts with W, X, Y, Z. Those are the four chunk overlaps, And then it continues with the rest of the string

In [None]:
text3 = "a b c d e f g h i j k l m n o p q r s t u v w x y z"
r_splitter.split_text(text3)

['a b c d e f g h i j k l m', 'l m n o p q r s t u v w', 'v w x y z']

In [None]:


chunk_size =26
chunk_overlap = 4

c_splitter = CharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    separator = ' '
)
c_splitter.split_text(text3)

['a b c d e f g h i j k l m', 'l m n o p q r s t u v w x', 'w x y z']

# Diving deep in recursive splitting

In [None]:
text = """As the world grapples with the challenges of climate change, \
renewable energy emerges as a beacon of hope. Solar and wind power, \
in particular, are transforming the energy landscape, offering sustainable \
alternatives to traditional fossil fuels. \n\n  \
Governments and businesses globally are investing in clean energy \
initiatives to reduce carbon footprints and mitigate environmental impact. \
The shift towards renewables not only addresses environmental concerns \
but also fosters innovation, creating a brighter and more sustainable \
future for generations to come."""


len(text)


563

In [None]:
c_splitter = CharacterTextSplitter(
    chunk_size=450,
    chunk_overlap=0,
    separator = ' '
)

# Recursive Character Text Splitter

r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=450,
    chunk_overlap=0,
    separators=["\n\n", "\n", " ", ""]
)

In [None]:
c_splitter.split_text(text)


['As the world grapples with the challenges of climate change, renewable energy emerges as a beacon of hope. Solar and wind power, in particular, are transforming the energy landscape, offering sustainable alternatives to traditional fossil fuels. \n\n Governments and businesses globally are investing in clean energy initiatives to reduce carbon footprints and mitigate environmental impact. The shift towards renewables not only addresses',
 'environmental concerns but also fosters innovation, creating a brighter and more sustainable future for generations to come.']

We can see that the **Character Text Splitter** splits on the spaces. So, we end up with this weird separation in the middle of the sentence.

The **Recursive Character Text Splitter** first tries to split on double newlines, and so here it splits it up into two paragraphs. Even though the first one is shorter than the 450 characters, we specified, this is probably a better split because now the two paragraphs, each being their own paragraph, are in chunks as opposed to being split in the middle of a sentence.


In [None]:
c_splitter.split_text(text)


['As the world grapples with the challenges of climate change, renewable energy emerges as a beacon of hope. Solar and wind power, in particular, are transforming the energy landscape, offering sustainable alternatives to traditional fossil fuels. \n\n Governments and businesses globally are investing in clean energy initiatives to reduce carbon footprints and mitigate environmental impact. The shift towards renewables not only addresses',
 'environmental concerns but also fosters innovation, creating a brighter and more sustainable future for generations to come.']

In [None]:
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=150,
    chunk_overlap=0,
    separators=["\n\n", "\n", "\. ", " ", ""]
)
r_splitter.split_text(text)

['As the world grapples with the challenges of climate change, renewable energy emerges as a beacon of hope. Solar and wind power, in particular, are',
 'transforming the energy landscape, offering sustainable alternatives to traditional fossil fuels.',
 'Governments and businesses globally are investing in clean energy initiatives to reduce carbon footprints and mitigate environmental impact. The',
 'shift towards renewables not only addresses environmental concerns but also fosters innovation, creating a brighter and more sustainable future for',
 'generations to come.']

We can see that it’s split into sentences, but the **periods are actually in the wrong places**. This is because of the **regex** that’s running behind the scenes.

To fix this, we can specify a slightly more **complicated regex with a look behind**. Now, if we run this, we can see that it’s split into sentences properly, with the **periods being in the right places**.


In [None]:
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=150,
    chunk_overlap=0,
    separators=["\n\n", "\n", "(?<=\. )", " ", ""]
)
r_splitter.split_text(text)

['As the world grapples with the challenges of climate change, renewable energy emerges as a beacon of hope. Solar and wind power, in particular, are',
 'transforming the energy landscape, offering sustainable alternatives to traditional fossil fuels.',
 'Governments and businesses globally are investing in clean energy initiatives to reduce carbon footprints and mitigate environmental impact. The',
 'shift towards renewables not only addresses environmental concerns but also fosters innovation, creating a brighter and more sustainable future for',
 'generations to come.']

# Separator `(?<=\. )` Explained

- `(?<=\. )` is a **regex lookbehind** used in text splitting.  
- It means: **split after every period followed by a space**.  
- This helps the splitter to **correctly divide sentences** without breaking abbreviations or numbers.  
- Example:


In [None]:
from langchain.document_loaders import PyPDFLoader

loader = PyPDFLoader(f'{youssef19_how_to_build_a_career_in_ai_pdf_path}/eBook-How-to-Build-a-Career-in-AI.pdf')
pages = loader.load()

print(f"Number of pages loaded: {len(pages)}")
print(pages[0].page_content)

Number of pages loaded: 41
PAGE 1
Founder, DeepLearning.AI
Collected Insights
from Andrew Ng
How to 
Build
Your
Career
in AI
A Simple Guide


In [None]:
from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=1000,
    chunk_overlap=150,
    length_function=len
)

In [None]:
docs=r_splitter.split_documents(pages)

In [None]:
print (len(docs))

477


# Token-Based Splitting

In the previous sections, we have done all the splitting based on **characters**.  
But there’s another way to split your text — based on **tokens**.

Since **LLMs (Large Language Models)** often have **context windows** that are defined by the **token count**,  
it’s important to know:
- what the tokens are, and  
- where they appear in the text.

By splitting text based on tokens, we can get a more **representative view** of how the LLM actually perceives the text.

---

### Comparing Token vs Character Splitters

To better understand the difference between **token** and **character** splitters,  
we can apply both to a piece of text and compare the results.

---




In [None]:
from langchain_text_splitters import TokenTextSplitter

text_splitter = TokenTextSplitter(chunk_size=10, chunk_overlap=0)



In [None]:
text1 = "foo bar bazzyfoo"
text_splitter.split_text(text1)

['foo', ' bar', ' b', 'az', 'zy', 'foo']

In [None]:
docs = text_splitter.split_documents(pages)
docs[0].page_content

'PAGE 1\nFounder, DeepLearning.'

#  Context-aware splitting

In [None]:
from langchain.text_splitter import MarkdownHeaderTextSplitter


In [None]:
markdown_document = """# Title\n\n \
## Chapter 1\n\n \
Hi this is Chapter 1\n\n Hi this is Section 1\n\n \
### Section \n\n \
Hi this is Section 2 \n\n
## Chapter 2\n\n \
Hi this is Chapter 2"""

### Splitting by Markdown Headers

Next, we will define a **list of headers** that we want to split on — along with the **names of those headers**.

- A single `#` → corresponds to **Header 1**  
- Two `##` → correspond to **Header 2**  
- Three `###` → correspond to **Header 3**

We can then initialize the **Markdown Header Text Splitter** using those headers,  
and apply it to split the example text we have above.

This allows the text splitter to separate content based on the document’s structure (like sections and subsections) —  
making it easier to organize and retrieve information later.


In [None]:
headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]

In [None]:
markdown_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=headers_to_split_on
)
md_header_splits = markdown_splitter.split_text(markdown_document)

In [None]:
md_header_splits[0]


Document(metadata={'Header 1': 'Title', 'Header 2': 'Chapter 1'}, page_content='Hi this is Chapter 1  \nHi this is Section 1')

In [None]:
md_header_splits[1]


Document(metadata={'Header 1': 'Title', 'Header 2': 'Chapter 1', 'Header 3': 'Section'}, page_content='Hi this is Section 2')

In [None]:
md_header_splits[2]


Document(metadata={'Header 1': 'Title', 'Header 2': 'Chapter 2'}, page_content='Hi this is Chapter 2')