In [1]:
from dotenv import load_dotenv

from langchain_community.document_loaders import YoutubeLoader, YoutubeAudioLoader
from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import (RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter,
                                      CharacterTextSplitter,TokenTextSplitter)
from langchain_experimental.text_splitter import SemanticChunker
from langchain_google_genai import GoogleGenerativeAIEmbeddings

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
load_dotenv()

True

In [27]:
video_url = 'https://youtu.be/75uBcITe0gU?si=Ggfa8c8uMb6P-dfR'

loader = YoutubeLoader.from_youtube_url(video_url, add_video_info=True)
text = loader.load()
text



In [28]:
text[0].metadata

{'source': '75uBcITe0gU',
 'title': '6 Langchain Document Loaders to Master (Beginner Friendly)',
 'description': 'Unknown',
 'view_count': 3294,
 'thumbnail_url': 'https://i.ytimg.com/vi/75uBcITe0gU/hq720.jpg',
 'publish_date': '2024-02-15 00:00:00',
 'length': 1280,
 'author': 'Ryan & Matt Data Science'}

### RecursiveCharacterTextSplitter

In [31]:
recursive_text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
chunks_recursive = recursive_text_splitter.split_documents(text)

In [32]:
for chunk in chunks_recursive[:3]:
    print('Page Content : \n', chunk.page_content)

Page Content : 
 a document is a piece of text and Associated metadata for example a text file CSV YouTube video web page file directory or even a PDF and see we can use something called a document loader to upload these documents to utilize them within our large language models and why I gave all those examples I'm going to be showing you guys how to load each of these specific types of documents in this video with the help of some basic python code if you're brand new here to this YouTube channel I am building
Page Content : 
 the help of some basic python code if you're brand new here to this YouTube channel I am building out a Series this year based around Lang chain large language models and open AI API now you don't need a ton of background information to understand all the code that is going on this video but if you want to go over some of the basics I have the playlist linked on my Channel with that being said I'm going to be coding all this in Google collab feel free to use wh

### CharacterTextSplitter

In [35]:
character_text_splitter = CharacterTextSplitter(chunk_size=100, chunk_overlap=20)
chunks_character = character_text_splitter.split_documents(text)

for chunk in chunks_character[:2]:
    print('page content : \n', chunk)

print('\n length :', len(chunks_character[0].page_content)) # this happens because there was no next paragraph there

page content : 

 length : 16749


### SentenceTransformersTokenTextSplitter

In [40]:
sentence_character_splitter = SentenceTransformersTokenTextSplitter(chunk_size=200, chunk_overlap=50)
chunks_sentence = sentence_character_splitter.split_documents(text)

for chunk in chunks_sentence[:3]:
    print('page content: \n', chunk.page_content) # same here is that it tries to find the sentence ending but it didn't do that much better 
    print('\n length :', len(chunk.page_content))


page content: 
 a document is a piece of text and associated metadata for example a text file csv youtube video web page file directory or even a pdf and see we can use something called a document loader to upload these documents to utilize them within our large language models and why i gave all those examples i'm going to be showing you guys how to load each of these specific types of documents in this video with the help of some basic python code if you're brand new here to this youtube channel i am building out a series this year based around lang chain large language models and open ai api now you don't need a ton of background information to understand all the code that is going on this video but if you want to go over some of the basics i have the playlist linked on my channel with that being said i'm going to be coding all this in google collab feel free to use whatever code editor that you would like and let's start coding all right so my google collab notebook let's get going

### TokenTextSplitter

In [45]:
token_splitter = TokenTextSplitter(chunk_size=50, chunk_overlap=10)
chunk_token= token_splitter.split_documents(text)

for chunk in chunk_token[:3]:
    print('page content: \n', chunk.page_content) 
    print(f'length : {len(chunk.page_content)}\n')

page content: 
 a document is a piece of text and Associated metadata for example a text file CSV YouTube video web page file directory or even a PDF and see we can use something called a document loader to upload these documents to utilize them within our large language models and why
length : 270

page content: 
  to utilize them within our large language models and why I gave all those examples I'm going to be showing you guys how to load each of these specific types of documents in this video with the help of some basic python code if you're brand new here
length : 249

page content: 
  some basic python code if you're brand new here to this YouTube channel I am building out a Series this year based around Lang chain large language models and open AI API now you don't need a ton of background information to understand all the code that is
length : 257



### SemanticChunker

In [4]:
file_path = r'C:\Users\HP\Desktop\langchain-crash-course\Practicing\Books\The Art of Being ALONE Solitude Is My HOME, Loneliness Was My Cage.pdf'
pdf_loader = PyMuPDFLoader(file_path=file_path)
pdf_text = pdf_loader.load()

In [5]:
sementic_splitter = SemanticChunker(GoogleGenerativeAIEmbeddings(model="models/text-embedding-004"), 
                                    breakpoint_threshold_type = 'percentile' )
sementic_chunk = sementic_splitter.split_documents(pdf_text) # split the sentence on the base of .?!

In [6]:
sementic_chunk[0].metadata

{'source': 'C:\\Users\\HP\\Desktop\\langchain-crash-course\\Practicing\\Books\\The Art of Being ALONE Solitude Is My HOME, Loneliness Was My Cage.pdf',
 'file_path': 'C:\\Users\\HP\\Desktop\\langchain-crash-course\\Practicing\\Books\\The Art of Being ALONE Solitude Is My HOME, Loneliness Was My Cage.pdf',
 'page': 0,
 'total_pages': 99,
 'format': 'PDF 1.4',
 'title': 'The Art of Being ALONE: Solitude Is My HOME, Loneliness Was My Cage',
 'author': 'Renuka Gavrani',
 'subject': '',
 'keywords': '',
 'creator': 'calibre 6.28.1',
 'producer': 'calibre 6.28.1',
 'creationDate': "D:20231115105259+00'00'",
 'modDate': "D:20231115105259+00'00'",
 'trapped': ''}

In [8]:
len(sementic_chunk)

225

In [10]:
print('page content: \n', sementic_chunk[100]) 
print(f'length : {len(sementic_chunk[100].page_content)}\n')

page content: 
 page_content='the crowds. While Stieglitz, her husband, was a social
butterfly, O’Keeffe preferred being alone. She always said
‘There is a kind of freedom in being alone.’
Finally, in 1929, after much deliberation, she left for New
Mexico for the very first time. This became the first of her
many extended trips in the desert, where she spent months
wandering alone, living in tents, with nothing but her art
supplies for the company. In 1934 O’Keeffe bought a piece of land on a ghost ranch
and permanently moved to the desert. Here she spent four
years restoring the ranch and setting up her solitary abode. Her minimalist house with a view of the Cerro Paranal
mountains became her own little haven, where she lived up
to almost a hundred years ‘deliciously alone’, in blissful
solitude, making art till her body allowed her. Her art and the inspiration behind it were simple as she
used to say, "I had to create an equivalent for what I
felt about what I was looking at – not co

## Using Qdrant vector store

**to do the chunking**