In [1]:
%pip install youtube-transcript-api

Collecting youtube-transcript-api
  Downloading youtube_transcript_api-0.6.2-py3-none-any.whl.metadata (15 kB)
Downloading youtube_transcript_api-0.6.2-py3-none-any.whl (24 kB)
Installing collected packages: youtube-transcript-api
Successfully installed youtube-transcript-api-0.6.2
Note: you may need to restart the kernel to use updated packages.


In [3]:
%load_ext dotenv
%dotenv
import os
cohere_api_key = os.getenv('COHERE_API_KEY')

In [4]:
from langchain_cohere import ChatCohere
chat = ChatCohere(cohere_api_key=cohere_api_key)

In [None]:
from langchain_community.document_loaders import YoutubeLoader

In [9]:
loader = YoutubeLoader.from_youtube_url(
    "https://www.youtube.com/watch?v=6Fa91SY9Gnw", add_video_info=False
)

In [27]:
transcript = loader.load()

In [28]:
transcript = transcript[0].page_content

In [29]:
transcript_stripped = transcript.replace(u'\xa0',u'').replace(u'\uf0a7',u'')

In [33]:
len(transcript_stripped)

5853

In [34]:
from langchain_text_splitters.character import CharacterTextSplitter
# to avoid ending them abruptly use . and chunk overlap
char_splitter = CharacterTextSplitter(separator=".",chunk_size=500,chunk_overlap=0)

In [35]:
transcripts_split = char_splitter.split_text(transcript_stripped)

Created a chunk of size 580, which is longer than the specified 500


In [37]:
len(transcripts_split)

15

In [38]:
from langchain_cohere import CohereEmbeddings

embeddings = CohereEmbeddings(
    cohere_api_key=cohere_api_key,
    model="embed-english-v3.0",
)

In [41]:
from langchain_community.vectorstores import Chroma
vectorstore = Chroma.from_texts(texts = transcripts_split,
                                    embedding=embeddings,
                                    persist_directory='./python-projects')

In [42]:
retriever = vectorstore.as_retriever(search_type='mmr',search_kwargs={'k':3,'lambda_mult':0.3})

In [43]:
retriever.invoke("Could you tell me all essential beginner python projects according to the video?")

Number of requested results 20 is greater than number of elements in index 15, updating n_results = 15


[Document(metadata={}, page_content='But if you already have some basic Python experience and\nwant to take it to the next level, in this video you’ll discover 4 interesting Python projects\nfor beginners – all including code for practice! Great! So, let’s start with project\nnumber 1 - Uber Trips Analysis Since it was founded in 2009, Uber has become one\nof the most famous unicorn companies, offering its services to more than 80 countries worldwide'),
 Document(metadata={}, page_content='If you are\naware of the risk to catch a certain disease, you will have the time to think and prepare,\nwhich can save a lot of suffering and money. In this project, you’ll carefully analyze\na dataset of 195 records to predict the likelihood of having Parkinson’s\ndisease using an XGBBoost model'),
 Document(metadata={}, page_content='This\nunreasonable behavior caused many deaths, especially in hospitals where most doctors—unlike\ntoday—never washed their hands before surgeries. In the mid-1800s, t

In [44]:
TEMPLATE = '''
You are a helpful chatbot that answers questions on Youtube videos.
Answer the questions using only the following context:
{context}
'''
TEMPLATE_Q = '''{question}'''

In [45]:
from langchain_core.prompts import PromptTemplate, SystemMessagePromptTemplate,HumanMessagePromptTemplate,ChatPromptTemplate
message_template_1 = SystemMessagePromptTemplate.from_template(template = TEMPLATE)
message_template_2 = HumanMessagePromptTemplate.from_template(template=TEMPLATE_Q)
chat_template = ChatPromptTemplate.from_messages([message_template_1,message_template_2])

In [46]:
chat_template

ChatPromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], input_types={}, partial_variables={}, template='\nYou are a helpful chatbot that answers questions on Youtube videos.\nAnswer the questions using only the following context:\n{context}\n'), additional_kwargs={}), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['question'], input_types={}, partial_variables={}, template='{question}'), additional_kwargs={})])

In [None]:
from langchain_core.output_parsers import StrOutputParser
parser = StrOutputParser()

In [None]:
from langchain_core.runnables import RunnablePassthrough, RunnableParallel

In [59]:
chain = ({'context':retriever,
         'question':RunnablePassthrough()}
         | chat_template
         | chat
         | parser
)

In [61]:
chain.invoke("Could you tell me all essential beginner python projects according to the video")

Number of requested results 20 is greater than number of elements in index 15, updating n_results = 15


"According to the video, these are the 4 essential beginner Python projects:\n\n1. Uber Trips Analysis: This project involves analyzing Uber trip data from its founding in 2009 to the present, offering insights into the company's operations across 80+ countries.\n2. Parkinson's Disease Prediction: The project focuses on predicting the likelihood of having Parkinson's disease using an XGBoost model and a dataset of 195 records. It aims to raise awareness and provide time for preparation and treatment.\n3. Ignaz Semmelweis and Handwashing: The project explores the mid-1800s story of Dr. Ignaz Semmelweis, who investigated the high death rates in hospitals, especially during childbirth, and linked it to doctors not washing their hands, leading to a significant change in medical practices.\n4. Project Number 4 is not mentioned in the provided text."