## Install all dependencies

In [2]:
import os
from langchain_groq import ChatGroq
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api._errors import TranscriptsDisabled
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.prompts import PromptTemplate
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from dotenv import load_dotenv

load_dotenv()
os.environ["GROQ_API_KEY"] = os.getenv("GROQ_API_KEY")
os.environ["HUGGINGFACEHUB_API_TOKEN"] = os.getenv("HUGGINGFACE_API_TOKEN")

  from .autonotebook import tqdm as notebook_tqdm


## Load youtube transcript

In [3]:
try:
    transcript_list = YouTubeTranscriptApi().fetch(video_id = "7xTGNNLPyMI", languages=['en'])
    
    transcript = " ".join(chunk.text for chunk in transcript_list)
    print(transcript)
    
except TranscriptsDisabled:
    print("No captions available for this video.")

hi everyone so I've wanted to make this video for a while it is a comprehensive but General audience introduction to large language models like Chachi PT and what I'm hoping to achieve in this video is to give you kind of mental models for thinking through what it is that this tool is it is obviously magical and amazing in some respects it's uh really good at some things not very good at other things and there's also a lot of sharp edges to be aware of so what is behind this text box you can put anything in there and press enter but uh what should we be putting there and what are these words generated back how does this work and what what are you talking to exactly so I'm hoping to get at all those topics in this video we're going to go through the entire pipeline of how this stuff is built but I'm going to keep everything uh sort of accessible to a general audience so let's take a look at first how you build something like chpt and along the way I'm going to talk about um you know som

In [4]:
transcript_list

FetchedTranscript(snippets=[FetchedTranscriptSnippet(text="hi everyone so I've wanted to make this", start=0.719, duration=4.681), FetchedTranscriptSnippet(text='video for a while it is a comprehensive', start=2.76, duration=5.32), FetchedTranscriptSnippet(text='but General audience introduction to', start=5.4, duration=5.8), FetchedTranscriptSnippet(text='large language models like Chachi PT and', start=8.08, duration=4.519), FetchedTranscriptSnippet(text="what I'm hoping to achieve in this video", start=11.2, duration=3.439), FetchedTranscriptSnippet(text='is to give you kind of mental models for', start=12.599, duration=4.641), FetchedTranscriptSnippet(text='thinking through what it is that this', start=14.639, duration=5.081), FetchedTranscriptSnippet(text='tool is it is obviously magical and', start=17.24, duration=5.199), FetchedTranscriptSnippet(text="amazing in some respects it's uh really", start=19.72, duration=4.2), FetchedTranscriptSnippet(text='good at some things not very

## Indexing & Chunking

In [5]:
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = splitter.create_documents([transcript])
len(chunks)

269

In [6]:
chunks[100]

Document(metadata={}, page_content="is of course hallucinations so you might be familiar with model hallucinations it's when llms make stuff up they just totally fabricate information Etc and it's a big problem with llm assistants it is a problem that existed to a large extent with early models uh from many years ago and I think the problem has gotten a bit better uh because there are some medications that I'm going to go into in a second for now let's just try to understand where these hallucinations come from so here's a specific example of a few uh of three conversations that you might think you have in your training set and um these are pretty reasonable conversations that you could imagine being in the training set so like for example who is Cruz well Tom Cruz is an famous actor American actor and producer Etc who is John baraso this turns out to be a us senetor for example who is genis Khan well genis Khan was blah blah blah and so this is what your conversations could look like 

## Embeddings & Vector store

In [7]:
Embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectorstore = Chroma.from_documents(chunks, Embeddings, collection_name="youtube-qna-chatbot")