## INSTALL LIBRARIES

In [2]:
!pip install -q \
    youtube-transcript-api \
    langchain \
    langchain-community \
    langchain-huggingface \
    faiss-cpu \
    tiktoken \
    python-dotenv \
    transformers \
    sentence-transformers

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.1/485.1 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.8/23.8 MB[0m [31m50.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25h

In [3]:

from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled


from langchain_text_splitters import RecursiveCharacterTextSplitter,Language
from langchain_core.prompts import PromptTemplate
from langchain_core.documents import Document


from langchain_huggingface import HuggingFaceEmbeddings, HuggingFacePipeline,HuggingFaceEndpoint,ChatHuggingFace



from langchain_community.vectorstores import FAISS


import os
from dotenv import load_dotenv
load_dotenv()



False

### Step 1a - Indexing (Document Ingestion)

In [4]:
from urllib.parse import urlparse, parse_qs
from youtube_transcript_api import YouTubeTranscriptApi
try :
    url = "https://www.youtube.com/watch?v=MoqgmWV1fm8"

    video_id = parse_qs(urlparse(url).query)["v"][0]


    transcript_list = YouTubeTranscriptApi().fetch(video_id, languages=[ 'en'])
    transcript_dicts = transcript_list.to_raw_data()

    # Flatten it to plain text 
    transcript = " ".join(chunk["text"] for chunk in transcript_dicts)
    print(transcript)
except TranscriptsDisabled :
    print("No captions available for this video")


today we will build an end-to-end llm project that covers a real life industry use case of equity research analysis we will build a news research tool where you can give bunch of news article URLs and then when you ask a question it will retrieve the answer based on those news articles in terms of Technology we have used Lang chain openai and streamlit to make this project more interesting we have added some fun storytelling as well so let's take a look at that story first what if Rocky by lived in the chat GPT era how would he invest all his money would he use chat GPT to find best investments no way he would hire someone for that Rocky boys recruitment team got Peter Pandey the equity research analyst Peter read lengthy stock market articles for his research but Rocky by did not like it Peter promised to create a chatbot like chat GPT for his investment Rocky by liked Peter's grid and he said fasten your seat belt so get ready folks we are going to create a chatbot for Rocky by perha

### Step 1(b) Indexing(Text Splitting)

In [None]:
splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap =200)
chunks = splitter.create_documents([transcript])

In [None]:
len(chunks)

In [None]:
chunks[0]

### Step 1c & 1d - Indexing (Embedding Generation and Storing in Vector Store)

In [None]:
embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

In [None]:
vector_store = FAISS.from_documents(chunks,embedding)

In [None]:
vector_store.index_to_docstore_id

In [None]:
vector_store.get_by_ids(['b4f98412-59db-40e0-a203-93540f63c2db'])

### Step 2 Retrieval

In [None]:
retriever = vector_store.as_retriever(search_type = "similarity",search_kwargs = {"k":4})

In [None]:
retriever

In [None]:
retriever.invoke("is the topic of dangers of AI discussed in the video")

### Step 3 Augmentation 

In [None]:
llm = HuggingFaceEndpoint(
    repo_id="meta-llama/Llama-3.1-8B-Instruct",
    task = "text-generation",
    temperature = 0.5
)
chat_model = ChatHuggingFace(llm=llm)

In [None]:
prompt = PromptTemplate(
template="""You are a helpful assistant.
      Answer ONLY from the provided transcript context.
      If the context is insufficient, just say you don't know.

      {context}
      Question: {question}
    """,
    input_variables = ['context', 'question']
)

In [None]:
question = "is the topic of dangers of AI disucussed in the video"
retrieved_docs = retriever.invoke(question)

In [None]:
context_text = "\n\n".join(doc.page_content for doc in retrieved_docs)

In [None]:
final_prompt = prompt.invoke({"context":context_text , "question":question})

### Step 4 generation

In [None]:
answer = chat_model.invoke(final_prompt)

In [None]:
print(answer.content)

## Building a Chain

In [None]:
from langchain_core.runnables import RunnableParallel,RunnablePassthrough,RunnableLambda
from langchain_core.output_parsers import StrOutputParser

In [None]:
def format_docs(retrieved_docs):
    context_text = "\n\n".join(doc.page_content for doc in retrieved_docs)
    return context_text

In [None]:
parallel_chain = RunnableParallel({
    'context' : retriever | RunnableLambda(format_docs),
    'question': RunnablePassthrough()
})

In [None]:
parallel_chain.invoke("Who is Denis")

In [None]:
parser = StrOutputParser()

In [None]:
main_chain = parallel_chain | prompt | chat_model | parser 

In [None]:
main_chain.invoke("Can you summarise the video ? ")