In [1]:
print("data science RAG")

data science RAG


In [2]:
!pip install openai
!pip install langchain
!pip install langchain-openai
!pip install langchain_pinecone
!pip install langchain[docarray]
!pip install pydantic==1.10.8
!pip install tiktoken
!pip install pinecone-client
!pip install pytube
!pip install scikit-learn
!pip install ruff



In [3]:
!pip install git+https://github.com/openai/whisper.git

Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-_5p5pwvh
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-_5p5pwvh
  Resolved https://github.com/openai/whisper.git to commit ba3f3cd54b0e5b8ce1ab3de13e32122d0d5f98ab
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [4]:
!pip install langchain-groq



In [5]:
import os
from google.colab import userdata
os.environ['GROQ_API_KEY'] = userdata.get('GROQ_API_KEY')

In [6]:
youtube_video = "https://www.youtube.com/watch?v=cdiD-9MMpb0"

In [60]:
from langchain_groq import ChatGroq

model = ChatGroq(model = "llama-3.1-70b-versatile")

In [8]:
model.invoke("Who is the Prime Minister of India?")

AIMessage(content='As of my last update in 2021, the Prime Minister of India is Narendra Modi. He has been serving in this position since May 26, 2014.', response_metadata={'token_usage': {'completion_tokens': 36, 'prompt_tokens': 18, 'total_tokens': 54, 'completion_time': 0.144, 'prompt_time': 0.005591834, 'queue_time': None, 'total_time': 0.14959183399999998}, 'model_name': 'llama-3.1-70b-versatile', 'system_fingerprint': 'fp_5c5d1b5cfb', 'finish_reason': 'stop', 'logprobs': None}, id='run-8bef3389-20c3-4ced-a878-4e600cd41a36-0', usage_metadata={'input_tokens': 18, 'output_tokens': 36, 'total_tokens': 54})

In [9]:
from langchain_core.output_parsers import StrOutputParser

parser = StrOutputParser()

chain = model | parser

chain.invoke("Who is the Prime Minister of India?")

'As of my last update in 2023, the Prime Minister of India is Narendra Modi. He has been serving as the Prime Minister since May 26, 2014.'

In [10]:
from langchain.prompts import ChatPromptTemplate

template = """
Answer the question based on the context below. If you can't answer the question, reply I don't know.

Context: {context}

Question: {question}

"""

prompt = ChatPromptTemplate.from_template(template)

prompt.format(context = "Marry's sister is susana", question="Who is Marry's sister?")

"Human: \nAnswer the question based on the context below. If you can't answer the question, reply I don't know.\n\nContext: Marry's sister is susana\n\nQuestion: Who is Marry's sister?\n\n"

In [11]:
chain = prompt | model | parser

chain.invoke({"question":"Who is Marry's sister?", "context": "Marry's sister is susana"}) # Pass a dictionary to the invoke method

'Susana.'

## Combining Chains


In [12]:
translation_prompt  = ChatPromptTemplate.from_template(
    "Translate {answer} to {language}"
)

In [13]:
from operator import itemgetter

translation_chain = (
    {"answer":chain, "language":itemgetter("language")} | translation_prompt | model | parser
)

translation_chain.invoke(
    {"context":"Marry's sister is Susana. She doesn't have any more siblings.",
     "question":"How many sisters does Marry have?",
     "language":"French"}
)

'Seulement une, Susana.'

## Transcribing the YouTube video

In [14]:
!pip install whisper



In [15]:
import tempfile
import whisper
from pytube import YouTube

In [25]:
!pip install yt-dlp whisper


Collecting yt-dlp
  Downloading yt_dlp-2024.8.1-py3-none-any.whl.metadata (170 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/170.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m163.8/170.1 kB[0m [31m4.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m170.1/170.1 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
Collecting brotli (from yt-dlp)
  Downloading Brotli-1.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (5.5 kB)
Collecting mutagen (from yt-dlp)
  Downloading mutagen-1.47.0-py3-none-any.whl.metadata (1.7 kB)
Collecting pycryptodomex (from yt-dlp)
  Downloading pycryptodomex-3.20.0-cp35-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.4 kB)
Collecting requests<3,>=2.32.2 (from yt-dlp)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collectin

In [28]:
import os
import yt_dlp

def download_youtube_audio(youtube_url, download_path):
    try:
        if not os.path.exists(download_path):
            os.makedirs(download_path)

        print("Downloading audio from YouTube...")
        ydl_opts = {
            'format': 'bestaudio/best',
            'outtmpl': os.path.join(download_path, 'audio.%(ext)s'),
            'noplaylist': True,
            'postprocessors': [{
                'key': 'FFmpegExtractAudio',
                'preferredcodec': 'mp3',
                'preferredquality': '192',
            }],
        }
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            info = ydl.extract_info(youtube_url, download=True)
            audio_file = ydl.prepare_filename(info)
            audio_file = audio_file.replace('.webm', '.mp3')  # Adjust if needed

        print(f"Audio downloaded and saved to: {audio_file}")
        return audio_file
    except Exception as e:
        print(f"Error downloading video: {e}")
        raise

# Example usage
download_youtube_audio("https://www.youtube.com/watch?v=cdiD-9MMpb0", "/content/audio")


Downloading audio from YouTube...
[youtube] Extracting URL: https://www.youtube.com/watch?v=cdiD-9MMpb0
[youtube] cdiD-9MMpb0: Downloading webpage
[youtube] cdiD-9MMpb0: Downloading ios player API JSON
[youtube] cdiD-9MMpb0: Downloading tv player API JSON
[youtube] cdiD-9MMpb0: Downloading m3u8 information
[info] cdiD-9MMpb0: Downloading 1 format(s): 251
[download] /content/audio/audio.webm has already been downloaded
[download] 100% of  167.01MiB
[ExtractAudio] Destination: /content/audio/audio.mp3
Deleting original file /content/audio/audio.webm (pass -k to keep)
Audio downloaded and saved to: /content/audio/audio.mp3


'/content/audio/audio.mp3'

In [31]:
!pip install git+https://github.com/openai/whisper.git
!sudo apt update && sudo apt install ffmpeg

Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-szcyl7p0
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-szcyl7p0
  Resolved https://github.com/openai/whisper.git to commit ba3f3cd54b0e5b8ce1ab3de13e32122d0d5f98ab
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Get:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [908 kB]
Get:4 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Hit:5 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelea

In [32]:
!pip install openai-whisper



In [33]:
import whisper

model_audio = whisper.load_model("base.en")
result = model.transcribe("/content/audio/audio.mp3")
print(result)

with open("transcription.txt", "w") as file:
  file.write(result["text"])

100%|███████████████████████████████████████| 139M/139M [00:05<00:00, 24.4MiB/s]


{'text': " I think it's possible that physics has exploits and we should be trying to find them, arranging some kind of a crazy quantum mechanical system that somehow gives you buffer overflow, somehow gives you rounding error in the floating point. Synthetic intelligences are kind of like the next stage of development. And I don't know where it leads to, like at some point I suspect the universe is some kind of a puzzle. These synthetic AIs will uncover that puzzle and solve it. The following is a conversation with Andre Capati. Previously, the director of AI at Tesla. And before that, at OpenAI and Stanford. He is one of the greatest scientist engineers and educators in the history of artificial intelligence. This is the Lex Friedman podcast to support it. Please check out our sponsors. And now, dear friends, here's Andre Capati. What is a neural network? And what does it seem to do such a surprisingly good job of learning? What is a neural network? It's a mathematical abstraction of

In [35]:
with open("transcription.txt") as file:
  transcription = file.read()

transcription[:100]

" I think it's possible that physics has exploits and we should be trying to find them, arranging som"

In [36]:
try:
  chain.invoke({
      "context":transcription,
      "question":"Is reading papers a good idea?"
  })

except Exception as e:
  print(e)


Error code: 400 - {'error': {'message': 'Please reduce the length of the messages or completion.', 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_length_exceeded'}}


In [37]:
!pip install langchain-community

Collecting langchain-community
  Downloading langchain_community-0.2.11-py3-none-any.whl.metadata (2.7 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.21.3-py3-none-any.whl.metadata (7.1 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting mypy-extensions>=0.3.0 (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading mypy_extensions-1.0.0-py3-none-any.whl.metadata (1.1 kB)
Downloading langchain_community-0.2.11-py3-none-any.whl (2.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 MB[0m [31m24.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dataclasses_json-0.6.7-py3-none-any.whl (

In [38]:
from langchain_community.document_loaders import TextLoader
loader = TextLoader("/content/transcription.txt")
text_documents = loader.load()
text_documents

[Document(metadata={'source': '/content/transcription.txt'}, page_content=" I think it's possible that physics has exploits and we should be trying to find them, arranging some kind of a crazy quantum mechanical system that somehow gives you buffer overflow, somehow gives you rounding error in the floating point. Synthetic intelligences are kind of like the next stage of development. And I don't know where it leads to, like at some point I suspect the universe is some kind of a puzzle. These synthetic AIs will uncover that puzzle and solve it. The following is a conversation with Andre Capati. Previously, the director of AI at Tesla. And before that, at OpenAI and Stanford. He is one of the greatest scientist engineers and educators in the history of artificial intelligence. This is the Lex Friedman podcast to support it. Please check out our sponsors. And now, dear friends, here's Andre Capati. What is a neural network? And what does it seem to do such a surprisingly good job of learn

In [42]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
documents = text_splitter.split_documents(text_documents)

In [43]:
%pip install --upgrade --quiet langchain-google-genai

In [46]:
%pip install langchain-google-genai



In [44]:
import getpass
import os

os.environ["GOOGLE_API_KEY"] = userdata.get('GOOGLE_API_KEY')

In [48]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings

embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

vector = embeddings.embed_query("Hello world")
vector[:5]

[0.04703257977962494,
 -0.04019005596637726,
 -0.02902696281671524,
 -0.02680964209139347,
 0.01892058178782463]

In [49]:
len(vector)

768

In [51]:
embedded_query = embeddings.embed_query("Who is Marry's sister?")

In [50]:
sentence1 = embeddings.embed_query("Mary's sister is Susana")
sentence2 = embeddings.embed_query("Pedro's mother is a teacher")

In [52]:
from sklearn.metrics.pairwise import cosine_similarity

query_sentence1_similarity = cosine_similarity([embedded_query], [sentence1])[0][0]
query_sentence2_similarity = cosine_similarity([embedded_query], [sentence2])[0][0]

print("Similarity between embedded_query & sentence1:", query_sentence1_similarity)
print("Similarity between embedded_query & sentence2:", query_sentence2_similarity)

Similarity between embedded_query & sentence1: 0.8061647630010358
Similarity between embedded_query & sentence2: 0.5701709031662558


In [54]:
pip install docarray

Collecting docarray
  Downloading docarray-0.40.0-py3-none-any.whl.metadata (36 kB)
Collecting types-requests>=2.28.11.6 (from docarray)
  Downloading types_requests-2.32.0.20240712-py3-none-any.whl.metadata (1.9 kB)
Downloading docarray-0.40.0-py3-none-any.whl (270 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m270.2/270.2 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading types_requests-2.32.0.20240712-py3-none-any.whl (15 kB)
Installing collected packages: types-requests, docarray
Successfully installed docarray-0.40.0 types-requests-2.32.0.20240712


In [55]:
from langchain_community.vectorstores import DocArrayInMemorySearch

vectorestore1 = DocArrayInMemorySearch.from_texts(
    ["Mary's sister is Susana",
     "John and tommy are brothers",
     "Pedro's mother is a teacher",
     "Patricia likes white cars",
     "Lucia drives an Audi",
     "Mary has two sublings"],

    embedding = embeddings

)



In [56]:
vectorestore1.similarity_search_with_score("Who is Marry's sister?", k=3)

[(Document(page_content="Mary's sister is Susana"), 0.7105438822816268),
 (Document(page_content='Mary has two sublings'), 0.6466681632533784),
 (Document(page_content='John and tommy are brothers'), 0.5660632168910563)]

In [57]:
retriever1 = vectorestore1.as_retriever()
retriever1.invoke("Who is Marry's sister?")

[Document(page_content="Mary's sister is Susana"),
 Document(page_content='Mary has two sublings'),
 Document(page_content='John and tommy are brothers'),
 Document(page_content='Patricia likes white cars')]

In [58]:
from langchain_core.runnables import RunnableParallel, RunnablePassthrough

setup = RunnableParallel(
    context = retriever1,
    question = RunnablePassthrough()
)

setup.invoke("What color is Patricia's car?")

# runnable parallel and runnale passthrough allow us to pass the context and question to the prompt as a map with the keys 'context' and 'question'

{'context': [Document(page_content='Patricia likes white cars'),
  Document(page_content='Lucia drives an Audi'),
  Document(page_content="Pedro's mother is a teacher"),
  Document(page_content="Mary's sister is Susana")],
 'question': "What color is Patricia's car?"}

In [61]:
chain = setup | prompt | model | parser

chain.invoke("What color is Patricia's car?")

'White'

In [62]:
chain.invoke("What car does Lucia drive?")

'Lucia drives an Audi.'

## Loading transcription into the Vector Store

In [63]:
vectorestore2 = DocArrayInMemorySearch.from_documents(
    documents,
    embedding = embeddings
)


In [64]:
chain = (
    {"context":vectorestore2.as_retriever(), "question":RunnablePassthrough()} | prompt | model | parser
)

chain.invoke("What is synthetic intelligence?")

'According to the context, synthetic intelligence is "kind of like the next stage of development" and is referred to as "synthetic AIs" that will "uncover" and "solve" the universe, which is thought to be "some kind of a puzzle". However, a clear definition of synthetic intelligence is not provided in the context.'

In [65]:
chain.invoke("Who is the prime minister of India?")

"I don't know."

In [66]:
chain.invoke("How does neural network learn?")

'A neural network learns by adjusting the "knobs" (weights) in its mathematical expression through a process of training on a dataset. The goal is to find the setting of the knobs that makes the neural net perform a specific task, such as classifying images. The network is trained on a dataset, and then deployed to observe its performance. The data collected from the deployment is then used to refine the training set, and the process is repeated in a cycle of improvement.'

## Setting Up PineCone

In [69]:
os.environ['PINECONE_API_KEY'] = userdata.get('PINECONE_API_KEY')

In [71]:
from langchain_pinecone import PineconeVectorStore

index_name = "youtube-rag-index"

pinecone = PineconeVectorStore.from_documents(
    documents = documents,
    embedding = embeddings,
    index_name = index_name,

)

In [72]:
pinecone.similarity_search_with_score("What is synthetic intelligence?")[:3]


[(Document(metadata={'source': '/content/transcription.txt'}, page_content="I think it's possible that physics has exploits and we should be trying to find them, arranging some kind of a crazy quantum mechanical system that somehow gives you buffer overflow, somehow gives you rounding error in the floating point. Synthetic intelligences are kind of like the next stage of development. And I don't know where it leads to, like at some point I suspect the universe is some kind of a puzzle. These synthetic AIs will uncover that puzzle and solve it. The following is a conversation with Andre Capati. Previously, the director of AI at Tesla. And before that, at OpenAI and Stanford. He is one of the greatest scientist engineers and educators in the history of artificial intelligence. This is the Lex Friedman podcast to support it. Please check out our sponsors. And now, dear friends, here's Andre Capati. What is a neural network? And what does it seem to do such a surprisingly good job of learn

In [73]:
pinecone.similarity_search_with_score("How does neural network learn?")[:3]


[(Document(metadata={'source': '/content/transcription.txt'}, page_content="I think it's possible that physics has exploits and we should be trying to find them, arranging some kind of a crazy quantum mechanical system that somehow gives you buffer overflow, somehow gives you rounding error in the floating point. Synthetic intelligences are kind of like the next stage of development. And I don't know where it leads to, like at some point I suspect the universe is some kind of a puzzle. These synthetic AIs will uncover that puzzle and solve it. The following is a conversation with Andre Capati. Previously, the director of AI at Tesla. And before that, at OpenAI and Stanford. He is one of the greatest scientist engineers and educators in the history of artificial intelligence. This is the Lex Friedman podcast to support it. Please check out our sponsors. And now, dear friends, here's Andre Capati. What is a neural network? And what does it seem to do such a surprisingly good job of learn

In [75]:
chain = (
    {"context":pinecone.as_retriever(), "question":RunnablePassthrough()} | prompt | model | parser
)

In [76]:
chain.invoke("What is Hollywood going to start doing?")

'Hollywood will start using AI to generate scenes, which will significantly reduce the cost of content creation.'

In [78]:
chain.invoke("Who is the prime minister of India?")

"I don't know."

In [79]:
chain.invoke("Who is the owner of Audi?")

"I don't know"

In [80]:
chain.invoke("Explain how overfitting works in machine learning")

"I don't know"