# Final ChatBot

# Installation of Pakages

In [1]:
%pip install pytube
%pip install --upgrade pytube
%pip install yt-dlp
%pip install moviepy
%pip install whisper
%pip install chromadb sentence-transformers
%pip install git+https://github.com/openai/whisper.git
%pip install pytubefix
%pip install chromadb
%pip install langchain
%pip install openai
%pip install opencv-python
%pip install langchain_openai
%pip install --upgrade huggingface_hub
%pip install --upgrade sentence-transformers
%pip install langchain_community
%pip install gradio

Collecting pytube
  Downloading pytube-15.0.0-py3-none-any.whl.metadata (5.0 kB)
Downloading pytube-15.0.0-py3-none-any.whl (57 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pytube
Successfully installed pytube-15.0.0
[0mNote: you may need to restart the kernel to use updated packages.
[0mNote: you may need to restart the kernel to use updated packages.
Collecting yt-dlp
  Downloading yt_dlp-2024.11.4-py3-none-any.whl.metadata (172 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m172.1/172.1 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading yt_dlp-2024.11.4-py3-none-any.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m97.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: yt-dlp
Successfully installed yt-dlp-2024.11.4
[0mNote: you may need to restart the kernel

# Load API Keys

In [None]:

from dotenv import load_dotenv
import os
import langsmith

# Specify the path to the .env file
dotenv_path = "./notebooks/apikey.env" #Change if your env is in a diffretn folder
load_dotenv(dotenv_path)

# Ensure required environment variables are loaded
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
LANGCHAIN_API_KEY = os.getenv("LANGCHAIN_API_KEY")
HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")

# Check if all environment variables are set; raise an error if any are missing
if not all([OPENAI_API_KEY, LANGCHAIN_API_KEY, HUGGINGFACEHUB_API_TOKEN]):
    raise ValueError("Some required API keys are missing in the .env file.")

# Enable LangSmith tracing with environment variables
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGCHAIN_PROJECT"] = "cat_expert_knowledge"

# Initialize LangSmith Client
from langsmith import Client
client = Client(api_key=LANGCHAIN_API_KEY)



# Main

In [None]:
import os
import yt_dlp
import whisper
import chromadb
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chains import RetrievalQA
from langchain.agents import initialize_agent, Tool
from langchain_community.chat_models import ChatOpenAI
import gradio as gr
import asyncio
from pytubefix import YouTube
import hashlib
import uuid
import re

# Load OpenAI API key

# Cat name memory storage with a limit of 3
cat_name_memory = []
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# Whisper model initialization for transcription
whisper_model = whisper.load_model("base")

# ChromaDB initialization for vector storage
chroma_client = chromadb.Client()
try:
    chroma_collection = chroma_client.create_collection(name="jackson_galaxy_videos")
except chromadb.errors.UniqueConstraintError:
    chroma_collection = chroma_client.get_collection(name="jackson_galaxy_videos")

# Define the directories for audio and transcriptions
audio_dir = "/notebooks/path/to/m4a"
transcription_dir = "/notebooks/path/to/transcriptions"
os.makedirs(audio_dir, exist_ok=True)  # Ensure the audio directory exists
os.makedirs(transcription_dir, exist_ok=True)  # Ensure the transcription directory exists

# List of YouTube URLs
video_urls = [
    "https://youtu.be/ZUcVUFvmDFE?si=z9GfOAWF1qothiKs",
    "https://youtu.be/4DlJYcfiRu4?si=cUVT9L5dEEdkcSt_",
    "https://youtu.be/rxInrRQLEmM?si=Ai7wHN0dI--cns0x",
    "https://youtu.be/gxlNfh5ukMw?si=naO3n4VZeXx3PlOs",
    "https://youtu.be/ojS7XwtoXtw?si=NpNSef7dCm_LnFPv",
    "https://youtu.be/tsYT7yIOdqQ?si=hdGEpxlmFNMf7NNQ",
    "https://youtu.be/tsYT7yIOdqQ?si=e_Zdh2dGpqempHR8",
    "https://youtu.be/UWohxDOXsl4?si=y1nXlUZYw6uzkc8n",
    "https://youtu.be/gZrwcoiy_gY?si=ksfYE03t6xtuxUL0",
    "https://youtu.be/lSDI5diNu4Y?si=Q-In6zMD4ZpuaPIz",
    "https://youtu.be/8aCGL9GpVUg?si=_0yF1U1thjwJqyPY",
    "https://youtu.be/VjOXvD7OvrE?si=t6xugNxLeMjpsi7E",
    "https://youtu.be/FzifwTnCV5s?si=sR_u4kG-4NoQx5Ux",
    "https://youtu.be/XreeFU7RYeI?si=hsc9WO24dJP6AfV2",
    "https://youtu.be/-4O97jw_8Bc?si=pC14dgZ_f4mXdYPv"
]

# Function to create a unique filename from URL
def generate_filename(url, extension="m4a"):
    """Creates a unique filename for each URL based on its hash."""
    return hashlib.md5(url.encode()).hexdigest() + f".{extension}"

# Download each video as audio and handle errors
failed_downloads = []  # To log any failed downloads

for url in video_urls:
    # Generate filename and check if it exists
    filename = generate_filename(url)
    file_path = os.path.join(audio_dir, filename)
    
    if os.path.exists(file_path):
        print(f"Already downloaded: {url}")
        continue  # Skip downloading if file exists
    
    # Download video if not already downloaded
    try:
        yt = YouTube(url)
        video = yt.streams.filter(only_audio=True).first()
        video.download(output_path=audio_dir, filename=filename)
        print(f"Downloaded: {url}")
    except Exception as e:
        print(f"Failed to download {url}: {e}")
        failed_downloads.append(url)

# Optional: log failed downloads if any
if failed_downloads:
    print("Failed Downloads:", failed_downloads)
    # You could write these to a log file for later review

# Step 1: Video Transcription Using Whisper (Updated)
# Directory containing your .m4a files
audio_files = [f for f in os.listdir(audio_dir) if f.endswith(".m4a")]

# Transcribe each audio file and save it as a .txt file, only if the transcription doesn't already exist
for audio_file in audio_files:
    audio_path = os.path.join(audio_dir, audio_file)
    transcription_filename = os.path.join(transcription_dir, audio_file.replace(".m4a", ".txt"))
    
    # Skip transcription if the file already exists
    if os.path.exists(transcription_filename):
        print(f"Already transcribed: {transcription_filename}")
        continue
    
    try:
        transcription = whisper_model.transcribe(audio_path)
        
        # Save transcription to file
        with open(transcription_filename, "w") as f:
            f.write(transcription['text'])
        print(f"Transcribed and saved: {transcription_filename}")
    except Exception as e:
        print(f"Failed to transcribe {audio_file}: {e}")

# Step 2: Add Transcription to ChromaDB
def add_to_chromadb(transcription, metadata):
    embedding_model = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
    vector = embedding_model.embed_documents([transcription])[0]
    unique_id = str(uuid.uuid4())  # Generate a unique ID for each document
    chroma_collection.add(ids=[unique_id], documents=[transcription], metadatas=[{"source": metadata}], embeddings=[vector])

# Read all transcriptions from .txt files and add to ChromaDB, only if not already added
existing_ids = [metadata['source'] for metadata in chroma_collection.get()['metadatas']] if chroma_collection.count() > 0 else []

for url in video_urls:
    filename = generate_filename(url)
    transcription_path = os.path.join(transcription_dir, filename.replace(".m4a", ".txt"))
    if os.path.exists(transcription_path) and url not in existing_ids:
        with open(transcription_path, "r") as f:
            transcription = f.read()
        add_to_chromadb(transcription, url)

# Step 3: Retrieval-Augmented Generation (RAG) Pipeline (Updated to use Local Transcriptions)
first_time_greeting = True

def generate_response(query):
    global first_time_greeting
    if first_time_greeting:
        first_time_greeting = False
        return "What's up my loving cat people! How can I assist you today?"
    
    # Ensure the conversation history limit is maintained to avoid overflow
    if len(cat_name_memory) > 3:
        cat_name_memory.pop(0)
    # Extract cat's name if mentioned in the query
    cat_name = None
    match = re.search(r"my cat named ([A-Za-z]+)", query, re.IGNORECASE)
    if match:
        cat_name = match.group(1)
        if cat_name not in cat_name_memory:
            cat_name_memory.append(cat_name)
        if len(cat_name_memory) > 3:
            cat_name_memory.pop(0)
    elif cat_name_memory:
        cat_name = cat_name_memory[-1]
    embedding_model = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
    vectordb = Chroma(embedding_function=embedding_model, collection_name="jackson_galaxy_videos", client=chroma_client)
    retriever = vectordb.as_retriever(search_kwargs={"k": 2})  # Limit the number of retrieved documents to avoid context overflow

    # Enhanced prompt to request more detailed and specific responses in Jackson Galaxy's tone.
    if cat_name:
        personalized_message = f"Regarding your cat named {cat_name}, "
    else:
        personalized_message = ""

    prompt = (personalized_message +
        f"As Jackson Galaxy, respond to the following question with empathy, expertise, charisma and detailed advice. Be informative and consider practical tips to help the cat owner understand their cat better:\n"
        f"Provide actionable steps that are easy to follow, explain the reasoning behind them, and end on an encouraging note. \n"
        f"Make sure the user feels understood and supported:\n"
        f"Question: {query}\n"
        f"Advice:"
    )

    # Use the adjusted prompt
    qa_chain = RetrievalQA.from_chain_type(
        llm=ChatOpenAI(openai_api_key=OPENAI_API_KEY, model_name="gpt-3.5-turbo", max_tokens=800),
        retriever=retriever,
        chain_type="map_reduce"
        #chain_type="refine"
    )
    response = qa_chain.run(prompt).strip()  # Stripping whitespace for cleaner output

    return f"{response} Remember, understanding your cat is key to improving your bond!"

# Step 4: Using LangChain Agents to Automate Workflow (Updated)
def setup_agents():
    tools = [
        Tool(
            name="Generate Response",
            func=generate_response,
            description="Use this to generate responses based on transcriptions using RAG"
        )
    ]
    llm = ChatOpenAI(temperature=0.7, openai_api_key=OPENAI_API_KEY)
    agent = initialize_agent(tools, llm, agent_type="zero-shot-react-description", verbose=True)
    return agent

agent = setup_agents()

# Gradio Interface for User Interaction
def ask_question_gradio(question):
    if len(question.strip()) == 0:
        return "Please ask a proper question about your cat."
    try:
            response = asyncio.run(asyncio.to_thread(agent.run, question))
            return response
    except Exception as e:
        return f"An error occurred: {str(e)}"

iface = gr.Interface(fn=ask_question_gradio, inputs="text", outputs="text", title="Jackson Galaxy Chatbot")

if __name__ == "__main__":
    # Launch Gradio interface
    iface.launch(share=True)


Already downloaded: https://youtu.be/ZUcVUFvmDFE?si=z9GfOAWF1qothiKs
Already downloaded: https://youtu.be/4DlJYcfiRu4?si=cUVT9L5dEEdkcSt_
Already downloaded: https://youtu.be/rxInrRQLEmM?si=Ai7wHN0dI--cns0x
Already downloaded: https://youtu.be/gxlNfh5ukMw?si=naO3n4VZeXx3PlOs
Already downloaded: https://youtu.be/ojS7XwtoXtw?si=NpNSef7dCm_LnFPv
Already downloaded: https://youtu.be/tsYT7yIOdqQ?si=hdGEpxlmFNMf7NNQ
Already downloaded: https://youtu.be/tsYT7yIOdqQ?si=e_Zdh2dGpqempHR8
Already downloaded: https://youtu.be/UWohxDOXsl4?si=y1nXlUZYw6uzkc8n
Already downloaded: https://youtu.be/gZrwcoiy_gY?si=ksfYE03t6xtuxUL0
Already downloaded: https://youtu.be/lSDI5diNu4Y?si=Q-In6zMD4ZpuaPIz
Already downloaded: https://youtu.be/8aCGL9GpVUg?si=_0yF1U1thjwJqyPY
Already downloaded: https://youtu.be/VjOXvD7OvrE?si=t6xugNxLeMjpsi7E
Already downloaded: https://youtu.be/FzifwTnCV5s?si=sR_u4kG-4NoQx5Ux
Already downloaded: https://youtu.be/XreeFU7RYeI?si=hsc9WO24dJP6AfV2
Already downloaded: https://youtu.



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mI should use the Generate Response tool to get advice on how to stop a cat from scratching furniture.
Action: Generate Response
Action Input: "How to stop a cat from scratching furniture?"[0m
Observation: [36;1m[1;3mWhat's up my loving cat people! How can I assist you today?[0m
Thought:[32;1m[1;3mThis response doesn't directly answer the question, I should try rephrasing the input.
Action: Generate Response
Action Input: "How can I prevent my cat from scratching the furniture?"[0m

  vectordb = Chroma(embedding_function=embedding_model, collection_name="jackson_galaxy_videos", client=chroma_client)



Observation: [36;1m[1;3mHey there, fellow cat lover! I completely understand your concern about your cat scratching the furniture. It's a common behavior that can be frustrating, but fear not, there are ways to address it effectively and create a harmonious environment for both you and your feline companion.

First and foremost, it's essential to realize that scratching is a natural and instinctual behavior for cats. It helps them maintain their claws, stretch their muscles, and mark their territory. So, providing them with appropriate outlets for scratching is key.

Here are some practical steps you can take to prevent your cat from scratching the furniture:

1. **Scratching Posts or Pads**: Cats need to scratch, so offering them a designated area to do so is crucial. Place scratching posts near the furniture they tend to scratch and try different materials like sisal, cardboard, or carpet to see what your cat prefers. By giving them an appealing alternative, you're redirecting the