In [None]:
%pip install pytube
%pip install --upgrade pytube
%pip install yt-dlp
%pip install moviepy
%pip install whisper
%pip install chromadb sentence-transformers
%pip install git+https://github.com/openai/whisper.git
%pip install pytubefix
%pip install chromadb
%pip install langchain
%pip install openai
%pip install opencv-python
%pip install langchain_openai
%pip install --upgrade huggingface_hub
%pip install --upgrade sentence-transformers
%pip install langchain_community
%pip install gradio



In [2]:
from dotenv import load_dotenv
import os
import langsmith

# Specify the path to the .env file
dotenv_path = "./notebooks/apikey.env" #Change if your env is in a diffretn folder
load_dotenv(dotenv_path)

# Ensure required environment variables are loaded
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
LANGCHAIN_API_KEY = os.getenv("LANGCHAIN_API_KEY")
HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")

# Check if all environment variables are set; raise an error if any are missing
if not all([OPENAI_API_KEY, LANGCHAIN_API_KEY, HUGGINGFACEHUB_API_TOKEN]):
    raise ValueError("Some required API keys are missing in the .env file.")

# Enable LangSmith tracing with environment variables
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGCHAIN_PROJECT"] = "cat_expert_knowledge"

# Initialize LangSmith Client
from langsmith import Client
client = Client(api_key=LANGCHAIN_API_KEY)


In [None]:
import os
import yt_dlp
import whisper
import chromadb
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chains import RetrievalQA
from langchain.agents import initialize_agent, Tool
from langchain_community.chat_models import ChatOpenAI
import gradio as gr
import asyncio
from pytubefix import YouTube
import hashlib
import uuid

# Load OpenAI API key

# Cat name memory storage with a limit of 3
cat_name_memory = []
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# Whisper model initialization for transcription
whisper_model = whisper.load_model("base")

# ChromaDB initialization for vector storage
chroma_client = chromadb.Client()
try:
    chroma_collection = chroma_client.create_collection(name="jackson_galaxy_videos")
except chromadb.errors.UniqueConstraintError:
    chroma_collection = chroma_client.get_collection(name="jackson_galaxy_videos")

# Define the directories for audio and transcriptions
audio_dir = "/notebooks/path/to/m4a"
transcription_dir = "/notebooks/path/to/transcriptions"
os.makedirs(audio_dir, exist_ok=True)  # Ensure the audio directory exists
os.makedirs(transcription_dir, exist_ok=True)  # Ensure the transcription directory exists

# List of YouTube URLs
video_urls = [
    "https://youtu.be/ZUcVUFvmDFE?si=z9GfOAWF1qothiKs",
    "https://youtu.be/4DlJYcfiRu4?si=cUVT9L5dEEdkcSt_",
    "https://youtu.be/rxInrRQLEmM?si=Ai7wHN0dI--cns0x",
    "https://youtu.be/gxlNfh5ukMw?si=naO3n4VZeXx3PlOs",
    "https://youtu.be/ojS7XwtoXtw?si=NpNSef7dCm_LnFPv",
    "https://youtu.be/tsYT7yIOdqQ?si=hdGEpxlmFNMf7NNQ",
    "https://youtu.be/tsYT7yIOdqQ?si=e_Zdh2dGpqempHR8",
    "https://youtu.be/UWohxDOXsl4?si=y1nXlUZYw6uzkc8n",
    "https://youtu.be/gZrwcoiy_gY?si=ksfYE03t6xtuxUL0",
    "https://youtu.be/lSDI5diNu4Y?si=Q-In6zMD4ZpuaPIz",
    "https://youtu.be/8aCGL9GpVUg?si=_0yF1U1thjwJqyPY",
    "https://youtu.be/VjOXvD7OvrE?si=t6xugNxLeMjpsi7E",
    "https://youtu.be/FzifwTnCV5s?si=sR_u4kG-4NoQx5Ux",
    "https://youtu.be/XreeFU7RYeI?si=hsc9WO24dJP6AfV2",
    "https://youtu.be/-4O97jw_8Bc?si=pC14dgZ_f4mXdYPv"
]

# Function to create a unique filename from URL
def generate_filename(url, extension="m4a"):
    """Creates a unique filename for each URL based on its hash."""
    return hashlib.md5(url.encode()).hexdigest() + f".{extension}"

# Download each video as audio and handle errors
failed_downloads = []  # To log any failed downloads

for url in video_urls:
    # Generate filename and check if it exists
    filename = generate_filename(url)
    file_path = os.path.join(audio_dir, filename)
    
    if os.path.exists(file_path):
        print(f"Already downloaded: {url}")
        continue  # Skip downloading if file exists
    
    # Download video if not already downloaded
    try:
        yt = YouTube(url)
        video = yt.streams.filter(only_audio=True).first()
        video.download(output_path=audio_dir, filename=filename)
        print(f"Downloaded: {url}")
    except Exception as e:
        print(f"Failed to download {url}: {e}")
        failed_downloads.append(url)

# Optional: log failed downloads if any
if failed_downloads:
    print("Failed Downloads:", failed_downloads)
    # You could write these to a log file for later review

# Step 1: Video Transcription Using Whisper (Updated)
# Directory containing your .m4a files
audio_files = [f for f in os.listdir(audio_dir) if f.endswith(".m4a")]

# Transcribe each audio file and save it as a .txt file, only if the transcription doesn't already exist
for audio_file in audio_files:
    audio_path = os.path.join(audio_dir, audio_file)
    transcription_filename = os.path.join(transcription_dir, audio_file.replace(".m4a", ".txt"))
    
    # Skip transcription if the file already exists
    if os.path.exists(transcription_filename):
        print(f"Already transcribed: {transcription_filename}")
        continue
    
    try:
        transcription = whisper_model.transcribe(audio_path)
        
        # Save transcription to file
        with open(transcription_filename, "w") as f:
            f.write(transcription['text'])
        print(f"Transcribed and saved: {transcription_filename}")
    except Exception as e:
        print(f"Failed to transcribe {audio_file}: {e}")

# Step 2: Add Transcription to ChromaDB
def add_to_chromadb(transcription, metadata):
    embedding_model = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
    vector = embedding_model.embed_documents([transcription])[0]
    unique_id = str(uuid.uuid4())  # Generate a unique ID for each document
    chroma_collection.add(ids=[unique_id], documents=[transcription], metadatas=[{"source": metadata}], embeddings=[vector])

# Read all transcriptions from .txt files and add to ChromaDB, only if not already added
existing_ids = [metadata['source'] for metadata in chroma_collection.get()['metadatas']] if chroma_collection.count() > 0 else []

for url in video_urls:
    filename = generate_filename(url)
    transcription_path = os.path.join(transcription_dir, filename.replace(".m4a", ".txt"))
    if os.path.exists(transcription_path) and url not in existing_ids:
        with open(transcription_path, "r") as f:
            transcription = f.read()
        add_to_chromadb(transcription, url)

# Step 3: Retrieval-Augmented Generation (RAG) Pipeline (Updated to use Local Transcriptions)
first_time_greeting = True

def generate_response(query):
    global first_time_greeting
    if first_time_greeting:
        first_time_greeting = False
        return "What's up my loving cat people! How can I assist you today?"
    
    # Ensure the conversation history limit is maintained to avoid overflow
    if len(cat_name_memory) > 3:
        cat_name_memory.pop(0)
    # Extract cat's name if mentioned in the query
    cat_name = None
    match = re.search(r"my cat named ([A-Za-z]+)", query, re.IGNORECASE)
    if match:
        cat_name = match.group(1)
        if cat_name not in cat_name_memory:
            cat_name_memory.append(cat_name)
        if len(cat_name_memory) > 3:
            cat_name_memory.pop(0)
    elif cat_name_memory:
        cat_name = cat_name_memory[-1]
    embedding_model = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
    vectordb = Chroma(embedding_function=embedding_model, collection_name="jackson_galaxy_videos", client=chroma_client)
    retriever = vectordb.as_retriever(search_kwargs={"k": 2})  # Limit the number of retrieved documents to avoid context overflow

    # Enhanced prompt to request more detailed and specific responses in Jackson Galaxy's tone.
    if cat_name:
        personalized_message = f"Regarding your cat named {cat_name}, "
    else:
        personalized_message = ""

    prompt = (personalized_message +
        f"As Jackson Galaxy, respond to the following question with empathy, expertise, charisma and detailed advice. Be informative and consider practical tips to help the cat owner understand their cat better:\n"
        f"Question: {query}\n"
        f"Advice:"
    )

    # Use the adjusted prompt
    qa_chain = RetrievalQA.from_chain_type(
        llm=ChatOpenAI(openai_api_key=OPENAI_API_KEY, model_name="gpt-3.5-turbo", max_tokens=800),
        retriever=retriever,
        chain_type="map_reduce"
        #chain_type="refine"
    )
    response = qa_chain.run(prompt).strip()  # Stripping whitespace for cleaner output

    return f"{response} Remember, understanding your cat is key to improving your bond! All light, all love, and all mojo. #ForTheLoveOfCat 😻🐾"

# Step 4: Using LangChain Agents to Automate Workflow (Updated)
def setup_agents():
    tools = [
        Tool(
            name="Generate Response",
            func=generate_response,
            description="Use this to generate responses based on transcriptions using RAG"
        )
    ]
    llm = ChatOpenAI(temperature=0.7, openai_api_key=OPENAI_API_KEY)
    agent = initialize_agent(tools, llm, agent_type="zero-shot-react-description", verbose=True)
    return agent

agent = setup_agents()

# Gradio Interface for User Interaction
def ask_question_gradio(question):
    if len(question.strip()) == 0:
        return "Please ask a proper question about your cat."
    try:
            response = asyncio.run(asyncio.to_thread(agent.run, question))
            return response
    except Exception as e:
        return f"An error occurred: {str(e)}"

iface = gr.Interface(fn=ask_question_gradio, inputs="text", outputs="text", title="Jackson Galaxy Chatbot")

if __name__ == "__main__":
    # Launch Gradio interface
    iface.launch(share=True)


Already downloaded: https://youtu.be/ZUcVUFvmDFE?si=z9GfOAWF1qothiKs
Already downloaded: https://youtu.be/4DlJYcfiRu4?si=cUVT9L5dEEdkcSt_
Already downloaded: https://youtu.be/rxInrRQLEmM?si=Ai7wHN0dI--cns0x
Already downloaded: https://youtu.be/gxlNfh5ukMw?si=naO3n4VZeXx3PlOs
Already downloaded: https://youtu.be/ojS7XwtoXtw?si=NpNSef7dCm_LnFPv
Already downloaded: https://youtu.be/tsYT7yIOdqQ?si=hdGEpxlmFNMf7NNQ
Already downloaded: https://youtu.be/tsYT7yIOdqQ?si=e_Zdh2dGpqempHR8
Already downloaded: https://youtu.be/UWohxDOXsl4?si=y1nXlUZYw6uzkc8n
Already downloaded: https://youtu.be/gZrwcoiy_gY?si=ksfYE03t6xtuxUL0
Already downloaded: https://youtu.be/lSDI5diNu4Y?si=Q-In6zMD4ZpuaPIz
Already downloaded: https://youtu.be/8aCGL9GpVUg?si=_0yF1U1thjwJqyPY
Already downloaded: https://youtu.be/VjOXvD7OvrE?si=t6xugNxLeMjpsi7E
Already downloaded: https://youtu.be/FzifwTnCV5s?si=sR_u4kG-4NoQx5Ux
Already downloaded: https://youtu.be/XreeFU7RYeI?si=hsc9WO24dJP6AfV2
Already downloaded: https://youtu.



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mI need to generate a response based on the input "hello"
Action: Generate Response
Action Input: hello[0m
Observation: [36;1m[1;3mWhat's up my loving cat people! How can I assist you today?[0m
Thought:[32;1m[1;3mThis response is not relevant to the input "hello"
Action: Generate Response
Action Input: hello[0m
Observation: [36;1m[1;3mHello there! I'm thrilled to see your message and I'm here with open arms to assist you with any cat-related questions or concerns you might have. It's wonderful that you're seeking guidance to better understand your feline companion. Let's delve into your inquiry together.

Understanding your cat's behavior and needs is crucial for fostering a strong bond and creating a harmonious environment for both of you. Cats, like humans, have unique personalities and preferences, so it's essential to observe and respect your cat's individuality.

If you have a specific question or situation in mi

V3.1 Remebers the cat name but crashes.

In [8]:
import os
import yt_dlp
import whisper
import chromadb
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chains import RetrievalQA
from langchain.agents import initialize_agent, Tool
from langchain_community.chat_models import ChatOpenAI
from langchain.memory import ConversationBufferMemory
import gradio as gr
import asyncio
import hashlib
from concurrent.futures import ThreadPoolExecutor
import re

# Load OpenAI API key
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# Whisper model initialization for transcription
whisper_model = whisper.load_model("base")

# ChromaDB initialization for vector storage
chroma_client = chromadb.Client()
try:
    chroma_collection = chroma_client.create_collection(name="jackson_galaxy_videos")
except chromadb.errors.UniqueConstraintError:
    chroma_collection = chroma_client.get_collection(name="jackson_galaxy_videos")

# Define the directories for audio and transcriptions
audio_dir = "./audio"
transcription_dir = "./transcriptions"
os.makedirs(audio_dir, exist_ok=True)  # Ensure the audio directory exists
os.makedirs(transcription_dir, exist_ok=True)  # Ensure the transcription directory exists

# List of YouTube URLs
video_urls = [
    "https://www.youtube.com/watch?v=1QqWkkpfgzk",
    "https://youtu.be/ZUcVUFvmDFE?si=z9GfOAWF1qothiKs",
    # Add more URLs as needed
]

# Function to create a unique filename from URL
def generate_filename(url, extension="m4a"):
    """Creates a unique filename for each URL based on its hash."""
    return hashlib.md5(url.encode()).hexdigest() + f".{extension}"

# Download each video as audio and handle errors (Concurrent downloading for better performance)
failed_downloads = []  # To log any failed downloads

def download_video(url):
    filename = generate_filename(url)
    file_path = os.path.join(audio_dir, filename)
    if os.path.exists(file_path):
        print(f"Already downloaded: {url}")
        return  # Skip downloading if file exists
    try:
        yt_opts = {
            'format': 'bestaudio/best',
            'outtmpl': file_path,
            'postprocessors': [{'key': 'FFmpegExtractAudio', 'preferredcodec': 'm4a'}],
        }
        with yt_dlp.YoutubeDL(yt_opts) as ydl:
            ydl.download([url])
        print(f"Downloaded: {url}")
    except Exception as e:
        print(f"Failed to download {url}: {e}")
        failed_downloads.append(url)

with ThreadPoolExecutor() as executor:
    executor.map(download_video, video_urls)

# Optional: log failed downloads if any
if failed_downloads:
    print("Failed Downloads:", failed_downloads)

# Step 1: Video Transcription Using Whisper (Updated to use concurrency)
audio_files = [f for f in os.listdir(audio_dir) if f.endswith(".m4a")]

def transcribe_and_index(audio_file):
    audio_path = os.path.join(audio_dir, audio_file)
    transcription_filename = os.path.join(transcription_dir, audio_file.replace(".m4a", ".txt"))
    if os.path.exists(transcription_filename):
        print(f"Already transcribed: {transcription_filename}")
        return
    try:
        transcription = whisper_model.transcribe(audio_path)
        with open(transcription_filename, "w") as f:
            f.write(transcription['text'])
        print(f"Transcribed and saved: {transcription_filename}")
        # Add transcription to ChromaDB
        add_to_chromadb(transcription['text'], audio_file)
    except Exception as e:
        print(f"Failed to transcribe {audio_file}: {e}")

def add_to_chromadb(transcription, source):
    vector = embedding_model.embed_documents([transcription])[0]
    unique_id = hashlib.md5(source.encode()).hexdigest()  # Unique ID based on audio filename
    if chroma_collection.count() == 0 or source not in [meta['source'] for meta in chroma_collection.get()['metadatas']]:
        chroma_collection.add(ids=[unique_id], documents=[transcription], metadatas=[{"source": source}], embeddings=[vector])

# Transcribe and index audio files concurrently
with ThreadPoolExecutor() as executor:
    executor.map(transcribe_and_index, audio_files)

# Initialize memory globally
memory = ConversationBufferMemory(memory_key="chat_history")

# Cat name memory
cat_name_memory = {}

# Step 3: Retrieval-Augmented Generation (RAG) Pipeline
def generate_response(query):
    # Extract cat's name if mentioned in the query
    cat_name = None
    match = re.search(r"my cat named ([A-Za-z]+)", query, re.IGNORECASE)
    if match:
        cat_name = match.group(1)
        cat_name_memory['cat_name'] = cat_name  # Store cat name in memory
    elif 'cat_name' in cat_name_memory:
        cat_name = cat_name_memory['cat_name']

    if not re.search(r"cat|kitten|feline", query, re.IGNORECASE):
        return "Please ask me a question related to cats. I can only provide guidance on cat-related topics."

    vectordb = Chroma(embedding_function=embedding_model, collection_name="jackson_galaxy_videos", client=chroma_client)
    retriever = vectordb.as_retriever(search_kwargs={"k": 2})

    qa_chain = RetrievalQA.from_chain_type(
        llm=ChatOpenAI(openai_api_key=OPENAI_API_KEY, model_name="gpt-3.5-turbo", max_tokens=800),
        retriever=retriever,
        chain_type="stuff"
    )

    response = qa_chain.run({"query": query, "chat_history": memory.load_memory_variables({})["chat_history"]})
    memory.save_context({"input": query}, {"output": response})

    if cat_name:
        response = f"Regarding your cat named {cat_name}, {response}"
    
    return f"As Jackson Galaxy would suggest: {response} Remember, understanding your cat is key to improving your bond!"

# Step 4: Using LangChain Agents to Automate Workflow
def setup_agents():
    tools = [
        Tool(
            name="Generate Response",
            func=generate_response,
            description="Use this to generate responses based on transcriptions using RAG"
        )
    ]
    llm = ChatOpenAI(temperature=0.7, openai_api_key=OPENAI_API_KEY)
    agent = initialize_agent(tools, llm, agent_type="conversational-react-description", verbose=True, memory=memory)
    return agent

agent = setup_agents()

# Gradio Interface for User Interaction
def ask_question_gradio(question):
    if len(question.strip()) == 0:
        return "Please ask a proper question about your cat."
    response = asyncio.run(asyncio.to_thread(agent.run, question))
    return response

iface = gr.Interface(fn=ask_question_gradio, inputs="text", outputs="text", title="Jackson Galaxy Chatbot")

if __name__ == "__main__":
    iface.launch(share=True)

[youtube] Extracting URL: https://youtu.be/ZUcVUFvmDFE?si=z9GfOAWF1qothiKs
[youtube] ZUcVUFvmDFE: Downloading webpage
[youtube] Extracting URL: https://www.youtube.com/watch?v=1QqWkkpfgzk
[youtube] 1QqWkkpfgzk: Downloading webpage
[youtube] 1QqWkkpfgzk: Downloading ios player API JSON
[youtube] ZUcVUFvmDFE: Downloading ios player API JSON
[youtube] ZUcVUFvmDFE: Downloading mweb player API JSON
[youtube] 1QqWkkpfgzk: Downloading mweb player API JSON
[youtube] 1QqWkkpfgzk: Downloading m3u8 information
[youtube] ZUcVUFvmDFE: Downloading m3u8 information
[info] 1QqWkkpfgzk: Downloading 1 format(s): 251
[info] ZUcVUFvmDFE: Downloading 1 format(s): 251
[download] Destination: ./audio/37088be9aa3da65cf4b8f8dd05c77ed9.m4a
[download]   8.9% of   11.20MiB at   48.10MiB/s ETA 00:00[download] Destination: ./audio/55f9f04d946d68dcac4bff1b1292a8c5.m4a
[download] 100% of   14.30MiB in 00:00:00 at 52.87MiB/s    
[ExtractAudio] Destination: ./audio/55f9f04d946d68dcac4bff1b1292a8c5.m4a.m4a
[download] 10

V3 works but no name of cat.

In [None]:
import os
import yt_dlp
import whisper
import chromadb
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chains import RetrievalQA
from langchain.agents import initialize_agent, Tool
from langchain_community.chat_models import ChatOpenAI
import gradio as gr
import asyncio
from pytubefix import YouTube
import hashlib
import uuid

# Load OpenAI API key

# Cat name memory storage with a limit of 3
cat_name_memory = []
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# Whisper model initialization for transcription
whisper_model = whisper.load_model("base")

# ChromaDB initialization for vector storage
chroma_client = chromadb.Client()
try:
    chroma_collection = chroma_client.create_collection(name="jackson_galaxy_videos")
except chromadb.errors.UniqueConstraintError:
    chroma_collection = chroma_client.get_collection(name="jackson_galaxy_videos")

# Define the directories for audio and transcriptions
audio_dir = "/notebooks/path/to/m4a"
transcription_dir = "/notebooks/path/to/transcriptions"
os.makedirs(audio_dir, exist_ok=True)  # Ensure the audio directory exists
os.makedirs(transcription_dir, exist_ok=True)  # Ensure the transcription directory exists

# List of YouTube URLs
video_urls = [
    "https://youtu.be/ZUcVUFvmDFE?si=z9GfOAWF1qothiKs",
    "https://youtu.be/4DlJYcfiRu4?si=cUVT9L5dEEdkcSt_",
    "https://youtu.be/rxInrRQLEmM?si=Ai7wHN0dI--cns0x",
    "https://youtu.be/gxlNfh5ukMw?si=naO3n4VZeXx3PlOs",
    "https://youtu.be/ojS7XwtoXtw?si=NpNSef7dCm_LnFPv",
    "https://youtu.be/tsYT7yIOdqQ?si=hdGEpxlmFNMf7NNQ",
    "https://youtu.be/tsYT7yIOdqQ?si=e_Zdh2dGpqempHR8",
    "https://youtu.be/UWohxDOXsl4?si=y1nXlUZYw6uzkc8n",
    "https://youtu.be/gZrwcoiy_gY?si=ksfYE03t6xtuxUL0",
    "https://youtu.be/lSDI5diNu4Y?si=Q-In6zMD4ZpuaPIz",
    "https://youtu.be/8aCGL9GpVUg?si=_0yF1U1thjwJqyPY",
    "https://youtu.be/VjOXvD7OvrE?si=t6xugNxLeMjpsi7E",
    "https://youtu.be/FzifwTnCV5s?si=sR_u4kG-4NoQx5Ux",
    "https://youtu.be/XreeFU7RYeI?si=hsc9WO24dJP6AfV2",
    "https://youtu.be/-4O97jw_8Bc?si=pC14dgZ_f4mXdYPv"
]

# Function to create a unique filename from URL
def generate_filename(url, extension="m4a"):
    """Creates a unique filename for each URL based on its hash."""
    return hashlib.md5(url.encode()).hexdigest() + f".{extension}"

# Download each video as audio and handle errors
failed_downloads = []  # To log any failed downloads

for url in video_urls:
    # Generate filename and check if it exists
    filename = generate_filename(url)
    file_path = os.path.join(audio_dir, filename)
    
    if os.path.exists(file_path):
        print(f"Already downloaded: {url}")
        continue  # Skip downloading if file exists
    
    # Download video if not already downloaded
    try:
        yt = YouTube(url)
        video = yt.streams.filter(only_audio=True).first()
        video.download(output_path=audio_dir, filename=filename)
        print(f"Downloaded: {url}")
    except Exception as e:
        print(f"Failed to download {url}: {e}")
        failed_downloads.append(url)

# Optional: log failed downloads if any
if failed_downloads:
    print("Failed Downloads:", failed_downloads)
    # You could write these to a log file for later review

# Step 1: Video Transcription Using Whisper (Updated)
# Directory containing your .m4a files
audio_files = [f for f in os.listdir(audio_dir) if f.endswith(".m4a")]

# Transcribe each audio file and save it as a .txt file, only if the transcription doesn't already exist
for audio_file in audio_files:
    audio_path = os.path.join(audio_dir, audio_file)
    transcription_filename = os.path.join(transcription_dir, audio_file.replace(".m4a", ".txt"))
    
    # Skip transcription if the file already exists
    if os.path.exists(transcription_filename):
        print(f"Already transcribed: {transcription_filename}")
        continue
    
    try:
        transcription = whisper_model.transcribe(audio_path)
        
        # Save transcription to file
        with open(transcription_filename, "w") as f:
            f.write(transcription['text'])
        print(f"Transcribed and saved: {transcription_filename}")
    except Exception as e:
        print(f"Failed to transcribe {audio_file}: {e}")

# Step 2: Add Transcription to ChromaDB
def add_to_chromadb(transcription, metadata):
    embedding_model = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
    vector = embedding_model.embed_documents([transcription])[0]
    unique_id = str(uuid.uuid4())  # Generate a unique ID for each document
    chroma_collection.add(ids=[unique_id], documents=[transcription], metadatas=[{"source": metadata}], embeddings=[vector])

# Read all transcriptions from .txt files and add to ChromaDB, only if not already added
existing_ids = [metadata['source'] for metadata in chroma_collection.get()['metadatas']] if chroma_collection.count() > 0 else []

for url in video_urls:
    filename = generate_filename(url)
    transcription_path = os.path.join(transcription_dir, filename.replace(".m4a", ".txt"))
    if os.path.exists(transcription_path) and url not in existing_ids:
        with open(transcription_path, "r") as f:
            transcription = f.read()
        add_to_chromadb(transcription, url)

# Step 3: Retrieval-Augmented Generation (RAG) Pipeline (Updated to use Local Transcriptions)
def generate_response(query):
    # Extract cat's name if mentioned in the query
    cat_name = None
    match = re.search(r"my cat named ([A-Za-z]+)", query, re.IGNORECASE)
    if match:
        cat_name = match.group(1)
        if cat_name not in cat_name_memory:
            cat_name_memory.append(cat_name)
        if len(cat_name_memory) > 3:
            cat_name_memory.pop(0)
    elif cat_name_memory:
        cat_name = cat_name_memory[-1]
    embedding_model = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
    vectordb = Chroma(embedding_function=embedding_model, collection_name="jackson_galaxy_videos", client=chroma_client)
    retriever = vectordb.as_retriever(search_kwargs={"k": 3})  # Limit the number of retrieved documents to avoid context overflow

    # Enhanced prompt to request more detailed and specific responses in Jackson Galaxy's tone.
    if cat_name:
        personalized_message = f"Regarding your cat named {cat_name}, "
    else:
        personalized_message = ""

    prompt = (personalized_message +
        f"As Jackson Galaxy, respond to the following question with empathy, expertise, and detailed advice. Be informative and consider practical tips to help the cat owner understand their cat better:\n"
        f"Question: {query}\n"
        f"Advice:"
    )

    # Use the adjusted prompt
    qa_chain = RetrievalQA.from_chain_type(
        llm=ChatOpenAI(openai_api_key=OPENAI_API_KEY, model_name="gpt-3.5-turbo", max_tokens=800),
        retriever=retriever,
        chain_type="refine"
    )
    response = qa_chain.run(prompt)

    return f"{response} Remember, understanding your cat is key to improving your bond!"

# Step 4: Using LangChain Agents to Automate Workflow (Updated)
def setup_agents():
    tools = [
        Tool(
            name="Generate Response",
            func=generate_response,
            description="Use this to generate responses based on transcriptions using RAG"
        )
    ]
    llm = ChatOpenAI(temperature=0.7, openai_api_key=OPENAI_API_KEY)
    agent = initialize_agent(tools, llm, agent_type="zero-shot-react-description", verbose=True)
    return agent

agent = setup_agents()

# Gradio Interface for User Interaction
def ask_question_gradio(question):
    response = asyncio.run(asyncio.to_thread(agent.run, question))
    return response

iface = gr.Interface(fn=ask_question_gradio, inputs="text", outputs="text", title="Jackson Galaxy Chatbot")

if __name__ == "__main__":
    # Launch Gradio interface
    iface.launch(share=True)


Already downloaded: https://youtu.be/ZUcVUFvmDFE?si=z9GfOAWF1qothiKs
Already downloaded: https://youtu.be/4DlJYcfiRu4?si=cUVT9L5dEEdkcSt_
Already downloaded: https://youtu.be/rxInrRQLEmM?si=Ai7wHN0dI--cns0x
Already downloaded: https://youtu.be/gxlNfh5ukMw?si=naO3n4VZeXx3PlOs
Already downloaded: https://youtu.be/ojS7XwtoXtw?si=NpNSef7dCm_LnFPv
Already downloaded: https://youtu.be/tsYT7yIOdqQ?si=hdGEpxlmFNMf7NNQ
Already downloaded: https://youtu.be/tsYT7yIOdqQ?si=e_Zdh2dGpqempHR8
Already downloaded: https://youtu.be/UWohxDOXsl4?si=y1nXlUZYw6uzkc8n
Already downloaded: https://youtu.be/gZrwcoiy_gY?si=ksfYE03t6xtuxUL0
Already downloaded: https://youtu.be/lSDI5diNu4Y?si=Q-In6zMD4ZpuaPIz
Already downloaded: https://youtu.be/8aCGL9GpVUg?si=_0yF1U1thjwJqyPY
Already downloaded: https://youtu.be/VjOXvD7OvrE?si=t6xugNxLeMjpsi7E
Already downloaded: https://youtu.be/FzifwTnCV5s?si=sR_u4kG-4NoQx5Ux
Already downloaded: https://youtu.be/XreeFU7RYeI?si=hsc9WO24dJP6AfV2
Already downloaded: https://youtu.



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mI should generate a response to get advice on how to stop my cat from scratching the furniture.
Action: Generate Response
Action Input: "How can I prevent my cat from scratching the furniture?"[0m
Observation: [36;1m[1;3mThank you for providing the additional context. In order to prevent your cat from scratching the furniture, it's important to understand why cats scratch in the first place. Scratching is a natural behavior for cats that helps them stretch their muscles, mark their territory, and maintain their claws. To prevent your cat from scratching the furniture, here are some tips:

1. Provide appropriate scratching surfaces: Cats need to scratch, so it's important to provide them with suitable scratching posts or pads. These should be sturdy, tall enough for your cat to fully stretch, and placed in areas where your cat likes to scratch.

2. Use positive reinforcement: Encourage your cat to use the scratching posts b

import os
import yt_dlp
import whisper
import chromadb
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chains import RetrievalQA
from langchain.agents import initialize_agent, Tool
from langchain_community.chat_models import ChatOpenAI
import gradio as gr
import asyncio
from pytubefix import YouTube
import hashlib
import uuid

# Load OpenAI API key
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# Whisper model initialization for transcription
whisper_model = whisper.load_model("base")

# ChromaDB initialization for vector storage
chroma_client = chromadb.Client()
try:
    chroma_collection = chroma_client.create_collection(name="jackson_galaxy_videos")
except chromadb.errors.UniqueConstraintError:
    chroma_collection = chroma_client.get_collection(name="jackson_galaxy_videos")

# Define the directories for audio and transcriptions
audio_dir = "/notebooks/path/to/m4a"
transcription_dir = "/notebooks/path/to/transcriptions"
os.makedirs(audio_dir, exist_ok=True)  # Ensure the audio directory exists
os.makedirs(transcription_dir, exist_ok=True)  # Ensure the transcription directory exists

# List of YouTube URLs
video_urls = [
    "https://youtu.be/ZUcVUFvmDFE?si=z9GfOAWF1qothiKs",
    "https://youtu.be/4DlJYcfiRu4?si=cUVT9L5dEEdkcSt_",
    "https://youtu.be/rxInrRQLEmM?si=Ai7wHN0dI--cns0x",
    "https://youtu.be/gxlNfh5ukMw?si=naO3n4VZeXx3PlOs",
    "https://youtu.be/ojS7XwtoXtw?si=NpNSef7dCm_LnFPv",
    "https://youtu.be/tsYT7yIOdqQ?si=hdGEpxlmFNMf7NNQ",
    "https://youtu.be/tsYT7yIOdqQ?si=e_Zdh2dGpqempHR8",
    "https://youtu.be/UWohxDOXsl4?si=y1nXlUZYw6uzkc8n",
    "https://youtu.be/gZrwcoiy_gY?si=ksfYE03t6xtuxUL0",
    "https://youtu.be/lSDI5diNu4Y?si=Q-In6zMD4ZpuaPIz",
    "https://youtu.be/8aCGL9GpVUg?si=_0yF1U1thjwJqyPY",
    "https://youtu.be/VjOXvD7OvrE?si=t6xugNxLeMjpsi7E",
    "https://youtu.be/FzifwTnCV5s?si=sR_u4kG-4NoQx5Ux",
    "https://youtu.be/XreeFU7RYeI?si=hsc9WO24dJP6AfV2",
    "https://youtu.be/-4O97jw_8Bc?si=pC14dgZ_f4mXdYPv"
]

# Function to create a unique filename from URL
def generate_filename(url, extension="m4a"):
    """Creates a unique filename for each URL based on its hash."""
    return hashlib.md5(url.encode()).hexdigest() + f".{extension}"

# Download each video as audio and handle errors
failed_downloads = []  # To log any failed downloads

for url in video_urls:
    # Generate filename and check if it exists
    filename = generate_filename(url)
    file_path = os.path.join(audio_dir, filename)
    
    if os.path.exists(file_path):
        print(f"Already downloaded: {url}")
        continue  # Skip downloading if file exists
    
    # Download video if not already downloaded
    try:
        yt = YouTube(url)
        video = yt.streams.filter(only_audio=True).first()
        video.download(output_path=audio_dir, filename=filename)
        print(f"Downloaded: {url}")
    except Exception as e:
        print(f"Failed to download {url}: {e}")
        failed_downloads.append(url)

# Optional: log failed downloads if any
if failed_downloads:
    print("Failed Downloads:", failed_downloads)
    # You could write these to a log file for later review

# Step 1: Video Transcription Using Whisper (Updated)
# Directory containing your .m4a files
audio_files = [f for f in os.listdir(audio_dir) if f.endswith(".m4a")]

# Transcribe each audio file and save it as a .txt file, only if the transcription doesn't already exist
for audio_file in audio_files:
    audio_path = os.path.join(audio_dir, audio_file)
    transcription_filename = os.path.join(transcription_dir, audio_file.replace(".m4a", ".txt"))
    
    # Skip transcription if the file already exists
    if os.path.exists(transcription_filename):
        print(f"Already transcribed: {transcription_filename}")
        continue
    
    try:
        transcription = whisper_model.transcribe(audio_path)
        
        # Save transcription to file
        with open(transcription_filename, "w") as f:
            f.write(transcription['text'])
        print(f"Transcribed and saved: {transcription_filename}")
    except Exception as e:
        print(f"Failed to transcribe {audio_file}: {e}")

# Step 2: Add Transcription to ChromaDB
def add_to_chromadb(transcription, metadata):
    embedding_model = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
    vector = embedding_model.embed_documents([transcription])[0]
    unique_id = str(uuid.uuid4())  # Generate a unique ID for each document
    chroma_collection.add(ids=[unique_id], documents=[transcription], metadatas=[{"source": metadata}], embeddings=[vector])

# Read all transcriptions from .txt files and add to ChromaDB, only if not already added
existing_ids = [metadata['source'] for metadata in chroma_collection.get()['metadatas']] if chroma_collection.count() > 0 else []

for url in video_urls:
    filename = generate_filename(url)
    transcription_path = os.path.join(transcription_dir, filename.replace(".m4a", ".txt"))
    if os.path.exists(transcription_path) and url not in existing_ids:
        with open(transcription_path, "r") as f:
            transcription = f.read()
        add_to_chromadb(transcription, url)

# Step 3: Retrieval-Augmented Generation (RAG) Pipeline (Updated to use Local Transcriptions)
def generate_response(query):
    embedding_model = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
    vectordb = Chroma(embedding_function=embedding_model, collection_name="jackson_galaxy_videos", client=chroma_client)
    retriever = vectordb.as_retriever(search_kwargs={"k": 3})  # Limit the number of retrieved documents to avoid context overflow

    # Enhanced prompt to request more detailed and specific responses in Jackson Galaxy's tone.
    prompt = (
        f"As Jackson Galaxy, respond to the following question with empathy, expertise, and detailed advice. Be informative and consider practical tips to help the cat owner understand their cat better:\n"
        f"Question: {query}\n"
        f"Advice:"
    )

    # Use the adjusted prompt
    qa_chain = RetrievalQA.from_chain_type(
        llm=ChatOpenAI(openai_api_key=OPENAI_API_KEY, model_name="gpt-3.5-turbo", max_tokens=800),
        retriever=retriever,
        chain_type="refine"
    )
    response = qa_chain.run(prompt)

    return f"Observation: {response} Remember, understanding your cat is key to improving your bond!"

# Step 4: Using LangChain Agents to Automate Workflow (Updated)
def setup_agents():
    tools = [
        Tool(
            name="Generate Response",
            func=generate_response,
            description="Use this to generate responses based on transcriptions using RAG"
        )
    ]
    llm = ChatOpenAI(temperature=0.7, openai_api_key=OPENAI_API_KEY)
    agent = initialize_agent(tools, llm, agent_type="zero-shot-react-description", verbose=True)
    return agent

agent = setup_agents()

# Gradio Interface for User Interaction
def ask_question_gradio(question):
    response = asyncio.run(asyncio.to_thread(agent.run, question))
    return response

iface = gr.Interface(fn=ask_question_gradio, inputs="text", outputs="text", title="Jackson Galaxy Chatbot")

if __name__ == "__main__":
    # Launch Gradio interface
    iface.launch(share=True)


# it works ------------------------------------------------

import os
import yt_dlp
import whisper
import chromadb
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chains import RetrievalQA
from langchain.agents import initialize_agent, Tool
from langchain_community.chat_models import ChatOpenAI
import gradio as gr
import asyncio
from pytubefix import YouTube
import hashlib
import uuid

# Load OpenAI API key
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# Whisper model initialization for transcription
whisper_model = whisper.load_model("base")

# ChromaDB initialization for vector storage
chroma_client = chromadb.Client()
try:
    chroma_collection = chroma_client.create_collection(name="jackson_galaxy_videos")
except chromadb.errors.UniqueConstraintError:
    chroma_collection = chroma_client.get_collection(name="jackson_galaxy_videos")

# Define the directories for audio and transcriptions
audio_dir = "/notebooks/path/to/m4a"
transcription_dir = "/notebooks/path/to/transcriptions"
os.makedirs(audio_dir, exist_ok=True)  # Ensure the audio directory exists
os.makedirs(transcription_dir, exist_ok=True)  # Ensure the transcription directory exists

# List of YouTube URLs
video_urls = [
    "https://youtu.be/ZUcVUFvmDFE?si=z9GfOAWF1qothiKs",
    "https://youtu.be/4DlJYcfiRu4?si=cUVT9L5dEEdkcSt_",
    "https://youtu.be/rxInrRQLEmM?si=Ai7wHN0dI--cns0x",
    "https://youtu.be/gxlNfh5ukMw?si=naO3n4VZeXx3PlOs",
    "https://youtu.be/ojS7XwtoXtw?si=NpNSef7dCm_LnFPv",
    "https://youtu.be/tsYT7yIOdqQ?si=hdGEpxlmFNMf7NNQ",
    "https://youtu.be/tsYT7yIOdqQ?si=e_Zdh2dGpqempHR8",
    "https://youtu.be/UWohxDOXsl4?si=y1nXlUZYw6uzkc8n",
    "https://youtu.be/gZrwcoiy_gY?si=ksfYE03t6xtuxUL0",
    "https://youtu.be/lSDI5diNu4Y?si=Q-In6zMD4ZpuaPIz",
    "https://youtu.be/8aCGL9GpVUg?si=_0yF1U1thjwJqyPY",
    "https://youtu.be/VjOXvD7OvrE?si=t6xugNxLeMjpsi7E",
    "https://youtu.be/FzifwTnCV5s?si=sR_u4kG-4NoQx5Ux",
    "https://youtu.be/XreeFU7RYeI?si=hsc9WO24dJP6AfV2",
    "https://youtu.be/-4O97jw_8Bc?si=pC14dgZ_f4mXdYPv"
]

# Function to create a unique filename from URL
def generate_filename(url, extension="m4a"):
    """Creates a unique filename for each URL based on its hash."""
    return hashlib.md5(url.encode()).hexdigest() + f".{extension}"

# Download each video as audio and handle errors
failed_downloads = []  # To log any failed downloads

for url in video_urls:
    # Generate filename and check if it exists
    filename = generate_filename(url)
    file_path = os.path.join(audio_dir, filename)
    
    if os.path.exists(file_path):
        print(f"Already downloaded: {url}")
        continue  # Skip downloading if file exists
    
    # Download video if not already downloaded
    try:
        yt = YouTube(url)
        video = yt.streams.filter(only_audio=True).first()
        video.download(output_path=audio_dir, filename=filename)
        print(f"Downloaded: {url}")
    except Exception as e:
        print(f"Failed to download {url}: {e}")
        failed_downloads.append(url)

# Optional: log failed downloads if any
if failed_downloads:
    print("Failed Downloads:", failed_downloads)
    # You could write these to a log file for later review

# Step 1: Video Transcription Using Whisper (Updated)
# Directory containing your .m4a files
audio_files = [f for f in os.listdir(audio_dir) if f.endswith(".m4a")]

# Transcribe each audio file and save it as a .txt file, only if the transcription doesn't already exist
for audio_file in audio_files:
    audio_path = os.path.join(audio_dir, audio_file)
    transcription_filename = os.path.join(transcription_dir, audio_file.replace(".m4a", ".txt"))
    
    # Skip transcription if the file already exists
    if os.path.exists(transcription_filename):
        print(f"Already transcribed: {transcription_filename}")
        continue
    
    try:
        transcription = whisper_model.transcribe(audio_path)
        
        # Save transcription to file
        with open(transcription_filename, "w") as f:
            f.write(transcription['text'])
        print(f"Transcribed and saved: {transcription_filename}")
    except Exception as e:
        print(f"Failed to transcribe {audio_file}: {e}")

# Step 2: Add Transcription to ChromaDB
def add_to_chromadb(transcription, metadata):
    embedding_model = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
    vector = embedding_model.embed_documents([transcription])[0]
    unique_id = str(uuid.uuid4())  # Generate a unique ID for each document
    chroma_collection.add(ids=[unique_id], documents=[transcription], metadatas=[{"source": metadata}], embeddings=[vector])

# Read all transcriptions from .txt files and add to ChromaDB, only if not already added
existing_ids = [metadata['source'] for metadata in chroma_collection.get()['metadatas']] if chroma_collection.count() > 0 else []

for url in video_urls:
    filename = generate_filename(url)
    transcription_path = os.path.join(transcription_dir, filename.replace(".m4a", ".txt"))
    if os.path.exists(transcription_path) and url not in existing_ids:
        with open(transcription_path, "r") as f:
            transcription = f.read()
        add_to_chromadb(transcription, url)

# Step 3: Retrieval-Augmented Generation (RAG) Pipeline (Updated to use Local Transcriptions)
def generate_response(query):
    embedding_model = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
    vectordb = Chroma(embedding_function=embedding_model, collection_name="jackson_galaxy_videos", client=chroma_client)
    retriever = vectordb.as_retriever(search_kwargs={"k": 3})  # Limit the number of retrieved documents to avoid context overflow
    qa_chain = RetrievalQA.from_chain_type(llm=ChatOpenAI(openai_api_key=OPENAI_API_KEY, model_name="gpt-3.5-turbo", max_tokens=250), retriever=retriever, chain_type="stuff")
    response = qa_chain.run(query)
    return f"As Jackson Galaxy, here's my advice: {response} Remember, understanding your cat is key to improving your bond!"

# Step 4: Using LangChain Agents to Automate Workflow (Updated)
def setup_agents():
    tools = [
        Tool(
            name="Generate Response",
            func=generate_response,
            description="Use this to generate responses based on transcriptions using RAG"
        )
    ]
    llm = ChatOpenAI(temperature=0.7, openai_api_key=OPENAI_API_KEY)
    agent = initialize_agent(tools, llm, agent_type="zero-shot-react-description", verbose=True)
    return agent

agent = setup_agents()

# Gradio Interface for User Interaction
def ask_question_gradio(question):
    response = asyncio.run(asyncio.to_thread(agent.run, f"Generate Response: {question}"))
    return response

iface = gr.Interface(fn=ask_question_gradio, inputs="text", outputs="text", title="Jackson Galaxy Chatbot")

if __name__ == "__main__":
    # Launch Gradio interface
    iface.launch(share = True)
