# CatChat bot V.03

In [None]:
#%pip install pytube
#%pip install --upgrade pytube
#%pip install yt-dlp
#%pip install moviepy
#%pip install whisper
#%pip install chromadb sentence-transformers
#%pip install git+https://github.com/openai/whisper.git
#%pip install pytubefix
#%pip install chromadb
#%pip install langchain
#%pip install openai
#%pip install opencv-python
#%pip install langchain_openai
#%pip install --upgrade huggingface_hub
#%pip install --upgrade sentence-transformers
#%pip install langchain_community




In [3]:

from dotenv import load_dotenv
import os
import langsmith

# Specify the path to the .env file
dotenv_path = "apikey.env" #Change if your env is in a diffretn folder
load_dotenv(dotenv_path)

# Ensure required environment variables are loaded
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
LANGCHAIN_API_KEY = os.getenv("LANGCHAIN_API_KEY")
HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")

# Check if all environment variables are set; raise an error if any are missing
if not all([OPENAI_API_KEY, LANGCHAIN_API_KEY, HUGGINGFACEHUB_API_TOKEN]):
    raise ValueError("Some required API keys are missing in the .env file.")

# Enable LangSmith tracing with environment variables
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGCHAIN_PROJECT"] = "cat_expert_knowledge"

# Initialize LangSmith Client
from langsmith import Client
client = Client(api_key=LANGCHAIN_API_KEY)


In [5]:
from pytubefix import YouTube
import os
import hashlib

# Define the directory for downloaded audio files
output_path = "/1_Lab_Final_Proyect/Chatbot-dump"
os.makedirs(output_path, exist_ok=True)  # Ensure the directory exists

# Replace with your list of YouTube URLs
video_urls = ["https://youtu.be/ZUcVUFvmDFE?si=z9GfOAWF1qothiKs", 
            "https://youtu.be/4DlJYcfiRu4?si=cUVT9L5dEEdkcSt_", 
            "https://youtu.be/rxInrRQLEmM?si=Ai7wHN0dI--cns0x", 
            "https://youtu.be/gxlNfh5ukMw?si=naO3n4VZeXx3PlOs", 
            "https://youtu.be/ojS7XwtoXtw?si=NpNSef7dCm_LnFPv", 
            "https://youtu.be/tsYT7yIOdqQ?si=hdGEpxlmFNMf7NNQ", 
            "https://youtu.be/tsYT7yIOdqQ?si=e_Zdh2dGpqempHR8", 
            "https://youtu.be/UWohxDOXsl4?si=y1nXlUZYw6uzkc8n", 
            "https://youtu.be/gZrwcoiy_gY?si=ksfYE03t6xtuxUL0", 
            "https://youtu.be/lSDI5diNu4Y?si=Q-In6zMD4ZpuaPIz", 
            "https://youtu.be/8aCGL9GpVUg?si=_0yF1U1thjwJqyPY", 
            "https://youtu.be/VjOXvD7OvrE?si=t6xugNxLeMjpsi7E", 
            "https://youtu.be/FzifwTnCV5s?si=sR_u4kG-4NoQx5Ux", 
            "https://youtu.be/XreeFU7RYeI?si=hsc9WO24dJP6AfV2", 
            "https://youtu.be/-4O97jw_8Bc?si=pC14dgZ_f4mXdYPv" ]

# Function to create a unique filename from URL
def generate_filename(url, extension="m4a"):
    """Creates a unique filename for each URL based on its hash."""
    return hashlib.md5(url.encode()).hexdigest() + f".{extension}"

# Download each video as audio and handle errors
failed_downloads = []  # To log any failed downloads

for url in video_urls:
    # Generate filename and check if it exists
    filename = generate_filename(url)
    file_path = os.path.join(output_path, filename)
    
    if os.path.exists(file_path):
        print(f"Already downloaded: {url}")
        continue  # Skip downloading if file exists
    
    # Download video if not already downloaded
    try:
        yt = YouTube(url)
        video = yt.streams.filter(only_audio=True).first()
        video.download(output_path=output_path, filename=filename)
        print(f"Downloaded: {url}")
    except Exception as e:
        print(f"Failed to download {url}: {e}")
        failed_downloads.append(url)

# Optional: log failed downloads if any
if failed_downloads:
    print("Failed Downloads:", failed_downloads)
    # You could write these to a log file for later review

Downloaded: https://youtu.be/ZUcVUFvmDFE?si=z9GfOAWF1qothiKs
Downloaded: https://youtu.be/4DlJYcfiRu4?si=cUVT9L5dEEdkcSt_
Downloaded: https://youtu.be/rxInrRQLEmM?si=Ai7wHN0dI--cns0x
Downloaded: https://youtu.be/gxlNfh5ukMw?si=naO3n4VZeXx3PlOs
Downloaded: https://youtu.be/ojS7XwtoXtw?si=NpNSef7dCm_LnFPv
Downloaded: https://youtu.be/tsYT7yIOdqQ?si=hdGEpxlmFNMf7NNQ
Downloaded: https://youtu.be/tsYT7yIOdqQ?si=e_Zdh2dGpqempHR8
Downloaded: https://youtu.be/UWohxDOXsl4?si=y1nXlUZYw6uzkc8n
Downloaded: https://youtu.be/gZrwcoiy_gY?si=ksfYE03t6xtuxUL0
Downloaded: https://youtu.be/lSDI5diNu4Y?si=Q-In6zMD4ZpuaPIz
Downloaded: https://youtu.be/8aCGL9GpVUg?si=_0yF1U1thjwJqyPY
Downloaded: https://youtu.be/VjOXvD7OvrE?si=t6xugNxLeMjpsi7E
Downloaded: https://youtu.be/FzifwTnCV5s?si=sR_u4kG-4NoQx5Ux
Downloaded: https://youtu.be/XreeFU7RYeI?si=hsc9WO24dJP6AfV2
Downloaded: https://youtu.be/-4O97jw_8Bc?si=pC14dgZ_f4mXdYPv


# Transcriptions done by Whisper

In [6]:
import whisper
import os

# Load Whisper model
model = whisper.load_model("base")

# Directory containing your .m4a files
audio_dir = "/1_Lab_Final_Proyect/Chatbot-dump"
transcription_dir = "/1_Lab_Final_Proyect/Chatbot-dump/Transcriptions"

# Ensure the transcription directory exists
os.makedirs(transcription_dir, exist_ok=True)

# List all .m4a files in the audio directory
audio_files = [f for f in os.listdir(audio_dir) if f.endswith(".m4a")]

# Transcribe each audio file and save it as a .txt file
for audio_file in audio_files:
    audio_path = os.path.join(audio_dir, audio_file)
    try:
        transcription = model.transcribe(audio_path)

        # Save transcription to a .txt file
        
        transcription_file = os.path.join(transcription_dir, audio_file.replace(".m4a", ".txt"))
        with open(transcription_file, "w") as f:
            f.write(transcription['text'])
        print(f"Transcribed and saved: {transcription_file}")
    except Exception as e:
        print(f"Failed to transcribe {audio_file}: {e}")



100%|███████████████████████████████████████| 139M/139M [00:10<00:00, 13.3MiB/s]
  checkpoint = torch.load(fp, map_location=device)


Failed to transcribe 0ad0281d64cf5d1e366881ec6adfbd9f.m4a: [WinError 2] The system cannot find the file specified
Failed to transcribe 14744c59f76c5a6fb10402ccdd3e280e.m4a: [WinError 2] The system cannot find the file specified
Failed to transcribe 23e9e0c657974c427119ed6e734c704d.m4a: [WinError 2] The system cannot find the file specified
Failed to transcribe 4c4c7f472299b10d74dee0ecdc941b3b.m4a: [WinError 2] The system cannot find the file specified
Failed to transcribe 55340c0bd9e5d16b88d7bdc38908eccc.m4a: [WinError 2] The system cannot find the file specified
Failed to transcribe 55f9f04d946d68dcac4bff1b1292a8c5.m4a: [WinError 2] The system cannot find the file specified
Failed to transcribe 6e43a9a97ff958b094ee86b59d6fa433.m4a: [WinError 2] The system cannot find the file specified
Failed to transcribe 89438d05ab80fbaad4194f97dcf8f786.m4a: [WinError 2] The system cannot find the file specified
Failed to transcribe b81b63d4601ef135d3641d91fa8e5920.m4a: [WinError 2] The system canno

# Check if the transcriptions are on the right path

In [7]:
import os
import glob

# Define the directory for transcription files
transcription_dir = "path/to/transcriptions"

# Check if the directory exists
if not os.path.exists(transcription_dir):
    print(f"The transcription directory {transcription_dir} does not exist.")
else:
    # List transcription files
    transcription_files = glob.glob(os.path.join(transcription_dir, "*.txt"))
    if not transcription_files:
        print(f"No .txt transcription files found in {transcription_dir}.")
    else:
        print(f"Found transcription files: {transcription_files}")


The transcription directory path/to/transcriptions does not exist.
