## PROJECT: Business Case - Building a Multimodel AI Chatbot for Youtube Videos
** Preparing Data **
* Creating CSV file with Youtube video links - 22 records
* Extracting Metadata from Youtube using CSV file
* Generating Audio files from Youtube using Metadata
* Transcripting Audio Files using "WHISPER" Model

## Step 1: Load CSV File

In [7]:
%pip install yt-dlp
%pip install pandas numpy --quiet
%pip install openai-whisper --quiet

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
import numpy as np
import os
import yt_dlp
import whisper

In [2]:
os.environ["PATH"] += os.pathsep + r"C:\ffmpeg-7.1.1-essentials_build\bin"

In [3]:
df = pd.read_csv("../Data/SNOW_YT_Videos.csv", sep=";")
print(df.head())

   Number                                 Youtube_link  \
0       1  https://www.youtube.com/watch?v=tOaMRG8DX3U   
1       2  https://www.youtube.com/watch?v=vteLoWpNw8Q   
2       3  https://www.youtube.com/watch?v=7WJ6lmxa1WQ   
3       4  https://www.youtube.com/watch?v=fqB-NcZmqXo   
4       5  https://www.youtube.com/watch?v=ZYJqkxGrNiI   

                                             Subject  
0  An AI Agent that knows everything about your P...  
1          What Is Agentic AI and Why Should I Care?  
2                     Agentic AI workflows for AIOps  
3  ServiceNow's agentic AI framework explained: W...  
4  AI and Business Agility: Enhancing Human Intel...  


## Convert Videos to MetaData

In [4]:
os.makedirs("Data", exist_ok=True)

def get_metadata_yt_dlp(video_url):
    ydl_opts = {'quiet': True, 'skip_download': True}
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        try:
            info = ydl.extract_info(video_url, download=False)
            return {
                "title": info.get("title"),
                "channel": info.get("uploader"),
                "description": info.get("description", "")[:200],
                "length": info.get("duration"),
                "publish_date": info.get("upload_date"),
                "views": info.get("view_count")
            }
        except Exception as e:
            return {"error": str(e)}

metadata_list = [get_metadata_yt_dlp(link) for link in df["Youtube_link"]]
metadata_df = pd.DataFrame(metadata_list)
final_df = pd.concat([df, metadata_df], axis=1)
final_df.to_csv("../Data/ServiceNow_Youtube_Metadata_Clean.csv", index=False)

ERROR: [youtube] VFGAvNxaK4Q: Private video. Sign in if you've been granted access to this video. Use --cookies-from-browser or --cookies for the authentication. See  https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp  for how to manually pass cookies. Also see  https://github.com/yt-dlp/yt-dlp/wiki/Extractors#exporting-youtube-cookies  for tips on effectively exporting YouTube cookies


## Transcription with Whisper

In [5]:
df = pd.read_csv("../Data/ServiceNow_Youtube_Metadata_Clean.csv", sep=";")
model = whisper.load_model("base")
os.makedirs("../audio/audio_files", exist_ok=True)

def download_audio(url, video_id):
    ydl_opts = {
        'format': 'bestaudio/best',
        'outtmpl': f'audio_files/{video_id}.%(ext)s',
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'mp3',
            'preferredquality': '192',
        }],
        'quiet': True
    }
    try:
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            ydl.download([url])
        return f'audio_files/{video_id}.mp3'
    except Exception as e:
        return None

In [7]:
import warnings
warnings.filterwarnings("ignore", message="FP16 is not supported on CPU; using FP32 instead")

output_path = "../data/video_metadata_with_transcripts.csv"

# Check if the file exists and has non-empty transcripts
if os.path.exists(output_path):
    existing_df = pd.read_csv(output_path)
    if "transcript" in existing_df.columns and not existing_df["transcript"].isnull().all():
        print("✅ Transcripts already exist. Skipping transcription step.")
    else:
        print("⚠️ Transcripts missing or empty. Running transcription...")
        run_transcription = True
else:
    print("📂 File not found. Running transcription...")
    run_transcription = True

if 'run_transcription' in locals():
    transcripts = []

    for idx, row in final_df.iterrows():
        url = row['Youtube_link']
        video_id = url.split("v=")[-1]
        print(f"🔊 Processing video {idx+1}: {url}")

        audio_path = download_audio(url, video_id)
        if audio_path and os.path.exists(audio_path):
            try:
                result = model.transcribe(audio_path)
                transcripts.append(result['text'])
            except Exception as e:
                transcripts.append(f"Error during transcription: {str(e)}")
        else:
            transcripts.append("Error: Audio download failed or video may be protected")

    final_df["transcript"] = transcripts
    final_df.to_csv(output_path, index=False)
    print("✅ Transcripts saved to:", output_path)

#print("✅ Transcripts saved to video_metadata_with_transcripts.csv")

✅ Transcripts already exist. Skipping transcription step.
