## Get transcripts from videos with caption IDs

In [None]:
import os
import json
from dotenv import load_dotenv
import pandas as pd
from google.auth.transport.requests import Request as AuthRequest
from googleapiclient.discovery import build
import httplib2
from pprint import pprint
from datetime import timedelta
import re
import pandas as pd

# Load environment variables from .env file
load_dotenv()

# Access API key
yt_api_key = os.getenv("YOUTUBE_API_KEY")

In [None]:
# Create a YouTube API client
youtube = build('youtube', 'v3', developerKey=yt_api_key)

In [None]:
# Load videos list
ready_to_transcribe = pd.read_csv('../resources/video_list/details_complete.csv')

# Load the existing transcribed_videos CSV file if it exists, otherwise create an empty DataFrame
if os.path.exists("../resources/video_list/transcribed_videos.csv"):
    transcribed_videos_df = pd.read_csv("../resources/video_list/transcribed_videos.csv")
else:
    transcribed_videos_df = pd.DataFrame(columns=["video_id", "title"])


In [None]:
# Function to download and parse caption file to extract transcript text
def download_and_parse_caption(caption_id):
    # Download the caption file
    request = youtube.captions().download(
        id=caption_id,
        tfmt="srt"  # Choose the caption format (e.g., "srt", "vtt")
    )
    caption_response = request.execute()

    # Parse the caption file to extract transcript text
    transcript_text = caption_response.decode('utf-8')  # Assuming the response is in UTF-8 encoding

    return transcript_text

In [None]:
### MAIN ###

# Create a directory to save transcripts
if not os.path.exists("transcripts"):
    os.makedirs("transcripts")

# Download and save transcripts for each caption
for index, row in ready_to_transcribe.iterrows():
    caption_id = row['caption_id']
    video_id = row['video_id']
    title = row['title']

    # Check if the video_id exists in the transcribed_videos DataFrame
    if video_id in transcribed_videos_df['video_id'].values:
        print(f"Transcription for video ID {video_id} already exists. Skipping...")
        continue

    transcript_text = download_and_parse_caption(caption_id)

    # Construct the file name
    file_name = f"transcripts/{title}_{video_id}.txt"

    # Save transcript text to a file
    with open(file_name, "w", encoding="utf-8") as file:
        file.write(transcript_text)

    print(f"Transcript saved to {file_name}")

    # Add the video to the transcribed_videos DataFrame
    transcribed_videos_df = transcribed_videos_df.append({"video_id": video_id, "title": title}, ignore_index=True)

# Save the updated transcribed_videos DataFrame to a CSV file
transcribed_videos_df.to_csv("../resources/video_list/transcribed_videos.csv", index=False)