In [None]:
# This script gets the transcirptions for the videos in the video list

In [6]:
import os
import json
from dotenv import load_dotenv
import pandas as pd
from google.auth.transport.requests import Request as AuthRequest
from googleapiclient.discovery import build
import httplib2
from pprint import pprint
from datetime import timedelta
import re

# Load environment variables from .env file
load_dotenv()

# Access API key
yt_api_key = os.getenv("YOUTUBE_API_KEY")


In [7]:
# Create a YouTube API client
youtube = build('youtube', 'v3', developerKey=yt_api_key)

In [32]:
# Load videos list
videos_df = pd.read_csv('../resources/video_list/videos.csv')
videos_df = videos_df.sort_values


Unnamed: 0,playlist,title,video_id,transcribed
0,52_weeks,How to set up a reef tank | 52 Weeks of Reefing,E-aoo7Gl2FQ,False
1,52_weeks,Week 1: Our Best Reef Tank Build Yet | 52 Week...,fKEXNIhomGs,False
2,52_weeks,Week 2: Unveiling the tank and custom built st...,OxZ_hJjXwj8,False
3,52_weeks,"Week 3: Sumps - What do they do, and which sho...",z6foHVHg1Rw,False
4,52_weeks,Week 4: Planning a Safer Tank with Redundancy ...,tppr8V13h5U,False
...,...,...,...,...
264,nutrients,"My Beliefs: Nutrients, 17 Years, and How I App...",ZiPJiSKkVqA,False
265,nutrients,A Fish First Approach to Optimizing Fish Food ...,IqLgZDp2Wzo,False
266,nutrients,A Challenge: Three Effective Filters for Organ...,lMzrWkBmX4g,False
267,nutrients,Are Water Changes Worth It? A Hammer Solution ...,I7_Rm39aOwc,False


In [52]:
# Function for video details
def get_video_details(video_id):
    details = []

    # Retrieve video details
    request = youtube.videos().list(
        part="snippet,contentDetails",  # Include 'contentDetails' to get video duration
        id=video_id
    )
    
    # Create an HTTP instance
    http = httplib2.Http()
    headers = {'referer': 'https://youtube.com'}

    # Execute the request
    response, content = http.request(request.uri, method=request.method, body=request.body, headers=headers)
    response_data = json.loads(content)

    # Extract video details from the response
    for item in response_data['items']:
        duration_str = item['contentDetails']['duration']  # Duration is provided in ISO 8601 format
        video_length = parse_duration(duration_str)
        details.append({'video_id': video_id, 'length': str(video_length)})

    return details


# Function to parse duration string into timedelta object
def parse_duration(duration_str):
    # Regular expression to extract hours, minutes, and seconds
    pattern = r'PT(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?'
    match = re.match(pattern, duration_str)
    
    # Extract hours, minutes, and seconds
    hours = int(match.group(1)) if match.group(1) else 0
    minutes = int(match.group(2)) if match.group(2) else 0
    seconds = int(match.group(3)) if match.group(3) else 0
    
    # Create a timedelta object with the parsed hours, minutes, and seconds
    duration = timedelta(hours=hours, minutes=minutes, seconds=seconds)
    
    return duration


# Function for caption ids
def get_caption_ids(video_id):
    captions = []

    # Retrieve captions for the video
    request = youtube.captions().list(
        part="snippet",
        videoId=video_id
    )
    # Create an HTTP instance
    http = httplib2.Http()
    headers = {'referer': 'https://youtube.com'}

    # Execute the request
    response, content = http.request(request.uri, method=request.method, body=request.body, headers=headers)
    response_data = json.loads(content)
    # pprint(response_data)

    # Extract captions from the response
    for item in response_data['items']:
        caption_id = item['id']
        captions.append({'id': caption_id})

    return captions


# Function to download and parse caption file to extract transcript text
def download_and_parse_caption(caption_id):
    # Download the caption file
    request = youtube.captions().download(
        id=caption_id,
        tfmt="srt"  # Choose the caption format (e.g., "srt", "vtt")
    )
    caption_response = request.execute()

    # Parse the caption file to extract transcript text
    transcript_text = caption_response.decode('utf-8')  # Assuming the response is in UTF-8 encoding

    return transcript_text

In [54]:
### MAIN

# Test Frame
test_df = videos_df.head(30)

#Set environment variable -- dev is for testing the code against the API in small batches to keep quota usage down
environment = 'dev'

if environment == 'production':
    df = videos_df
elif environment == 'dev':
    df = test_df.copy()
else:
    print('Error: please set environment')




In [55]:
for index, row in df.iterrows():
    if pd.isnull(row['length']) or pd.isnull(row['caption_id']):  # Check if either length or caption_id is null
        video_id = row['video_id']
        
        # Get video details if length is null
        if pd.isnull(row['length']):
            video_details = get_video_details(video_id)
            if video_details:
                video_length = video_details[0]['length']
                videos_df.loc[index, 'length'] = video_length

        # Get captions if caption_id is null
        if pd.isnull(row['caption_id']):
            captions = get_caption_ids(video_id)
            if captions:
                caption_id = captions[0]['id']
                videos_df.loc[index, 'caption_id'] = caption_id

display(videos_df.head())

Unnamed: 0,playlist,title,video_id,transcribed,length,caption_id
0,52_weeks,How to set up a reef tank | 52 Weeks of Reefing,E-aoo7Gl2FQ,False,0:01:28,AUieDaanATJE3B_sY3g0XdT4d529fpUTwWeQoRNi8_RNyM...
1,52_weeks,Week 1: Our Best Reef Tank Build Yet | 52 Week...,fKEXNIhomGs,False,0:02:51,AUieDaZUPwzNfRgZ8pkyQVjAyPoKqobPj2n8FaNwOUqe
2,52_weeks,Week 2: Unveiling the tank and custom built st...,OxZ_hJjXwj8,False,0:17:38,AUieDaaGEEmWfrg14IXf7M_i6ZmxAgaPbdif7ba6YD67
3,52_weeks,"Week 3: Sumps - What do they do, and which sho...",z6foHVHg1Rw,False,0:15:11,AUieDaaJ5tS6S0lWQ3n0iLGnQupq2Vc6geBy_kXYiyl5
4,52_weeks,Week 4: Planning a Safer Tank with Redundancy ...,tppr8V13h5U,False,0:17:34,AUieDabkVqpY2TKn0EG-plpyrFZjHJVCdSiOFCDIE2mY


In [66]:
videos_df.head(75)
#TODO - count caption_ids vs unique
#TODO - count nulls

Unnamed: 0,playlist,title,video_id,transcribed,length,caption_id
0,52_weeks,How to set up a reef tank | 52 Weeks of Reefing,E-aoo7Gl2FQ,False,0:01:28,AUieDaanATJE3B_sY3g0XdT4d529fpUTwWeQoRNi8_RNyM...
1,52_weeks,Week 1: Our Best Reef Tank Build Yet | 52 Week...,fKEXNIhomGs,False,0:02:51,AUieDaZUPwzNfRgZ8pkyQVjAyPoKqobPj2n8FaNwOUqe
2,52_weeks,Week 2: Unveiling the tank and custom built st...,OxZ_hJjXwj8,False,0:17:38,AUieDaaGEEmWfrg14IXf7M_i6ZmxAgaPbdif7ba6YD67
3,52_weeks,"Week 3: Sumps - What do they do, and which sho...",z6foHVHg1Rw,False,0:15:11,AUieDaaJ5tS6S0lWQ3n0iLGnQupq2Vc6geBy_kXYiyl5
4,52_weeks,Week 4: Planning a Safer Tank with Redundancy ...,tppr8V13h5U,False,0:17:34,AUieDabkVqpY2TKn0EG-plpyrFZjHJVCdSiOFCDIE2mY
...,...,...,...,...,...,...
70,52_FAQ,Reef tank LED lighting: The best mounting heig...,BdlRU34wZSI,False,0:04:43,AUieDaZopQ3mEU4o_dqjfFHpY-PIpux9RSW7SNYiRj1jhGKv
71,52_FAQ,PAR meter rental: Setting your reef tank LED l...,4lJZCZXE2yo,False,0:02:20,
72,52_FAQ,How many LED modules? Selecting and spacing re...,5dYZMoR1cro,False,,
73,52_FAQ,PUR vs PAR? What is more important when settin...,DMW7rc-aFgs,False,,


In [None]:
# Instead of looping over each row of the dataframe and passing the function, we pass one list to the function, which makes one API call
# Step 1: Create a list of video IDs from the DataFrame
video_ids = df['video_id'].tolist()

# Step 2: Call the function to get video details for all video IDs
video_details = get_video_details(video_ids)

# Step 3: Convert the video details to a DataFrame
video_details_df = pd.DataFrame(video_details)

video_details_df.head()

In [16]:
# Step 4: Merge or join the original DataFrame with the video details DataFrame
merged_df = pd.merge(df, video_details_df, on='video_id', how='left')

merged_df

Unnamed: 0,playlist,title,video_id,transcribed,length
0,52_weeks,How to set up a reef tank | 52 Weeks of Reefing,E-aoo7Gl2FQ,False,0:01:28
1,52_weeks,Week 1: Our Best Reef Tank Build Yet | 52 Week...,fKEXNIhomGs,False,0:02:51
2,52_weeks,Week 2: Unveiling the tank and custom built st...,OxZ_hJjXwj8,False,0:17:38


In [None]:
video_id = "YOUR_VIDEO_ID_HERE"
captions_data = get_captions(video_id)

# Create a directory to save transcripts
if not os.path.exists("transcripts"):
    os.makedirs("transcripts")

# Download and save transcripts for each caption
for caption in captions_data:
    caption_id = caption['id']
    transcript_text = download_and_parse_caption(caption_id)

    # Save transcript text to a file
    file_name = f"transcripts/{video_id}_{caption_id}.txt"
    with open(file_name, "w", encoding="utf-8") as file:
        file.write(transcript_text)

    print(f"Transcript saved to {file_name}")

In [44]:
# Function for video details
def get_video_details(video_id):
    details = []

    # Retrieve video details
    request = youtube.videos().list(
        part="snippet,contentDetails",  # Include 'contentDetails' to get video duration
        id=video_id  # Convert the list of video IDs to a comma-separated string
    )
    
    # Create an HTTP instance
    http = httplib2.Http()
    headers = {'referer': 'https://youtube.com'}

    try:
        # Execute the request
        response, content = http.request(request.uri, method=request.method, body=request.body, headers=headers)
        response_data = json.loads(content)

        # Check if the response contains the expected structure
        if 'items' in response_data:
            # Extract video details from the response
            for item in response_data['items']:
                duration_str = item['contentDetails']['duration']  # Duration is provided in ISO 8601 format
                video_length = parse_duration(duration_str)
                details.append({'video_id': video_id, 'length': str(video_length)})
        else:
            # Handle cases where the response does not contain the expected structure
            print("Error: Unexpected response structure")

    except Exception as e:
        # Handle any exceptions that occur during API call
        print(f"Error fetching video details: {e}")

    # Identify failed video IDs
    successful_video_ids = [detail['video_id'] for detail in details]
    failed_video_ids = list(set(video_id) - set(successful_video_ids))
    failed_count = len(failed_video_ids)
    if failed_video_ids:
        print(failed_count)
        print(f"Failed video IDs: {failed_video_ids}")


    return details
