In [70]:
!pip install pandas
!pip install yt-dlp
!pip install python-dotenv
!pip install --upgrade google-api-python-client

Collecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.0.1


In [74]:
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Access the DEVELOPER_KEY variable
DEVELOPER_KEY = os.getenv("DEVELOPER_KEY")
# Access the DEVELOPER_KEY_2 variable
DEVELOPER_KEY_2 = os.getenv("DEVELOPER_KEY_2")
# Access the DEVELOPER_KEY_3 variable
DEVELOPER_KEY_3 = os.getenv("DEVELOPER_KEY_3")

In [1]:
#!/usr/bin/python

# This sample executes a search request for the specified search term.
# Sample usage:
#   python search.py --q=surfing --max-results=10
# NOTE: To use the sample, you must provide a developer key obtained
#       in the Google APIs Console. Search for "REPLACE_ME" in this code
#       to find the correct place to provide that key..

import argparse

from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from googleapiclient.discovery import build

In [5]:

YOUTUBE_API_SERVICE_NAME = 'youtube'
YOUTUBE_API_VERSION = 'v3'

def youtube_search(options):
    youtube = build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION, developerKey=DEVELOPER_KEY)
    
    # Call the search.list method to retrieve results matching the specified
    # query term.
    search_response = youtube.search().list(
    q=options.q,
    part='id,snippet',
    maxResults=options.max_results
    ).execute()
    
    videos = []
    captions_info = []  # Renamed to avoid conflict with the 'captions' variable below
    
    # Assuming 'youtube' is already defined and 'search_response' contains the API response
    for search_result in search_response.get('items', []):
        if search_result['id']['kind'] == 'youtube#video':
            video_id = search_result['id']['videoId']
            video_title = search_result['snippet']['title']
            video_url = f'https://www.youtube.com/watch?v={video_id}'
            
            # Fetch captions for the video
            captions_response = youtube.captions().list(part='snippet', videoId=video_id).execute()
            
            # Temporary list to hold captions for the current video
            video_captions = []
            for item in captions_response.get('items', []):
                caption_detail = f"Caption ID: {item['id']}, Language: {item['snippet']['language']}, Name: {item['snippet']['name']}"
                video_captions.append(caption_detail)
            
            # Append video information and associated captions to the videos list
            videos.append({
                'title': video_title,
                'url': video_url,
                'captions': video_captions
            })

    # To display the videos and their captions
    for video in videos:
        print(f"Video Title: {video['title']} ({video['url']})")
        if video['captions']:
            print("Captions:")
            for caption in video['captions']:
                print(f" - {caption}")
        else:
            print("No captions available")
        print("\n")  # Add extra newline for better readability



# parser = argparse.ArgumentParser()
# parser.add_argument('--q', help='Search term', default='Google')
# parser.add_argument('--max-results', help='Max results', default=25)
# args = parser.parse_args()

In [3]:
import argparse

args = argparse.Namespace(q='positive affirmations', max_results=2)
try:
    youtube_search(args)
except HttpError as e:
    print('An HTTP error {} occurred:\n{}'.format(e.resp.status, e.content))

Video Title: Positive Morning Affirmations to Start Your Day on the Right Foot (https://www.youtube.com/watch?v=poj98oUJm3U)
Captions:
 - Caption ID: AUieDabv9ig-GJC3ZKCzT9n9a5keLGcIcJGanSJr0eS1RCnTNIs, Language: en, Name: 


Video Title: I AM Affirmations for Manifestation, Positive Thinking, Confidence, Clearing Negative Energy (https://www.youtube.com/watch?v=kdU0iYxmpvY)
No captions available




In [22]:
!yt-dlp --write-subs --sub-langs en --convert-subs srt --skip-download 'https://www.youtube.com/watch?v=poj98oUJm3U'

[youtube] Extracting URL: https://www.youtube.com/watch?v=poj98oUJm3U
[youtube] poj98oUJm3U: Downloading webpage
[youtube] poj98oUJm3U: Downloading ios player API JSON
[youtube] poj98oUJm3U: Downloading android player API JSON
[youtube] poj98oUJm3U: Downloading m3u8 information
[info] poj98oUJm3U: Downloading 1 format(s): 22
[info] There are no subtitles for the requested languages
[SubtitlesConvertor] There aren't any subtitles to convert


In [27]:
import yt_dlp

video_url = 'https://www.youtube.com/watch?v=poj98oUJm3U'
ydl_opts = {
    'writeautomaticsub': True, # Download automatic subtitles (auto-generated)
    'subtitleslangs': ['en'],  # Specify subtitle languages
    'skip_download': True,     # Skip downloading the video itself
}

with yt_dlp.YoutubeDL(ydl_opts) as ydl:
    ydl.download([video_url])


[youtube] Extracting URL: https://www.youtube.com/watch?v=poj98oUJm3U
[youtube] poj98oUJm3U: Downloading webpage
[youtube] poj98oUJm3U: Downloading ios player API JSON
[youtube] poj98oUJm3U: Downloading android player API JSON
[youtube] poj98oUJm3U: Downloading m3u8 information
[info] poj98oUJm3U: Downloading subtitles: en
[info] poj98oUJm3U: Downloading 1 format(s): 22
Deleting existing file Positive Morning Affirmations to Start Your Day on the Right Foot [poj98oUJm3U].en.vtt
[info] Writing video subtitles to: Positive Morning Affirmations to Start Your Day on the Right Foot [poj98oUJm3U].en.vtt
[download] Destination: Positive Morning Affirmations to Start Your Day on the Right Foot [poj98oUJm3U].en.vtt
[download] 100% of   37.45KiB in 00:00:00 at 286.96KiB/s


In [24]:
def convert_vtt_to_srt(vtt_filename, srt_filename):
    with open(vtt_filename, 'r', encoding='utf-8') as vtt_file:
        lines = vtt_file.readlines()
    
    # Skip lines until after the first empty line (remove the header)
    start_index = next((i for i, line in enumerate(lines) if line.strip() == ''), -1) + 1

    # Replace '.' with ',' in timestamps and write to .srt file
    with open(srt_filename, 'w', encoding='utf-8') as srt_file:
        for line in lines[start_index:]:
            if '-->' in line:
                line = line.replace('.', ',')
            srt_file.write(line)

# Example usage
vtt_filename = 'Positive Morning Affirmations to Start Your Day on the Right Foot [poj98oUJm3U].en.vtt'
srt_filename = 'Positive Morning Affirmations to Start Your Day on the Right Foot [poj98oUJm3U].en.srt'
convert_vtt_to_srt(vtt_filename, srt_filename)


In [75]:
from googleapiclient.discovery import build
from datetime import datetime
import yt_dlp
import pandas as pd
import os

# Ensure the subtitles directory exists
os.makedirs('subtitles', exist_ok=True)

# Assuming options is a dictionary that includes the necessary user inputs
options = {
    'q': 'positive affirmations',  # User input for search query
    'max_results': 1,  # Maximum number of results to return
    'order': 'viewCount',  # Order by viewCount, videoCount, date, relevance
    'video_duration': ['medium'],  # Filter for video duration
    'published_after': '2010-01-01T00:00:00Z',  # ISO 8601 format
    'published_before': '2024-12-31T23:59:59Z',  # ISO 8601 format
    'relevance_language': 'en',  # Relevance language set to English
    'video_category_id': '10',  # Category ID (e.g., '10' for Music)
}

# Convert published_after and published_before to datetime objects for validation (optional)
published_after = datetime.fromisoformat(options['published_after'].rstrip('Z'))
published_before = datetime.fromisoformat(options['published_before'].rstrip('Z'))

def youtube_search_all_videos(options, DEVELOPER_KEY):
    # Initialize the YouTube API client
    youtube = build('youtube', 'v3', developerKey=DEVELOPER_KEY)

    all_videos = []
    page_token = None
    max_iterations = 1  # Limit to 2 iterations

    for _ in range(max_iterations):
        # Perform the search with additional parameters
        search_response = youtube.search().list(
            q=options['q'],
            part='id,snippet',
            maxResults=options['max_results'],
            order=options['order'],
            type='video',
            videoDuration=options['video_duration'][0],  # Assuming single duration for simplicity
            publishedAfter=options['published_after'],
            publishedBefore=options['published_before'],
            relevanceLanguage=options['relevance_language'],
            videoCategoryId=options['video_category_id'],
            pageToken=page_token  # Use nextPageToken from the previous request
        ).execute()

        all_videos.extend(search_response.get('items', []))
        page_token = search_response.get('nextPageToken')
        if not page_token:
            break

    return all_videos


def initialize_dataframe():
    columns = [
        'title', 'url', 'view_count', 'like_count', 'upload_date',
        'caption_id', 'subtitle_file', 'subtitle_text', 'order',
        'video_duration', 'published_after', 'published_before',
        'relevance_language', 'video_category_id'
    ]
    return pd.DataFrame(columns=columns)


# Process each video in the results
def fetch_video_details(youtube, video_id):
    request = youtube.videos().list(
        id=video_id,
        part='snippet,statistics,contentDetails'
    )
    response = request.execute()
    return response.get('items', [])[0] if response.get('items') else None


def convert_iso8601_duration(duration):
    """
    Converts ISO 8601 duration format to a more readable format.

    Parameters:
    - duration: A string in ISO 8601 duration format.

    Returns:
    A more readable string representation of the duration.
    """
    # Placeholder for conversion logic. You might need an external library
    # like isodate for a comprehensive solution, or implement a custom parser
    # depending on your needs. Here's a simple example of custom parsing:
    
    import re
    pattern = re.compile(r'P(?:(\d+)D)?T(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?')
    days, hours, minutes, seconds = pattern.match(duration).groups()

    readable_parts = []
    if days:
        readable_parts.append(f"{days} day{'s' if days != '1' else ''}")
    if hours:
        readable_parts.append(f"{hours} hour{'s' if hours != '1' else ''}")
    if minutes:
        readable_parts.append(f"{minutes} minute{'s' if minutes != '1' else ''}")
    if seconds:
        readable_parts.append(f"{seconds} second{'s' if seconds != '1' else ''}")

    return ', '.join(readable_parts) if readable_parts else 'Varies'
    

def extract_video_details(video_details, video_id):
    """
    Extracts and returns key video details from the video_details dictionary.

    Parameters:
    - video_details: A dictionary containing YouTube video details.
    - video_id: The YouTube video ID.

    Returns:
    A tuple containing the title, URL, view count, like count, upload date,
    and video duration in a more readable format.
    """
    title = video_details['snippet']['title']
    url = f'https://www.youtube.com/watch?v={video_id}'
    view_count = video_details['statistics'].get('viewCount', 'N/A')  # Using .get() for safe access
    like_count = video_details['statistics'].get('likeCount', 'N/A')  # Using .get() for safe access
    upload_date = video_details['snippet']['publishedAt']
    video_duration = video_details['contentDetails']['duration']  # ISO 8601 duration format

    # Optionally convert ISO 8601 duration to a more readable format
    readable_duration = convert_iso8601_duration(video_duration)

    return title, url, view_count, like_count, upload_date, readable_duration
    

def download_subtitles(video_id, url):
        # Attempt to download subtitles (including auto-generated)
        ydl_opts = {
            'writesubtitles': True,
            'writeautomaticsub': True,
            'subtitleslangs': ['en'],
            'skip_download': True,
            'outtmpl': f'subtitles/{video_id}.%(ext)s',
            'quiet': True
        }
        
        caption_id, subtitle_file, subtitle_text = None, None, pd.NA  # Default values if subtitles are not found
        
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            try:
                ydl_res = ydl.extract_info(url, download=True)
                # Attempt to locate the subtitle file (assuming VTT format)
                subtitle_file = f"subtitles/{video_id}.en.vtt"  # Adjust based on yt-dlp's actual output
                # Check if the subtitle file exists
                if os.path.exists(subtitle_file):
                    with open(subtitle_file, 'r', encoding='utf-8') as file:
                        subtitle_text = file.read()
                    return subtitle_file, subtitle_text
            except Exception as e:
                subtitle_file = pd.NA
                subtitle_text = pd.NA
            return pd.NA, pd.NA
        

def append_video_info_to_df(df, video_info):
    new_row_df = pd.DataFrame([video_info])
    return pd.concat([df, new_row_df], ignore_index=True)


def process_videos(all_videos, options):
    df = initialize_dataframe()
    
    for video in all_videos:
        video_id = video['id']['videoId']
        video_details = fetch_video_details(youtube, video_id)
        
        
        if video_details:
            # Extract and process video details
            title, url, view_count, like_count, upload_date, video_duration = extract_video_details(video_details, video_id)
            
            # Download subtitles
            subtitle_file, subtitle_text = download_subtitles(video_id, url)
            
            # Data to add to the DataFrame
            new_row = {
                'title': title,
                'url': url,
                'view_count': view_count,
                'like_count': like_count,
                'upload_date': upload_date,
                'caption_id': caption_id,  # Assuming this is set by your yt-dlp logic
                'subtitle_file': subtitle_file,
                'subtitle_text': subtitle_text,  # Assuming this is set by your yt-dlp logic
                # Placeholder values for additional attributes - adjust as necessary
                'order': options['order'],  # Example placeholder
                'video_duration': options['video_duration'],  # Placeholder - determine based on 'video_duration'
                'published_after': options['published_after'],  # Example placeholder
                'published_before': options['published_before'],  # Example placeholder
                'relevance_language': options['relevance_language'],  # Example placeholder
                'video_category_id': options['video_category_id']  # Example placeholder
            }
            new_row_df = pd.DataFrame([new_row])
            df = pd.concat([df, new_row_df], ignore_index=True)
    
    return df

# Call the search function
all_videos = youtube_search_all_videos(options, DEVELOPER_KEY_3)
df = process_videos(all_videos, options)

# Adjust display options
pd.set_option('display.max_columns', 1000)  # None means unlimited
pd.set_option('display.max_colwidth', 1000)  # None means show full content of each column


df.head(2)

                                                        

Unnamed: 0,title,url,view_count,like_count,upload_date,caption_id,subtitle_file,subtitle_text,order,video_duration,published_after,published_before,relevance_language,video_category_id
0,"POWERFUL POSITIVE Morning Affirmations for POSITIVE DAY, WAKE UP: 21 Day ""I AM"" Affirmations",https://www.youtube.com/watch?v=ZssjZnsN4Gg,15511727,175397,2018-06-05T02:00:02Z,,subtitles/ZssjZnsN4Gg.en.vtt,WEBVTT\nKind: captions\nLanguage: en\n\n00:00:01.830 --> 00:00:11.580 align:start position:0%\n \n[Music]\n\n00:00:11.580 --> 00:00:11.590 align:start position:0%\n \n \n\n00:00:11.590 --> 00:00:14.560 align:start position:0%\n \nscientific<00:00:12.590><c> prayer</c><00:00:12.980><c> or</c><00:00:13.280><c> affirmations</c><00:00:14.269><c> of</c><00:00:14.450><c> a</c>\n\n00:00:14.560 --> 00:00:14.570 align:start position:0%\nscientific prayer or affirmations of a\n \n\n00:00:14.570 --> 00:00:16.929 align:start position:0%\nscientific prayer or affirmations of a\nharmonious<00:00:15.290><c> interaction</c><00:00:16.129><c> of</c><00:00:16.250><c> the</c><00:00:16.369><c> conscious</c>\n\n00:00:16.929 --> 00:00:16.939 align:start position:0%\nharmonious interaction of the conscious\n \n\n00:00:16.939 --> 00:00:19.260 align:start position:0%\nharmonious interaction of the conscious\nand<00:00:17.090><c> subconscious</c><00:00:17.450><c> levels</c><00:00:18.349><c> of</c><00:00:18.5...,viewCount,[medium],2010-01-01T00:00:00Z,2024-12-31T23:59:59Z,en,10


In [40]:
# Reset display options to their defaults
pd.reset_option('display.max_columns')
pd.reset_option('display.max_colwidth')

df.tail(2)

Unnamed: 0,title,url,view_count,like_count,upload_date,caption_id,subtitle_file,subtitle_text,order,video_duration,published_after,published_before,relevance_language,video_category_id
1,4 Minutes To Start Your Day Right! MORNING MOT...,https://www.youtube.com/watch?v=HgiiY9TLtX8,6165178,165123,2018-06-20T03:43:43Z,,subtitles/iPm0TmMyhSI.en.vtt,WEBVTT\nKind: captions\nLanguage: en\n\n00:00:...,viewCount,medium/long,2021-01-01,2021-12-31,en,10
2,"I AM Morning Affirmations: Gratitude, Self Lov...",https://www.youtube.com/watch?v=SZr5LDFKn8w,3116451,48100,2018-07-11T19:41:05Z,,subtitles/iPm0TmMyhSI.en.vtt,WEBVTT\nKind: captions\nLanguage: en\n\n00:00:...,viewCount,medium/long,2021-01-01,2021-12-31,en,10
