In [1]:
!pip install boto3
!pip install --upgrade google-api-python-client



In [25]:
import boto3
import hashlib
import json
import os
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
import yt_dlp
from datetime import datetime
from dotenv import load_dotenv


def current_timestamp():
    """Returns the current timestamp formatted for readability."""
    return datetime.now().strftime("%Y-%m-%d %H:%M:%S")

def get_env_variables():
    """Fetch all necessary configurations from environment variables."""
    return {
        'DEVELOPER_KEY': os.getenv('DEVELOPER_KEY'),
        'AWS_ACCESS_KEY_ID': os.getenv('AWS_ACCESS_KEY_ID'),
        'AWS_SECRET_ACCESS_KEY': os.getenv('AWS_SECRET_ACCESS_KEY'),
        'SEARCH_CACHE_TABLE': os.getenv('SEARCH_CACHE_TABLE'),
        'RESULTS_TABLE_NAME': os.getenv('RESULTS_TABLE_NAME'),
        'SEARCH_QUERY': os.getenv('SEARCH_QUERY'),
        'MAX_RESULTS': int(os.getenv('MAX_RESULTS', 3)),
        'ORDER': os.getenv('ORDER', 'viewCount'),
        'VIDEO_DURATION': os.getenv('VIDEO_DURATION', 'medium'),
        'PUBLISHED_AFTER': os.getenv('PUBLISHED_AFTER', '2010-01-01T00:00:00Z'),
        'PUBLISHED_BEFORE': os.getenv('PUBLISHED_BEFORE', '2024-12-31T23:59:59Z'),
        'RELEVANCE_LANGUAGE': os.getenv('RELEVANCE_LANGUAGE', 'en'),
        # 'VIDEO_CATEGORY_ID': os.getenv('VIDEO_CATEGORY_ID', '10'),
        'AWS_REGION': os.getenv('AWS_REGION')  # Add AWS region to the environment variables
    }

def open_aws_dynamodb_session(options):
    """
    Configures a boto3 session using AWS credentials read from environment variable.
    Returns a boto3 DynamoDB resource configured with these credentials.
    """
    try:

        # Configure the boto3 session with the read credentials
        session = boto3.Session(
            aws_access_key_id=options['AWS_ACCESS_KEY_ID'],
            aws_secret_access_key=options['AWS_SECRET_ACCESS_KEY'],
            region_name=options['AWS_REGION']  # Specify your AWS region
        )
        dynamodb_resource = session.resource('dynamodb')
        print(f"Successfully open_aws_dynamodb_session with environment variables. at {current_timestamp()}\n")
        # Return the configured DynamoDB resource
        return dynamodb_resource

    except Exception as e:
        print(f"Error open_aws_dynamodb_session from environment variables: {e} at {current_timestamp()}\n")
        return None


def cache_key(options):
    """Generate a hash key for the given search options."""
    key_str = json.dumps(options, sort_keys=True)
    return hashlib.sha256(key_str.encode('utf-8')).hexdigest()

def check_cache(options, dynamodb):
    """Check if search options are in the cache on AWS DynamoDB."""
    print(f"Checking cache at {current_timestamp()}\n")
    non_search_terms = ['DEVELOPER_KEY', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY', 'SEARCH_CACHE_TABLE', 'RESULTS_TABLE_NAME', 'AWS_REGION', 'MAX_RESULTS']
    search_query_options = {k: v for k, v in options.items() if k not in non_search_terms}
    key = cache_key(search_query_options)
    sort_key_value = current_timestamp()

    try:
        cache_table = dynamodb.Table(options['SEARCH_CACHE_TABLE'])
        response = cache_table.get_item(
        Key={
                'CacheKey': key #,
                # 'Timestamp': sort_key_value  # You'll need to provide the appropriate sort key value here
            }
        )
        if 'Item' in response:
            print("Cache hit.")
            return json.loads(response['Item']['Results'])
    except Exception as e:
        print(f"Error accessing DynamoDB due to 'check_cache' malfunctioniong: {e} at {current_timestamp()}\n")
    print("Cache miss.\n")
    return None

def update_cache(options, dynamodb, results):
    """Update the cache with new search results, including the search time."""
    print(f"Updating cache at {current_timestamp()}\n")
    non_search_terms = ['DEVELOPER_KEY', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY', 'SEARCH_CACHE_TABLE', 'RESULTS_TABLE_NAME', 'AWS_REGION', 'MAX_RESULTS']
    search_query_options = {k: v for k, v in options.items() if k not in non_search_terms}
    key = cache_key(search_query_options)
    timestamp = current_timestamp()  # Capture the search timestamp
    try:
        search_cache_table = options['SEARCH_CACHE_TABLE']
        cache_table = dynamodb.Table(search_cache_table)
        cache_table.put_item(Item={
            'CacheKey': key,
            'Results': json.dumps(results),
            'Timestamp': timestamp  # Store the search time
        })
    except Exception as e:
        print(f"Error updating DynamoDB due to 'update_cache' malfunctioniong: {e} at {current_timestamp()}\n")

def youtube_search_all_videos(options, dynamodb):
    """Perform a search on YouTube Data APIand return all videos based on the options, with caching and paging."""
    print(f"Performing YouTube search at {current_timestamp()}\n")
    # cached_results = check_cache(options, dynamodb)
    # if cached_results is not None:
    #     print(f"Function 'youtube_search_all_videos' was run at {current_timestamp()}\n")
    #     return cached_results

    developer_key = options['DEVELOPER_KEY']
    # CREATE YOUTUBE OBJECT calling the youtbue data api 
    youtube = build('youtube', 'v3', developerKey=developer_key)
    
    all_videos = []
    page_token = None
    max_iterations = 1  # Adjust based on how many pages you want to retrieve (50 results max with 25 per page)

    try:
        for _ in range(max_iterations):
            search_response = youtube.search().list(
                q=options['SEARCH_QUERY'],
                part='id,snippet',
                maxResults=options['MAX_RESULTS'],
                order=options['ORDER'],
                type='video',
                videoDuration=options['VIDEO_DURATION'],
                publishedAfter=options['PUBLISHED_AFTER'],
                publishedBefore=options['PUBLISHED_BEFORE'],
                relevanceLanguage=options['RELEVANCE_LANGUAGE'],
                # videoCategoryId=options['VIDEO_CATEGORY_ID'], # When there is a category ID assigned, if number 10, then results are mostly music.
                pageToken=page_token
            ).execute()

    except HttpError as e:
        print(f"An HTTP error occurred: {e.resp.status} {e.content} at {current_timestamp()}\n")

    all_videos_json = json.dumps(all_videos)
    # update_cache(options, dynamodb, all_videos)
    print(f"Function 'youtube_search_all_videos' was run at {current_timestamp()}\n")
    print(all_videos_json)
    return all_videos_json

def download_subtitles(video_id):
    """Download subtitles for a given YouTube video ID."""
    video_url = f'https://www.youtube.com/watch?v={video_id}'
    ydl_opts = {
        'writeautomaticsub': True,
        'subtitleslangs': ['en'],
        'skip_download': True,
        'outtmpl': f'subtitles/{video_id}.%(ext)s',
        'quiet': True
    }
    try:
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            ydl.download([video_url])
            subtitle_file = f'subtitles/{video_id}.en.vtt'
            if os.path.exists(subtitle_file):
                with open(subtitle_file, 'r', encoding='utf-8') as file:
                    subtitle_text = file.read()
                    print(f"Downloaded video subtitles for video ID: {video_id} into the file {subtitle_file} at {current_timestamp()}\n")
                return subtitle_file, subtitle_text
            return None, None
    except Exception as e:
        print(f"Failed to download subtitles for video ID: {video_id}: {e} at {current_timestamp()}\n")
        return None, None

def send_to_dynamodb(options, dynamodb, video_details):   
    """Send Youtube Data API results on the video and subtitle transcripts to Dynamodb."""
    try:
        results_table_name = options['RESULTS_TABLE_NAME']
        results_table = dynamodb.Table(results_table_name)
        response = results_table.put_item(Item=video_details)
        print(f"Successfully inserted api results and converted transcripts into DynamoDB  at {current_timestamp()}\n", response)
    except Exception as e:
        print(f"Error inserting into DynamoDB: {e}")

def process_videos_and_store(options, dynamodb, all_videos):
    """Process each video from the search results and store them."""
    for video in all_videos:
        video_id = video['id']['videoId']
        subtitle_file, subtitle_text = download_subtitles(video_id)
        video_details = {
            'VideoID': video_id,
            'Title': video['snippet']['title'],
            'URL': f'https://www.youtube.com/watch?v={video_id}',
            'PublishedAt': video['snippet']['publishedAt'],
            'SubtitleFile': subtitle_file if subtitle_file else "N/A",
            'SubtitleText': subtitle_text if subtitle_text else "N/A",
            # Including search parameters as part of video_details
            'SearchQuery': options['SEARCH_QUERY'],
            'Order': options['ORDER'],
            'VideoDuration': options['VIDEO_DURATION'],
            'RelevanceLanguage': options['RELEVANCE_LANGUAGE']
            # 'VideoCategoryID': options['VIDEO_CATEGORY_ID']
        }
        send_to_dynamodb(options, dynamodb, video_details)
        print(f"\nFunction 'process_videos_and_store' was run at {current_timestamp()} for video ID: {video_id}, title: {video['snippet']['title']}\n")

# def main():

# Load the .env file
load_dotenv()

# Ensure the subtitles directory exists}
os.makedirs('subtitles', exist_ok=True)

# Get environment variables
options = get_env_variables()

# Only proceed if options were successfully retrieved
if options:
    dynamodb = open_aws_dynamodb_session(options)
    
    # Only proceed if DynamoDB was successfully configured
    if dynamodb:

        all_videos = youtube_search_all_videos(options, dynamodb)
        print(all_videos)
        # # Only proceed if videos were successfully retrieved
        # if all_videos:
        #     process_videos_and_store(options, dynamodb, all_videos)
        # else:
        #     print(f"Failed to retrieve videos at {current_timestamp()}. Exiting...\n")
    else:
        print(f"Failed to configure boto3 with Docker secrets at {current_timestamp()}. Exiting...\n")
else:
    print(f"Failed to retrieve environment variables at {current_timestamp()} Exiting...\n")

# if __name__ == "__main__":
#     main()


Successfully open_aws_dynamodb_session with environment variables. at 2024-02-23 14:29:49

Performing YouTube search at 2024-02-23 14:29:49

Function 'youtube_search_all_videos' was run at 2024-02-23 14:29:50

[]
[]


In [4]:
from dotenv import load_dotenv
from datetime import datetime
import boto3
import os

def current_timestamp():
    """Returns the current timestamp formatted for readability."""
    return datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    
def get_env_variables():
    """Fetch all necessary configurations from environment variables."""
    return {
        'DEVELOPER_KEY': os.getenv('DEVELOPER_KEY'),
        'AWS_ACCESS_KEY_ID': os.getenv('AWS_ACCESS_KEY_ID'),
        'AWS_SECRET_ACCESS_KEY': os.getenv('AWS_SECRET_ACCESS_KEY'),
        'SEARCH_CACHE_TABLE': os.getenv('SEARCH_CACHE_TABLE'),
        'RESULTS_TABLE_NAME': os.getenv('RESULTS_TABLE_NAME'),
        'SEARCH_QUERY': os.getenv('SEARCH_QUERY'),
        'MAX_RESULTS': int(os.getenv('MAX_RESULTS', 3)),
        'ORDER': os.getenv('ORDER', 'viewCount'),
        'VIDEO_DURATION': os.getenv('VIDEO_DURATION', 'medium'),
        'PUBLISHED_AFTER': os.getenv('PUBLISHED_AFTER', '2010-01-01T00:00:00Z'),
        'PUBLISHED_BEFORE': os.getenv('PUBLISHED_BEFORE', '2024-12-31T23:59:59Z'),
        'RELEVANCE_LANGUAGE': os.getenv('RELEVANCE_LANGUAGE', 'en'),
        # 'VIDEO_CATEGORY_ID': os.getenv('VIDEO_CATEGORY_ID', '10'),
        'AWS_REGION': os.getenv('AWS_REGION')  # Add AWS region to the environment variables
    }

def open_aws_dynamodb_session(options):
    """
    Configures a boto3 session using AWS credentials read from environment variable.
    Returns a boto3 DynamoDB resource configured with these credentials.
    """
    try:

        # Configure the boto3 session with the read credentials
        session = boto3.Session(
            aws_access_key_id=options['AWS_ACCESS_KEY_ID'],
            aws_secret_access_key=options['AWS_SECRET_ACCESS_KEY'],
            region_name=options['AWS_REGION']  # Specify your AWS region
        )
        dynamodb_resource = session.resource('dynamodb')
        print(f"Successfully open_aws_dynamodb_session with environment variables. at {current_timestamp()}\n")
        # Return the configured DynamoDB resource
        return dynamodb_resource

    except Exception as e:
        print(f"Error open_aws_dynamodb_session from environment variables: {e} at {current_timestamp()}\n")
        return None


def print_dynamodb_table_items(options):
    """
    Prints all items stored in a specified DynamoDB table.

    :param table_name: The name of the DynamoDB table to print items from.
    :param aws_region: The AWS region where the table is located. Default is 'us-east-1'.
    """
    # Initialize a DynamoDB service resource in the specified region
    aws_region = options['AWS_REGION']
    dynamodb = boto3.resource('dynamodb', region_name=aws_region)
    
    # Connect to the table
    table_name = options['RESULTS_TABLE_NAME']
    # table_name = options['SEARCH_CACHE_TABLE']
    table = dynamodb.Table(table_name)
    # Perform a scan operation to retrieve all items
    response = table.scan()
    
    # Initialize a list to hold all retrieved items
    all_items = []
    
    # Start the scan operation
    response = table.scan()
    all_items.extend(response['Items'])
    
    # Continue scanning if LastEvaluatedKey is present
    while 'LastEvaluatedKey' in response:
        response = table.scan(ExclusiveStartKey=response['LastEvaluatedKey'])
        all_items.extend(response['Items'])
    
    # Now, all_items contains all the items from the DynamoDB table
    for item in all_items:
        print(item)
        print("\n\n\n")
        # cache_key = item.get('CacheKey', 'No CacheKey')
        # timestamp = item.get('Timestamp', 'No Timestamp')
        # # Assuming Results is stored as a JSON string; parse it if necessary
        # results = item.get('Results', 'No Results')
        # # Uncomment the next line if you need to parse the Results from JSON string
        # # results = json.loads(item.get('Results', '{}')) if 'Results' in item else 'No Results'
        
        # print(f"\n\n\nCacheKey: {cache_key}, \n\n\nTimestamp: {timestamp}, \n\n\nResults: {results}")


def main():
    load_dotenv()
    options = get_env_variables()
    if options:
        dynamodb = open_aws_dynamodb_session(options)
        if dynamodb:
            print_dynamodb_table_items(options)
        else:
            print("Error connecting to DynamoDB.")
    else:
        print("Error retrieving environment variables.")


if __name__ == "__main__":
    main()

Successfully open_aws_dynamodb_session with environment variables. at 2024-02-23 12:42:03

{'PublishedAt': '2020-09-16T06:56:10Z', 'Title': 'MORNING MOTIVATIONAL VIDEO - Sandeep Maheshwari | DAILY MORNING AFFIRMATIONS Hindi', 'VideoID': 'fG1oNm2tCro', 'RelevanceLanguage': 'en', 'VideoDuration': 'medium', 'SubtitleText': "WEBVTT\nKind: captions\nLanguage: en\n\n00:00:00.040 --> 00:00:01.550 align:start position:0%\n \n\n\n00:00:01.550 --> 00:00:01.560 align:start position:0%\n \n \n\n00:00:01.560 --> 00:00:09.500 align:start position:0%\n \nhappened <00:00:02.497><c>in </c><00:00:03.434><c>the </c><00:00:04.371><c>morning, </c><00:00:05.308><c>when </c><00:00:06.245><c>I </c><00:00:07.182><c>hear </c><00:00:08.119><c>something </c><00:00:09.056><c>good,</c>\n\n00:00:09.500 --> 00:00:09.510 align:start position:0%\nhappened in the morning, when I hear something good,\n \n\n00:00:09.510 --> 00:00:11.860 align:start position:0%\nhappened in the morning, when I hear something good,\nthen <0

In [None]:
part='id,snippet,contentDetails,fileDetails,player, processingDetails,recordingDetails,statistics,status,suggestions,topicDetails',

In [38]:
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
import json
from datetime import datetime
import os
from dotenv import load_dotenv


# Function to get current timestamp
def current_timestamp():
    return datetime.now().strftime("%Y-%m-%d %H:%M:%S")


load_dotenv()
options = get_env_variables()

developer_key = options['DEVELOPER_KEY']
# CREATE YOUTUBE OBJECT calling the youtbue data api 
youtube = build('youtube', 'v3', developerKey=developer_key)

all_videos = []
page_token = None
max_iterations = 1  # Adjust based on how many pages you want to retrieve (50 results max with 25 per page)

try:
    for _ in range(max_iterations):
        search_response = youtube.search().list(
            q=options['SEARCH_QUERY'],
            part='id, snippet',
            maxResults=options['MAX_RESULTS'],
            order=options['ORDER'],
            type='video',
            videoDuration=options['VIDEO_DURATION'],
            publishedAfter=options['PUBLISHED_AFTER'],
            publishedBefore=options['PUBLISHED_BEFORE'],
            relevanceLanguage=options['RELEVANCE_LANGUAGE'],
            # videoCategoryId=options['VIDEO_CATEGORY_ID'], # When there is a category ID assigned, if number 10, then results are mostly music.
            pageToken=page_token
        ).execute()

        all_videos.extend(search_response.get('items', []))

        video_ids = [item['id']['videoId'] for item in search_response.get('items', [])]

        # Retrieve additional details for each video by their IDs, excluding restricted parts
        if video_ids:
            details_response = youtube.videos().list(
                id=','.join(video_ids),
                part='contentDetails,statistics,status,topicDetails,recordingDetails'
            ).execute()

            all_videos.extend(details_response.get('items', []))

except HttpError as e:
    print(f"An HTTP error occurred: {e.resp.status} {e.content} at {current_timestamp()}\n")

all_videos_json = json.dumps(all_videos, indent=4)  # Pretty print JSON
print(f"Script 'youtube_search_all_videos' was run at {current_timestamp()}\n")
print(all_videos_json)

Script 'youtube_search_all_videos' was run at 2024-02-23 14:37:55

[
    {
        "kind": "youtube#searchResult",
        "etag": "jGmLZdvgw4Xyr1nqkM0e_6QWexg",
        "id": {
            "kind": "youtube#video",
            "videoId": "fG1oNm2tCro"
        },
        "snippet": {
            "publishedAt": "2020-09-16T06:56:10Z",
            "channelId": "UCBqFKDipsnzvJdt6UT0lMIg",
            "title": "MORNING MOTIVATIONAL VIDEO - Sandeep Maheshwari | DAILY MORNING AFFIRMATIONS Hindi",
            "description": "Positive daily affirmations are very powerful... when these affirmations are repeated over and over again, they begin to take ...",
            "thumbnails": {
                "default": {
                    "url": "https://i.ytimg.com/vi/fG1oNm2tCro/default.jpg",
                    "width": 120,
                    "height": 90
                },
                "medium": {
                    "url": "https://i.ytimg.com/vi/fG1oNm2tCro/mqdefault.jpg",
                  

In [None]:
            "channelId":
            "duration": "PT7M21S",
            "definition": "hd",
            "licensedContent": false,
            "license": "youtube",
            "viewCount": "24548057",
            "likeCount": "756631",
            "favoriteCount": "0",
            "commentCount": "14302"
            "channelTitle": "Sandeep Maheshwari",
            "liveBroadcastContent": "none",
                "high": {
                    "url":