In [1]:
!pip install boto3
!pip install --upgrade google-api-python-client



In [44]:
import boto3
import hashlib
import json
import os
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from YTsearch_options_cycler import SearchOptionsCycler
import yt_dlp
from datetime import datetime
from dotenv import load_dotenv


def current_timestamp():
    """Returns the current timestamp formatted for readability."""
    return datetime.now().strftime("%Y-%m-%d %H:%M:%S")

def get_env_variables():
    """Fetch all necessary configurations from environment variables."""
    return {
        'DEVELOPER_KEY': os.getenv('DEVELOPER_KEY'),
        'AWS_ACCESS_KEY_ID': os.getenv('AWS_ACCESS_KEY_ID'),
        'AWS_SECRET_ACCESS_KEY': os.getenv('AWS_SECRET_ACCESS_KEY'),
        'SEARCH_CACHE_TABLE': os.getenv('SEARCH_CACHE_TABLE'),
        'RESULTS_TABLE_NAME': os.getenv('RESULTS_TABLE_NAME'),
        'SEARCH_QUERY': os.getenv('SEARCH_QUERY'),
        'MAX_RESULTS': int(os.getenv('MAX_RESULTS', 50)),
        'ORDER': os.getenv('ORDER', 'viewCount'),
        'VIDEO_DURATION': os.getenv('VIDEO_DURATION', 'medium'),
        'PUBLISHED_AFTER': os.getenv('PUBLISHED_AFTER', '2010-01-01T00:00:00Z'),
        'PUBLISHED_BEFORE': os.getenv('PUBLISHED_BEFORE', '2024-12-31T23:59:59Z'),
        'RELEVANCE_LANGUAGE': os.getenv('RELEVANCE_LANGUAGE', 'en'),
        # 'VIDEO_CATEGORY_ID': os.getenv('VIDEO_CATEGORY_ID', '10'),
        'AWS_REGION': os.getenv('AWS_REGION')  # Add AWS region to the environment variables
    }

def open_aws_dynamodb_session(options):
    """
    Configures a boto3 session using AWS credentials read from environment variable.
    Returns a boto3 DynamoDB resource configured with these credentials.
    """
    try:

        # Configure the boto3 session with the read credentials
        session = boto3.Session(
            aws_access_key_id=options['AWS_ACCESS_KEY_ID'],
            aws_secret_access_key=options['AWS_SECRET_ACCESS_KEY'],
            region_name=options['AWS_REGION']  # Specify your AWS region
        )
        dynamodb_resource = session.resource('dynamodb')
        print(f"Successfully open_aws_dynamodb_session with environment variables. at {current_timestamp()}\n")
        # Return the configured DynamoDB resource
        return dynamodb_resource

    except Exception as e:
        print(f"Error open_aws_dynamodb_session from environment variables: {e} at {current_timestamp()}\n")
        return None


def first_time_check_cache(options, dynamodb):
    """Check if search options are in the cache on AWS DynamoDB."""
    print(f"Checking cache at {current_timestamp()}\n")
    non_search_terms = ['DEVELOPER_KEY', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY', 'SEARCH_CACHE_TABLE', 'RESULTS_TABLE_NAME', 'AWS_REGION', 'MAX_RESULTS']
    search_query_options = {k: v for k, v in options.items() if k not in non_search_terms}
    key = json.dumps(search_query_options, sort_keys=True)
    try:
        cache_table = dynamodb.Table(options['SEARCH_CACHE_TABLE'])
        response = cache_table.get_item(
        Key={
                'CacheKey': key
                # 'Timestamp': sort_key_value  # You'll need to provide the appropriate sort key value here
            }
        )
        if 'Item' in response:
            print("Cache hit.")
            return response
    except Exception as e:
        print(f"Error accessing DynamoDB due to 'check_cache' malfunctioniong: {e} at {current_timestamp()}\n")
    print("Cache miss.\n")
    return {'Item': {'Timestamp': '2024-02-28 15:03:32', 'CacheKey': '{"ORDER": "relevance", "PUBLISHED_AFTER": "2010-01-01T00:00:00Z", "PUBLISHED_BEFORE": "2024-12-31T23:59:59Z", "RELEVANCE_LANGUAGE": "en", "SEARCH_QUERY": "affirmation", "VIDEO_DURATION": "medium"}'}, 'ResponseMetadata': {'RequestId': 'QJN027G2KSUGOH95GUQH2G70UNVV4KQNSO5AEMVJF66Q9ASUAAJG', 'HTTPStatusCode': 200, 'HTTPHeaders': {'server': 'Server', 'date': 'Thu, 29 Feb 2024 06:23:28 GMT', 'content-type': 'application/x-amz-json-1.0', 'content-length': '290', 'connection': 'keep-alive', 'x-amzn-requestid': 'QJN027G2KSUGOH95GUQH2G70UNVV4KQNSO5AEMVJF66Q9ASUAAJG', 'x-amz-crc32': '3681715306'}, 'RetryAttempts': 0}}

def check_cache(dynamodb, full_youtube_cycled_options_dict):
    """Check if search options are in the cache on AWS DynamoDB."""
    print(f"Checking cache at {current_timestamp()}\n")
    key = json.dumps(full_youtube_cycled_options_dict, sort_keys=True)
    # sort_key_value = current_timestamp()

    try:
        cache_table = dynamodb.Table(options['SEARCH_CACHE_TABLE'])
        response = cache_table.get_item(
        Key={
                'CacheKey': key #,
                # 'Timestamp': sort_key_value  # You'll need to provide the appropriate sort key value here
            }
        )
        if 'Item' in response:
            print("Cache hit.")
            return response
    except Exception as e:
        print(f"Error accessing DynamoDB due to 'check_cache' malfunctioniong: {e} at {current_timestamp()}\n")
    print("Cache miss.\n")
    return None

def update_cache(dynamodb, full_youtube_cycled_options_dict):
    """Update the cache with new search results, including the search time."""
    print(f"Updating cache at {current_timestamp()}\n")
    
    timestamp = current_timestamp()  # Capture the search timestamp
    key = json.dumps(full_youtube_cycled_options_dict, sort_keys=True)
    try:
        search_cache_table = options['SEARCH_CACHE_TABLE']
        cache_table = dynamodb.Table(search_cache_table)
        cache_table.put_item(Item={
            'CacheKey': key,
            'Timestamp': timestamp  # Store the search time
        })
        return True
    except Exception as e:
        print(f"Error updating DynamoDB due to 'update_cache' malfunctioniong: {e} at {current_timestamp()}\n")
        return False

def merge_dicts_return_larger(dict1, dict2):
    # Determine which dictionary is larger
    larger_dict = dict1 if len(dict1) > len(dict2) else dict2
    smaller_dict = dict2 if larger_dict is dict1 else dict1

    # Update the values of the larger dictionary with values from the smaller dictionary
    # Gets any unique keys from the smaller dictionary.
    for key in smaller_dict.keys():
        if key in larger_dict:
            # Update the larger dictionary with values from the smaller one
            larger_dict[key] = smaller_dict[key]

    # Output the larger dictionary with updated values
    return larger_dict

def youtube_search_all_videos(options, dynamodb):
    """Perform a search on YouTube Data API and return all videos based on the options, with caching and paging."""
    print(f"Performing YouTube search at {current_timestamp()}\n")

    developer_key = options['DEVELOPER_KEY']
    youtube = build('youtube', 'v3', developerKey=developer_key)
    
    all_videos = []
    page_token = None
    max_iterations = 50  # Adjust based on how many pages you want to retrieve

    try:
        for _ in range(max_iterations):
            search_response = youtube.search().list(
                q=options['SEARCH_QUERY'],
                part='id,snippet',
                maxResults=options['MAX_RESULTS'],
                order=options['ORDER'],
                type='video',
                videoDuration=options['VIDEO_DURATION'],
                publishedAfter=options['PUBLISHED_AFTER'],
                publishedBefore=options['PUBLISHED_BEFORE'],
                relevanceLanguage=options['RELEVANCE_LANGUAGE'],
                # videoCategoryId=options['VIDEO_CATEGORY_ID'], # When there is a category ID assigned, if number 10, then results are mostly music.
                pageToken=page_token
            ).execute()

            # Create a temporary map of video ID to search result item
            video_id_to_search_item = {}
            for item in search_response.get('items', []):
                video_id = item['id']['videoId']
                video_id_to_search_item[video_id] = item
        
            video_ids = list(video_id_to_search_item.keys())
        
            if video_ids:
                details_response = youtube.videos().list(
                    id=','.join(video_ids),
                    part='contentDetails,statistics,status,topicDetails,liveStreamingDetails,localizations'
                ).execute()
        
                # Update search result items with additional details
                for detail_item in details_response.get('items', []):
                    video_id = detail_item['id']
                    if video_id in video_id_to_search_item:
                        # Combine the detail item with the search result item
                        search_item = video_id_to_search_item[video_id]
                        search_item['details'] = detail_item  # Add a new key for additional details
                        
                        # Append the updated search item to all_videos
                        all_videos.append(search_item)
        
            page_token = search_response.get('nextPageToken')
            if not page_token:
                break

    except HttpError as e:
        print(f"An HTTP error occurred: {e.resp.status} {e.content} at {current_timestamp()}\n")

    print(f"Function 'youtube_search_all_videos' was run at {current_timestamp()}\n")
    return all_videos


def send_to_dynamodb(options, dynamodb, flattened_single_video_dict):   
    """Send Youtube Data API results on the video and subtitle transcripts to Dynamodb."""
    try:
        results_table_name = options['RESULTS_TABLE_NAME']
        results_table = dynamodb.Table(results_table_name)
        response = results_table.put_item(Item=flattened_single_video_dict)
        print(f"Successfully inserted api results into DynamoDB  at {current_timestamp()}\n", response)
    except Exception as e:
        print(f"Error inserting into DynamoDB: {e}")
    
def flatten_dict(d):
    """
    Flatten a nested dictionary, concatenating keys with a specified separator.

    :param d: The dictionary to flatten
    :param parent_key: The base key string to use for constructing new key names
    :param sep: The separator to use between concatenated keys
    :return: A flattened dictionary
    """
    parent_key=''
    sep='-'
    items = []
    for k, v in d.items():
        new_key = f"{parent_key}{sep}{k}" if parent_key else k
        if isinstance(v, dict):
            items.extend(flatten_dict(v).items())
        else:
            items.append((new_key, v))
    return dict(items)



def main():

    # Load the .env file
    load_dotenv()
    
    # text file of youtube search terms to cycle through
    file_path = 'search_queries.txt'
    
    # Get environment variables
    options = get_env_variables()
    
    # Only proceed if options were successfully retrieved
    if options:
        dynamodb = open_aws_dynamodb_session(options)
        
        # Only proceed if DynamoDB was successfully configured
        if dynamodb:
    
            # use default values to do a search, using environment variable defaults
            full_cache_table_value = first_time_check_cache(options, dynamodb)
            long_dict = json.loads(full_cache_table_value['Item']['CacheKey'])
            
            # initialize cache cycler
            cycler = SearchOptionsCycler(file_path)
            
            # start with cycling through order as the search option values to rotate
            cycler.set_cycling_attribute('ORDER')
            # cycle for the first time
            youtube_cycled_options_dict = cycler.get_next_options()
    
            # combine the full search terms from environment variables updated with just the new values that got cycled
            full_search_term_cycled_dict = merge_dicts_return_larger(long_dict, youtube_cycled_options_dict)
    
            # while cache hit, cycle through youtube search options
            while check_cache(dynamodb, full_search_term_cycled_dict):
                # generate new cycled options, which is a subset of what gets sent to the youtube api
                youtube_cycled_options_dict = cycler.get_next_options()
                # combine the full search terms from environment variables updated with just the new values that got cycled
                full_search_term_cycled_dict = merge_dicts_return_larger(long_dict, youtube_cycled_options_dict)
                
            # if not a cache hit, store new search terms
            update_cache(dynamodb, full_search_term_cycled_dict)
            video_list = youtube_search_all_videos(options, dynamodb)
            # print(all_videos)
            # # Only proceed if videos were successfully retrieved
            for single_video_dict in video_list:
                flattened_single_video_dict = flatten_dict(single_video_dict)
                # print(video['id']['videoId'])
                print(json.dumps(flattened_single_video_dict, indent=4))
                send_to_dynamodb(options, dynamodb, flattened_single_video_dict)
        else:
            print(f"Failed to configure boto3 from environment variables at {current_timestamp()}. Exiting...\n")
    else:
        print(f"Failed to retrieve environment variables at {current_timestamp()} Exiting...\n")

if __name__ == "__main__":
    main()


Successfully open_aws_dynamodb_session with environment variables. at 2024-02-29 00:20:01

Checking cache at 2024-02-29 00:20:01

Cache hit.
Checking cache at 2024-02-29 00:20:01

Cache hit.
Checking cache at 2024-02-29 00:20:01

Cache hit.
Checking cache at 2024-02-29 00:20:01

Cache miss.

Updating cache at 2024-02-29 00:20:02

Performing YouTube search at 2024-02-29 00:20:02

Function 'youtube_search_all_videos' was run at 2024-02-29 00:20:32

{
    "kind": "youtube#video",
    "etag": "7x_E1heuhDUph9WruDaTyJIEUYo",
    "videoId": "RFzZ-xpyfoo",
    "publishedAt": "2011-03-17T23:21:39Z",
    "channelId": "UCocAlqGOU7BqzKc87qx6WUg",
    "title": "Savage Garden - Affirmation",
    "description": "Savage Garden's official music video for 'Affirmation'. Click to listen to Savage Garden on Spotify: http://smarturl.it/SGSpot?",
    "url": "https://i.ytimg.com/vi/RFzZ-xpyfoo/hqdefault.jpg",
    "width": 480,
    "height": 360,
    "channelTitle": "SavageGardenVEVO",
    "liveBroadcastConte

In [33]:
import boto3
from dotenv import load_dotenv
from datetime import datetime
import os
import pandas as pd


def current_timestamp():
    """Returns the current timestamp formatted for readability."""
    return datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    
def open_aws_dynamodb_session(options):
    """
    Configures a boto3 session using AWS credentials read from environment variable.
    Returns a boto3 DynamoDB resource configured with these credentials.
    """
    try:

        # Configure the boto3 session with the read credentials
        session = boto3.Session(
            aws_access_key_id=options['AWS_ACCESS_KEY_ID'],
            aws_secret_access_key=options['AWS_SECRET_ACCESS_KEY'],
            region_name=options['AWS_REGION']  # Specify your AWS region
        )
        dynamodb_resource = session.resource('dynamodb')
        print(f"Successfully open_aws_dynamodb_session with environment variables. at {current_timestamp()}\n")
        # Return the configured DynamoDB resource
        return dynamodb_resource

    except Exception as e:
        print(f"Error open_aws_dynamodb_session from environment variables: {e} at {current_timestamp()}\n")
        return None

def get_env_variables():
    """Fetch all necessary configurations from environment variables."""
    return {
        'DEVELOPER_KEY': os.getenv('DEVELOPER_KEY'),
        'AWS_ACCESS_KEY_ID': os.getenv('AWS_ACCESS_KEY_ID'),
        'AWS_SECRET_ACCESS_KEY': os.getenv('AWS_SECRET_ACCESS_KEY'),
        'SEARCH_CACHE_TABLE': os.getenv('SEARCH_CACHE_TABLE'),
        'RESULTS_TABLE_NAME': os.getenv('RESULTS_TABLE_NAME'),
        'SEARCH_QUERY': os.getenv('SEARCH_QUERY'),
        'MAX_RESULTS': int(os.getenv('MAX_RESULTS', 50)),
        'ORDER': os.getenv('ORDER', 'viewCount'),
        'VIDEO_DURATION': os.getenv('VIDEO_DURATION', 'medium'),
        'PUBLISHED_AFTER': os.getenv('PUBLISHED_AFTER', '2010-01-01T00:00:00Z'),
        'PUBLISHED_BEFORE': os.getenv('PUBLISHED_BEFORE', '2024-12-31T23:59:59Z'),
        'RELEVANCE_LANGUAGE': os.getenv('RELEVANCE_LANGUAGE', 'en'),
        # 'VIDEO_CATEGORY_ID': os.getenv('VIDEO_CATEGORY_ID', '10'),
        'AWS_REGION': os.getenv('AWS_REGION')  # Add AWS region to the environment variables
    }

def query_and_sort_videos(table_name, search_keywords, dynamodb):
    """
    Queries a DynamoDB table for videos that match any of the provided keywords in their title or description,
    sorts the results by commentCount in descending order, and returns detailed information for the top 3 videos.

    :param table_name: The name of the DynamoDB table to search.
    :param search_keywords: A list of keywords to search for in video titles and descriptions.
    :param dynamodb: The DynamoDB session object.
    :return: A list of dictionaries containing detailed information for the top 3 videos sorted by commentCount in descending order.
    """
    table = dynamodb.Table(table_name)

    # Build filter expression dynamically based on the number of search keywords
    filter_expression = " or ".join([f"contains(title, :kw{i}) or contains(description, :kw{i})" for i, _ in enumerate(search_keywords, start=1)])
    expression_attribute_values = {f":kw{i}": kw for i, kw in enumerate(search_keywords, start=1)}

    response = table.scan(
        FilterExpression=filter_expression,
        ExpressionAttributeValues=expression_attribute_values
    )

    items = response.get('Items', [])
    # Ensure 'commentCount' is treated as an integer for sorting purposes
    for item in items:
        item['commentCount'] = int(item.get('commentCount', 0))

    # Sort items by 'commentCount' in descending order and select the top 3
    sorted_items = sorted(items, key=lambda x: x['commentCount'], reverse=True)[:3]

    return sorted_items


def query_all_videos(table_name, dynamodb):
    """
    Queries a DynamoDB table for videos without any specific keyword filter,
    sorts the results by commentCount in descending order, and returns detailed information for the top 3 videos.

    :param table_name: The name of the DynamoDB table to search.
    :param dynamodb: The DynamoDB session object.
    :return: A list of dictionaries containing detailed information for the top 3 videos sorted by commentCount in descending order.
    """
    table = dynamodb.Table(table_name)

    items = []
    scan_kwargs = {}
    done = False
    start_key = None

    while not done:
        if start_key:
            scan_kwargs['ExclusiveStartKey'] = start_key
        response = table.scan(**scan_kwargs)
        items.extend(response.get('Items', []))

        start_key = response.get('LastEvaluatedKey', None)
        done = start_key is None

    return items


def convert_to_dataframe(sorted_items):
    """
    Converts a list of sorted items into a pandas DataFrame.

    :param sorted_items: A list of sorted items by commentCount in descending order.
    :return: A pandas DataFrame of the sorted items.
    """
    if sorted_items:
        df = pd.DataFrame(sorted_items, columns=[
        "kind", "etag", "videoId", "publishedAt", "channelId", "title", "description",
        "url", "width", "height", "channelTitle", "liveBroadcastContent", "publishTime",
        "id", "duration", "dimension", "definition", "caption", "licensedContent",
        "projection", "uploadStatus", "privacyStatus", "license", "embeddable",
        "publicStatsViewable", "madeForKids", "viewCount", "likeCount", "favoriteCount",
        "commentCount", "topicCategories"
    ])
    else:
        print("No items found matching the criteria.")
        df = pd.DataFrame()

    return df


def clean_transcript(input_string):
    # Regex pattern to extract transcript text and replace brackets, "\n", "-->", and datetime
    cleaned_text = re.sub(r'<[^>]*>', ', ', input_string)  # Remove tags
    cleaned_text = re.sub(r' align:start position:\d+%', '', input_string)
    cleaned_text = re.sub(r'\d{2}:\d{2}:\d{2}\.\d{3} --> \d{2}:\d{2}:\d{2}\.\d{3}', '', cleaned_text)  # Remove datetime
    cleaned_text = re.sub(r'\n|-->', ' ', cleaned_text)  # Remove "\n" and "-->"
    cleaned_text = re.sub(r'<.*?>', '', cleaned_text)  # Remove remaining tags
    cleaned_text = re.sub(r'\[|\]', ', ', cleaned_text)  # Replace brackets with ", "
    cleaned_text = re.sub(r'   ', ' ', cleaned_text)  # remove excess spaces
    cleaned_text = re.sub(r'  ', ' ', cleaned_text)  # remove excess spaces
    
    return cleaned_text

# Example usage
if __name__ == "__main__":
    load_dotenv()
    options = get_env_variables()
    if options:
        table_name = options['RESULTS_TABLE_NAME']
        dynamodb = open_aws_dynamodb_session(options)
        if dynamodb:
            search_keywords = ["Self Love", "Confidence"]
        
            # video_attributes_list = query_and_sort_videos(table_name, search_keywords, dynamodb)
            # df = convert_to_dataframe(video_attributes_list)
            all_videos_in_dynamodb = query_all_videos(table_name, dynamodb)
            df = convert_to_dataframe(all_videos_in_dynamodb)
            # df.to_csv('all_latest_records_dynamodb.csv', index=False)
            print(df.shape)
            # print(video_ids)
            # for video_id in video_ids_list:
            #     # Input string containing transcript
            #     subtitle_file, transcript_text = download_subtitles('OU5vWLaIpnY')
                
            #     # Call the function to clean the transcript text
            #     cleaned_transcript = clean_transcript(transcript_text)
            #     print(cleaned_transcript)


Successfully open_aws_dynamodb_session with environment variables. at 2024-03-01 17:15:15

(608, 31)


In [35]:
pd.set_option('display.max_columns', None)

df.to_csv('all_latest_records_dynamodb.csv', index=False)
df.sample(2)

Unnamed: 0,kind,etag,videoId,publishedAt,channelId,title,description,url,width,height,channelTitle,liveBroadcastContent,publishTime,id,duration,dimension,definition,caption,licensedContent,projection,uploadStatus,privacyStatus,license,embeddable,publicStatsViewable,madeForKids,viewCount,likeCount,favoriteCount,commentCount,topicCategories
544,youtube#video,Oym55xB1vydEtmoSu3VA60NkfKo,Rgc7L30FmlM,2022-01-02T00:35:32Z,UCjdvLPmF0iUeoX7uLFtWmwg,Affirmation by Randall Standridge - Walsh Midd...,Performed by the Walsh Middle School Honor Ban...,https://i.ytimg.com/vi/Rgc7L30FmlM/hqdefault.jpg,480,360,Walsh Band,none,2022-01-02T00:35:32Z,Rgc7L30FmlM,PT4M57S,2d,hd,False,False,rectangular,processed,public,youtube,True,True,True,3419,,0,0,[https://en.wikipedia.org/wiki/Classical_music...
590,youtube#video,XZbyfDzKz8t0A7VJoZ_6NHjUs8Y,O3nVWBNURFk,2024-02-28T13:04:09Z,UCJnTW8HXgTMkwfAEu_LBFYQ,"Affirmation prayer, You are about to receive a...","Affirmation prayer, You are about to receive a...",https://i.ytimg.com/vi/O3nVWBNURFk/hqdefault.jpg,480,360,Prophetic Prayers_,none,2024-02-28T13:04:09Z,O3nVWBNURFk,PT4M29S,2d,hd,False,False,rectangular,processed,public,youtube,True,True,False,1281,117.0,0,145,[https://en.wikipedia.org/wiki/Christian_music...


In [60]:
import re

def download_subtitles(video_id):
    """Download subtitles for a given YouTube video ID."""
    video_url = f'https://www.youtube.com/watch?v={video_id}'
    ydl_opts = {
        'writeautomaticsub': True,
        'subtitleslangs': ['en'],
        'skip_download': True,
        'outtmpl': f'subtitles/{video_id}.%(ext)s',
        'quiet': True
    }
    try:
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            ydl.download([video_url])
            subtitle_file = f'subtitles/{video_id}.en.vtt'
            if os.path.exists(subtitle_file):
                with open(subtitle_file, 'r', encoding='utf-8') as file:
                    subtitle_text = file.read()
                    print(f"Downloaded video subtitles for video ID: {video_id} into the file {subtitle_file} at {current_timestamp()}\n")
                return subtitle_file, subtitle_text
            return None, None
    except Exception as e:
        print(f"Failed to download subtitles for video ID: {video_id}: {e} at {current_timestamp()}\n")
        return None, None


def clean_transcript(input_string):
    # Regex pattern to extract transcript text and replace brackets, "\n", "-->", and datetime
    cleaned_text = re.sub(r'<[^>]*>', ', ', input_string)  # Remove tags
    cleaned_text = re.sub(r' align:start position:\d+%', '', input_string)
    cleaned_text = re.sub(r'\d{2}:\d{2}:\d{2}\.\d{3} --> \d{2}:\d{2}:\d{2}\.\d{3}', '', cleaned_text)  # Remove datetime
    cleaned_text = re.sub(r'\n|-->', ' ', cleaned_text)  # Remove "\n" and "-->"
    cleaned_text = re.sub(r'<.*?>', '', cleaned_text)  # Remove remaining tags
    cleaned_text = re.sub(r'\[|\]', ', ', cleaned_text)  # Replace brackets with ", "
    cleaned_text = re.sub(r'   ', ' ', cleaned_text)  # remove excess spaces
    cleaned_text = re.sub(r'  ', ' ', cleaned_text)  # remove excess spaces
    
    return cleaned_text



# Ensure the subtitles directory exists}
os.makedirs('subtitles', exist_ok=True)

# Input string containing transcript
subtitle_file, transcript_text = download_subtitles('OU5vWLaIpnY')

# Call the function to clean the transcript text
cleaned_transcript = clean_transcript(transcript_text)
print(cleaned_transcript)


Downloaded video subtitles for video ID: OU5vWLaIpnY into the file subtitles/OU5vWLaIpnY.en.vtt at 2024-02-29 01:01:41

WEBVTT Kind: captions Language: en  welcome to the seven day money welcome to the seven day money  welcome to the seven day money manifestation challenge the purpose of manifestation challenge the purpose of  manifestation challenge the purpose of this recording is to help you generate this recording is to help you generate  this recording is to help you generate extra cash flow within the next week and extra cash flow within the next week and  extra cash flow within the next week and let me be clear I'm not talking about let me be clear I'm not talking about  let me be clear I'm not talking about just having an abundance mindset don't just having an abundance mindset don't  just having an abundance mindset don't get me wrong your mindset is vitally get me wrong your mindset is vitally  get me wrong your mindset is vitally important but my ultimate goal for you import

In [61]:
import os
import yt_dlp
from datetime import datetime

def current_timestamp():
    return datetime.now().strftime("%Y-%m-%d %H:%M:%S")

def download_subtitles(video_id):
    """Download subtitles for a given YouTube video ID."""
    video_url = f'https://www.youtube.com/watch?v={video_id}'
    ydl_opts = {
        'writeautomaticsub': True,
        'subtitleslangs': ['en'],
        'skip_download': True,
        'outtmpl': f'subtitles/{video_id}.%(ext)s',
        'quiet': True
    }
    
    subtitle_file = f'subtitles/{video_id}.en.vtt'
    
    if os.path.exists(subtitle_file):
        with open(subtitle_file, 'r', encoding='utf-8') as file:
            subtitle_text = file.read()
            print(f"Subtitles for video ID: {video_id} already exist. Returning existing subtitles at {current_timestamp()}\n")
        return subtitle_file, subtitle_text
    
    try:
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            ydl.download([video_url])
            if os.path.exists(subtitle_file):
                with open(subtitle_file, 'r', encoding='utf-8') as file:
                    subtitle_text = file.read()
                    print(f"Downloaded video subtitles for video ID: {video_id} into the file {subtitle_file} at {current_timestamp()}\n")
                return subtitle_file, subtitle_text
            return None, None
    except Exception as e:
        print(f"Failed to download subtitles for video ID: {video_id}: {e} at {current_timestamp()}\n")
        return None, None

def clean_transcript(input_string):
    # Regex pattern to extract transcript text and replace brackets, "\n", "-->", and datetime
    cleaned_text = re.sub(r'<[^>]*>', ', ', input_string)  # Remove tags
    cleaned_text = re.sub(r' align:start position:\d+%', '', input_string)
    cleaned_text = re.sub(r'\d{2}:\d{2}:\d{2}\.\d{3} --> \d{2}:\d{2}:\d{2}\.\d{3}', '', cleaned_text)  # Remove datetime
    cleaned_text = re.sub(r'\n|-->', ' ', cleaned_text)  # Remove "\n" and "-->"
    cleaned_text = re.sub(r'<.*?>', '', cleaned_text)  # Remove remaining tags
    cleaned_text = re.sub(r'\[|\]', ', ', cleaned_text)  # Replace brackets with ", "
    cleaned_text = re.sub(r'   ', ' ', cleaned_text)  # remove excess spaces
    cleaned_text = re.sub(r'  ', ' ', cleaned_text)  # remove excess spaces
    
    return cleaned_text



# Ensure the subtitles directory exists}
os.makedirs('subtitles', exist_ok=True)

# Input string containing transcript
subtitle_file, transcript_text = download_subtitles('2bbHZusVgw0')

# Call the function to clean the transcript text
cleaned_transcript = clean_transcript(transcript_text)
print(cleaned_transcript)


Downloaded video subtitles for video ID: 2bbHZusVgw0 into the file subtitles/2bbHZusVgw0.en.vtt at 2024-02-29 01:09:34

WEBVTT Kind: captions Language: en  morning morning  morning I am affirmations to attract wealth I am affirmations to attract wealth  I am affirmations to attract wealth abundance and prosperity listen to this abundance and prosperity listen to this  abundance and prosperity listen to this powerful recording once a day for 21 powerful recording once a day for 21  powerful recording once a day for 21 days in a row and you just might be days in a row and you just might be  days in a row and you just might be amazed by what shows up in your life amazed by what shows up in your life  amazed by what shows up in your life ready I'll recite each affirmation then ready I'll recite each affirmation then  ready I'll recite each affirmation then you'll repeat the statement back to you'll repeat the statement back to  you'll repeat the statement back to yourself either out loud o

In [62]:
from collections import OrderedDict

def remove_duplicates(input_string):
    # Split the input string into phrases
    phrases = input_string.split()
    
    # Use an OrderedDict to maintain the order while eliminating duplicates
    unique_phrases = list(OrderedDict.fromkeys(phrases))
    
    # Join the unique phrases back into a string
    unique_string = ' '.join(unique_phrases)
    
    return unique_string


# Call the function to remove duplicate long phrases
unique_string = remove_duplicates(cleaned_transcript)
print(unique_string)

WEBVTT Kind: captions Language: en morning I am affirmations to attract wealth abundance and prosperity listen this powerful recording once a day for 21 days in row you just might be amazed by what shows up your life ready I'll recite each affirmation then you'll repeat the statement back yourself either out loud or mind more energy emotion certainty that put behind statements engrained they will become let's begin I've vibrate with frequency of my is embodiment success see everywhere money magnet comes me both expected unexpected ways , Music, welcome new streams income into grateful many eagerly flows inspired actions create ongoing aligned making opportunities master creates positive impact on world expands ability serve others all needs are met so i i've worthy receive bring immense value have gifts only uniquely qualified share perfect as deserve financially rewarded birthright way good responsible steward know core truly blessed blessing community people enjoy its forms better pl

In [41]:
import boto3

def count_rows_in_table(table_name, dynamodb=None):
    """
    Counts the number of rows in a specified DynamoDB table.

    :param table_name: The name of the DynamoDB table.
    :param dynamodb: The DynamoDB session object (optional).
    :return: The number of rows in the table.
    """
    if dynamodb is None:
        dynamodb = boto3.resource('dynamodb')

    table = dynamodb.Table(table_name)
    # Start the scan operation with minimal projection to reduce consumed capacity
    response = table.scan(ProjectionExpression='id')
    items = response.get('Items', [])

    # Continue scanning if more items are available
    while 'LastEvaluatedKey' in response:
        response = table.scan(
            ProjectionExpression='id',
            ExclusiveStartKey=response['LastEvaluatedKey']
        )
        items.extend(response.get('Items', []))

    # Return the count of items (rows) in the table
    return len(items)



# Example usage
if __name__ == "__main__":
    load_dotenv()
    options = get_env_variables()
    if options:
        table_name = options['RESULTS_TABLE_NAME']
        dynamodb = open_aws_dynamodb_session(options)
        if dynamodb:
            row_count = count_rows_in_table(table_name, dynamodb)
            print(f"Number of rows in table '{table_name}': {row_count}")

Successfully open_aws_dynamodb_session with environment variables. at 2024-03-01 21:34:32

Number of rows in table 'YoutubeSearchResults': 618
