In [1]:
!pip install pytube
!pip install moviepy
!pip install pydub

Collecting pytube
  Downloading pytube-15.0.0-py3-none-any.whl.metadata (5.0 kB)
Downloading pytube-15.0.0-py3-none-any.whl (57 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pytube
Successfully installed pytube-15.0.0
Collecting moviepy
  Downloading moviepy-1.0.3.tar.gz (388 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m388.3/388.3 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting decorator<5.0,>=4.0.2 (from moviepy)
  Downloading decorator-4.4.2-py2.py3-none-any.whl.metadata (4.2 kB)
Collecting proglog<=1.0.0 (from moviepy)
  Downloading proglog-0.1.10-py3-none-any.whl.metadata (639 bytes)
Collecting imageio<3.0,>=2.5 (from moviepy)
  Downloading imageio-2.34.0-py3-none-any.whl.metadata (4.9 kB)
Collecting imageio_ffmpeg>=0.2.0 (from moviepy)
  Downloading imageio-ffmpe

In [63]:
from pydub import AudioSegment
from pytube import YouTube
from datetime import datetime

In [61]:
import boto3
import os
from datetime import datetime
from dotenv import load_dotenv
import random


def current_timestamp():
    """Returns the current timestamp formatted for readability."""
    return datetime.now().strftime("%Y-%m-%d %H:%M:%S")

def get_env_variables():
    """Fetch all necessary configurations from environment variables."""
    return {
        'DEVELOPER_KEY': os.getenv('DEVELOPER_KEY'),
        'AWS_ACCESS_KEY_ID': os.getenv('AWS_ACCESS_KEY_ID'),
        'AWS_SECRET_ACCESS_KEY': os.getenv('AWS_SECRET_ACCESS_KEY'),
        'SEARCH_CACHE_TABLE': os.getenv('SEARCH_CACHE_TABLE'),
        'RESULTS_TABLE_NAME': os.getenv('RESULTS_TABLE_NAME'),
        'SEARCH_QUERY': os.getenv('SEARCH_QUERY'),
        'MAX_RESULTS': int(os.getenv('MAX_RESULTS', 50)),
        'ORDER': os.getenv('ORDER', 'viewCount'),
        'VIDEO_DURATION': os.getenv('VIDEO_DURATION', 'medium'),
        'PUBLISHED_AFTER': os.getenv('PUBLISHED_AFTER', '2010-01-01T00:00:00Z'),
        'PUBLISHED_BEFORE': os.getenv('PUBLISHED_BEFORE', '2024-12-31T23:59:59Z'),
        'RELEVANCE_LANGUAGE': os.getenv('RELEVANCE_LANGUAGE', 'en'),
        # 'VIDEO_CATEGORY_ID': os.getenv('VIDEO_CATEGORY_ID', '10'),
        'AWS_REGION': os.getenv('AWS_REGION')  # Add AWS region to the environment variables
    }

def open_aws_dynamodb_session(options):
    """
    Configures a boto3 session using AWS credentials read from environment variable.
    Returns a boto3 DynamoDB resource configured with these credentials.
    """
    try:

        # Configure the boto3 session with the read credentials
        session = boto3.Session(
            aws_access_key_id=options['AWS_ACCESS_KEY_ID'],
            aws_secret_access_key=options['AWS_SECRET_ACCESS_KEY'],
            region_name=options['AWS_REGION']  # Specify your AWS region
        )
        dynamodb_resource = session.resource('dynamodb')
        print(f"Successfully open_aws_dynamodb_session with environment variables. at {current_timestamp()}\n")
        # Return the configured DynamoDB resource
        return dynamodb_resource

    except Exception as e:
        print(f"Error open_aws_dynamodb_session from environment variables: {e} at {current_timestamp()}\n")
        return None

def convert_duration_to_minutes(duration):
    if 'H' in duration:
        hours = int(duration.split('H')[0][2:])
    else:
        hours = 0
    if 'M' in duration:
        minutes = int(duration.split('M')[0].split('T')[-1])
    else:
        minutes = 0
    if 'S' in duration:
        seconds = int(duration.split('S')[0].split('M')[-1])
    else:
        seconds = 0
    return hours * 3600 + minutes * 60 + seconds


def retrieve_video_ids(table_name, search_term, max_duration_seconds=15, dynamodb=None):
    if dynamodb is None:
        dynamodb = boto3.resource('dynamodb')

    table = dynamodb.Table(table_name)

    response = table.scan(
        ProjectionExpression='#videoId, #title, #duration',  # Adjust 'title' and 'duration' based on your column names
        ExpressionAttributeNames={
            '#videoId': 'videoId',
            '#title': 'title',
            '#duration': 'duration'
        }
    )

    items = response.get('Items', [])

    # Filter items based on search term and duration
    filtered_items = [
        item for item in items
        if search_term.lower() in item.get('title', '').lower() and
        convert_duration_to_minutes(item.get('duration', '')) < max_duration_minutes
    ]

    # Extract video IDs from filtered items, limiting to the specified total
    video_ids = [item['videoId'] for item in filtered_items]

    return video_ids

def compare_video_ids_with_file(video_ids, file_path):
    # Read video IDs from the file
    with open(file_path, 'r') as file:
        # Iterate through each line and extract video IDs
        file_video_ids = [line.strip().split(',') for line in file]

    # Check if any line in the file exactly matches the given video IDs
    return any(set(video_ids) == set(line) for line in file_video_ids)

def write_video_ids_to_file(video_ids, file_path):
    with open(file_path, 'a') as file:
        file.write(','.join(video_ids) + '\n')

if __name__ == "__main__":
    load_dotenv()
    options = get_env_variables()
    if options:
        table_name = options['RESULTS_TABLE_NAME']
        dynamodb = open_aws_dynamodb_session(options)
        
        if dynamodb:
            file_path = 'batched_videos_list.txt'
            search_term = 'meditation'
            max_duration_seconds = 500
            result_limit = 3
            first_pass_check = True

            # search for a comprehensive list of video ids from dynamodb
            video_ids = retrieve_video_ids(table_name, search_term, max_duration_seconds, dynamodb)            
            for _ in range(10):  # Run the loop 20 times
                
                # if both are false the new record is unique
                if first_pass_check or compare_video_ids_with_file(video_ids, file_path):
                    if not first_pass_check:
                        print('Exact match found in records, running again')
                    first_pass_check = False

                    #randomly shuffle the list and return a limited list of video ids
                    random.shuffle(video_ids)
                    video_ids = video_ids[:result_limit]

            # save record to txt file
            write_video_ids_to_file(video_ids, file_path)
            print(f"Unique video ids: {video_ids} written to file")

Successfully open_aws_dynamodb_session with environment variables. at 2024-03-05 21:41:50

Unique video ids: ['ICMfseFhE-E', 'C_xsXnRd_uc', 'xKAYvtoWxGs'] written to file


In [62]:
video_ids

['ICMfseFhE-E', 'C_xsXnRd_uc', 'xKAYvtoWxGs']

In [117]:
import asyncio
from pytube import YouTube
import os
import nest_asyncio

nest_asyncio.apply()

def replace_mp4_with_wav(file_name):
    return re.sub(r'\.mp4$', '.wav', file_name)

async def download_and_process_video(video_id, parent_output_folder, output_folder):
    video_url = f'https://www.youtube.com/watch?v=' + video_id

    # Create a YouTube object
    yt = YouTube(video_url)

    # Filter and select the audio stream
    audio_stream = yt.streams.filter(only_audio=True, file_extension='mp4').first()

    # Define the output path for the downloaded audio
    output_path = f'{parent_output_folder}/{output_folder}/{video_id}'

    # Download the audio stream to the specified output path
    audio_stream.download(output_path=output_path)

    # Get the list of files in the downloaded folder
    files = os.listdir(output_path)

    resulting_mp4_path = f'{output_path}.mp4'

    # Move the downloaded file to the parent folder and rename it to the video ID
    os.rename(f'{output_path}/{files[0]}' , resulting_mp4_path)

    # Remove the now-empty subfolder
    os.rmdir(output_path)

    # Load the MP4 file using Pydub
    audio = AudioSegment.from_file(resulting_mp4_path, format="mp4")

    wav_file_path = replace_mp4_with_wav(resulting_mp4_path)
    
    # Export the audio to a WAV file
    audio.export(wav_file_path, format="wav")

    # Delete .mp4 file
    os.remove(resulting_mp4_path)

async def main():
    video_ids = ['ICMfseFhE-E', 'C_xsXnRd_uc', 'xKAYvtoWxGs']
    parent_output_folder = 'audio_mp4s'
    # Generate timestamp
    timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M")
    output_folder = f'{search_term}_{timestamp}'
    os.makedirs(f'{parent_output_folder}/{output_folder}', exist_ok=True)

    # Create a list of tasks
    tasks = [download_and_process_video(video_id, parent_output_folder, output_folder) for video_id in video_ids]

    # Run the tasks asynchronously
    await asyncio.gather(*tasks)

# Run the event loop
await main()


In [87]:
def extract_first_15_seconds(audio_path, output_path):
    # Load audio file
    audio = AudioSegment.from_wav(audio_path)

    # Extract the first 15 seconds
    first_15_seconds = audio[:15000]  # 15 seconds in milliseconds

    # Save the snippet
    first_15_seconds.export(output_path, format='wav')

# Example usage
audio_file_path = 'video_1vvyyhteIv4/video_1vvyyhteIv4.wav'
output_file_path = 'video_1vvyyhteIv4/first_15_1vvyyhteIv4.wav'
extract_first_15_seconds(audio_file_path, output_file_path)