In [None]:
import boto3
import json
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime, timedelta, timezone

# CDK created role which has access to lambda/dynamodb/s3/CloudWatch Logs. Copy paste the role from cdk deploy terminal output
role_arn = "arn:aws:iam::736551082663:role/TranscriptCollectorProjec-SageMakerAccessRole9107AB-ndEtKQgg84zA"
session_name = "SageMakerSession"

# Create an STS client and assume the role
sts_client = boto3.client('sts')
assumed_role_object = sts_client.assume_role(
    RoleArn=role_arn,
    RoleSessionName=session_name
)

# Get temporary credentials and create a new session using the assumed role credentials
credentials = assumed_role_object['Credentials']
session = boto3.Session(
    aws_access_key_id=credentials['AccessKeyId'],
    aws_secret_access_key=credentials['SecretAccessKey'],
    aws_session_token=credentials['SessionToken'],
    region_name='eu-central-1'
)

lambda_client = session.client('lambda')
logs_client = session.client('logs')
stage = "development"

print("Assumed role successfully!")

def invoke_lambda(url):
    payload = {"video_url": url}
    lambda_client.invoke(
        FunctionName=f'TranscriptCollectorFunction-{stage}',
        InvocationType='Event',  # Asynchronous invocation
        Payload=json.dumps(payload)
    )

def get_log_events(log_group_name, filter_pattern, start_time, end_time):
    """Fetch log events from CloudWatch Logs based on a filter pattern."""
    response = logs_client.filter_log_events(
        logGroupName=log_group_name,
        startTime=int(start_time.timestamp() * 1000),
        endTime=int(end_time.timestamp() * 1000),
        filterPattern=filter_pattern
    )
    
    return response['events']

def format_log_message(event):
    """Format the log message for readability."""
    log_event = {
        'timestamp': datetime.utcfromtimestamp(event['timestamp'] / 1000).isoformat(),
        'logStreamName': event['logStreamName'],
        'message': event['message']
    }
    return log_event

def fetch_lambda_logs(log_group_name, log_level, hours_back):
    """Fetch Lambda logs based on log level and time range."""
    filter_patterns = {
        'error': 'ERROR',
        'info': 'INFO',
        'warning': 'WARNING'
    }
    
    filter_pattern = filter_patterns.get(log_level.lower())
    
    if not filter_pattern:
        raise ValueError("Invalid log level specified. Choose from 'error', 'info', or 'warning'.")
    
    end_time = datetime.now(timezone.utc)
    start_time = end_time - timedelta(hours=hours_back)
    
    events = get_log_events(log_group_name, filter_pattern, start_time, end_time)
    
    return [format_log_message(event) for event in events]

def main(urls):
    # Invoke Lambda functions asynchronously
    max_workers = 100  # Start with a high value, adjust based on your testing
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_url = {executor.submit(invoke_lambda, url): url for url in urls}
        for future in as_completed(future_to_url):
            future.result()  # To catch any exceptions raised during invocation

In [None]:
import random
from pytubefix import Channel

# c = Channel("https://www.youtube.com/@baldandbankrupt")
# c = Channel("https://www.youtube.com/@SabineHossenfelder")
c = Channel("https://www.youtube.com/@primitivetechnology9550")

print(f'Channel name: {c.channel_name}')
print(f'Total videos: {len(c.videos)}')

def construct_video_url(video_id):
    return f'https://youtube.com/watch?v={video_id}'

# video_urls = [construct_video_url(video.video_id) for video in c.videos[200:300]]
video_urls = [construct_video_url(video.video_id) for video in c.videos]


results = main(video_urls)


In [None]:
video_urls = [construct_video_url("YVkUvmDQ3HY")]

results = main(video_urls)

In [None]:
# Fetch and print logs
log_group_name = f'/aws/lambda/TranscriptCollectorFunction-{stage}'
log_level = 'error'  # Can be 'error', 'info', or 'warning'
hours_back = 0.5  # Number of hours in the past

logs = fetch_lambda_logs(log_group_name, log_level, hours_back)

for log in logs:
    print(f"Timestamp: {log['timestamp']}")
    print(f"Log Stream: {log['logStreamName']}")
    print(f"Message: {log['message']}")
    print("="*60)