In [30]:
import boto3
import json
from concurrent.futures import ThreadPoolExecutor, as_completed

# CDK created role which has access to lambda/dynamodb/s3. Copy paste the role from cdk deploy terminal output
role_arn = "arn:aws:iam::736551082663:role/TranscriptCollectorProjec-SageMakerAccessRole9107AB-ndEtKQgg84zA"
session_name = "SageMakerSession"

# Create an STS client and assume the role
sts_client = boto3.client('sts')
assumed_role_object = sts_client.assume_role(
    RoleArn=role_arn,
    RoleSessionName=session_name
)

# Get temporary credentials and create a new session using the assumed role credentials
credentials = assumed_role_object['Credentials']
session = boto3.Session(
    aws_access_key_id=credentials['AccessKeyId'],
    aws_secret_access_key=credentials['SecretAccessKey'],
    aws_session_token=credentials['SessionToken'],
    region_name='eu-central-1'
)

print("Assumed role successfully!")

Assumed role successfully!


In [31]:
# Run lambda invocations in paralell (for stage production
lambda_client = session.client('lambda')

def invoke_lambda(url):
    payload = {"video_url": url}
    response = lambda_client.invoke(
        FunctionName='TranscriptCollectorFunction-production',
        InvocationType='RequestResponse',  # Change to 'Event' for async invocation
        Payload=json.dumps(payload)
    )
    response_payload = json.loads(response['Payload'].read().decode('utf-8'))
    return response_payload

def main(urls):
    results = []
    with ThreadPoolExecutor(max_workers=10) as executor:
        future_to_url = {executor.submit(invoke_lambda, url): url for url in urls}
        for future in as_completed(future_to_url):
            result = future.result()
            results.append(result)
    return results

In [32]:
import random
from pytubefix import Channel

# c = Channel("https://www.youtube.com/@baldandbankrupt")
# c = Channel("https://www.youtube.com/@ArtosisCasts")
c = Channel("https://www.youtube.com/@primitivetechnology9550")

print(f'Channel name: {c.channel_name}')
print(f'Total videos: {len(c.videos)}')

def construct_video_url(video_id):
    return f'https://youtube.com/watch?v={video_id}'

# video_urls = [construct_video_url(video.video_id) for video in c.videos[200:300]]
video_urls = [construct_video_url(video.video_id) for video in c.videos]


results = main(video_urls)


Channel name: Primitive Technology
Total videos: 80
