# Transcriptions

https://medium.com/attest-product-and-technology/how-to-set-up-aws-transcribe-to-batch-transcribe-hundreds-of-audio-files-a75abd5996d3

In [None]:
import boto3
import pandas as pd
import time

s3_client = boto3.client('s3')
transcribe_client = boto3.client('transcribe')

bucket_name = 'sagemaker-us-east-2-058264083825'
prefix = 'Train/'  # Adjust this based on the structure of your S3 bucket
output_bucket_name = 'sagemaker-studio-058264083825-bar3vvoeivv'
region = 'us-east-2'  # e.g., 'us-west-2'
language_code = 'en-US'  # Adjust as needed
batch_identifier = "_trial_2"
max_concurrent_jobs = 10  # Adjust based on your account's limit

def list_s3_videos(bucket_name, prefix):
    response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
    video_files = [obj['Key'] for obj in response.get('Contents', []) if obj['Key'].lower().endswith(('.mp4', '.mov', '.avi'))]
    return video_files

def start_transcription_job(file_uri, job_name):
    response = transcribe_client.start_transcription_job(
        TranscriptionJobName=job_name,
        Media={'MediaFileUri': file_uri},
        OutputBucketName=output_bucket_name,
        LanguageCode=language_code
    )
    return response

def check_transcription_job(job_name):
    response = transcribe_client.get_transcription_job(TranscriptionJobName=job_name)
    status = response['TranscriptionJob']['TranscriptionJobStatus']
    return status, response

def get_transcription_text(transcript_uri):
    transcript_response = s3_client.get_object(Bucket=output_bucket_name, Key=transcript_uri.split('/')[-1])
    transcript_text = transcript_response['Body'].read().decode('utf-8')
    return transcript_text

def process_videos(bucket_name, prefix):
    video_files = list_s3_videos(bucket_name, prefix)
    transcriptions = []
    job_names = []

    # Start transcription jobs
    for video_file in video_files:
        job_name = video_file.split('/')[-1].replace('.', '_') + batch_identifier
        file_uri = f's3://{bucket_name}/{video_file}'
        
        while True:
            try:
                start_transcription_job(file_uri, job_name)
                job_names.append(job_name)
                print(f'Started transcription job for {video_file} with job name {job_name}')
                break
            except transcribe_client.exceptions.LimitExceededException:
                print(f'LimitExceededException: Too many jobs running. Waiting before retrying.')
                time.sleep(60)  # Wait for 60 seconds before retrying
        
        # Check if we have reached the maximum number of concurrent jobs
        while len(job_names) >= max_concurrent_jobs:
            time.sleep(30)  # Wait for 30 seconds before checking again
            for job_name in job_names:
                status, response = check_transcription_job(job_name)
                if status in ['COMPLETED', 'FAILED']:
                    job_names.remove(job_name)
                    if status == 'COMPLETED':
                        transcript_uri = response['TranscriptionJob']['Transcript']['TranscriptFileUri']
                        transcript_text = get_transcription_text(transcript_uri)
                        transcriptions.append({
                            'JobName': job_name,
                            'Transcript': transcript_text
                        })
                        print(f'Completed transcription job for {job_name}')
                    else:
                        print(f'Transcription job for {job_name} failed')

    # Poll for remaining transcription job completion
    while job_names:
        time.sleep(30)  # Wait for 30 seconds before polling again
        for job_name in job_names:
            status, response = check_transcription_job(job_name)
            if status in ['COMPLETED', 'FAILED']:
                job_names.remove(job_name)
                if status == 'COMPLETED':
                    transcript_uri = response['TranscriptionJob']['Transcript']['TranscriptFileUri']
                    transcript_text = get_transcription_text(transcript_uri)
                    transcriptions.append({
                        'JobName': job_name,
                        'Transcript': transcript_text
                    })
                    print(f'Completed transcription job for {job_name}')
                else:
                    print(f'Transcription job for {job_name} failed')

    # Create a DataFrame with the transcriptions
    df = pd.DataFrame(transcriptions)
    return df

# Process the videos and create the DataFrame
transcripts_df = process_videos(bucket_name, prefix)

# Print the DataFrame
print(transcripts_df)

Started transcription job for Train/1.mp4 with job name 1_mp4_trial_2
Started transcription job for Train/10.mp4 with job name 10_mp4_trial_2
Started transcription job for Train/100.mp4 with job name 100_mp4_trial_2
Started transcription job for Train/101.mp4 with job name 101_mp4_trial_2
Started transcription job for Train/102.mp4 with job name 102_mp4_trial_2
Started transcription job for Train/103.mp4 with job name 103_mp4_trial_2
Started transcription job for Train/104.mp4 with job name 104_mp4_trial_2
Started transcription job for Train/105.mp4 with job name 105_mp4_trial_2
Started transcription job for Train/106.mp4 with job name 106_mp4_trial_2
Started transcription job for Train/107.mp4 with job name 107_mp4_trial_2
Completed transcription job for 1_mp4_trial_2
Completed transcription job for 100_mp4_trial_2
Completed transcription job for 103_mp4_trial_2
Completed transcription job for 105_mp4_trial_2
Started transcription job for Train/108.mp4 with job name 108_mp4_trial_2
St

In [56]:
parsed_transcript_df = pd.json_normalize(transcripts_df['parsed_transcript'])
df = pd.concat([transcripts_df.drop(columns=['parsed_transcript']), parsed_transcript_df], axis=1)
df.head(3)

Unnamed: 0,JobName,Transcript,transcript_text,parsed_Transcript,0
0,1_mp4_trial_2,"{""jobName"":""1_mp4_trial_2"",""accountId"":""058264...",,"{'jobName': '1_mp4_trial_2', 'accountId': '058...",{'transcript': 'ST Anselm College. She's a Rep...
1,100_mp4_trial_2,"{""jobName"":""100_mp4_trial_2"",""accountId"":""0582...",ST Anselm College. She's a Republican. This wi...,"{'jobName': '100_mp4_trial_2', 'accountId': '0...",{'transcript': 'The police were not impeding a...
2,103_mp4_trial_2,"{""jobName"":""103_mp4_trial_2"",""accountId"":""0582...",The police were not impeding anybody's religio...,"{'jobName': '103_mp4_trial_2', 'accountId': '0...",{'transcript': 'So y'all wanna hear something ...
