In [None]:
import boto3
import json
import time
import re

transcribe = boto3.client('transcribe')


def start_transcription_job(input_uri, output_uri, transcription_job_name, bucket_name, transcription_key):
    transcribe.start_transcription_job(
        TranscriptionJobName=transcription_job_name,
        Media={'MediaFileUri': input_uri},
        MediaFormat='mp4',
        LanguageCode='en-US',
        OutputBucketName=bucket_name,
        OutputKey=transcription_key,
        Settings={'ShowSpeakerLabels': True, 'MaxSpeakerLabels': 2}
    )


def sanitize_key(key):
    key = re.sub(r'[^\w\s]', '_', key)
    key = re.sub(r'\s+', '_', key)
    key = key.replace('&', 'and')
    return key


def transcribe_video(video_key, bucket_name):
    timestamp = int(time.time())
    transcription_job_name = f'transcription-job-{timestamp}'
    sanitized_video_key = sanitize_key(video_key[:-4])
    transcription_key = f'Transcription_{sanitized_video_key}.json'
    input_uri = f's3://{bucket_name}/{video_key}'
    output_uri = f's3://{bucket_name}/'

    start_transcription_job(input_uri, output_uri, transcription_job_name, bucket_name, transcription_key)

    while True:
        status = transcribe.get_transcription_job(TranscriptionJobName=transcription_job_name)
        if status['TranscriptionJob']['TranscriptionJobStatus'] in ['COMPLETED', 'FAILED']:
            break
        time.sleep(10)

    if status['TranscriptionJob']['TranscriptionJobStatus'] == 'COMPLETED':
        transcript_uri = status['TranscriptionJob']['Transcript']['TranscriptFileUri']
        s3 = boto3.resource('s3')
        s3.Bucket(bucket_name).download_file(transcription_key, 'transcription.json')

        with open('transcription.json', 'r') as f:
            transcript_data = json.load(f)

        items = transcript_data['results']['items']

        with open(f"{sanitized_video_key}.txt", "w") as f:
            speaker = ''
            for item in items:
                if item['type'] == 'pronunciation':
                    if 'speaker_label' in item:
                        new_speaker = item['speaker_label']
                        if new_speaker != speaker:
                            speaker = new_speaker
                            f.write(f"\n{speaker}: ")
                    f.write(item['alternatives'][0]['content'] + ' ')
                elif item['type'] == 'punctuation':
                    f.write(item['alternatives'][0]['content'])

        s3.Bucket(bucket_name).upload_file(f"{sanitized_video_key}.txt", f"{sanitized_video_key}.txt")
        print(f"Transcript saved to s3://{bucket_name}/{sanitized_video_key}.txt")

    else:
        print("Transcription job failed.")


def get_mp4_files(bucket_name):
    s3 = boto3.client('s3')
    response = s3.list_objects_v2(Bucket=bucket_name)
    mp4_files = []
    for obj in response['Contents']:
        if obj['Key'].lower().endswith('.mp4'):
            mp4_files.append(obj['Key'])
    return mp4_files


bucket_name = input("Enter the bucket name: ").strip()

mp4_files = get_mp4_files(bucket_name)

for video_key in mp4_files:
    print(f"Transcribing video: {video_key}")
    transcribe_video(video_key, bucket_name)

print("Transcription process complete.")