In [None]:
import os
import time
import json
from pytube import YouTube
import boto3
import requests
import tempfile


def sanitize_key(key):
    invalid_chars = ['<', '>', ':', '"', '/', '\\', '|', '?', '*', ' ', '\'', '’']
    for char in invalid_chars:
        key = key.replace(char, '_')
    return key


def download_video(video_url, output_path='/tmp'):
    yt = YouTube(video_url)
    video = yt.streams.get_highest_resolution()
    video_path = os.path.join(output_path, f'{video.default_filename}')
    video.download(output_path=output_path)
    return video.default_filename, video_path


def upload_video_to_s3(video_path, bucket_name, video_default_filename):
    with open(video_path, 'rb') as video_file:
        s3.Bucket(bucket_name).put_object(Key=video_default_filename, Body=video_file)
    print(f"Video uploaded to s3://{bucket_name}/{video_default_filename}")

    
def start_transcription_job(input_uri, output_uri, transcription_job_name, bucket_name, transcription_key):
    transcribe.start_transcription_job(
        TranscriptionJobName=transcription_job_name,
        Media={'MediaFileUri': input_uri},
        MediaFormat='mp4',
        LanguageCode='en-US',
        OutputBucketName=bucket_name,
        OutputKey=transcription_key
    )

    while True:
        status = transcribe.get_transcription_job(TranscriptionJobName=transcription_job_name)
        if status['TranscriptionJob']['TranscriptionJobStatus'] in ['COMPLETED', 'FAILED']:
            break
        time.sleep(10)
    return status


def process_transcription_output(status, bucket_name, transcription_key, s3_client, video_default_filename):
    # Download the transcription file to a temporary file
    with tempfile.NamedTemporaryFile(delete=False) as temp_file:
        s3_client.download_file(bucket_name, transcription_key, temp_file.name)
        temp_file_path = temp_file.name

    # Read the transcription data from the temporary file
    with open(temp_file_path, 'r') as file:
        transcription_data = json.load(file)

    # Delete the temporary file
    os.unlink(temp_file_path)

    # Process and save the transcript
    items = transcription_data["results"]["items"]
    output = format_transcript(items)
    save_transcript_to_file(output, video_default_filename, bucket_name)

    
def save_transcript_to_file(transcript, filename, bucket_name):
    # Create a temporary file to store the transcript
    with tempfile.NamedTemporaryFile(delete=False) as temp_file:
        temp_file.write(transcript.encode())
        temp_file_path = temp_file.name

    # Upload the temporary file to the S3 bucket
    with open(temp_file_path, 'rb') as file:
        s3.Bucket(bucket_name).put_object(Key=f"{filename}.txt", Body=file)
    print(f"Transcript saved to s3://{bucket_name}/{filename}.txt")

    # Delete the temporary file
    os.unlink(temp_file_path)


def format_transcript(items):
    output = []
    word_count = 0

    for item in items:
        if item["type"] == "pronunciation":
            word_count += 1
            content = item["alternatives"][0]["content"]
            if word_count % 10 == 1:
                seconds = float(item["start_time"])
                minutes = int(seconds // 60)
                seconds = int(seconds % 60)
                timestamp = f"({minutes:02d}:{seconds:02d}) "
                content = f"{timestamp}{content}"
            output.append(content)
        elif item["type"] == "punctuation":
            output[-1] += item["alternatives"][0]["content"]

    return " ".join(output)


def process_video(video_url, bucket_name):
    # Download video using pytube
    video_default_filename, video_path = download_video(video_url)

    # Upload video to S3
    upload_video_to_s3(video_path, bucket_name, video_default_filename)

    # Generate a unique transcription job name using the current timestamp 
    timestamp = int(time.time())
    transcription_job_name = f'transcription-job-{timestamp}'

    # Set up the transcription output key
    transcription_key = f'Transcription_{sanitize_key(video_default_filename[:-4])}.json'

    # Set up S3 input and output URIs
    input_uri = f's3://{bucket_name}/{video_default_filename}'
    output_uri = f's3://{bucket_name}/'

    # Start transcription job
    start_transcription_job(input_uri, output_uri, transcription_job_name, bucket_name, transcription_key)

    # Wait for the transcription job to complete
    while True:
        status = transcribe.get_transcription_job(TranscriptionJobName=transcription_job_name)
        if status['TranscriptionJob']['TranscriptionJobStatus'] in ['COMPLETED', 'FAILED']:
            break
        print("Waiting for transcription to complete...")
        time.sleep(30)
    
    # Check if the transcription job has been completed successfully
    if status['TranscriptionJob']['TranscriptionJobStatus'] == 'FAILED':
        print("Transcription job failed. Please try again.")
        return
    
    # Process the transcription output
    process_transcription_output(status, bucket_name, transcription_key, s3_client, video_default_filename)


# Set up S3 and SageMaker
session = boto3.Session()
s3 = session.resource('s3')
s3_client = session.client('s3')
transcribe = session.client('transcribe')

bucket_name = input("Enter the name of your S3 Bucket")

# Get YouTube video URL from user input
video_url = input("Enter the YouTube video URL: ")

process_video(video_url, bucket_name)

print('Everything is complete for ' + video_url)
