In [None]:
import os
import re
import whisper
import torch
import subprocess

 
device = "cuda" if torch.cuda.is_available() else "cpu"
 
model = whisper.load_model("base", device=device)

def download_audio(youtube_url, output_path="audio.mp3"):
    """Download the audio from a YouTube video."""
    try:
        command = [
            "yt-dlp",
            "--format", "bestaudio",
            "--extract-audio",
            "--audio-format", "mp3",
            "--output", output_path,
            youtube_url,
        ]
        subprocess.run(command, check=True)
        print(f"Audio downloaded to {output_path}")
        return output_path
    except Exception as e:
        print(f"Error downloading audio: {e}")
        return None

def transcribe_audio(audio_path):
    """Transcribe the audio using Whisper."""
    try:
        print("Extracting transcript")
        result = model.transcribe(audio_path)
        return result  # Contains text and timestamps
    except Exception as e:
        print(f"Error transcribing audio: {e}")
        return None

def save_transcript_with_timestamps(transcript, output_file="transcript.txt"):
    """Save the transcript with timestamps to a file."""
    try:
        with open(f"transcripts/{output_file}", "w") as file:
            for segment in transcript['segments']:
                start_time = segment['start']
                end_time = segment['end']
                text = segment['text']
                file.write(f"[{start_time:.2f} - {end_time:.2f}] {text}\n")
        print(f"Transcript saved to {output_file}")
    except Exception as e:
        print(f"Error saving transcript: {e}")

 
def generate_system_prompt(exclude_terms:list|None=None, custom_terms:dict|None=None):
 
    if exclude_terms is None:
        exclude_terms = []
    if custom_terms is None:
        custom_terms = {}
 
    exclusion_text = ", ".join([f"{term}" for term in exclude_terms])
    exclusion_text.rstrip(",")
    exclusion_text += "."
    custom_terms_text = "\n".join([f"'{eng_term}' => '{myanmar_term}'" for eng_term, myanmar_term in custom_terms.items()])

 
    prompt = f"""
    Translate the following English text to Burmese (Myanmar) while keeping the original English terms for special terms.
    Please exclude the following terms from translation: 
    {exclusion_text}
    """
    
    if custom_terms:
        prompt += f"""Also, translate the following custom terms as specified:
    {custom_terms_text}"""
    return prompt



def save_file(filepath: str, content: str):
    with open(filepath, "w", encoding="utf-8") as f:
        f.write(content)
        
def read_file(filepath:str):
    with open(filepath , "r") as f:
        return f.read()


In [8]:
youtube_url = "https://www.youtube.com/shorts/GgfuaA9ySbU"
output_filename = "transcript.txt"
audio_file = download_audio(youtube_url)

if audio_file:

    transcript = transcribe_audio(audio_file)
    
    if transcript:

        save_transcript_with_timestamps(transcript, output_filename)

    os.remove(audio_file)

[youtube] Extracting URL: https://www.youtube.com/shorts/GgfuaA9ySbU
[youtube] GgfuaA9ySbU: Downloading webpage
[youtube] GgfuaA9ySbU: Downloading tv client config
[youtube] GgfuaA9ySbU: Downloading player d50f54ef
[youtube] GgfuaA9ySbU: Downloading tv player API JSON
[youtube] GgfuaA9ySbU: Downloading ios player API JSON
[youtube] GgfuaA9ySbU: Downloading m3u8 information
[info] GgfuaA9ySbU: Downloading 1 format(s): 251
[download] Destination: audio.webm
[download] 100% of  872.35KiB in 00:00:00 at 6.90MiB/s   
[ExtractAudio] Destination: audio.mp3
Deleting original file audio.webm (pass -k to keep)
Audio downloaded to audio.mp3
Extracting transcript
Transcript saved to transcript.txt
