<a href="https://colab.research.google.com/github/MoodyMarshmallow/Audio-Transcriber/blob/main/Podcast_Transcriber.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install pydub openai mutagen

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting mutagen
  Downloading mutagen-1.47.0-py3-none-any.whl.metadata (1.7 kB)
Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Downloading mutagen-1.47.0-py3-none-any.whl (194 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.4/194.4 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pydub, mutagen
Successfully installed mutagen-1.47.0 pydub-0.25.1


In [30]:
import io
from mutagen.mp3 import MP3
from pydub import AudioSegment
import openai
import os
from google.colab import userdata
from tqdm import tqdm
import json

In [27]:
# Set your OpenAI API key
api_key = userdata.get('OPENAI_API_KEY')  # Replace with your actual API key
client = openai.OpenAI(api_key=api_key)

In [32]:
def split_mp3(input_file, max_size_mb=25):
    max_size_bytes = max_size_mb * 1024 * 1024
    buffer_bytes = 100 * 1024  # 100KB buffer for headers/metadata

    # Get the audio file's bitrate using mutagen
    try:
        audio_info = MP3(input_file).info
    except Exception as e:
        raise ValueError(f"Could not read MP3 file: {e}")

    bitrate = audio_info.bitrate  # in bits per second

    # Calculate the maximum duration per chunk in milliseconds
    max_data_size = max_size_bytes - buffer_bytes
    chunk_duration_ms = int((max_data_size * 8 * 1000) / bitrate)

    if chunk_duration_ms <= 0:
        raise ValueError("Bitrate is too high for the maximum size, cannot split into valid chunks.")

    # Load the audio file using pydub
    try:
        audio = AudioSegment.from_mp3(input_file)
    except Exception as e:
        raise ValueError(f"Could not load MP3 file with pydub: {e}")

    total_duration_ms = len(audio)
    start = 0
    chunks = []
    total_chunks = (total_duration_ms + chunk_duration_ms - 1) // chunk_duration_ms


    with tqdm(total=total_chunks, desc="Splitting audio", unit="chunk") as pbar:
        while start < total_duration_ms:
            end = start + chunk_duration_ms
            if end > total_duration_ms:
                end = total_duration_ms

            # Extract the chunk
            chunk = audio[start:end]

            # Export to BytesIO
            buffer = io.BytesIO()
            try:
                chunk.export(buffer, format="mp3", bitrate=f"{bitrate//1000}k")
            except Exception as e:
                raise IOError(f"Failed to export chunk: {e}")

            # Verify the chunk size
            buffer_size = buffer.getbuffer().nbytes
            if buffer_size > max_size_bytes:
                raise RuntimeError(
                    f"Chunk {len(chunks)+1} exceeded {max_size_mb}MB. "
                    "Consider reducing the buffer size or checking the bitrate accuracy."
                )

            buffer.seek(0)
            chunks.append(buffer)

            start = end
            pbar.update(1)

    return chunks

def get_cost_whisper(audio_file):
    audio = AudioSegment.from_file(audio_file)
    # Whisper pricing: $0.006 / minute
    audio_duration = audio.duration_seconds
    cost_per_minute = 0.006
    cost = (audio_duration / 60) * cost_per_minute
    return cost

In [14]:
split_audio = split_mp3('/content/Edward Mehr talks to S3 about building Machina Labs.mp3')
print(split_audio)
print(len(split_audio))
print(type(split_audio[0]))

Splitting audio: 100%|██████████| 2/2 [00:36<00:00, 18.44s/chunk]

[<_io.BytesIO object at 0x788c1384abb0>, <_io.BytesIO object at 0x788c13849800>]
2
<class '_io.BytesIO'>





In [18]:
def transcribe_audio(audio_file_path):
    filler_text = """
    The website doesn't have the theme I was going for. Something summery;
    colorful. This looks perfect. Just Photoshop out the dog, add a baby,
    and make the curtains blue. We also need to add this 2000 line essay.
    Can the black be darker? Can you make it stand out more? I know you've made
    thirty iterations, but can we go back to the first one? That was the best
    version I remember. I'll know it when I see it; that’s not what I saw in my
    head at all. Can you put "find us on facebook" by the facebook logo?
    """
    try:
        with open(audio_file_path, "rb") as audio_file:
            transcripts = []
            segments = split_mp3(audio_file)
            for i, audio_segment in tqdm(enumerate(segments), total=len(segments), desc="Transcribing", unit="segment"):
                transcripts.append(
                        client.audio.transcriptions.create(
                        file=("segment.mp3", audio_segment, "audio/mpeg"),
                        model="whisper-1",
                        response_format="verbose_json",
                        prompt= filler_text if i==0 else transcripts[-1].text,
                        timestamp_granularities=["segment"]
                        )
                    )
        return transcripts
    except FileNotFoundError:
        print(f"Error: File not found at {audio_file_path}")
        return None

In [48]:
def merge_verbose_json_outputs(verbose_json_list):
    """
    Merges a list of verbose JSON outputs from the transcribe_audio function
    into a single JSON object.

    Args:
        verbose_json_list (list): A list of verbose JSON outputs.

    Returns:
        dict: A single JSON object containing all the merged data.
    """

    merged_transcript = verbose_json_list[0]

    for verbose_json in (verbose_json_list[1:]):
        segments = verbose_json.segments
        text = verbose_json.text
        for segment in segments:
            segment.start += merged_transcript.duration
            segment.end += merged_transcript.duration
            segment.id += len(merged_transcript.segments)
        merged_transcript.duration += verbose_json.duration
        merged_transcript.text += text
        merged_transcript.segments.extend(segments)
    return merged_transcript

In [71]:
def detect_speakers(transcript, num_speakers):

    # Use GPT-4o-mini to determine who spoke which segment.
    system_prompt = f"""
    You are a helpful AI assistant to a professional transcriber. Your task is to
    identify which of the {num_speakers} speakers spoke which parts of a given
    transcript. Please be careful to leave the timestamps in the transcript.
    Format your response like so:

        Speaker 1 (John Daniels)
        (00:00) How many people are there in your family?

        Speaker 2 (Janet Jones)
        (00:02) There are five people in my family. (0:04) My father, mother, brother, sister, and me.

        Speaker 1 (John Daniels)
        (00:10) Does your family live in a house or an apartment?

        Speaker 2 (Janet Jones)
        (00:13) We live in a house in the countryside.

        Speaker 1 (John Daniels)
        (00:17) What does your father do?

        Speaker 2 (Janet Jones)
        (00:19) My father is a doctor. (00:21) He works at the local hospital.
    """

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        temperature=0,
        messages=[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": transcript},
        ]
    )
    speakers = response.choices[0].message.content
    return speakers

In [63]:
def format_transcript(verbose_json, num_speakers=1):
    """
    Converts OpenAI Whisper API's verbose JSON output into a timestamped text format.

    Args:
        verbose_json: Whisper API response in verbose JSON format

    Returns:
        str: Formatted text with timestamps like "(0:10) Your text here" with speakers indicated
    """

    segments = verbose_json.segments
    formatted_output = []
    for segment in segments:
        start_time = segment.start
        hours = int(start_time // 3600)
        minutes = int(start_time // 60)
        seconds = int(start_time % 60)
        if hours == 0:
            timestamp = f"({minutes:02d}:{seconds:02d})"
        else:
            timestamp = f"({hours}:{minutes:02d}:{seconds:02d})"
        text = (segment.text).strip()
        formatted_output.append(f"{timestamp} {text}")

    formatted_output = ' '.join(formatted_output)

    if num_speakers > 1:
        formatted_output = detect_speakers(formatted_output, num_speakers)
    return formatted_output

In [73]:
audio_file_path = "/content/ed-mehr-machinas-robot-army-transform-manufacturing-interview-ezmp3cc_gct0gfxt.mp3" # Replace with the path to your audio file
num_speakers = 2 # replace with number of speakers in the transcript

transcript_raw = merge_verbose_json_outputs(transcribe_audio(audio_file_path))
transcript = format_transcript(transcript_raw, num_speakers=num_speakers)
if transcript:
    print("Transcription:\n", transcript)
    print(f"Total cost: ${get_cost_whisper(audio_file_path):.4f}")

Splitting audio: 100%|██████████| 3/3 [00:37<00:00, 12.44s/chunk]
Transcribing: 100%|██████████| 3/3 [01:31<00:00, 30.55s/segment]


Transcription:
 Speaker 1 (Interviewer)
(00:00) I really want to start is like early life because when I looked at your LinkedIn when (00:24) we were first preparing for the episode, the original podcast episode, I just couldn't (00:29) figure out why you're doing anything with manufacturing because I'm reading like software, (00:32) software, software, like very software centric person, study computer science. And then we (00:37) were listening to, I forgot which podcast episode it was you did. You're like, actually, (00:41) I went to boarding school. It was like very manufacturing focused. And I'm like, ah, that's (00:46) where the seed might have been planted. So let's start there. Talk about that boarding (00:50) school, the experience, how you got into it.

Speaker 2 (Guest)
(00:55) Yeah, so I went to all boys school. Yeah, (01:01) we had a lot of focus. It was a small school. I think we had a class of 60 people. So there (01:07) was a lot of focus on kind of like all around curri

In [74]:
def export_string_to_file(text, filename="output.txt"):
    try:
        with open(filename, "w") as file:
            file.write(text)
        print(f"String successfully exported to '{filename}'")
    except Exception as e:
        print(f"An error occurred: {e}")

In [75]:
my_string = format_transcript(transcript)
export_string_to_file(my_string, "ed-mehr-machinas-robot-army-transform-manufacturing-interview-ezmp3cc_gct0gfxt.txt") # Replace with desired filename


String successfully exported to 'ed-mehr-machinas-robot-army-transform-manufacturing-interview-ezmp3cc_gct0gfxt.txt'
