<a href="https://colab.research.google.com/github/JanEggers-hr/chatgpt-playground/blob/main/whisper_audio_conversion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Whisper-based audio-to-text conversion

Runs OpenAI's "Whisper" TTS library in a Colab. Nothing is uploaded to OpenAI's servers, everything is processed within the Colab (e.g. in the Google Cloud).

Press the small "Play" triangle button under the headline "Code" - the notebook starts by loading loads and loads into the Colab environment. (Don't worry, this does not concern your computer!) As soon as the loading is done, you are asked to select and upload a file, then the file is converted to a .txt file and downloaded to your download folder.

## Tips for running this colab

- Activate the GPU in the colab environment (menu "Runtime"/"Change Runtime type") - this speeds up the Whisper conversion immensely
- Use a browser plugin like [Colab Auto Clicker](https://addons.mozilla.org/en-US/firefox/addon/colab-automatic-clicker/) for Firefox to hold the connection to the Notebook while it's doing the work, and leave the browser tab open

# Code

In [None]:
#@title

# Start by getting these two audio libraries (we will need them for conversion)
!apt install -q ffmpeg
!pip install -q pydub


# ipywidgets for setting the parameters
import ipywidgets as widgets
from IPython.display import display
import pandas as pd
from google.colab import files
from pydub import AudioSegment
from pathlib import Path
import os
import io
import base64

# Preparations: Make work directory
output_dir = "/content/audio/"
# Create output directory
if not os.path.exists(output_dir):
    os.mkdir(output_dir)

os.chdir(output_dir)

# You may select
# -the library used for conversion (Whisper vs. whisper-jax)
# -the size of the model (fast vs. good)

lib = 'whisper-jax'
model = 'medium'
textmode = 'Text only'
get_going = False

dropdown_library = widgets.Dropdown(
    options=['whisper','whisper-jax'],
    value=lib,
    description='Library:',
    layout=widgets.Layout(width='240px')
)

dropdown_model = widgets.Dropdown(
    options=['small','medium','large-v3'],
    value=model,
    description='Model:',
    layout=widgets.Layout(width='240px')
)

dropdown_textmode = widgets.Dropdown(
    options=['Text only','Text+Timestamp'],
    value=textmode,
    description='Output:',
    layout=widgets.Layout(width='240px')
)


def usage_text(lib,model,textmode):
  if lib == "whisper-jax":
    lib_str = f"<b>Using the {lib} library</b>: Takes longer to load but converts many times faster"
  else:
    lib_str = "<b>Using the whisper library</b>: Takes about 0.5x-2x the audio length to convert"
  if model == "small":
    model_str = "<b>Using small model</b>: less precise but faster"
  elif model == "medium":
    model_str = "<b>Using medium model</b>: average precision and conversion time"
  elif model == "large-v3":
    model_str = "<b>Using most recent large model</b>: large file download, long runtime, but more precise"
  if textmode == "Text only":
    textmode_str = "<b>Text only</b>: Plaintext file without timestamps"
  else:
    textmode_str = "<b>Text+Timestamps</b>: Runs 2x and produces CSV with timestamps and line breaks"
  return(lib_str + "<br>" + model_str + "<br>" + textmode_str)

text_explainer = widgets.HTML(
    value = usage_text(lib,model,textmode)
)

button_start = widgets.Button(
    description='Load the model',
    layout=widgets.Layout(width='15%'),
)

def update_params(change):
    global lib
    global model
    global textmode
    global get_going
    model = dropdown_model.value
    lib = dropdown_library.value
    text_explainer.value = usage_text(lib,model,textmode)
    get_going = False

def update_textmode(change):
    global textmode
    global lib
    global model
    textmode = dropdown_textmode.value
    text_explainer.value = usage_text(lib,model,textmode)

# HTML code for a spinning wheel showing the computer at work
spinner_html = """
<div class="loader"></div>
<style>
.loader {
  border: 8px solid #f3f3f3;
  border-top: 8px solid #3498db;
  border-radius: 50%;
  width: 25px;
  height: 25px;
  animation: spin 2s linear infinite;
  margin: 20px auto;
}

@keyframes spin {
  0% { transform: rotate(0deg); }
  100% { transform: rotate(360deg); }
}
</style>
"""

def button_clicked(button):
    global get_going
    print("Installing libraries for conversion (may take some time)")
    # Display the spinner animation
    html_spinner = widgets.HTML(spinner_html)
    display(html_spinner)
    global pipeline
    if lib == "whisper-jax":
        !pip install -q git+https://github.com/sanchit-gandhi/whisper-jax > /dev/null 2>&1
        from whisper_jax import FlaxWhisperPipline
        # load model (medium)
        pipeline = FlaxWhisperPipline("openai/whisper-"+model)
    elif lib == "whisper":
        !pip install -q git+https://github.com/openai/whisper.git > /dev/null 2>&1
        import whisper
        pipeline = whisper.load_model(model)
        # Once the tasks are completed, remove the spinner animation
    else:
        raise Exception("No valid library selected")
    html_spinner.close()
    get_going = True

def get_transcript(fname):
    if lib == "whisper-jax":
        result = pipeline(fname,
                          task="transcribe",
                          return_timestamps=True)
    else:
        result = pipeline.transcribe(fname)
    if (textmode != 'Text only'):
        text = ""
        # "chunks" is a list of dicts
        # each containing:
        # - timestamp: (list of 2 timecodes - start & end)
        # - text: string
        # Converting keys and values to a list, and taking first element
        for c in result["chunks"]:
            print(c)
            c_ts = c["timestamp"]
            c_text = c["text"]
            text = text + f'{c_ts[0]},{c_ts[1]},"{c_text}"\n'
        return(text)
    else:
        return(result["text"])


###################### Upload file and process ########################
uploader = widgets.FileUpload(
    multiple=True  # Allow multiple files to be selected
)

def convert_file(change):
    if change.new:
        if not get_going:
            button_clicked(0)
        # Now, convert the files
        for fname in uploader.value:
            file_info = uploader.value[fname]
            file_content = file_info['content']
            html_spinner = widgets.HTML(spinner_html)
            display(html_spinner)
            audio_bytes = io.BytesIO(file_content)
            # print(f"Loaded file: {fname}")
            fpath = output_dir + fname
            # file extension, slicing away the dot
            stem, suffix_raw = os.path.splitext(fpath)
            suffix = suffix_raw[1:]
            #convert to mp4 (m4a) using pydub
            audio = AudioSegment.from_file(audio_bytes,format=suffix)
            # print("Loaded audio.")
            if lib == "whisper-jax":
                export_ext = "mp3"
                audio.export(stem + "." + export_ext, format=export_ext)
            else:   # whisper needing m4a
                export_ext = "mp4"
                audio.export(stem + "." + export_ext, format=export_ext)
            # print(f"Saved as {export_ext}")
            print("Starting conversion of audio to text file.")
            audio_fname = stem + "." + export_ext
            if textmode == "Text only":
                txt_fname = stem + ".txt"
            else:
                txt_fname = stem + ".csv"
            # print(f"Converting {audio_fname}")
            result = get_transcript(audio_fname)
            with open(txt_fname, 'w') as f:
              f.write(result)
            files.download(txt_fname)
            os.remove(audio_fname)
        html_spinner.close()
        # Remove uploaded file from uploader and from local storage
        uploader.value.clear()
        print("Done - files converted. Saving to the download folder.")

### Widget hooks and display ###

dropdown_library.observe(update_params, 'value')
dropdown_model.observe(update_params, 'value')
dropdown_textmode.observe(update_textmode, 'value')
uploader.observe(convert_file,names='value')
button_start.on_click(button_clicked)

display(dropdown_library,
        dropdown_model,
        dropdown_textmode,
        text_explainer,
        uploader
        )

*v2.1 - based on whisper-jax, and with timestamp output support. Known issue: Downloads CSV twice.*

### The Fine Print:

By default, this workbook is using the medium-sized model (the multilanguage model is about 5GB); for better accuracy, switch to "large" (10GB), for faster transcription, use "small" (2GB).

Remember to switch on the GPU in Colab - changing the runtime type before executing - or conversion will be really, really slow. **But even with GPU installed, the conversion takes some time** - so be patient! If you should lose connection to the Colab VM, reconnect, and rerun the cell.

One thing that Whisper does not do for you: insert paragraphs, line breaks, indentations, emphases. Anything that makes the text block more readable is missing. Sorry.