# insanely-fast-whisper notebook

A Jupyter notebook interface for the 'insanely-fast-whisper' command-line tool, providing high-speed transcription and speaker diarization with export capabilities to Word documents.

visit https://github.com/Vaibhavs10/insanely-fast-whisper for more information

#### HuggingFace user agreements!
To run this notebook, you will need the following:

A Hugging Face account: [Create an account](https://huggingface.co/settings/tokens)

A Hugging Face authentication token: [Generate your token here](https://huggingface.co/settings/tokens) 

→ Enter this token in the notebook under HUGGINGFACE_AUTH_TOKEN.
Acceptance of the following user agreements:
- [Pyannote Speaker Diarization 3.1](https://huggingface.co/pyannote/speaker-diarization-3.1)
- [Pyannote Segmentation 3.0](https://hf.co/pyannote/segmentation-3.0)

# Quickrun for ease of use

In [None]:
import os, glob, re

# set these 3 variables and run all!
# if you want to change the language, change it at "2. Transcribe"
os.environ["HUGGINGFACE_AUTH_TOKEN"] = "hf_dqNMswjtIbBq...."
extensions = ["mp3", "WMA", "wma"]  # List of extensions
path_audio = '/work/speech/...'

# Use glob to match multiple extensions
files = []
for ext in extensions:
    path_audio_regex = f"{path_audio}*.{ext}"  # glob does not use regex
    files.extend(glob.glob(path_audio_regex))

length = len(files)
print(f"The following {length} files will be transcribed: \n{files}")

### 1. Install

In [None]:
!pip install pipx
!pipx install insanely-fast-whisper==0.0.15 --force
!pip install python-docx

# Workaround for insanely-fast-whisper (0.0.14) 
# replace numpy=2.0.0 as it is not compatible with all modules
# !pipx inject insanely-fast-whisper numpy==1.26.4 --force

In [None]:
# You can uncomment the following code as an alternative method if you prefer not to set your token directly in the notebook.
# Run this cell, copy paste your huggingface token and press enter

# import getpass
# import os

# def _set_env(var: str):
#     if not os.environ.get(var):
#         os.environ[var] = getpass.getpass(f"{var}: ")

# if not os.environ.get("HUGGINGFACE_AUTH_TOKEN"):
#     _set_env("HUGGINGFACE_AUTH_TOKEN")

### 2. Transcribe

In [None]:
import re
import os

LANGUAGE = "da"
NUM_SPEAKERS = 2
TASK = "transcribe"
DIARIZATION_MODEL = "pyannote/speaker-diarization-3.1"
HUGGINGFACE_AUTH_TOKEN = os.environ['HUGGINGFACE_AUTH_TOKEN']

def transcribe(file_name):
    TRANSCRIPT_PATH = os.path.splitext(file_name)[0]+"-transcription.json"
    FILE_NAME = re.escape(file_name)
    TRANSCRIPT_PATH = re.escape(TRANSCRIPT_PATH)    
    
    # workaround, current MPLBACKEND default does not work
    os.environ['MPLBACKEND'] = 'svg'
    # !echo $MPLBACKEND
    !insanely-fast-whisper --file-name {FILE_NAME} --transcript-path {TRANSCRIPT_PATH} --language {LANGUAGE} --task {TASK} --diarization_model {DIARIZATION_MODEL} --hf-token {HUGGINGFACE_AUTH_TOKEN}


In [None]:
for idx, file in enumerate(files):
    print(f"file {idx+1} of {length} file={file}")
    transcribe(file)

### 3. Convert transcriptions to Word

In [None]:
import glob
import os
import json
import re

from docx import Document
from docx.enum.text import WD_ALIGN_PARAGRAPH


def diarization_to_docx_edit(audio_file_diarization):

    audio_file_basename = os.path.basename(audio_file_diarization)
    
    doc = Document()
    
    paragraph = doc.add_paragraph()
    paragraph.add_run(audio_file_basename).bold = True
    
    paragraph_format = paragraph.paragraph_format
    paragraph_format.alignment
    paragraph_format.alignment = WD_ALIGN_PARAGRAPH.CENTER
    
    f = open(audio_file_diarization, 'r', encoding="utf-8")

    data = json.load(f)

    for entry in data["speakers"]:
        paragraph = doc.add_paragraph()
        paragraph.add_run(str(entry["timestamp"]) + "\n")
        paragraph.add_run(str(entry["speaker"])  + "\n")
        paragraph.add_run(str(entry["text"]).strip())

            
    docx_filename = audio_file_diarization.replace(".json", "_edit.docx")
    doc.save(docx_filename)


# absolute path containing the json diarization files
extension = "json"
path_json_regex = r'' + re.escape(path_audio) + '*.' + re.escape(extension)
files_json = glob.glob(path_json_regex)
length_json = len(files_json)

print(f"The following {length_json} files will be converted to docx {files_json}")

for idx, file in enumerate(files_json):
    print(f"file {idx+1} of {length} file={file}")
    diarization_to_docx_edit(file)
print(f"Done!")
