In [None]:
#@title Install Whisper and Condacolab
!pip install -U openai-whisper
!pip install ffmpeg
!pip install -q condacolab
import condacolab
condacolab.install()
from IPython.display import clear_output
clear_output()
print("All done!")

In [None]:
#@title Mount Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#@title Unzip corpus
#@markdown Unzip your dataset for transcription stuff. Make sure it is an archive only containing wavs (15-30 seconds in length recommended).

file_location = '/content/drive/MyDrive/wav.zip' #@param {type:"string"}

!7z x "$file_location" -o/content/db
from IPython.display import clear_output
clear_output()
print("Wavs extracted in db folder")

In [None]:
#@title Whisper inference
#@markdown **Make transcriptions** <br/> Worth noting that your singing database shouldn't have long pauses, *ooh-ing*, lalala-ing, humming etc. in it, otherwise it'll probably break the transcription making (Whisper poorly recognises those).
#Implemented from https://github.com/openai/whisper/discussions/1041 by Haru0l

import os
os.makedirs('/content/txt/', exist_ok=True)
!cd /content/db

def Transcriber(audiofile):
    import whisper
    from whisper.tokenizer import get_tokenizer
    #encourage model to transcribe words literally
    tokenizer = get_tokenizer(multilingual=True)  # use multilingual=True if using multilingual model
    number_tokens = [
        i
        for i in range(tokenizer.eot)
        if all(c in "0123456789" for c in tokenizer.decode([i]).removeprefix(" "))
    ]

    model = whisper.load_model("medium")
    answer = model.transcribe(audiofile, suppress_tokens=[-1] + number_tokens)

    print(answer['text'])

    output_txt = os.path.join('/content/txt/', os.path.splitext(filename)[0] + '.txt')

    with open(output_txt, 'w') as f:
      f.write(answer['text'])

for filename in os.listdir('/content/db/'):
  if filename.endswith('.wav'):
    file_path = os.path.join('/content/db/', filename)
    Transcriber(file_path)
from IPython.display import clear_output
clear_output()
print("Hopefully everything worked and your transcriptions are in the 'txt' folder!")

In [None]:
#@title (Optional) Zip up text transcriptions `txt` for you to dowload and edit
!zip transcriptions.zip /content/txt/*.txt

In [None]:
#@title Install MFA
!conda create -n aligner kaldi pynini
!source activate aligner
!pip install montreal-forced-aligner
!source activate aligner
!pip install speechbrain
from IPython.display import clear_output
clear_output()
print("All done!")

In [None]:
#@title Download the alignment models
#@markdown Choose the model for your desired language and scroll down to find the name of the model under "Installation"<br>After "mfa model download acoustic/dictionary" (e.g.: italian_cv)<br>Acoustic models: https://mfa-models.readthedocs.io/en/latest/acoustic/index.html<br>Dictionaries: https://mfa-models.readthedocs.io/en/latest/dictionary/index.html
!mv /content/txt/*.txt /content/db
acoustic = 'french_mfa' #@param {type:"string"}
dictionary = 'french_mfa' #@param {type:"string"}

# Download Model
!source activate aligner
!mfa model download acoustic "$acoustic"
# Download G2P
!source activate aligner
!mfa model download dictionary "$dictionary"

In [None]:
#@title **Make alignments**
!source activate aligner
!mfa align /content/db "$acoustic" "$dictionary" /content/alignment --beam 400
#mfa align --custom_mapping_path /content/arpa_cleaners.yaml /content/db english_us_arpa english_us_arpa /content/alignment
# Thank u HAI-D I'd probably die figuring out myself