#Whisper-Powered Automatic Audio Processor (W-PAAP)
Notebook hackily created for Colab by NekoCitrus.

Adapted from the OpenAI Whisper and PPP TalkNet notebooks.

**Do not use this notebook alone to transcribe data.** Please make sure to verify and check your transcriptions before utilizing them for AI datasets/submitting them.

# Initial Setup

In [None]:
#@markdown **Startup:** Check which GPU you've been allocated.

#@markdown Colab should not run out of memory regardless of what GPU is assigned to you.

#@markdown This cell exists if you wish to check anyways out of either curiosity or to check if Google will even give you a GPU in this session.

!nvidia-smi -L
!nvidia-smi

In [None]:
#@markdown **Google Drive:** Because uploading a giant ZIP directly to Colab takes far too long.
from google.colab import drive
drive.mount('drive')

In [None]:
#@markdown **Dependencies:** Install dependencies for OpenAI Whisper and other assorted elements.

!apt install unzip

!python -m pip install git+https://github.com/openai/whisper.git


In [None]:
#@markdown **Path Configuration:** Configure paths of your audio ZIP file.
#@markdown * Make sure all your audio files are all in the same format.
import os
import shutil

dataset = "/content/drive/My Drive/your_zip.zip" #@param {type:"string"}
character_folder_name = "character_name" #@param {type:"string"}

#@markdown Select these options if you're pulling your audio files directly from Audacity's "Export Multiple" Sound Finder features, so that Colab doesn't mess up your filestructure.
add_trailing_zeroes = False #@param {type:"boolean"}
zerostyle = "Numbering after File Name Prefix" #@param ["Using Label/Track Name", "Numbering before Label/Track Name", "Numbering after File Name Prefix"] {allow-input: false}

assert os.path.exists(dataset), "Cannot find your ZIP file."

#Extract data.
os.chdir('/content')
if os.path.exists("/content/wavs"):
    shutil.rmtree("/content/wavs")
os.mkdir("wavs")
os.chdir("wavs")

if dataset[-4:] == ".zip":
    !unzip -q "{dataset}"
#elif dataset[-4:] == ".tar":
#    !tar -xf "{dataset}"
else:
    raise Exception("Unknown extension for dataset.")

print("ZIP file successfully loaded.")

if(add_trailing_zeroes):
  print("Adding trailing zeroes...")
  for r, _, f in os.walk("/content/wavs"):
    for name in f:
      re = name
      if(zerostyle == "Using Label/Track Name"):
        sp = name.rsplit(".")
        re = "".zfill(7-len(sp[0])) + name
      elif(zerostyle == "Numbering before Label/Track Name"):
        sp = name.rsplit("-")
        re = "".zfill(7-len(sp[0])) + name
      elif(zerostyle == "Numbering after File Name Prefix"):
        sp = name.rsplit("-")
        re = sp[0] + "-" + "".zfill(7-sp[1].index('.')) + sp[1]
      else:
        raise Exception("Invalid zerostyle. (How did you even manage to fuck this up?)")
      os.rename(name, re)

  print("...done.")



# Running

In [None]:
#@markdown **Transcribe:** Select your model and start transcription work. If you don't know what each model means, keep it on "small".

#@markdown * Do note that while the multilanguage version of the model will be downloaded, the only languages that most relevant voice AI services (15.ai/TalkNet/etc) support is English.

import os
import re
from tqdm.notebook import tqdm
import whisper

Model = "small" #@param ["tiny", "base", "small", "medium", "large"] {allow-input: false}

#@markdown Select this option if you wish to delete audio files that are knowingly unusable by 15.ai automatically. A file will also be generated with a list of all deleted files.
delete_useless = False #@param {type:"boolean"}

mod = Model
print("Model \""+mod+"\" chosen.")

filename = "transcription_for_"+character_folder_name+".txt"
if(os.path.exists(filename)):
		os.remove(filename)

if(delete_useless):
  if(os.path.exists("deleted_files.txt")):
    os.remove("deleted_files.txt")

print("Loading Whisper model into memory...")
model = whisper.load_model(mod)
options = whisper.DecodingOptions()

def inference(audio):
  if(delete_useless):
    aud = whisper.load_audio(audio)
    aud = whisper.pad_or_trim(aud)
    mel = whisper.log_mel_spectrogram(aud).to(model.device)
    _, probs = model.detect_language(mel)
    lang = max(probs, key=probs.get)
    if(lang!="en"):
      return ""
    result = whisper.decode(model, mel, options)
    return result.text
  result = model.transcribe(audio)
  return result["text"]

def destroy_file(filename, strippedFilename):
  with open("deleted_files.txt", 'a') as d:
    d.write(strippedFilename+"\n")
  os.remove(filename)
  
def strip_trailing_zeroes(filename):
  if(zerostyle == "Using Label/Track Name"):
    tmp = filename.lstrip('0')
    return tmp
  if(zerostyle == "Numbering before Label/Track Name"):
    tmp = filename.lstrip('0')
    if(tmp.index('-')==1):
      tmp = "0" + tmp
    return tmp
  if(zerostyle == "Numbering after File Name Prefix"):
    tmp = filename.rsplit('-')
    tmp[1] = tmp[1].lstrip('0')
    if(tmp[1].index('.')==1):
      tmp[1] = "0" + tmp[1]
    return tmp[0] + "-" + tmp[1]
  raise Exception("Invalid zerostyle. (How did you even manage to fuck this up?)")

print("Writing transcriptions...")

for r, _, f in os.walk("/content/wavs"):
    for name in tqdm(sorted(f)):
        scrip=inference(os.path.join(r, name))

        fname = name
        if(add_trailing_zeroes):
          fname = strip_trailing_zeroes(name)
        if(delete_useless):
          nopu = re.sub(r'[^\w\s]', '', scrip)
          if(nopu == "" or (not nopu.isascii()) or len(nopu)<5):
            destroy_file(name, fname)
            continue
        finScript = character_folder_name + "/" + fname + "|" + scrip.lstrip()
        with open(filename, 'a') as w:
            w.write(finScript+"\n")

shutil.move("/content/wavs/"+filename, "/content/")
#if(delete_useless):
#  shutil.move("content/wavs/deleted_files.txt", "/content/")

print("All done! Be sure to verify!")