# Deepgram-Powered Automatic Audio Processor
Notebook hackily created by NekoCitrus.

Adapted from the Deepgram Python docs and the PPP TalkNet notebooks.

You really shouldn't need this notebook, since Deepgram can easily be ran locally. However, it is here for the sake of completeness.

**Do not use this notebook alone to transcribe data.** Please make sure to verify and check your transcriptions before utilizing them for AI datasets/submitting them.

#Initial Setup

In [None]:
#@markdown **Google Drive:** Because uploading a giant ZIP directly to Colab takes far too long.
from google.colab import drive
drive.mount('drive')

In [None]:
#@markdown **Dependencies:** Install dependencies for Deepgram and other assorted elements.

!apt install unzip
!pip install deepgram-sdk



In [None]:
#@markdown **Path Configuration:** Configure paths of your audio ZIP file.
#@markdown * Make sure all your audio files are all in the same format.
import os
import shutil

dataset = "/content/drive/My Drive/your_zip.zip" #@param {type:"string"}
character_folder_name = "character_name" #@param {type:"string"}

#@markdown Select these options if you're pulling your audio files directly from Audacity's "Export Multiple" Sound Finder features, so that Colab doesn't mess up your filestructure.
add_trailing_zeroes = False #@param {type:"boolean"}
zerostyle = "Numbering after File Name Prefix" #@param ["Using Label/Track Name", "Numbering before Label/Track Name", "Numbering after File Name Prefix"] {allow-input: false}

assert os.path.exists(dataset), "Cannot find your ZIP file."

#Extract data.
os.chdir('/content')
if os.path.exists("/content/wavs"):
    shutil.rmtree("/content/wavs")
os.mkdir("wavs")
os.chdir("wavs")

if dataset[-4:] == ".zip":
    !unzip -q "{dataset}"
#elif dataset[-4:] == ".tar":
#    !tar -xf "{dataset}"
else:
    raise Exception("Unknown extension for dataset.")

print("ZIP file successfully loaded.")

if(add_trailing_zeroes):
  print("Adding trailing zeroes...")
  for r, _, f in os.walk("/content/wavs"):
    for name in f:
      re = name
      if(zerostyle == "Using Label/Track Name"):
        sp = name.rsplit(".")
        re = "".zfill(7-len(sp[0])) + name
      elif(zerostyle == "Numbering before Label/Track Name"):
        sp = name.rsplit("-")
        re = "".zfill(7-len(sp[0])) + name
      elif(zerostyle == "Numbering after File Name Prefix"):
        sp = name.rsplit("-")
        re = sp[0] + "-" + "".zfill(7-sp[1].index('.')) + sp[1]
      else:
        raise Exception("Invalid zerostyle. (How did you even manage to fuck this up?)")
      os.rename(name, re)

  print("...done.")

#Running

In [None]:
#@markdown **Transcribe:** Start transcription work.

import os
import re
import mimetypes
from tqdm.notebook import tqdm
from deepgram import Deepgram

#@markdown You will need to provide your Deepgram API key here. You can obtain a free API key worth 12,000 minutes at https://console.deepgram.com/signup. Use burner emails if you're paranoid or you have a lot of data to transcribe.
deepgram_api_key = "API_KEY_HERE" #@param {type:"string"}

#@markdown Select this option if you wish to delete audio files that are knowingly unusable by 15.ai automatically. A file will also be generated with a list of all deleted files.
delete_useless = False #@param {type:"boolean"}

mimetypes.init()

dg_client = Deepgram(deepgram_api_key)
print("Deepgram connection successfully created.")

filename = "transcription_for_"+character_folder_name+".txt"
if(os.path.exists(filename)):
		os.remove(filename)
  
if(delete_useless):
  if(os.path.exists("deleted_files.txt")):
    os.remove("deleted_files.txt")

async def process(audio):
  mt = mimetypes.guess_type(audio)
  with open(audio, 'rb') as au:
    source = {'buffer': au, 'mimetype': mt[0]}
    response = await dg_client.transcription.prerecorded(source, {'punctuate': True, 'profanity_filter': False, 'detect_language': True})
  lang = response['results']['channels'][0]['detected_language']
  if(delete_useless and lang != "en"):
    return ""
  transcription = response['results']['channels'][0]['alternatives'][0]['transcript']
  return transcription

def destroy_file(filename, strippedFilename):
  with open("deleted_files.txt", 'a') as d:
    d.write(strippedFilename+"\n")
  os.remove(filename)

def strip_trailing_zeroes(filename):
  if(zerostyle == "Using Label/Track Name"):
    tmp = filename.lstrip('0')
    return tmp
  if(zerostyle == "Numbering before Label/Track Name"):
    tmp = filename.lstrip('0')
    if(tmp.index('-')==1):
      tmp = "0" + tmp
    return tmp
  if(zerostyle == "Numbering after File Name Prefix"):
    tmp = filename.rsplit('-')
    tmp[1] = tmp[1].lstrip('0')
    if(tmp[1].index('.')==1):
      tmp[1] = "0" + tmp[1]
    return tmp[0] + "-" + tmp[1]
  raise Exception("Invalid zerostyle. (How did you even manage to fuck this up?)")

print("Writing transcriptions...")

for r, _, f in os.walk("/content/wavs"):
    for name in tqdm(sorted(f)):
      scrip = await process(os.path.join(r, name))

      fname = name
      if(add_trailing_zeroes):
        fname = strip_trailing_zeroes(name)
      if(delete_useless):
        nopu = re.sub(r'[^\w\s]', '', scrip)
        if(nopu == "" or (not nopu.isascii()) or len(nopu)<5):
          destroy_file(name, fname)
          continue

      finScript = character_folder_name + "/" + fname + "|" + scrip.lstrip()
      with open(filename, 'a') as w:
        w.write(finScript+"\n")

shutil.move("/content/wavs/"+filename, "/content/")

print("All done! Be sure to verify!")


#Debug Bullshit
You should not need anything in these cells.

In [None]:
#@markdown **DEBUG:** Cell meant for debugging the Deepgram API.

import os
import re
import mimetypes
from tqdm.notebook import tqdm
from deepgram import Deepgram

character_folder_name = "dummy"
deepgram_api_key = "api-key-here" #@param {type:"string"}

mimetypes.init()

dg_client = Deepgram(deepgram_api_key)
print("DeepGram connection successfully created.")

async def inference(audio):
  mt = mimetypes.guess_type(audio)
  print(mt)
  with open(audio, 'rb') as au:
    source = {'buffer': au, 'mimetype': mt[0]}
    response = await dg_client.transcription.prerecorded(source, {'punctuate': True, 'profanity_filter': False, 'detect_language': True})
  lang = response['results']['channels'][0]['detected_language']
  transcription = response['results']['channels'][0]['alternatives'][0]['transcript']
  print(lang)
  return transcription

for r, _, f in os.walk("/content/wavs"):
    for name in tqdm(sorted(f)):
      response = await inference(os.path.join(r, name))
      print(response)


print("All done! Be sure to verify!")


In [None]:
#@markdown **DEBUG:** Create fake variables.
dataset = "dummy"
character_folder_name = "dummy"
add_trailing_zeroes = False
