dataset_configs/armenian/audio_books/config.yaml

documentation: |
  Audio books (Armenian)
  ######################

  This config can be used as example to process audiobooks in Armenian language and prepare 
  dataset in the NeMo format.

  This config performs the following data processing.

  1. Create initial manifest by collecting all avalible files with mp3 extension in raw_data_dir folder.

  2. Convert mp3 into wav format using the Ffmpeg suite, with a downsampling to a 16000 Hz sample rate and a unification of all audio channels into a mono track.

  3. Count duration for audio files in seconds and save it into duration field.

  4. Filter out broken files with duration shorter than 0 seconds. You can directly change the config file to control this.

  5. Predict transcription using a Whisper ASR model and save results into text field.

  6. Predict transcription using a ``distil-whisper`` ASR model and save results into pred_text field.

  7. Drops everything with non-Armenian characters.

  8. Normalise some text examples with SubRegex.

  **Required arguments**:
  
  * **workspace_dir**: specify the workspace folder where all audio files will be stored.

  Note that you can customize any part of this config either directly or from command-line.

  **Output format**:
  
  * ``${workspace_dir}/final_manifest.json`` - final_manifest manifest with all the data.

  Output manifest contain the following fields:

  * **audio_filepath (str)**: relative path to the audio files.

  * **text (str)**: transcription predicted by Whisper (Upper-case with punctuation).

  * **pred_text (str)**: transcription predicted by Transformers (Upper-case without punctuation).

  * **duration (float)**: audio duration in seconds.

processors_to_run: "0:"
workspace_dir: ???
final_manifest: ${workspace_dir}/final_manifest.json

processors:
  - _target_: sdp.processors.CreateInitialManifestByExt
    raw_data_dir: ${workspace_dir}/mp3
    extension: mp3
    output_file_key: audio_filepath
    output_manifest_file: ${workspace_dir}/manifest0.json

  - _target_: sdp.processors.FfmpegConvert
    output_manifest_file: ${workspace_dir}/manifest1.json
    converted_audio_dir: ${workspace_dir}/audio
    target_samplerate: 16000
    target_nchannels: 1
    input_file_key: "audio_filepath"
    output_file_key: "audio_filepath"
    id_key: null

  - _target_: sdp.processors.GetAudioDuration
    audio_filepath_key: audio_filepath
    duration_key: duration
    output_manifest_file: ${workspace_dir}/manifest2.json

  - _target_: sdp.processors.PreserveByValue
    output_manifest_file: ${workspace_dir}/manifest3.json
    input_value_key: duration
    target_value: 0
    operator: gt
    
  - _target_: sdp.processors.ASRWhisper # pip install -U openai-whisper
    output_manifest_file: ${workspace_dir}/manifest4.json
    pretrained_model: "small"
    output_text_key: text
    output_lang_key: lid
    pad_or_trim_length: 480000

  - _target_: sdp.processors.ASRTransformers # pip install accelerate transformers
    output_manifest_file: ${workspace_dir}/manifest5.json
    pretrained_model: "distil-whisper/distil-large-v2" #  distil-whisper/distil-large-v2 openai/whisper-large-v2
    generate_language: armenian
    generate_task: transcribe
    output_text_key: pred_text

  - _target_: sdp.processors.SubRegex
    output_manifest_file: ${final_manifest}
    regex_params_list:
      - {"pattern": '\[(.*?)\]', "repl": ' '}
      - {"pattern": '�', "repl": ' '}
      - {"pattern": 'n', "repl": "ո"}
      - {"pattern": '1', "repl": "մեկ"} # dummy text normalization

      - {"pattern": 'անտար', "repl": "անտառ"}
      - {"pattern": 'թնակ', "repl": "տնակ"}
      - {"pattern": 'Ռուսերենիս', "repl": "Ռուսերենից"}
      - {"pattern": 'ամալիահ', "repl": "Ամալիյա"}

      - {"pattern": 'Էտկարպո', "repl": "Էդգար Պո"}
      - {"pattern": 'թարգմանություն', "repl": "թարգմանությունը"}
      - {"pattern": 'արտաշ է սեմինի', "repl": "Արտաշես Էմինի"}
      # double space to single space
      - {"pattern": "\\s+", "repl": " "}
    test_cases:
      - {input: {text: "Գրիմ եղբայրներ, անտարի թնակը, Ռուսերենիս թարգմանեց, ամալիահ Ուկասյանը."}, output: {text: "Գրիմ եղբայրներ, անտառի տնակը, Ռուսերենից թարգմանեց, Ամալիյա Ուկասյանը."}}
      - {input: {text: "Էտկարպո, Մատնիչ սիրտը, թարգմանություն արտաշ է սեմինի."}, output: {text: "Էդգար Պո, Մատնիչ սիրտը, թարգմանությունը Արտաշես Էմինի."}}

  - _target_: sdp.processors.DropNonAlphabet
    output_manifest_file: ${workspace_dir}/manifest6.json
    alphabet: "ԱԲԳԴԵԶԷԸԹԺԻԼԽԾԿՀՁՂՃՄՅՆՇՈՉՊՋՌՍՎՏՐՑՒՓՔՕՖՈՒԵ աբգդեզէըթժիլխծկհձղճմյնշոչպջռսվտրցւփքօֆուև.,!?"
    test_cases:
      - {input: {text: "test тест Գրիմ եղբայրներ, անտառի տնակը, Ռուսերենից թարգմանեց, Ամալիյա Ուկասյանը."}, output: null}
      - {input: {text: "Գրիմ եղբայրներ, անտառի տնակը, Ռուսերենից թարգմանեց, Ամալիյա Ուկասյանը."}, output: {text: "Գրիմ եղբայրներ, անտառի տնակը, Ռուսերենից թարգմանեց, Ամալիյա Ուկասյանը."}}

  - _target_: sdp.processors.SubMakeLowercase
    output_manifest_file: ${workspace_dir}/manifest7.json
    text_key: "text"