In [None]:
!pip install coqui-stt-model-manager

Collecting coqui-stt-model-manager
  Downloading coqui_stt_model_manager-0.0.21-py3-none-any.whl (600 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m600.5/600.5 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting Jinja2==3.0.1 (from coqui-stt-model-manager)
  Downloading Jinja2-3.0.1-py3-none-any.whl (133 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m133.7/133.7 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting Flask==2.0.1 (from coqui-stt-model-manager)
  Downloading Flask-2.0.1-py3-none-any.whl (94 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m94.8/94.8 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting Flask-Cors==3.0.10 (from coqui-stt-model-manager)
  Downloading Flask_Cors-3.0.10-py2.py3-none-any.whl (14 kB)
Collecting pyxdg==0.27 (from coqui-stt-model-manager)
  Downloading pyxdg-0.27-py2.py3-none-any.whl (49 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.

In [None]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function

import argparse
import json
import shlex
import subprocess
import sys
import wave
from timeit import default_timer as timer

import numpy as np
from stt import Model, version

try:
    from shlex import quote
except ImportError:
    from pipes import quote


def convert_samplerate(audio_path, desired_sample_rate):
    sox_cmd = "sox {} --type raw --bits 16 --channels 1 --rate {} --encoding signed-integer --endian little --compression 0.0 --no-dither - ".format(
        quote(audio_path), desired_sample_rate
    )
    try:
        output = subprocess.check_output(shlex.split(sox_cmd), stderr=subprocess.PIPE)
    except subprocess.CalledProcessError as e:
        raise RuntimeError("SoX returned non-zero status: {}".format(e.stderr))
    except OSError as e:
        raise OSError(
            e.errno,
            "SoX not found, use {}hz files or install it: {}".format(
                desired_sample_rate, e.strerror
            ),
        )

    return desired_sample_rate, np.frombuffer(output, np.int16)


def metadata_to_string(metadata):
    return "".join(token.text for token in metadata.tokens)


def words_from_candidate_transcript(metadata):
    word = ""
    word_list = []
    word_start_time = 0
    # Loop through each character
    for i, token in enumerate(metadata.tokens):
        # Append character to word if it's not a space
        if token.text != " ":
            if len(word) == 0:
                # Log the start time of the new word
                word_start_time = token.start_time

            word = word + token.text
        # Word boundary is either a space or the last character in the array
        if token.text == " " or i == len(metadata.tokens) - 1:
            word_duration = token.start_time - word_start_time

            if word_duration < 0:
                word_duration = 0

            each_word = dict()
            each_word["word"] = word
            each_word["start_time"] = round(word_start_time, 4)
            each_word["duration"] = round(word_duration, 4)

            word_list.append(each_word)
            # Reset
            word = ""
            word_start_time = 0

    return word_list


def metadata_json_output(metadata):
    json_result = dict()
    json_result["transcripts"] = [
        {
            "confidence": transcript.confidence,
            "words": words_from_candidate_transcript(transcript),
        }
        for transcript in metadata.transcripts
    ]
    return json.dumps(json_result, indent=2)


class VersionAction(argparse.Action):
    def __init__(self, *args, **kwargs):
        super(VersionAction, self).__init__(nargs=0, *args, **kwargs)

    def __call__(self, *args, **kwargs):
        print("Coqui STT ", version())
        exit(0)


def main():
    parser = argparse.ArgumentParser(description="Running Coqui STT inference.")
    parser.add_argument(
        "--model", required=True, help="Path to the model (protocol buffer binary file)"
    )
    parser.add_argument(
        "--scorer", required=False, help="Path to the external scorer file"
    )
    parser.add_argument(
        "--audio", required=True, help="Path to the audio file to run (WAV format)"
    )
    parser.add_argument("--beam_width", type=int, help="Beam width for the CTC decoder")
    parser.add_argument(
        "--lm_alpha",
        type=float,
        help="Language model weight (lm_alpha). If not specified, use default from the scorer package.",
    )
    parser.add_argument(
        "--lm_beta",
        type=float,
        help="Word insertion bonus (lm_beta). If not specified, use default from the scorer package.",
    )
    parser.add_argument(
        "--version", action=VersionAction, help="Print version and exits"
    )
    parser.add_argument(
        "--extended",
        required=False,
        action="store_true",
        help="Output string from extended metadata",
    )
    parser.add_argument(
        "--json",
        required=False,
        action="store_true",
        help="Output json from metadata with timestamp of each word",
    )
    parser.add_argument(
        "--candidate_transcripts",
        type=int,
        default=3,
        help="Number of candidate transcripts to include in JSON output",
    )
    parser.add_argument("--hot_words", type=str, help="Hot-words and their boosts.")
    args = parser.parse_args()

    print("Loading model from file {}".format(args.model), file=sys.stderr)
    model_load_start = timer()
    # sphinx-doc: python_ref_model_start
    ds = Model(args.model)
    # sphinx-doc: python_ref_model_stop
    model_load_end = timer() - model_load_start
    print("Loaded model in {:.3}s.".format(model_load_end), file=sys.stderr)

    if args.beam_width:
        ds.setBeamWidth(args.beam_width)

    desired_sample_rate = ds.sampleRate()

    if args.scorer:
        print("Loading scorer from files {}".format(args.scorer), file=sys.stderr)
        scorer_load_start = timer()
        ds.enableExternalScorer(args.scorer)
        scorer_load_end = timer() - scorer_load_start
        print("Loaded scorer in {:.3}s.".format(scorer_load_end), file=sys.stderr)

        if args.lm_alpha and args.lm_beta:
            ds.setScorerAlphaBeta(args.lm_alpha, args.lm_beta)

    if args.hot_words:
        print("Adding hot-words", file=sys.stderr)
        for word_boost in args.hot_words.split(","):
            word, boost = word_boost.split(":")
            ds.addHotWord(word, float(boost))

    fin = wave.open(args.audio, "rb")
    fs_orig = fin.getframerate()
    if fs_orig != desired_sample_rate:
        print(
            "Warning: original sample rate ({}) is different than {}hz. Resampling might produce erratic speech recognition.".format(
                fs_orig, desired_sample_rate
            ),
            file=sys.stderr,
        )
        fs_new, audio = convert_samplerate(args.audio, desired_sample_rate)
    else:
        audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)

    audio_length = fin.getnframes() * (1 / fs_orig)
    fin.close()

    print("Running inference.", file=sys.stderr)
    inference_start = timer()
    # sphinx-doc: python_ref_inference_start
    if args.extended:
        print(metadata_to_string(ds.sttWithMetadata(audio, 1).transcripts[0]))
    elif args.json:
        print(
            metadata_json_output(ds.sttWithMetadata(audio, args.candidate_transcripts))
        )
    else:
        print(ds.stt(audio))
    # sphinx-doc: python_ref_inference_stop
    inference_end = timer() - inference_start
    print(
        "Inference took %0.3fs for %0.3fs audio file." % (inference_end, audio_length),
        file=sys.stderr,
    )


if __name__ == "__main__":
    main()


In [None]:



 ds = Model(args.model)

if args.extended:
        print(metadata_to_string(ds.sttWithMetadata(audio, 1).transcripts[0]))
    elif args.json:
        print(
            metadata_json_output(ds.sttWithMetadata(audio, args.candidate_transcripts))
        )
    else:
        print(ds.stt(audio))

NameError: name 'Model' is not defined

#Tensorflow 2.x (with 2.x >= 2.3)
!pip install -U "TensorFlowASR[tf2.x-gpu]" # or pip3 install -U "TensorFlowASR[tf2.x-gpu]"  
## -- WARNING: tensorflowasr 1.0.3 does not provide the extra 'tf2.x-gpu'
'''
error: subprocess-exited-with-error
  
  × Getting requirements to build wheel did not run successfully.
  │ exit code: 1
  ╰─> See above for output.
  
  note: This error originates from a subprocess, and is likely not a problem with pip.
  Getting requirements to build wheel ... error
error: subprocess-exited-with-error

× Getting requirements to build wheel did not run successfully.
│ exit code: 1
╰─> See above for output.
'''


In [None]:

!pip install git+https://github.com/TensorSpeech/TensorFlowASR.git || echo "Installation failed, but continuing."


In [3]:
!python --version
#Python 3.10.12



Python 3.10.12


In [4]:
!pip install TensorFlowTTS

Collecting TensorFlowTTS
  Downloading TensorFlowTTS-1.8-py3-none-any.whl (128 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m128.5/128.5 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hINFO: pip is looking at multiple versions of tensorflowtts to determine which version is compatible with other requirements. This could take a while.
  Downloading TensorFlowTTS-1.6.1-py3-none-any.whl (126 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.1/126.1 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Downloading TensorFlowTTS-1.6-py3-none-any.whl (126 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.1/126.1 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Downloading TensorFlowTTS-1.1-py3-none-any.whl (121 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.0/122.0 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Downloading TensorFlowTTS-0.11-py3-none-any.whl (118 kB)
[2K     [90m━━━━━━

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
!ln -s /usr/local/cuda-10.1 /usr/local/cuda

# check if installed successfully
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0


In [7]:
import os
import sys
from pathlib import Path
from pprint import pprint

%tensorflow_version 2.x

Colab only includes TensorFlow 2.x; %tensorflow_version has no effect.


In [8]:
%cd /content
!git clone --depth 1 https://github.com/sce-tts/g2pK.git
!git clone --depth 1 https://github.com/sce-tts/glow-tts.git
!git clone --depth 1 https://github.com/sce-tts/TensorflowTTS.git -b r0.7
!pip install -q --no-cache-dir "torch==1.5.1" -f https://download.pytorch.org/whl/cu101/torch_stable.html
!pip install -q --no-cache-dir "cython==0.29.12" "tensorflow-gpu>=2.2.0" "tensorflow-addons>=0.9.1" "setuptools>=38.5.1" "librosa>=0.7.0" "soundfile>=0.10.2" "matplotlib>=3.1.0" "PyYAML>=3.12" "tqdm>=4.26.1" "h5py>=2.10.0" "pathos>=0.2.5" "unidecode>=1.1.1" "inflect>=4.1.0" "scikit-learn>=0.22.0" "pyworld>=0.2.10" "numba<=0.48" "numpy" "scipy" "pillow" "future" "konlpy" "jamo" "nltk" "python-mecab-ko"

/content
Cloning into 'g2pK'...
remote: Enumerating objects: 20, done.[K
remote: Counting objects: 100% (20/20), done.[K
remote: Compressing objects: 100% (20/20), done.[K
remote: Total 20 (delta 0), reused 14 (delta 0), pack-reused 0[K
Receiving objects: 100% (20/20), 35.24 KiB | 11.75 MiB/s, done.
Cloning into 'glow-tts'...
remote: Enumerating objects: 41, done.[K
remote: Counting objects: 100% (41/41), done.[K
remote: Compressing objects: 100% (38/38), done.[K
remote: Total 41 (delta 1), reused 38 (delta 1), pack-reused 0[K
Receiving objects: 100% (41/41), 1.60 MiB | 10.67 MiB/s, done.
Resolving deltas: 100% (1/1), done.
Cloning into 'TensorflowTTS'...
remote: Enumerating objects: 132, done.[K
remote: Counting objects: 100% (132/132), done.[K
remote: Compressing objects: 100% (121/121), done.[K
remote: Total 132 (delta 17), reused 99 (delta 3), pack-reused 0[K
Receiving objects: 100% (132/132), 7.60 MiB | 11.73 MiB/s, done.
Resolving deltas: 100% (17/17), done.
[31mERRO

In [9]:
%cd /content/glow-tts/monotonic_align
!python setup.py build_ext --inplace

/content/glow-tts/monotonic_align
Compiling core.pyx because it changed.
[1/1] Cythonizing core.pyx
  tree = Parsing.p_module(s, pxd, full_module_name)
performance hint: core.pyx:9:5: Exception check on 'maximum_path_each' will always require the GIL to be acquired.
Possible solutions:
	1. Declare 'maximum_path_each' as 'noexcept' if you control the definition and you're sure you don't want the function to raise exceptions.
	2. Use an 'int' return type on 'maximum_path_each' to allow an error code to be returned.
performance hint: core.pyx:40:6: Exception check on 'maximum_path_c' will always require the GIL to be acquired.
Possible solutions:
	1. Declare 'maximum_path_c' as 'noexcept' if you control the definition and you're sure you don't want the function to raise exceptions.
	2. Use an 'int' return type on 'maximum_path_c' to allow an error code to be returned.
performance hint: core.pyx:45:21: Exception check after calling 'maximum_path_each' will always require the GIL to be acqu

In [10]:
os.environ['CUDA_VISIBLE_DEVICES'] = '-1' # USE CPU

import yaml
import json
import numpy as np
import torch
import tensorflow as tf
from sklearn.preprocessing import StandardScaler

In [11]:
sys.path.append('/content/TensorflowTTS')
from tensorflow_tts.processor.ljspeech import LJSpeechProcessor
from tensorflow_tts.processor.ljspeech import symbols as tensorflowtts_symbols
from tensorflow_tts.processor.ljspeech import _symbol_to_id

from tensorflow_tts.configs import MultiBandMelGANGeneratorConfig
from tensorflow_tts.models import TFMelGANGenerator
from tensorflow_tts.models import TFPQMF
sys.path.remove('/content/TensorflowTTS')

ModuleNotFoundError: No module named 'unidecode'

# 여기서 부터 입니다  위 내용은 아님

In [12]:
!pip install nemo_toolkit['all']


Collecting nemo_toolkit[all]
  Downloading nemo_toolkit-1.23.0-py3-none-any.whl (3.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
Collecting onnx>=1.7.0 (from nemo_toolkit[all])
  Downloading onnx-1.16.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (15.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.9/15.9 MB[0m [31m44.6 MB/s[0m eta [36m0:00:00[0m
Collecting ruamel.yaml (from nemo_toolkit[all])
  Downloading ruamel.yaml-0.18.6-py3-none-any.whl (117 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m117.8/117.8 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
Collecting wget (from nemo_toolkit[all])
  Downloading wget-3.2.zip (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting black==19.10b0 (from nemo_toolkit[all])
  Downloading black-19.10b0-py36-none-any.whl (97 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m

In [2]:
import nemo.collections.asr as nemo_asr
asr_model = nemo_asr.models.ASRModel.from_pretrained("eesungkim/stt_kr_conformer_transducer_large")


      self.pid = os.fork()
    
    The secret `HF_TOKEN` does not exist in your Colab secrets.
    To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
    You will be able to reuse this secret in all of your notebooks.
    Please note that authentication is recommended but still optional to access public models or datasets.
    


stt_kr_conformer_transducer_large.nemo:   0%|          | 0.00/489M [00:00<?, ?B/s]

[NeMo I 2024-04-24 08:10:41 mixins:172] Tokenizer SentencePieceTokenizer initialized with 2261 tokens


[NeMo W 2024-04-24 08:10:43 modelPT:165] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: /data/public/rw/chris/manifests/ksponspeech//train.json
    sample_rate: 16000
    batch_size: 8
    shuffle: true
    num_workers: 8
    pin_memory: true
    use_start_end_token: true
    trim_silence: false
    max_duration: 17
    min_duration: 0.1
    shuffle_n: 2048
    
[NeMo W 2024-04-24 08:10:43 modelPT:172] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config : 
    manifest_filepath: /data/public/rw/chris/manifests/ksponspeech//dev.json
    sample_rate: 16000
    batch_size: 16
    shuffle: false
    num_workers: 8
    pin_memory: true
    use_start_e

[NeMo I 2024-04-24 08:10:43 features:289] PADDING: 0


    


[NeMo I 2024-04-24 08:10:45 rnnt_models:217] Using RNNT Loss : warprnnt_numba
    Loss warprnnt_numba_kwargs: {'fastemit_lambda': 0.0, 'clamp': 5}
[NeMo I 2024-04-24 08:10:46 save_restore_connector:249] Model EncDecRNNTBPEModel was successfully restored from /root/.cache/huggingface/hub/models--eesungkim--stt_kr_conformer_transducer_large/snapshots/fdc8412fe0d089913524767b20ff244ff1007ed0/stt_kr_conformer_transducer_large.nemo.


In [1]:
!wget https://dldata-public.s3.us-east-2.amazonaws.com/sample-kor.wav
#https://huggingface.co/eesungkim/stt_kr_conformer_transducer_large

--2024-04-24 08:07:52--  https://dldata-public.s3.us-east-2.amazonaws.com/sample-kor.wav
Resolving dldata-public.s3.us-east-2.amazonaws.com (dldata-public.s3.us-east-2.amazonaws.com)... 52.219.94.234, 3.5.131.124, 3.5.128.13, ...
Connecting to dldata-public.s3.us-east-2.amazonaws.com (dldata-public.s3.us-east-2.amazonaws.com)|52.219.94.234|:443... connected.
HTTP request sent, awaiting response... 403 Forbidden
2024-04-24 08:07:53 ERROR 403: Forbidden.



In [4]:
asr_model.transcribe(['/content/coro.wav'])


#python [NEMO_GIT_FOLDER]/examples/asr/transcribe_speech.py  pretrained_name="eesungkim/stt_kr_conformer_transducer_large"  audio_dir="<DIRECTORY CONTAINING AUDIO FILES>"


Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

TypeError: Output shape mismatch occured for audio_signal in module AudioToBPEDataset : 
Output shape expected = (batch, time) | 
Output shape found : torch.Size([1, 406273, 2])