In [3]:
# Loading the libraries
from datasets import load_dataset
from transformers import pipeline
from funasr import AutoModel
from funasr.utils.postprocess_utils import rich_transcription_postprocess
import librosa
import numpy as np

from IPython.display import Audio as IPythonAudio

In [4]:
# Load the MP3 file
audio_path = r'C:\Users\Administrator\Desktop\LLM_work\SenseVoiceSmall\example\zh.mp3'
audio_array, sample_rate = librosa.load(audio_path, sr=None)

In [5]:
# Print details
print("Sample Rate:", sample_rate)
print("Audio Array:", audio_array)
print("Audio Array datatype is Array: ", isinstance(audio_array,np.ndarray) )

Sample Rate: 48000
Audio Array: [ 0.0000000e+00 -1.0005779e-14 -7.1443096e-15 ...  2.2035025e-10
  2.1887764e-10  2.0613053e-10]
Audio Array datatype is Array:  True


In [6]:
# Resampling the Audio so it matches to the requirement of the model
audio_16KHz = librosa.resample(audio_array,
                               orig_sr=sample_rate,
                               target_sr=16000)

In [7]:
model_dir = "FunAudioLLM/SenseVoiceSmall"

# SenseVoiceSmall 


In [8]:
# model_dir = r'C:/Users/Administrator/Desktop/LLM_work/SenseVoiceSmall'
# pretrained_model_path=  r'C:\Users\Administrator\Desktop\LLM_work\SenseVoiceSmall\model.pt'
model = AutoModel(
    model='iic/SenseVoiceSmall',
    # init_param = pretrained_model_path
)

# en
res = model.generate(
    input=audio_16KHz,
    cache={},
    language="auto",  # "zn", "en", "yue", "ja", "ko", "nospeech"
    use_itn=True,
    batch_size_s=60,
    merge_vad=True,  #
    merge_length_s=15,
)
text = rich_transcription_postprocess(res[0]["text"])
print(text)

funasr version: 1.1.6.
Check update of funasr, and it would cost few times. You may disable it by set `disable_update=True` in AutoModel
You are using the latest version of funasr-1.1.6


  src_state = torch.load(path, map_location=map_location)
rtf_avg: 0.044: 100%|[34m██████████[0m| 1/1 [00:00<00:00,  4.01it/s]                                                                                          

开饭时间早上9点至下午5点。





# paraformer-zh

In [9]:
model = AutoModel(
    model='paraformer-zh',  # This is the directory of model path
    device="cuda:0",  # "cuda:0" for GPU (if CUDA is available) or "cpu" for CPU.
    hub="hf",   # "hf" for Hugging Face Hub, "local" for local filesystem.
)

# en
res = model.generate(
    input=audio_16KHz,
    cache={},
    language="zn",
    use_itn=True,
    batch_size_s=60,
    merge_vad=True,  #
    merge_length_s=15,
)
text = rich_transcription_postprocess(res[0]["text"])
print(text)



funasr version: 1.1.6.
Check update of funasr, and it would cost few times. You may disable it by set `disable_update=True` in AutoModel
You are using the latest version of funasr-1.1.6


Fetching 10 files: 100%|██████████| 10/10 [00:00<00:00, 9993.58it/s]
  with autocast(False):
  with autocast(False):
rtf_avg: 0.042: 100%|[34m██████████[0m| 1/1 [00:00<00:00,  4.16it/s]                                                                                          

开放时间早上九点至下午五点





# ct-punc

In [10]:
# This model is used to detect the punctuation in the generated text
model = AutoModel(model="ct-punc", model_revision="v2.0.4")
res = model.generate(input="那今天的会就到这里吧 happy new year 明年见")
print(res)

funasr version: 1.1.6.
Check update of funasr, and it would cost few times. You may disable it by set `disable_update=True` in AutoModel
You are using the latest version of funasr-1.1.6


2024-09-06 14:10:22,487 - modelscope - INFO - Use user-specified model revision: v2.0.4
Downloading [README.md]: 100%|██████████| 10.6k/10.6k [00:02<00:00, 5.29kB/s]
Building prefix dict from the default dictionary ...
DEBUG:jieba:Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\Administrator\AppData\Local\Temp\jieba.cache
DEBUG:jieba:Loading model from cache C:\Users\Administrator\AppData\Local\Temp\jieba.cache
Loading model cost 0.393 seconds.
DEBUG:jieba:Loading model cost 0.393 seconds.
Prefix dict has been built successfully.
DEBUG:jieba:Prefix dict has been built successfully.
rtf_avg: -0.026: 100%|[34m██████████[0m| 1/1 [00:00<00:00, 37.55it/s]                                                                                  

[{'key': 'rand_key_2yW4Acq9GFz6Y', 'text': '那今天的会就到这里吧，happy new year,明年见。', 'punc_array': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 3])}]





# fsmn-vad

In [11]:
from funasr import AutoModel

model = AutoModel(model="fsmn-vad", model_revision="v2.0.4")

res = model.generate(input=audio_16KHz)
print(res)

funasr version: 1.1.6.
Check update of funasr, and it would cost few times. You may disable it by set `disable_update=True` in AutoModel
You are using the latest version of funasr-1.1.6


2024-09-06 14:10:44,028 - modelscope - INFO - Use user-specified model revision: v2.0.4
rtf_avg: 0.005: 100%|[34m██████████[0m| 1/1 [00:00<00:00, 31.15it/s]                                                                                          

[{'key': 'rand_key_2yW4Acq9GFz6Y', 'value': [[420, 5600]]}]





# Combined all

In [12]:
model = AutoModel(
    model='iic/SenseVoiceSmall',
    # vad_model="fsmn-vad",
    # vad_kwargs={"max_single_segment_time": 30000},
    device="cuda:0",
    punc_model = "ct-punc"
)

# en
res = model.generate(
    input=audio_16KHz,
    cache={},
    language="auto",  # "zn", "en", "yue", "ja", "ko", "nospeech"
    use_itn=True,
    batch_size_s=60,
    merge_vad=True,  #
    merge_length_s=15,
)
text = rich_transcription_postprocess(res[0]["text"])
print(text)

funasr version: 1.1.6.
Check update of funasr, and it would cost few times. You may disable it by set `disable_update=True` in AutoModel
You are using the latest version of funasr-1.1.6


Downloading [README.md]: 100%|██████████| 10.6k/10.6k [00:01<00:00, 6.34kB/s]
rtf_avg: 0.048: 100%|[34m██████████[0m| 1/1 [00:00<00:00,  3.67it/s]                                                                                          

开饭时间早上9点至下午5点。



