In [88]:
"""
The following is the output format of a vad model, the model is a streaming vad model, audio is processed in chunks, for each chunk, we will get a list of list of integers
as vad results. to analyze the vad results, we need to merge vad results from different chunks.

[[beg1, end1], [beg2, end2], .., [begN, endN]]：The same as the offline VAD output result mentioned above.
[[beg, -1]]：Indicates that only a starting point has been detected.
[[-1, end]]：Indicates that only an ending point has been detected.
[]：Indicates that neither a starting point nor an ending point has been detected.
The output is measured in milliseconds and represents the absolute time from the starting point.

When processing audio, we need know the speech intervals and silence intervals.
each speech interval is a time period starting with user's voice and ending with user's silece.

I want to process the results, do you have any recommendations?
"""



"\nThe following is the output format of a vad model, the model is a streaming vad model, audio is processed in chunks, for each chunk, we will get a list of list of integers\nas vad results. to analyze the vad results, we need to merge vad results from different chunks.\n\n[[beg1, end1], [beg2, end2], .., [begN, endN]]：The same as the offline VAD output result mentioned above.\n[[beg, -1]]：Indicates that only a starting point has been detected.\n[[-1, end]]：Indicates that only an ending point has been detected.\n[]：Indicates that neither a starting point nor an ending point has been detected.\nThe output is measured in milliseconds and represents the absolute time from the starting point.\n\nWhen processing audio, we need know the speech intervals and silence intervals.\neach speech interval is a time period starting with user's voice and ending with user's silece.\n\nI want to process the results, do you have any recommendations?\n"

# Streaming VAD Result Processor

Below is a solution to process streaming VAD results. This implementation handles the various output formats from the VAD model and maintains speech segments across multiple chunks.

In [153]:
class StreamingVADProcessor:
    
    def __init__(self, 
                 max_silence_ms=1000, 
                 min_speech_ms=300
            ):
        self.speech_segments = []  # segments 列表（时间点） [start_ms, end_ms]
        self.pending_segment = None  # 当前不完整的segment [start_ms, -1]
        self.max_silence_ms = max_silence_ms  # 结束一个段落的最大沉默时间
        self.min_speech_ms = min_speech_ms  # 有效语音段的最小长度
        self.last_end_time = 0  # 记录上一个ending的时间
    
    def process_chunk(self, chunk_result, chunk_time_ms):
        """
        Process one chunk of VAD results
        
        Args:
            chunk_result: List of VAD results in the format [[beg1, end1], [beg, -1], [-1, end], ...]
            chunk_time_ms: Current chunk time in milliseconds from the start
            
        Returns:
            List of any completed speech segments in this chunk
        """
        completed_segments = []
        
        # 未检测到任何端点
        if not chunk_result:
            
            # 检查是否有一个挂起的段落已经沉默了太久
            vad_status = "empty" if not self.pending_segment else "continue"
            
            if self.pending_segment and (chunk_time_ms - self.last_end_time) > self.max_silence_ms:
                # 将挂起的segment做ennding
                segment = [self.pending_segment[0], self.last_end_time]
                
                # 如果段落足够长，将其添加到结果中
                if segment[1] - segment[0] >= self.min_speech_ms:
                    self.speech_segments.append(segment)
                    completed_segments.append(segment)
                self.pending_segment = None
                
            return completed_segments
        
        # 如果检测到端点，处理每个segment
        for segment in chunk_result:
            
            # Case 1: 收到完整的segment
            if len(segment) == 2 and segment[0] >= 0 and segment[1] > 0:
                
                # 将挂起的段落结束
                if self.pending_segment:
                    # 检查当前segment是一个继续还是一个新的段落
                    if abs(segment[0] - self.last_end_time) <= self.max_silence_ms:
                        # 如果segment是之前片段的继续，用pending的开始时间，组成一个大的，新的segment
                        new_segment = [self.pending_segment[0], segment[1]]
                        # 如果新的段落足够长，将其添加到结果中
                        if new_segment[1] - new_segment[0] >= self.min_speech_ms:
                            self.speech_segments.append(new_segment)
                            completed_segments.append(new_segment)
                    else: 
                        
                        # 如果当前segment不是之前片段的继续，结束之前的片段
                        prev_segment = [self.pending_segment[0], self.last_end_time]
                        
                        # 检查之前的片段是否足够长
                        if prev_segment[1] - prev_segment[0] >= self.min_speech_ms:
                            self.speech_segments.append(prev_segment)
                            completed_segments.append(prev_segment)
                        
                        # 加入新的片段
                        if segment[1] - segment[0] >= self.min_speech_ms:
                            self.speech_segments.append(segment)
                            completed_segments.append(segment)
                    
                    self.pending_segment = None
                    
                else:
                    # 如果没有挂起的段落，直接添加当前段落
                    if segment[1] - segment[0] >= self.min_speech_ms:
                        self.speech_segments.append(segment)
                        completed_segments.append(segment)
                
                self.last_end_time = segment[1]
            
            # Case 2: 只检测到一个开始 [beg, -1]
            elif len(segment) == 2 and segment[0] >= 0 and segment[1] == -1:
                # 如果当前没有pending，则开始一个新的pending
                if not self.pending_segment:
                    self.pending_segment = [segment[0], -1]
                # 如果我们已经有一个pending的段落，我们保留较早的开始时间
            
            # Case 3: 只检测到一个结束 [-1, end]
            elif len(segment) == 2 and segment[0] == -1 and segment[1] > 0:
                if self.pending_segment:
                    # 此时必有一个挂起的段落
                    completed_segment = [self.pending_segment[0], segment[1]]
                    # 检查是否要添加到speech段落
                    if completed_segment[1] - completed_segment[0] >= self.min_speech_ms:
                        self.speech_segments.append(completed_segment)
                        completed_segments.append(completed_segment)
                    self.pending_segment = None
                    self.last_end_time = segment[1]
        
        return completed_segments
    
    def get_speech_segments(self):
        """Return all completed speech segments"""
        return self.speech_segments
    
    def get_pending_segment(self):
        """Return the current pending segment if any"""
        return self.pending_segment
    
    def finalize(self):
        """Finalize processing and return all segments"""
        # If we have a pending segment, complete it with the last known end time
        if self.pending_segment:
            segment = [self.pending_segment[0], self.last_end_time]
            if segment[1] - segment[0] >= self.min_speech_ms:
                self.speech_segments.append(segment)
            self.pending_segment = None
        
        return self.speech_segments

In [154]:
1024, 10

(1024, 10)

In [None]:
# Example of how to use the StreamingVADProcessor with your current VAD model
def process_audio_with_vad(audio_path, chunk_size_ms=200, vad_model=None):
    """Process an audio file using the StreamingVADProcessor"""
    import soundfile
    from funasr import AutoModel
    
    # Load the VAD model if not provided
    if vad_model is None:
        vad_model = AutoModel(model="fsmn-vad")
    
    # Read the audio file
    speech, sample_rate = soundfile.read(audio_path)
    chunk_stride = int(chunk_size_ms * sample_rate / 1000)
    
    # Initialize the VAD processor
    vad_processor = StreamingVADProcessor(max_silence_ms=400, min_speech_ms=300)
    
    # Process the audio in chunks
    cache = {}
    total_chunk_num = int(len(speech-1)/chunk_stride+1)
    all_completed_segments = []
    
    for i in range(total_chunk_num):
        # Get the current chunk of audio
        speech_chunk = speech[i*chunk_stride:(i+1)*chunk_stride]
        is_final = i == total_chunk_num - 1
        
        # Get VAD results for this chunk
        import time
        res = vad_model.generate(input=speech_chunk, cache=cache, is_final=is_final, chunk_size=chunk_size_ms)
        # Process VAD results if there are any
        if len(res) > 0 and len(res[0]["value"]) > 0:
            # Current chunk time in ms from the start
            current_time_ms = i * chunk_size_ms
            
            # Process the chunk results
            print(f"Passing {res[0]['value']}")
            completed_segments = vad_processor.process_chunk(res[0]["value"], current_time_ms)
            all_completed_segments.extend(completed_segments)
            if completed_segments:
                print(f"Completed segment at chunk {i}: {completed_segments}")
    
    # Finalize to get any pending segments
    final_segments = vad_processor.finalize()
    
    return final_segments

# Example usage:
# segments = process_audio_with_vad("/path/to/audio.wav")

In [164]:
from funasr import AutoModel

model = AutoModel(model="fsmn-vad", disable_update = True)

funasr version: 1.2.6.
Downloading Model from https://www.modelscope.cn to directory: /Users/mac/.cache/modelscope/hub/models/iic/speech_fsmn_vad_zh-cn-16k-common-pytorch




In [165]:
# Test with the example file from your VAD model
if 'model' in locals() and hasattr(model, 'model_path'):  # Check if model is defined
    test_wav_file = f"/Users/mac/Desktop/audio-services/cache/recording.wav"
    speech_segments = process_audio_with_vad(test_wav_file, vad_model=model)
    
    print("\nFinal speech segments (start_ms, end_ms):")
    for i, segment in enumerate(speech_segments):
        duration_ms = segment[1] - segment[0]
        print(f"Segment {i+1}: {segment} (duration: {duration_ms}ms)")
        
    # Calculate statistics
    total_duration = sum(seg[1] - seg[0] for seg in speech_segments)
    print(f"\nTotal speech duration: {total_duration}ms ({total_duration/1000:.2f}s)")
    print(f"Number of speech segments: {len(speech_segments)}")
    if speech_segments:
        avg_duration = total_duration / len(speech_segments)
        print(f"Average segment duration: {avg_duration:.2f}ms ({avg_duration/1000:.2f}s)")

rtf_avg: 0.132: 100%|[34m██████████[0m| 1/1 [00:00<00:00, 44.55it/s]                                                                                          
rtf_avg: 0.024: 100%|[34m██████████[0m| 1/1 [00:00<00:00, 180.28it/s]                                                                                          
rtf_avg: 0.025: 100%|[34m██████████[0m| 1/1 [00:00<00:00, 160.51it/s]                                                                                          
rtf_avg: 0.030: 100%|[34m██████████[0m| 1/1 [00:00<00:00, 140.51it/s]                                                                                          
rtf_avg: 0.024: 100%|[34m██████████[0m| 1/1 [00:00<00:00, 178.19it/s]                                                                                          
rtf_avg: 0.097: 100%|[34m██████████[0m| 1/1 [00:00<00:00, 49.24it/s]                                                                                          


Passing [[740, -1]]


rtf_avg: 0.024: 100%|[34m██████████[0m| 1/1 [00:00<00:00, 170.56it/s]                                                                                          
rtf_avg: 0.025: 100%|[34m██████████[0m| 1/1 [00:00<00:00, 171.15it/s]                                                                                          
rtf_avg: 0.062: 100%|[34m██████████[0m| 1/1 [00:00<00:00, 75.01it/s]                                                                                          
rtf_avg: 0.021: 100%|[34m██████████[0m| 1/1 [00:00<00:00, 196.89it/s]                                                                                          
rtf_avg: 0.021: 100%|[34m██████████[0m| 1/1 [00:00<00:00, 190.73it/s]                                                                                          
rtf_avg: 0.038: 100%|[34m██████████[0m| 1/1 [00:00<00:00, 119.63it/s]                                                                                          
rtf_avg: 0.021: 100%|[34m███

Passing [[-1, 2060]]
Completed segment at chunk 13: [[740, 2060]]


rtf_avg: 0.043: 100%|[34m██████████[0m| 1/1 [00:00<00:00, 101.55it/s]                                                                                          
rtf_avg: 0.021: 100%|[34m██████████[0m| 1/1 [00:00<00:00, 202.11it/s]                                                                                          
rtf_avg: 0.029: 100%|[34m██████████[0m| 1/1 [00:00<00:00, 151.73it/s]                                                                                          


Passing [[2940, -1]]


rtf_avg: 0.039: 100%|[34m██████████[0m| 1/1 [00:00<00:00, 101.10it/s]                                                                                          
rtf_avg: 0.074: 100%|[34m██████████[0m| 1/1 [00:00<00:00, 64.11it/s]                                                                                          
rtf_avg: 0.023: 100%|[34m██████████[0m| 1/1 [00:00<00:00, 180.56it/s]                                                                                          
rtf_avg: 0.021: 100%|[34m██████████[0m| 1/1 [00:00<00:00, 196.04it/s]                                                                                          
rtf_avg: 0.176: 100%|[34m██████████[0m| 1/1 [00:00<00:00, 27.53it/s]                                                                                          
rtf_avg: 0.028: 100%|[34m██████████[0m| 1/1 [00:00<00:00, 114.35it/s]                                                                                          
rtf_avg: 0.025: 100%|[34m████

Passing [[-1, 4400]]
Completed segment at chunk 24: [[2940, 4400]]


rtf_avg: 0.022: 100%|[34m██████████[0m| 1/1 [00:00<00:00, 197.63it/s]                                                                                          
rtf_avg: 0.030: 100%|[34m██████████[0m| 1/1 [00:00<00:00, 146.22it/s]                                                                                          


Passing [[4850, -1]]


rtf_avg: 0.041: 100%|[34m██████████[0m| 1/1 [00:00<00:00, 111.91it/s]                                                                                          
rtf_avg: 0.021: 100%|[34m██████████[0m| 1/1 [00:00<00:00, 202.43it/s]                                                                                          
rtf_avg: 0.076: 100%|[34m██████████[0m| 1/1 [00:00<00:00, 62.39it/s]                                                                                          
rtf_avg: 0.021: 100%|[34m██████████[0m| 1/1 [00:00<00:00, 205.89it/s]                                                                                          
rtf_avg: 0.026: 100%|[34m██████████[0m| 1/1 [00:00<00:00, 169.50it/s]                                                                                          
rtf_avg: 0.075: 100%|[34m██████████[0m| 1/1 [00:00<00:00, 62.72it/s]                                                                                          
rtf_avg: 0.023: 100%|[34m████

Passing [[-1, 6640]]
Completed segment at chunk 36: [[4850, 6640]]


rtf_avg: 0.031: 100%|[34m██████████[0m| 1/1 [00:00<00:00, 124.83it/s]                                                                                          
rtf_avg: 0.021: 100%|[34m██████████[0m| 1/1 [00:00<00:00, 200.57it/s]                                                                                          
rtf_avg: 0.032: 100%|[34m██████████[0m| 1/1 [00:00<00:00, 138.45it/s]                                                                                          
rtf_avg: 0.073: 100%|[34m██████████[0m| 1/1 [00:00<00:00, 62.97it/s]                                                                                          
rtf_avg: 0.022: 100%|[34m██████████[0m| 1/1 [00:00<00:00, 192.87it/s]                                                                                          


Passing [[7780, -1]]


rtf_avg: 0.044: 100%|[34m██████████[0m| 1/1 [00:00<00:00, 103.68it/s]                                                                                          
rtf_avg: 0.310: 100%|[34m██████████[0m| 1/1 [00:00<00:00, 15.77it/s]                                                                                          
rtf_avg: 0.022: 100%|[34m██████████[0m| 1/1 [00:00<00:00, 180.56it/s]                                                                                          
rtf_avg: 0.020: 100%|[34m██████████[0m| 1/1 [00:00<00:00, 206.15it/s]                                                                                          
rtf_avg: 0.033: 100%|[34m██████████[0m| 1/1 [00:00<00:00, 136.17it/s]                                                                                          
rtf_avg: 0.024: 100%|[34m██████████[0m| 1/1 [00:00<00:00, 164.10it/s]                                                                                          
rtf_avg: 0.686: 100%|[34m███

Passing [[-1, 9580]]
Completed segment at chunk 48: [[7780, 9580]]

Final speech segments (start_ms, end_ms):
Segment 1: [740, 2060] (duration: 1320ms)
Segment 2: [2940, 4400] (duration: 1460ms)
Segment 3: [4850, 6640] (duration: 1790ms)
Segment 4: [7780, 9580] (duration: 1800ms)

Total speech duration: 6370ms (6.37s)
Number of speech segments: 4
Average segment duration: 1592.50ms (1.59s)





In [None]:
from funasr import AutoModel

vader = AutoModel(model = 'fsmn-vad', disable_update = True)

funasr version: 1.2.6.
Downloading Model from https://www.modelscope.cn to directory: /Users/mac/.cache/modelscope/hub/models/iic/speech_fsmn_vad_zh-cn-16k-common-pytorch




In [185]:
from funasr_onnx import Fsmn_vad_online

vader = Fsmn_vad_online(model_dir = "/Users/mac/.cache/modelscope/hub/models/iic/speech_fsmn_vad_zh-cn-16k-common-pytorch")

In [187]:
test_wav_file = f"/Users/mac/Desktop/audio-services/cache/recording.wav"
cache = {}
import numpy as np
import soundfile
speech, sample_rate = soundfile.read(test_wav_file)
chunks = np.array([speech[i:i+3200] for i in range(0, len(speech), 3200)])
for i, chunk in enumerate(chunks):
    is_final = i == len(chunks) - 1
    print(vader(chunk, cache = cache, chunk_size = 200, is_final = is_final))

[]
[]
[]
[[[-1, 9720]]]
[]
[]
[]
[]
[[[10880, -1]]]
[]
[]
[]
[[[-1, 11520]]]
[]
[]
[]
[[[12520, -1]]]
[]
[]
[]
[]
[]
[]
[]
[[[-1, 13850]]]
[]
[[[14440, -1]]]
[]
[]
[]
[]
[]
[]
[]
[]
[[[-1, 16120]]]
[]
[]
[]
[]
[]
[[[17380, -1]]]
[]
[]
[]
[]
[]
[]


In [169]:
!open /Users/mac/.cache/modelscope/hub/models/iic/speech_fsmn_vad_zh-cn-16k-common-pytorch

In [168]:
vader.export(quantize = True)



Quantizing model from /Users/mac/.cache/modelscope/hub/models/iic/speech_fsmn_vad_zh-cn-16k-common-pytorch/model.onnx to /Users/mac/.cache/modelscope/hub/models/iic/speech_fsmn_vad_zh-cn-16k-common-pytorch/model_quant.onnx


'/Users/mac/.cache/modelscope/hub/models/iic/speech_fsmn_vad_zh-cn-16k-common-pytorch'

In [3]:
from funasr import AutoModel

asr = AutoModel(model = 'paraformer-zh-streaming', disable_update = True)

funasr version: 1.2.6.
Downloading Model from https://www.modelscope.cn to directory: /Users/mac/.cache/modelscope/hub/models/iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online


  ori_state = torch.load(path, map_location=map_location)


In [4]:
asr.export(quantize = True)

  _C._jit_pass_onnx_node_shape_type_inference(node, params_dict, opset_version)
  _C._jit_pass_onnx_graph_shape_type_inference(
  _C._jit_pass_onnx_graph_shape_type_inference(


Quantizing model from /Users/mac/.cache/modelscope/hub/models/iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/model.onnx to /Users/mac/.cache/modelscope/hub/models/iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/model_quant.onnx




Quantizing model from /Users/mac/.cache/modelscope/hub/models/iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/decoder.onnx to /Users/mac/.cache/modelscope/hub/models/iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/decoder_quant.onnx


'/Users/mac/.cache/modelscope/hub/models/iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online'

In [None]:
from funasr_onnx.paraformer_online_bin import Paraformer



In [18]:
import soundfile
import os
chunk_size = [0, 10, 5] #[0, 10, 5] 600ms, [0, 8, 4] 480ms
encoder_chunk_look_back = 4 #number of chunks to lookback for encoder self-attention
decoder_chunk_look_back = 1 #number of encoder chunks to lookback for decoder cross-attention
model1 = Paraformer(model_dir = "/Users/mac/.cache/modelscope/hub/models/iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online",
                    quantize = True, 
                    chunk_size = chunk_size)
wav_file = "/Users/mac/Desktop/audio-services/cache/asr_example.wav"
speech, sample_rate = soundfile.read(wav_file)
chunk_stride = chunk_size[1] * 960 # 600ms

cache = {}
total_chunk_num = int(len((speech)-1)/chunk_stride+1)
for i in range(total_chunk_num):
    speech_chunk = speech[i*chunk_stride:(i+1)*chunk_stride]
    is_final = i == total_chunk_num - 1
    res = model1(audio_in=speech_chunk, param_dict = dict(
        is_final=is_final,cache = cache, encoder_chunk_look_back=encoder_chunk_look_back, decoder_chunk_look_back=decoder_chunk_look_back)
    )
    print(res)

[]
[]
[{'preds': ('欢迎大', ['欢', '迎', '大'])}]
[{'preds': ('家来体', ['家', '来', '体'])}]
[{'preds': ('验达摩', ['验', '达', '摩'])}]
[{'preds': ('院推出', ['院', '推', '出'])}]
[{'preds': ('的语音', ['的', '语', '音'])}]
[{'preds': ('识别模', ['识', '别', '模'])}]
[{'preds': ('型儿么', ['型', '儿', '么'])}]
[{'preds': ('呢', ['呢'])}]


In [56]:
from dataclasses import dataclass
from functools import lru_cache


@lru_cache(maxsize=None)
def load_model():
    return Paraformer(
        model_dir="/Users/mac/Desktop/audio-services/resources/modelscope/hub/iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online", 
        device = "cuda",
        quantize = True,
        chunk_size = [0,10,5],
        disable_update = True)


@dataclass
class ParaformerStreaming:
    chunk_ms:int = 600 
    encoder_chunk_look_back:int = 4
    decoder_chunk_look_back:int = 1
    
    def __post_init__(self):
        self.model = load_model()
        self.cache = {}
        self.chunk_size = [0, int(self.chunk_ms / 60), int(self.chunk_ms / 120)]
        

    def new(self):
        return ParaformerStreaming(
            chunk_ms=self.chunk_ms,
            encoder_chunk_look_back=self.encoder_chunk_look_back, 
            decoder_chunk_look_back=self.decoder_chunk_look_back)

    def run(self, speech_chunk, sampling_rate:int =16000, is_final = False):
            
        speech_chunk = np.array(speech_chunk).astype("float32")
        
        res = self.model(
            audio_in=speech_chunk, 
            param_dict = dict(
                cache = self.cache,
                is_final = is_final,
                encoder_chunk_look_back = self.encoder_chunk_look_back,
                decoder_chunk_look_back = self.decoder_chunk_look_back
            ))
        if len(res) == 0:
            text = " "
        elif "preds" not in res[0]:
            text = " "
        else:
            try:
                text = res[0]['preds'][0]
            except:
                text = " "
        return [{"text":text}]
    

In [57]:
p = ParaformerStreaming(chunk_ms=600, encoder_chunk_look_back=4, decoder_chunk_look_back=1)

In [58]:
import numpy as np

In [59]:
import time
for i in range(total_chunk_num):
    speech_chunk = speech[i*chunk_stride:(i+1)*chunk_stride]
    is_final = i == total_chunk_num - 1
    start = time.time()
    res = p.run(speech_chunk = speech_chunk, is_final = is_final)
    end = time.time()
    if len(res) == 0:
        text = " "
    elif "preds" not in res[0]:
        text = " "
    else:
        try:
            text = res[0]['preds'][0]
        except:
            text = " "
    print(text)
    print(f"chunk {i} time: {end - start:.2f}s")
    print(res)

 
chunk 0 time: 0.11s
[{'text': ' '}]
 
chunk 1 time: 0.04s
[{'text': ' '}]
 
chunk 2 time: 0.04s
[{'text': '欢迎大'}]
 
chunk 3 time: 0.05s
[{'text': '家来体'}]
 
chunk 4 time: 0.05s
[{'text': '验达摩'}]
 
chunk 5 time: 0.05s
[{'text': '院推出'}]
 
chunk 6 time: 0.05s
[{'text': '的语音'}]
 
chunk 7 time: 0.05s
[{'text': '识别模'}]
 
chunk 8 time: 0.05s
[{'text': '型儿一'}]
 
chunk 9 time: 0.03s
[{'text': '是'}]


In [47]:
text

'嗯'