In [None]:
import queue
import threading
import tempfile
import time
from collections import deque

import numpy as np
import sounddevice as sd
from scipy.io import wavfile

from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks


SAMPLE_RATE = 16000            # model expects 16 kHz
CHANNELS = 1
CHUNK_SEC = 1.0                # record chunk length
HOP_SEC = 0.5                  
PRINT_INTERVAL_SEC = 0.5       # how often to print an update
SMOOTHING_ALPHA = 0.6          

MODEL_ID = "iic/emotion2vec_plus_base"

emo = pipeline(
    task=Tasks.emotion_recognition,
    model=MODEL_ID
)

MODEL_LABELS = None

audio_q = queue.Queue(maxsize=10)
stop_flag = threading.Event()

def audio_producer():
    frame_len = int(SAMPLE_RATE * CHUNK_SEC)
    hop_len = int(SAMPLE_RATE * HOP_SEC)
    ring = deque(maxlen=frame_len)

    def callback(indata, frames, time_info, status):
        if status:
            print(f"[audio status] {status}")
        mono = indata[:, 0] if indata.ndim > 1 else indata
        ring.extend((mono * 32767).astype(np.int16))  # float32->int16 scale

        # Whenever we have enough samples for one chunk, emit and keep overlap
        if len(ring) == frame_len:
            chunk = np.array(ring, dtype=np.int16)
            try:
                audio_q.put_nowait(chunk)
            except queue.Full:
                pass
            # simulate hop by discarding hop_len samples
            for _ in range(hop_len):
                if ring:
                    ring.popleft()

    with sd.InputStream(
        samplerate=SAMPLE_RATE,
        channels=CHANNELS,
        dtype="float32",
        callback=callback,
        blocksize=int(SAMPLE_RATE * 0.05),  # ~50 ms callback
    ):
        while not stop_flag.is_set():
            time.sleep(0.1)

def run_model_on_wav_bytes(int16_pcm):
    """Write a temp wav and call the ModelScope pipeline with frame granularity."""
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as tmp:
        wavfile.write(tmp.name, SAMPLE_RATE, int16_pcm)
        res = emo(tmp.name, granularity="frame", extract_embedding=False)
        return res

def infer_consumer():
    global MODEL_LABELS

    ewma_scores = None
    last_print = 0.0

    while not stop_flag.is_set():
        try:
            pcm = audio_q.get(timeout=0.2)
        except queue.Empty:
            continue

        res = run_model_on_wav_bytes(pcm)

        labels = res.get("labels")
        scores = res.get("scores")

        if labels is None or scores is None:
            detail = res.get("detail") or {}


In [None]:
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks

inference_pipeline = pipeline(
    task=Tasks.emotion_recognition,
    model="iic/emotion2vec_plus_base")

rec_result = inference_pipeline('SOME.wav', granularity="utterance", extract_embedding=False)
print(rec_result)