# Speaker Verification Inference
This notebook demonstrates how to perform direct inference for speaker verification using the ModelScope pipeline.

In [6]:
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks

In [7]:
# Initialize the speaker verification pipeline
inference_sv_pipeline = pipeline(
    task=Tasks.speaker_verification,
    model="damo/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch",
)

# Inference: same speaker
rec_result_same = inference_sv_pipeline(
    audio_in=(
        "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/sv_example_enroll.wav",
        "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/sv_example_same.wav",
    )
)
print("Same speaker similarity:", rec_result_same["scores"])

# Inference: different speaker
rec_result_diff = inference_sv_pipeline(
    audio_in=(
        "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/sv_example_enroll.wav",
        "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/sv_example_different.wav",
    )
)
print("Different speaker similarity:", rec_result_diff["scores"])

2025-05-08 05:17:16,573 - modelscope - INFO - initiate model from /home/maojia/.cache/modelscope/hub/damo/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch
2025-05-08 05:17:16,574 - modelscope - INFO - initiate model from location /home/maojia/.cache/modelscope/hub/damo/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch.
2025-05-08 05:17:16,575 - modelscope - INFO - initialize model from /home/maojia/.cache/modelscope/hub/damo/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch
2025-05-08 05:17:16,803 - modelscope - INFO - Speaker Verification Processing: ('https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/sv_example_enroll.wav', 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/sv_example_same.wav') ...
2025-05-08 05:17:20,467 - modelscope - INFO - Speaker Verification Processing: ('https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/sv_example_enroll.wav', 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_au

Same speaker similarity: [0.8540488358969999, 0.14595116410300013]
Different speaker similarity: [0.0, 1.0]


In [9]:
enroll = inference_sv_pipeline(audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/sv_example_enroll.wav')["spk_embedding"]

same = inference_sv_pipeline(audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/sv_example_same.wav')["spk_embedding"]

import numpy as np
# 对相同的说话人计算余弦相似度
sv_threshold=0.9465
same_cos=np.sum(enroll*same)/(np.linalg.norm(enroll)*np.linalg.norm(same))
same_cos=max(same_cos - sv_threshold, 0.0) / (1.0 - sv_threshold) * 100.0
print(same_cos)

2025-05-08 05:18:03,530 - modelscope - INFO - Speaker Verification Processing: https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/sv_example_enroll.wav ...
2025-05-08 05:18:07,414 - modelscope - INFO - Speaker Verification Processing: https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/sv_example_same.wav ...


85.40488358969999


In [None]:
rec_result_same

{'label': ['Same', 'Different'],
 'scores': [0.8540488358969999, 0.14595116410300013]}

In [None]:
from modelscope.pipelines import pipeline
sv_pipline = pipeline(
    task='speaker-verification',
    model='iic/speech_eres2net_large_sv_zh-cn_3dspeaker_16k',
    model_revision='v1.0.0'
)
speaker1_a_wav = 'https://modelscope.cn/api/v1/models/damo/speech_campplus_sv_zh-cn_16k-common/repo?Revision=master&FilePath=examples/speaker1_a_cn_16k.wav'
speaker1_b_wav = 'https://modelscope.cn/api/v1/models/damo/speech_campplus_sv_zh-cn_16k-common/repo?Revision=master&FilePath=examples/speaker1_b_cn_16k.wav'
speaker2_a_wav = 'https://modelscope.cn/api/v1/models/damo/speech_campplus_sv_zh-cn_16k-common/repo?Revision=master&FilePath=examples/speaker2_a_cn_16k.wav'
# 相同说话人语音
result = sv_pipline([speaker1_a_wav, speaker1_b_wav])
print(result)
# 不同说话人语音
result = sv_pipline([speaker1_a_wav, speaker2_a_wav])
print(result)
# 可以自定义得分阈值来进行识别
result = sv_pipline([speaker1_a_wav, speaker2_a_wav], thr=0.262)
print(result)

2025-05-08 05:03:29,525 - modelscope - INFO - Use user-specified model revision: v1.0.0
Downloading: 100%|██████████| 407/407 [00:00<00:00, 2.92MB/s]
Downloading: 100%|██████████| 143k/143k [00:00<00:00, 235kB/s]
Downloading: 100%|██████████| 123k/123k [00:00<00:00, 202kB/s]
Downloading: 100%|██████████| 111M/111M [12:00<00:00, 162kB/s] 
Downloading: 100%|██████████| 5.84k/5.84k [00:00<00:00, 16.1MB/s]
Downloading: 100%|██████████| 116k/116k [00:00<00:00, 178kB/s]
Downloading: 100%|██████████| 153k/153k [00:00<00:00, 239kB/s]
Downloading: 100%|██████████| 166k/166k [00:00<00:00, 261kB/s]
2025-05-08 05:15:44,668 - modelscope - INFO - initiate model from /home/maojia/.cache/modelscope/hub/iic/speech_eres2net_large_sv_zh-cn_3dspeaker_16k
2025-05-08 05:15:44,668 - modelscope - INFO - initiate model from location /home/maojia/.cache/modelscope/hub/iic/speech_eres2net_large_sv_zh-cn_3dspeaker_16k.
2025-05-08 05:15:44,669 - modelscope - INFO - initialize model from /home/maojia/.cache/modelsc

{'score': 0.63359, 'text': 'yes'}
{'score': 0.08289, 'text': 'no'}
{'score': 0.08289, 'text': 'no'}


In [25]:
a = sv_pipline.preprocess(
    inputs=[speaker1_a_wav, speaker2_a_wav],
)

a

[tensor([ 0.0000,  0.0000,  0.0000,  ..., -0.0002, -0.0012, -0.0009]),
 tensor([0., 0., 0.,  ..., 0., 0., 0.])]

In [None]:
b = sv_pipline.forward(a)
b

tensor([[ 0.2183,  1.0513, -1.0930,  ..., -0.1855, -0.5929,  0.5581],
        [ 0.3999, -0.9875, -0.1107,  ...,  0.1198,  1.7506, -0.5841]])

In [38]:
sv_pipline.postprocess(
    inputs=b,
    in_audios=[speaker1_a_wav, speaker2_a_wav],
)

{'score': 0.08289, 'text': 'no'}