<a href="https://colab.research.google.com/github/pyannote/pyannote-audio/blob/develop/tutorials/intro.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

[`pyannote.audio`](https://github.com/pyannote/pyannote-audio) is an open-source toolkit written in Python for **speaker diarization**.

Based on [`PyTorch`](https://pytorch.org) machine learning framework, it provides a set of trainable end-to-end neural building blocks that can be combined and jointly optimized to build speaker diarization pipelines.

`pyannote.audio` also comes with pretrained [models](https://huggingface.co/models?other=pyannote-audio-model) and [pipelines](https://huggingface.co/models?other=pyannote-audio-pipeline) covering a wide range of domains for voice activity detection, speaker segmentation, overlapped speech detection, speaker embedding reaching state-of-the-art performance for most of them.

**This notebook will teach you how to apply those pretrained pipelines on your own data.**

Make sure you run it using a GPU (or it might otherwise be slow...)

## Installation

In [None]:
!pip install -qq https://github.com/pyannote/pyannote-audio/archive/refs/heads/develop.zip
!pip install -qq ipython==7.34.0

[2K     [32m|[0m [32m16.6 MB[0m [31m5.3 MB/s[0m [33m0:00:04[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.2/42.2 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m49.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.5/79.5 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.5/58.5 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.1/48.1 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.4/51.4 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━

In [None]:
! pip install pydub

In [None]:
from pydub import AudioSegment
from scipy.spatial.distance import cdist

# 文件路径
mp3_file_paths = [
    "/content/AudioCutter_mmexport1692876162085 [vocals](2).mp3",
    "/content/AudioCutter_mmexport1692876162085 [vocals](3).mp3"
]
output_wav_path = "/content/original_voice.wav"

# 合并MP3文件并保存为WAV格式
combined_audio = AudioSegment.silent(duration=0)  # 创建一个空音频段

for mp3_file_path in mp3_file_paths:
    audio = AudioSegment.from_mp3(mp3_file_path)
    combined_audio += audio

combined_audio.export(output_wav_path, format="wav")


<_io.BufferedRandom name='/content/original_voice.wav'>

In [None]:
# 加载MP3音频并转换为WAV格式
mp3_file_path = "/content/AudioCutter_mmexport1692869397949[music+vocals].mp3"
wav_file_path = "supershay.wav"

audio = AudioSegment.from_mp3(mp3_file_path)
audio.export(wav_file_path, format="wav")

<_io.BufferedRandom name='supershay.wav'>

In [None]:
# 加载MP3音频并转换为WAV格式
mp3_file_path = "/content/zuodehenhao .mp3"
wav_file_path = "zuodehenhao.wav"

audio = AudioSegment.from_mp3(mp3_file_path)
audio.export(wav_file_path, format="wav")

<_io.BufferedRandom name='zuodehenhao.wav'>

In [None]:
# 加载MP3音频并转换为WAV格式
mp3_file_path = "/content/zuodehenhao2.mp3"
wav_file_path = "zuodehenhao2.wav"

audio = AudioSegment.from_mp3(mp3_file_path)
audio.export(wav_file_path, format="wav")

<_io.BufferedRandom name='zuodehenhao2.wav'>

In [None]:
pip install torchaudio



In [None]:
import torch
import torchaudio
from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
from scipy.spatial.distance import cdist

# 指定音频文件路径
output_wav_path = "/content/original_voice.wav"
compared_voice_path = "/content/supershay.wav"

# 加载预训练的说话人嵌入模型
model = PretrainedSpeakerEmbedding(
    "speechbrain/spkrec-ecapa-voxceleb",
    device=torch.device("cuda"))

# 使用torchaudio加载音频文件
waveform1, _ = torchaudio.load(output_wav_path)
waveform2, _ = torchaudio.load(compared_voice_path)

# 如果音频不是单通道，可以进行下混
waveform1 = waveform1.mean(dim=0, keepdim=True)
waveform2 = waveform2.mean(dim=0, keepdim=True)

# 添加批处理维度
waveform1 = waveform1.unsqueeze(0)
waveform2 = waveform2.unsqueeze(0)

# 提取嵌入
embedding1 = model(waveform1)
embedding2 = model(waveform2)

# 使用余弦距离比较嵌入
distance = cdist(embedding1, embedding2, metric="cosine")

# distance值越小，两个音频的说话人越相似
distance

Downloading (…)ain/hyperparams.yaml:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

Downloading embedding_model.ckpt:   0%|          | 0.00/83.3M [00:00<?, ?B/s]

Downloading (…)an_var_norm_emb.ckpt:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

Downloading classifier.ckpt:   0%|          | 0.00/5.53M [00:00<?, ?B/s]

Downloading (…)in/label_encoder.txt:   0%|          | 0.00/129k [00:00<?, ?B/s]

array([[0.6931781]])

In [None]:
import torch
import torchaudio
from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
from scipy.spatial.distance import cdist

# 指定音频文件路径
output_wav_path = "/content/original_voice.wav"
compared_voice_path = "/content/zuodehenhao.wav"

# 加载预训练的说话人嵌入模型
model = PretrainedSpeakerEmbedding(
    "speechbrain/spkrec-ecapa-voxceleb",
    device=torch.device("cuda"))

# 使用torchaudio加载音频文件
waveform1, _ = torchaudio.load(output_wav_path)
waveform2, _ = torchaudio.load(compared_voice_path)

# 如果音频不是单通道，可以进行下混
waveform1 = waveform1.mean(dim=0, keepdim=True)
waveform2 = waveform2.mean(dim=0, keepdim=True)

# 添加批处理维度
waveform1 = waveform1.unsqueeze(0)
waveform2 = waveform2.unsqueeze(0)

# 提取嵌入
embedding1 = model(waveform1)
embedding2 = model(waveform2)

# 使用余弦距离比较嵌入
distance = cdist(embedding1, embedding2, metric="cosine")

# distance值越小，两个音频的说话人越相似
distance

array([[0.42989582]])

In [None]:
import numpy as np
from scipy.spatial.distance import euclidean

# 提取嵌入
embedding1 = model(waveform1)
embedding2 = model(waveform2)

# 去除所有大小为 1 的维度
embedding1 = np.squeeze(embedding1)
embedding2 = np.squeeze(embedding2)

# 使用欧几里得距离比较嵌入
distance = euclidean(embedding1, embedding2)

# 输出欧几里得距离
print("Euclidean Distance:", distance)

Euclidean Distance: 279.2568359375


In [None]:
import torch
import torchaudio
from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
from scipy.spatial.distance import cdist

# 指定音频文件路径
output_wav_path = "/content/original_voice.wav"
compared_voice_path = "/content/zuodehenhao2.wav"

# 加载预训练的说话人嵌入模型
model = PretrainedSpeakerEmbedding(
    "speechbrain/spkrec-ecapa-voxceleb",
    device=torch.device("cuda"))

# 使用torchaudio加载音频文件
waveform1, _ = torchaudio.load(output_wav_path)
waveform2, _ = torchaudio.load(compared_voice_path)

# 如果音频不是单通道，可以进行下混
waveform1 = waveform1.mean(dim=0, keepdim=True)
waveform2 = waveform2.mean(dim=0, keepdim=True)

# 添加批处理维度
waveform1 = waveform1.unsqueeze(0)
waveform2 = waveform2.unsqueeze(0)

# 提取嵌入
embedding1 = model(waveform1)
embedding2 = model(waveform2)

# 使用余弦距离比较嵌入
distance = cdist(embedding1, embedding2, metric="cosine")

# distance值越小，两个音频的说话人越相似
distance

array([[0.36684647]])

In [None]:
import numpy as np
from scipy.spatial.distance import euclidean

# 提取嵌入
embedding1 = model(waveform1)
embedding2 = model(waveform2)

# 去除所有大小为 1 的维度
embedding1 = np.squeeze(embedding1)
embedding2 = np.squeeze(embedding2)

# 使用欧几里得距离比较嵌入
distance = euclidean(embedding1, embedding2)

# 输出欧几里得距离
print("Euclidean Distance:", distance)




Euclidean Distance: 274.4893798828125


In [None]:
import torch
import torchaudio
from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
from scipy.spatial.distance import cdist

# 指定音频文件路径
output_wav_path = "/content/supershay.wav"
compared_voice_path = "/content/zuodehenhao2.wav"

# 加载预训练的说话人嵌入模型
model = PretrainedSpeakerEmbedding(
    "speechbrain/spkrec-ecapa-voxceleb",
    device=torch.device("cuda"))

# 使用torchaudio加载音频文件
waveform1, _ = torchaudio.load(output_wav_path)
waveform2, _ = torchaudio.load(compared_voice_path)

# 如果音频不是单通道，可以进行下混
waveform1 = waveform1.mean(dim=0, keepdim=True)
waveform2 = waveform2.mean(dim=0, keepdim=True)

# 添加批处理维度
waveform1 = waveform1.unsqueeze(0)
waveform2 = waveform2.unsqueeze(0)

# 提取嵌入
embedding1 = model(waveform1)
embedding2 = model(waveform2)

# 使用余弦距离比较嵌入
distance = cdist(embedding1, embedding2, metric="cosine")

# distance值越小，两个音频的说话人越相似
distance

array([[0.60504768]])

In [None]:
import torch
import torchaudio
from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
from scipy.spatial.distance import cdist

# 指定音频文件路径
output_wav_path = "/content/supershay.wav"
compared_voice_path = "/content/zuodehenhao.wav"

# 加载预训练的说话人嵌入模型
model = PretrainedSpeakerEmbedding(
    "speechbrain/spkrec-ecapa-voxceleb",
    device=torch.device("cuda"))

# 使用torchaudio加载音频文件
waveform1, _ = torchaudio.load(output_wav_path)
waveform2, _ = torchaudio.load(compared_voice_path)

# 如果音频不是单通道，可以进行下混
waveform1 = waveform1.mean(dim=0, keepdim=True)
waveform2 = waveform2.mean(dim=0, keepdim=True)

# 添加批处理维度
waveform1 = waveform1.unsqueeze(0)
waveform2 = waveform2.unsqueeze(0)

# 提取嵌入
embedding1 = model(waveform1)
embedding2 = model(waveform2)

# 使用余弦距离比较嵌入
distance = cdist(embedding1, embedding2, metric="cosine")

# distance值越小，两个音频的说话人越相似
distance

array([[0.58349131]])

In [None]:
import librosa
import numpy as np
from scipy.spatial.distance import euclidean

# 加载音频文件
audio1, sr1 = librosa.load("/content/original_voice.wav")
audio2, sr2 = librosa.load("/content/zuodehenhao2.wav")

# 计算MFCC
mfcc1 = librosa.feature.mfcc(audio1, sr1)
mfcc2 = librosa.feature.mfcc(audio2, sr2)

# 对MFCC进行平均，以获得一个固定大小的表示
# 这是一种简化，更复杂的方法可以考虑整个MFCC序列
avg_mfcc1 = np.mean(mfcc1, axis=1)
avg_mfcc2 = np.mean(mfcc2, axis=1)

# 使用欧几里得距离计算MFCC之间的相似性
distance = euclidean(avg_mfcc1, avg_mfcc2)

print("Euclidean Distance between MFCCs:", distance)
