In [54]:
import librosa
import numpy as np

def extract_mfcc(audio_path, n_mfcc=30, sr=48000, hop_length=512, n_fft=1024):
    """
    提取指定路径音频的MFCC特征。

    参数:
    - audio_path: 音频文件的路径。
    - n_mfcc: 要提取的MFCC特征的数量。
    - sr: 音频的采样率。
    - hop_length: 帧移，即每个窗口的样本数。
    - n_fft: FFT窗口的大小。

    返回:
    - mfccs: 音频的MFCC特征。
    """
    # 加载音频文件
    audio, sr = librosa.load(audio_path, sr=sr)
    #剪掉开头和结尾的静音部分
    audio, _ = librosa.effects.trim(audio)
    #归一化音频数据
    audio = audio / np.max(np.abs(audio))
    # 提取MFCC特征
    mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc, hop_length=hop_length, n_fft=n_fft)
    # 转置MFCC特征，使其维度为(时间步长, 特征数量)
    mfccs = mfccs.T

    return mfccs

In [35]:
import numpy as np

def lbg(features, M):
    """
    LBG算法实现矢量量化。

    参数:
    - features: 特征矩阵，形状为(N, D)，其中N是样本数，D是特征维度。
    - M: 码本的大小。

    返回:
    - codebook: 生成的码本，形状为(M, D)。
    """
    eps = 0.01  # 用于初始化码本分裂的小扰动值
    N, D = features.shape
    codebook = np.mean(features, axis=0).reshape(1, -1)  # 初始化码本为所有特征的平均值

    while codebook.shape[0] < M:
        # 分裂步骤
        new_codebook = []
        for code in codebook:
            new_codebook.append(code * (1 + eps))
            new_codebook.append(code * (1 - eps))
        codebook = np.array(new_codebook)

        i = 0
        while True:
            i += 1
            # 分配步骤
            distances = np.sqrt(((features[:, np.newaxis, :] - codebook[np.newaxis, :, :]) ** 2).sum(axis=2))
            closest_code_indices = np.argmin(distances, axis=1)

            # 更新步骤
            new_codebook = []
            for j in range(codebook.shape[0]):
                if np.any(closest_code_indices == j):
                    # 如果某个码本被分配到至少一个样本，则计算新的码本值
                    new_codebook.append(features[closest_code_indices == j].mean(axis=0))
                else:
                    # 如果某个码本没有被分配到任何样本，则不进行更新，保留原码本值
                    new_codebook.append(codebook[j])
            new_codebook = np.array(new_codebook)

            if np.linalg.norm(codebook - new_codebook) < eps:
                print(f'Converged in {i} iterations.')
                break
            codebook = new_codebook

    return codebook

In [3]:
#计算当前的mfcc结果到码本的最小距离，将features的每个样本与codebook的每个码本计算距离，返回当前样本到码本的最小距离，重复操作，返回所有样本到码本的最小距离和
def calculate_distortion(features, codebook):
    """
    计算矢量量化的失真度。

    参数:
    - features: 特征矩阵，形状为(N, D)，其中N是样本数，D是特征维度。
    - codebook: 码本，形状为(M, D)。

    返回:
    - distortion: 失真度。
    """
    # 计算每个样本到每个码本的距离
    distances = np.sqrt(((features[:, np.newaxis, :] - codebook[np.newaxis, :, :]) ** 2).sum(axis=2))
    # 计算每个样本到最近码本的距离
    min_distances = np.min(distances, axis=1)
    # 计算失真度
    distortion = min_distances.sum()/features.shape[0]

    return distortion



In [58]:
path1 = r"C:\Users\paihui\Downloads\Compressed\2024集创赛紫光同创杯测试音频\E 声纹分类测试音频\1.m4a"
path2 = r"C:\Users\paihui\Downloads\Compressed\2024集创赛紫光同创杯测试音频\E 声纹分类测试音频\9.m4a"
mfccs = extract_mfcc(path1)
print(mfccs.shape)
codebook1 = lbg(mfccs, 32)
print(codebook1.shape)

mfccs2 = extract_mfcc(path2)
print(mfccs2.shape)
print(mfccs2.shape)
# 计算两个码本的相似性
similarity = calculate_distortion(mfccs2, codebook1)
print(similarity)


(373, 30)
Converged in 5 iterations.
Converged in 8 iterations.
Converged in 9 iterations.
Converged in 16 iterations.
Converged in 8 iterations.
(32, 30)
(411, 30)
(411, 30)
72.44593027676399


In [60]:
import os
import librosa
import numpy as np

def main(audio_dir):
    supported_formats = ('.wav', '.mp3', '.m4a')
    audio_files = [os.path.join(audio_dir, f) for f in os.listdir(audio_dir) if f.endswith(supported_formats)]
    codebooks = []  # 存储所有码本的列表
    audio_to_codebook = []
    for audio_file in audio_files:
        mfcc = extract_mfcc(audio_file)
        min_distortion = float('inf')
        min_codebook_index = -1

        # 计算与现有码本的calculate_distortion
        for i, codebook in enumerate(codebooks):
            distortion = calculate_distortion(mfcc, codebook)
            if distortion < min_distortion:
                min_distortion = distortion
                min_codebook_index = i

        # 判断是否生成新码本
        if min_distortion > 73:
            codebooks.append(lbg(mfcc, 32))  # 创建新的码本
            #获取codebooks的最大索引
            min_codebook_index = len(codebooks) - 1
            audio_to_codebook.append((audio_file, min_codebook_index))
            print(f"为 {audio_file} 创建新码本")
        # 归类到现有码本，新建一个表格，记录音频文件和码本的索引
        
        else:
            audio_to_codebook.append((audio_file, min_codebook_index))
    return audio_to_codebook

if __name__ == "__main__":
    audio_dir = r"C:\Users\paihui\Downloads\Compressed\2024集创赛紫光同创杯测试音频\E 声纹分类测试音频"
    audio_to_codebook=main(audio_dir)
    #将audio_to_codebook按照索引进行排序
    audio_to_codebook.sort(key=lambda x: x[1])

Converged in 5 iterations.
Converged in 8 iterations.
Converged in 9 iterations.
Converged in 16 iterations.
Converged in 8 iterations.
为 C:\Users\paihui\Downloads\Compressed\2024集创赛紫光同创杯测试音频\E 声纹分类测试音频\1.m4a 创建新码本
Converged in 4 iterations.
Converged in 6 iterations.
Converged in 12 iterations.
Converged in 15 iterations.
Converged in 10 iterations.
为 C:\Users\paihui\Downloads\Compressed\2024集创赛紫光同创杯测试音频\E 声纹分类测试音频\10.m4a 创建新码本
Converged in 6 iterations.
Converged in 9 iterations.
Converged in 13 iterations.
Converged in 9 iterations.
Converged in 8 iterations.
为 C:\Users\paihui\Downloads\Compressed\2024集创赛紫光同创杯测试音频\E 声纹分类测试音频\100.m4a 创建新码本
Converged in 5 iterations.
Converged in 20 iterations.
Converged in 17 iterations.
Converged in 17 iterations.
Converged in 12 iterations.
为 C:\Users\paihui\Downloads\Compressed\2024集创赛紫光同创杯测试音频\E 声纹分类测试音频\14.mp3 创建新码本
Converged in 8 iterations.
Converged in 11 iterations.
Converged in 7 iterations.
Converged in 11 iterations.
Converged in 7 iterat