Speaker Verification Using Adapted Gaussain Mixture Models

In [1]:
from sklearn.mixture import GaussianMixture
import pylab as pl
from scipy import linalg
import numpy as np
import matplotlib as mpl
from joblib import dump, load
import pandas as pd

# Data - Train

Se extraen los MFCCS de 5 audios de una duración promedio de 7 segundo.
- Angelower p225 
- Carlos p226
- Gabriel p227
- Jose p228
- Juan p229
- Leon p230

In [142]:
PATH_MFCS ='./MFCCS_Train/'
speakers = [225,226,227,228,229,230]
audios_jose = pd.read_hdf(PATH_MFCS+'/p'+'228'+'.hd5').values
audios_leo = pd.read_hdf(PATH_MFCS+'/p'+'230'+'.hd5').values
audios_angelower = pd.read_hdf(PATH_MFCS+'/p'+'225'+'.hd5').values
audios_gabriel = pd.read_hdf(PATH_MFCS+'/p'+'227'+'.hd5').values
audios_juan=pd.read_hdf(PATH_MFCS+'/p'+'229'+'.hd5').values
audios_carlos= pd.read_hdf(PATH_MFCS+'/p'+'226'+'.hd5').values

In [143]:
audios_jose.shape

(2588, 20)

In [None]:
audios_juan.shape

# Load UBM Model

GMM de 300 componentes, entrado con 20 speakers, cada uno con un promedio de 350 audios.

In [153]:
gmm_UBM = load('GMM_300_NC.joblib')

In [154]:
gmm_UBM

GaussianMixture(covariance_type='diag', init_params='kmeans', max_iter=15,
                means_init=None, n_components=300, n_init=1,
                precisions_init=None, random_state=None, reg_covar=1e-06,
                tol=0.001, verbose=1, verbose_interval=1, warm_start=False,
                weights_init=None)

In [155]:
gmm_UBM.means_

array([[-3.72092763e+02,  7.90531104e+01,  5.75087773e+01, ...,
         1.95589305e+00, -5.42388895e+00,  1.31865869e+01],
       [-6.40186012e+02,  6.51182066e+01,  4.77050119e+01, ...,
         1.02605349e+01,  1.03537936e+01,  1.04918980e+01],
       [-4.42519519e+02,  1.62691426e+02,  5.65444069e+01, ...,
         3.48107743e+00,  3.47658404e+00,  5.90776896e+00],
       ...,
       [-4.18289128e+02, -6.26675689e-01, -1.00534455e+01, ...,
         2.05060145e+01,  5.20795478e+00,  1.31309701e+01],
       [-4.08525935e+02, -6.49790849e+01,  7.48685844e+00, ...,
         1.85930022e+01,  7.63451773e+00,  1.93786638e+01],
       [-6.66046679e+02,  4.18304648e+01,  4.05385905e+01, ...,
         1.28114425e+01,  1.19357271e+01,  1.11988715e+01]])

In [156]:
gmm_UBM.covariances_.shape

(300, 20)

# Adaptation of Speaker Model 

Bayesian learning or maximum a posteriori (MAP) estimation

# 1 Determine the probabilistic alignment of the training vectors into the UBM mixture components
Given a UBM and training vectors from the hypothesized speaker, X = {x 1 , . . . , x T }, we first determine
the probabilistic alignment of the training vectors into the UBM mixture components. That is, for mixture i in the UBM, we compute

In [157]:
def map_adaptation(gmm,data,max_iterations=5000,likelihood_threshold= 1e-20, relevance_factor = 16):
    T = data.shape[0] #Número de ventanas
    D = data.shape[1]  # Dimensiones (Número de coeficientes MFCCS)
    M = gmm.n_components #Número de componentes
    
    #Parametros del UBM
    mu_k = gmm.means_
    cov_k = gmm.covariances_
    pi_k = gmm.weights_
    
    #Inicializamos los parametros del nuevo modelo, los cuales se actualizarán usando el Algoritmo de EM
    mu_new = np.zeros((M,D)) #Vector de medias (300,20) # cada compoenente tiene un vector de medias (1,20)
    cov_new = np.ones((M,D))#Matriz de covarianza
    weights_new = np.zeros((M,1))#Vector de pesos
    
    #Variables auxiliares
    n_i = np.zeros((T,M)) #Vector de probabilidades por compoenentes (300,1) probabilidad de cada componente
    
    #Probabilidades
    # Estas son usadas como condiciona de parada de la adaptación
    old_likelihood = gmm.score(data)
    new_likelihood = 0 # Ya que no se ha creado el modelo nuevo(Adaptado)
    iterations = 0
    # likelihood ratios (LLR) 
    while(abs(old_likelihood - new_likelihood) > likelihood_threshold and iterations < max_iterations):
        iterations +=1
        old_likelihood = new_likelihood # Actualizamos la probabilidad antigua, que va ser 
        
    #1 Paso: first determine the probabilistic alignment of the training vectors into the UBM mixture components
        
        z_n_k = gmm.predict_proba(data) # (T,300) probabilidad de que un vector pertenezca a la componente M.
        n_i = np.sum(z_n_k,axis=0)   # Suma de las probabilidades (300,1) de todos los tiempos en las componentes
        
    #2 Paso: Use Pr(i | x t ) and x t to compute the sufficient statistics for the weight, mean, and variance parameters  
              
        
        for i in range(M): # Por cada componente
            Ei = np.zeros((1,D))
            #Eii = np.zeros((1,D))
            for t in range(T): #Por cada vector t=0 ,.... T=N_Windows
                xt=data[t,:]
                #xtt=xt*xt
                Ei =Ei+ np.dot(z_n_k[t][i],xt)
                #Eii+=z_n_k[t][i]*xtt
            #Normalizamos, dividiendo por n_i
            mu_new[i]=(1/n_i[i])*Ei
        #Calculamos el coeficiente de adaptación 
        adaptation_coefficient = n_i/(n_i+relevance_factor)
    
    #3 Paso: Calculo el nuevo valor de los parametros usando el coeficiente de adaptación
        for i in range(M):
            #Actualizo vector de medias
            mu_k[i] = (adaptation_coefficient[i]*mu_new[i])+((1-adaptation_coefficient[i])*mu_k[i])
            #Actualizo vector de pesos, pero necesito el factor de escala.
            #pi_k[i]
            #Actualizo matriz de covarianza
            #cov_k[i]
    #4 Paso: actualizo parametros del modelo a retorna
        gmm.means_=mu_k
        
        
    
    #5 Paso: calculo las nuevas probabilidades del modelo
        log_likelihood = gmm.score(data)
        
        new_likelihood = log_likelihood
        print(log_likelihood)
    return gmm

In [None]:
gmm_juan = map_adaptation(gmm_UBM,audios_juan)

-73.55280576004986
-72.44488278708398
-71.92857706297127
-71.60192010145624
-71.37847142281844
-71.21692461901064
-71.0964062575396
-70.99846087500933
-70.91064568290705
-70.82866852987313
-70.75335461590913
-70.68337728067726
-70.62180359879905
-70.5696269956613
-70.52446031513627
-70.48116032019128
-70.43674297987017
-70.39201082029527
-70.34624096111297
-70.30301723532467
-70.26638757969755
-70.2330951753308
-70.20394275829453
-70.17869369028348
-70.15706578103749
-70.13747530204708
-70.11740300327565
-70.09784346589608
-70.08150345871405
-70.06636957578314
-70.05221985439985
-70.039099214843
-70.02731351490613
-70.01684180020342
-70.00761187310616
-69.99936514641112
-69.9916250683277
-69.9839995452954
-69.9762203424977
-69.96838043317017
-69.9608871434453
-69.95407725133802
-69.94790252760073
-69.94209805985129
-69.93660203358888
-69.93166183229152
-69.92730474077229
-69.92332967652993
-69.91945342472646
-69.9154314363701
-69.9110762834452
-69.90637184841293
-69.90145086348433
-69.

In [None]:
gmm_juan.predict_proba(audios_juan)

In [None]:
gmm_juan.means_

In [6]:
def distance_probabilities(gmm1,gmm2,sound1,sound2):
    nv1 = sound1.shape[0]
    nv2 = sound2.shape[0]
    
    score_gmm1_with_sound1 =gmm1.score(sound1)
    score_gmm2_with_sound1 =gmm2.score(sound1)
    
    score_gmm1_with_sound2 = gmm1.score(sound2)
    score_gmm2_with_sound2 = gmm2.score(sound2)
    
    D12= (1/nv2)*(score_gmm2_with_sound2 - score_gmm1_with_sound2)
    D21 = (1/nv1)*(score_gmm2_with_sound1 - score_gmm1_with_sound1)
    
    D=(np.absolute(D12 - D21))/2
    return D