# Speaker Identification

This notebook looks at speaker identification. First using the KNN classifier we have used in other tasks, then using a Gaussian Mixture Model which is commonly used for speaker ID tasks.  

See [this page](https://appliedmachinelearning.blog/2017/11/14/spoken-speaker-identification-based-on-gaussian-mixture-models-python-implementation/) another example of using GMMs (but with an older version of scikit-learn). 

In [13]:
import os
import librosa
import librosa.display
import pysptk
import matplotlib.pyplot as plt
%matplotlib inline
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np
import utils  # our own utilities

# KNN Classifier

In [35]:
data_dirname = 'data'
iteminfo, speakers = utils.extract_metadata(data_dirname)

#all_data = utils.get_data_for(iteminfo, 'language', 'en')
all_data = np.array(list(iteminfo.keys()))

# get the labels for this list of filenames
target = utils.get_data_labels(iteminfo, all_data, 'speaker')
target.shape

(440,)

In [58]:
def features(datafile):
    """Generate a mean feature vector for a single data file
    return a 1 dimensional numpy array"""
    y, sr = librosa.load(datafile)
    mfcc = librosa.feature.mfcc(y, sr=sr, n_mfcc=20)
    # normalise the feature vector
    mfcc = sklearn.preprocessing.scale(mfcc)
    return mfcc.mean(axis=1)

In [43]:
def delta_features(datafile):
    """Generate a mean feature vector including delta 
    features for a single data file
    return a 1 dimensional numpy array"""
    y, sr = librosa.load(datafile)
    mfcc = librosa.feature.mfcc(y, sr=sr, n_mfcc=20)
    # normalise the feature vector
    mfcc = sklearn.preprocessing.scale(mfcc)
    delta = librosa.feature.delta(mfcc)
    feat = np.concatenate([mfcc, delta])
    return feat.mean(axis=1)

In [59]:
data = np.array([features(d) for d in all_data])
data.shape

(440, 20)

In [60]:
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.50)
print(X_train.shape)
print(X_test.shape)

(220, 20)
(220, 20)


In [61]:
knn = sklearn.neighbors.KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [62]:
predicted = knn.predict(X_test)
print(classification_report(y_test, predicted))

             precision    recall  f1-score   support

        10a       0.75      0.67      0.71         9
        10b       1.00      0.91      0.95        11
        11b       0.62      1.00      0.76         8
        11c       1.00      1.00      1.00         9
         1b       0.71      0.38      0.50        13
         1c       1.00      0.89      0.94         9
         2C       0.88      0.70      0.78        10
         2d       0.24      1.00      0.39         6
         3a       1.00      0.92      0.96        12
         3b       0.44      0.36      0.40        11
         4c       0.70      0.78      0.74         9
         4d       0.45      0.82      0.58        11
         5a       1.00      0.18      0.31        11
         5b       0.90      0.90      0.90        10
          6       0.67      0.18      0.29        11
        6gb       1.00      0.69      0.82        13
         7a       0.62      0.89      0.73         9
         7b       1.00      0.86      0.92   

# Speaker Identification with GMM

Try using a GMM to model each speaker.

In [26]:
from sklearn.mixture import GaussianMixture

# For speaker id we split the data so that we have four items for each 
# speaker (two english, two chinese) reserved for testing

train_files = utils.get_data_for(iteminfo, 'item', ['3', '4', '5', '6', '7', '8', '9', '10'])

# split training data by speaker
speakerdata = {}
for fname in train_files:
    spkr = iteminfo[fname]['speaker']
    if spkr in speakerdata:
        speakerdata[spkr].append(fname)
    else:
        speakerdata[spkr] = [fname]

test_files = utils.get_data_for(iteminfo, 'item', ['1', '2'])
train_files.shape, test_files.shape

((352,), (88,))

In [27]:
def frame_features(datafile, label):
    """Generate mfcc feature vectors for a single data file
    return a 2 dimensional numpy array with one row per frame
    and an array of labels for each frame"""
    y, sr = librosa.load(datafile)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20)
    mfcc = sklearn.preprocessing.scale(mfcc)
    delta = librosa.feature.delta(mfcc)
    labels = np.full((mfcc.shape[1],), label)
    return np.hstack((mfcc, delta)).T, labels

def extract_frame_features(datafiles, target):
    """Given a list of sound files and their target labels
    Compute a sequence of features for each file
    Concatenate these features into a single np.array
    Return these features and a corresponding array
    of target labels, one for every frame"""
    
    data = None
    frame_target = []
    for i in range(len(target)):
        frames, labels = frame_features(datafiles[i], target[i])
        frame_target.extend(labels)
        if data is None:
            # transpose the frames 
            data = frames
        else:
            # concatenate the tranposed frames
            data = np.concatenate((data, frames))

    frame_target = np.array(frame_target)
    
    return data, frame_target

#data, target = extract_frame_features(datafiles, target)
    

In [28]:
# train the UBM
target = utils.get_data_labels(iteminfo, train_files, 'speaker')
data, target = extract_frame_features(train_files, target)

print(data.shape)
ubm = GaussianMixture(n_components=1, warm_start=True)
ubm.fit(data)

(32496, 20)


GaussianMixture(covariance_type='full', init_params='kmeans', max_iter=100,
        means_init=None, n_components=1, n_init=1, precisions_init=None,
        random_state=None, reg_covar=1e-06, tol=0.001, verbose=0,
        verbose_interval=10, warm_start=True, weights_init=None)

In [63]:
import copy

def model_speaker(ubm, speakerdata):
    """use data from one
    speaker to train a GMM for that speaker,
    return the adapted model"""
    
    sp_model = GaussianMixture(n_components=5)
    target = np.full(len(speakerdata), 'x')
    data, target = extract_frame_features(speakerdata, target)
    
    sp_model.fit(data)
    
    return sp_model

test_target = utils.get_data_labels(iteminfo, test_files, 'speaker')
speakers = set(test_target)
print(speakers)
models = {}
for speaker in speakers:
    models[speaker] = model_speaker(ubm, speakerdata[speaker])


{'7b', '3b', '1b', '2d', '6', '5b', '2C', '9c', '4c', '6gb', '1c', '10a', '4d', '5a', '10b', '9a', '11c', '11b', '7a', '3a', '8d', '8b'}


In [64]:
def identify_speaker(models, datafile):

    data, target = frame_features(datafile, 'x')
    scores = []
    for spkr in models:
        score = models[spkr].score(data)
        scores.append((score, spkr))
    return scores

import os

allscores = {}
for fname in test_files:
    scores = identify_speaker(models, fname)
    allscores[os.path.basename(fname)] = scores

In [65]:
for fn in test_files:
    s = sorted(allscores[os.path.basename(fn)])
    print(os.path.basename(fn), s[:2])

1b_F_Ch_01.wav [(-283.33644667343754, '11b'), (-194.1239778655229, '5b')]
1b_F_Ch_02.wav [(-336.48171013744883, '5b'), (-243.15703369990405, '11b')]
1b_F_En_01.wav [(-466.28436155675121, '11b'), (-29.613564482770837, '3a')]
1b_F_En_02.wav [(-105.47306926260845, '11b'), (-6.2579101713362713, '3a')]
1c_M_Ch_01.wav [(-9367.1348931996636, '5b'), (-1487.9530709369119, '11b')]
1c_M_Ch_02.wav [(-13152.959855078836, '5b'), (-2104.9762360846389, '11b')]
1c_M_En_01.wav [(-7330.9516421427224, '5b'), (-1622.3547473973872, '11b')]
1c_M_En_02.wav [(-5558.2052874961355, '5b'), (-910.497037810493, '11b')]
2C-W-Ch-01.wav [(-9371.6887264968464, '5b'), (-2049.7145808099817, '11b')]
2C-W-Ch-02.wav [(-4914.6579828333306, '5b'), (-1271.2126526227034, '11b')]
2C-W-En--01.wav [(-10503.712566896693, '5b'), (-2034.3582624112025, '11b')]
2C-W-En--02.wav [(-6647.8014470710486, '5b'), (-1430.4150137133495, '11b')]
2d_M_Ch_01.wav [(-6976.2697786799281, '5b'), (-530.13553481249392, '11b')]
2d_M_Ch_02.wav [(-8496.855

In [None]:
test_files[10]