# Get speaker recognition model

In [1]:
import tensorflow as tf
import numpy as np

from Utils.config import (
    TEST_DATA_DIR, 
    SPEAKERS,
    DATA_DIR,
    UBU_DATA_DIR, 
    SELECTED_KEYWORD, 
    SELECTED_SPEAKER, 
    ALTERNATIVE_KEYWORD_1, 
    ALTERNATIVE_SPEAKER_1,
    ALTERNATIVE_KEYWORD_2,
    ALTERNATIVE_SPEAKER_2
)
from Utils.create_model import create_model
from Utils.load_test_wav import load_test_wav




In [2]:
model = create_model('spectrogram')
model.load_weights('Models/70_epochs_spectrogram.h5')
model.summary()







Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 static_stft (Spectrogram)   (None, 257, 63, 1)        263168    
                                                                 
 conv2d (Conv2D)             (None, 256, 62, 16)       80        
                                                                 
 max_pooling2d (MaxPooling2  (None, 85, 20, 16)        0         
 D)                                                              
                                                                 
 flatten (Flatten)           (None, 27200)             0         
                                                                 
 dense (Dense)               (None, 9)                 244809    
                                                                 
Total params: 508057 (1.94 MB)
Trainable params: 508057 (1.94 MB)
Non-trainable params: 0 (0.00 Byte)
____________________

## Sanity check

In [3]:
signal = load_test_wav(f'{TEST_DATA_DIR}/Hillary_Clinton/dignity.wav')
print(signal)

tf.Tensor(
[[-0.05557251 -0.05883789 -0.0513916  ...  0.03161621  0.02597046
   0.03268433]], shape=(1, 12960), dtype=float32)


In [4]:
print(signal.shape)
signal = tf.pad(signal, [[0, 0], [0, 16000 - 12960]])
print("padded")
print(signal.shape)
signal = tf.expand_dims(signal, 0)
print('dim extended')
print(signal.shape)

(1, 12960)
padded
(1, 16000)
dim extended
(1, 1, 16000)


In [5]:
y_pred = model.predict(signal)



In [6]:
y_pred 

array([[6.6041788e-16, 2.3670447e-20, 1.0000000e+00, 2.9069542e-12,
        6.9307100e-16, 6.9170922e-13, 1.2936300e-26, 6.0045793e-15,
        4.6369544e-15]], dtype=float32)

In [7]:
y_pred_classes = np.argmax(y_pred, axis=1)

In [8]:
print(y_pred_classes)

[2]


In [9]:
speaker = SPEAKERS[y_pred_classes[0]]

In [10]:
print(speaker)

Hillary_Clinton


# Get keyword spotting model

In [1]:
import nemo.collections.asr as nemo_asr
from Utils.config import (
    UBU_DATA_DIR, 
    SELECTED_KEYWORD, 
    SELECTED_SPEAKER, 
    ALTERNATIVE_KEYWORD_1, 
    ALTERNATIVE_SPEAKER_1,
    ALTERNATIVE_KEYWORD_2,
    ALTERNATIVE_SPEAKER_2
)

In [2]:
asr_model = nemo_asr.models.EncDecCTCModel.from_pretrained(
    model_name="QuartzNet15x5Base-En", strict=False
)

[NeMo I 2024-01-21 16:21:06 cloud:58] Found existing object /home/lorca/.cache/torch/NeMo/NeMo_1.22.0/QuartzNet15x5Base-En/2b066be39e9294d7100fb176ec817722/QuartzNet15x5Base-En.nemo.
[NeMo I 2024-01-21 16:21:06 cloud:64] Re-using file from: /home/lorca/.cache/torch/NeMo/NeMo_1.22.0/QuartzNet15x5Base-En/2b066be39e9294d7100fb176ec817722/QuartzNet15x5Base-En.nemo
[NeMo I 2024-01-21 16:21:06 common:913] Instantiating model from pre-trained checkpoint
[NeMo I 2024-01-21 16:21:07 features:289] PADDING: 16
[NeMo I 2024-01-21 16:21:07 save_restore_connector:249] Model EncDecCTCModel was successfully restored from /home/lorca/.cache/torch/NeMo/NeMo_1.22.0/QuartzNet15x5Base-En/2b066be39e9294d7100fb176ec817722/QuartzNet15x5Base-En.nemo.


# Identification

In [12]:
print(SELECTED_KEYWORD)
print(SELECTED_SPEAKER)
print()
print(ALTERNATIVE_KEYWORD_1)
print(ALTERNATIVE_SPEAKER_1)
print()
print(ALTERNATIVE_KEYWORD_2)
print(ALTERNATIVE_SPEAKER_2)

experiences
Hillary_Clinton

challenges
Yamini_Ravindran

forces
Ronald_Reagan


## Speaker and keyword match

### Speaker ID

In [4]:
audio_filename = f"{TEST_DATA_DIR}/{SELECTED_SPEAKER}/{SELECTED_KEYWORD}.wav"
print(audio_filename)

./Data/keywords//Hillary_Clinton/experiences.wav


In [5]:
signal = load_test_wav(audio_filename)

In [7]:
# pre-process the signal
signal = tf.expand_dims(signal, 0)

# post-process the logits
y_pred = model.predict(signal)
y_pred_classes = np.argmax(y_pred, axis=1)



In [8]:
speaker = SPEAKERS[y_pred_classes[0]]
print(f'Speaker: "{speaker}"')

Speaker: "Hillary_Clinton"


### Keyword ID

In [48]:
audio_filename = f"{UBU_DATA_DIR}/keywords/{SELECTED_SPEAKER}/{SELECTED_KEYWORD}.wav"
print(audio_filename)

/mnt/d/__repos/wut_easar/project/Data/keywords/Hillary_Clinton/experiences.wav


In [5]:
files = [audio_filename]
transcript = asr_model.transcribe(paths2audio_files=files)[0]
print(f'Transcript: "{transcript}"')

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcript: "experiences"


### Final result

In [6]:
if transcript == f'{SELECTED_KEYWORD}':
    print('Keyword ID: Success!')
else :
    print('Keyword ID: Failure!')

Keyword ID: Success!


In [15]:
if speaker == f'{SELECTED_SPEAKER}':
    print('Speaker ID: Success!')
else:
    print('Speaker ID: Failure!')

Speaker ID: Success!


## Speaker match but keyword mismatch

### Speaker ID

In [9]:
audio_filename = f"{TEST_DATA_DIR}/{SELECTED_SPEAKER}/{ALTERNATIVE_KEYWORD_1}.wav"
print(audio_filename)

./Data/keywords//Hillary_Clinton/challenges.wav


In [10]:
signal = load_test_wav(audio_filename)

In [12]:
# pre-process the signal
signal = tf.pad(signal, [[0, 0], [0, 16000 - 9760]])
signal = tf.expand_dims(signal, 0)

# post-process the logits
y_pred = model.predict(signal)
y_pred_classes = np.argmax(y_pred, axis=1)



In [13]:
speaker = SPEAKERS[y_pred_classes[0]]
print(f'Speaker: "{speaker}"')

Speaker: "Hillary_Clinton"


### Keyword ID

In [46]:
audio_filename = f"{UBU_DATA_DIR}/keywords/{SELECTED_SPEAKER}/{ALTERNATIVE_KEYWORD_1}.wav"
print(audio_filename)

/mnt/d/__repos/wut_easar/project/Data/keywords/Hillary_Clinton/challenges.wav


In [8]:
files = [audio_filename]
transcript = asr_model.transcribe(paths2audio_files=files)[0]
print(f'Transcript: "{transcript}"')

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcript: "challenges"


### Final result

In [9]:
if transcript == f'{SELECTED_KEYWORD}':
    print('Keyword ID: Success!')
else :
    print('Keyword ID: Failure!')

Keyword ID: Failure!


In [16]:
if speaker == f'{SELECTED_SPEAKER}':
    print('Speaker ID: Success!')
else:
    print('Speaker ID: Failure!')

Speaker ID: Success!


## Speaker mismatch but keyword match

### Speaker ID

In [14]:
audio_filename = f"{TEST_DATA_DIR}/{ALTERNATIVE_SPEAKER_1}/{SELECTED_KEYWORD}.wav"
print(audio_filename)

./Data/keywords//Yamini_Ravindran/experiences.wav


In [15]:
signal = load_test_wav(audio_filename)

In [17]:
# pre-process the signal
signal = signal[:, :16000]
signal = tf.expand_dims(signal, 0)

# post-process the logits
y_pred = model.predict(signal)
y_pred_classes = np.argmax(y_pred, axis=1)



In [18]:
speaker = SPEAKERS[y_pred_classes[0]]
print(f'Speaker: "{speaker}"')
# note that out-of-distibution speaker is not recognized

Speaker: "Nelson_Mandela"


### Keyword ID

In [51]:
audio_filename = f"{UBU_DATA_DIR}/keywords/{ALTERNATIVE_SPEAKER_1}/{SELECTED_KEYWORD}.wav"
print(audio_filename)

/mnt/d/__repos/wut_easar/project/Data/keywords/Yamini_Ravindran/experiences.wav


In [11]:
files = [audio_filename]
transcript = asr_model.transcribe(paths2audio_files=files)[0]
print(f'Transcript: "{transcript}"')

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcript: "experiences"


### Final result

In [12]:
if transcript == f'{SELECTED_KEYWORD}':
    print('Keyword ID: Success!')
else :
    print('Keyword ID: Failure!')

Keyword ID: Success!


In [19]:
if speaker == f'{SELECTED_SPEAKER}':
    print('Speaker ID: Success!')
else:
    print('Speaker ID: Failure!')

Speaker ID: Failure!


## Speaker mismatch and keyword mismatch

### Speaker ID

In [20]:
audio_filename = f"{TEST_DATA_DIR}/{ALTERNATIVE_SPEAKER_2}/{ALTERNATIVE_KEYWORD_2}.wav"
print(audio_filename)

./Data/keywords//Ronald_Reagan/forces.wav


In [21]:
signal = load_test_wav(audio_filename)

In [23]:
# pre-process the signal
signal = tf.pad(signal, [[0, 0], [0, 16000 - 10880]])
signal = tf.expand_dims(signal, 0)

# post-process the logits
y_pred = model.predict(signal)
y_pred_classes = np.argmax(y_pred, axis=1)



In [24]:
speaker = SPEAKERS[y_pred_classes[0]]
print(f'Speaker: "{speaker}"')

Speaker: "Ronald_Reagan"


### Keyword ID

In [52]:
audio_filename = f"{UBU_DATA_DIR}/keywords/{ALTERNATIVE_SPEAKER_2}/{ALTERNATIVE_KEYWORD_2}.wav"
print(audio_filename)

/mnt/d/__repos/wut_easar/project/Data/keywords/Ronald_Reagan/forces.wav


In [14]:
files = [audio_filename]
transcript = asr_model.transcribe(paths2audio_files=files)[0]
print(f'Transcript: "{transcript}"')

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcript: "forces"


### Final result

In [15]:
if transcript == f'{SELECTED_KEYWORD}':
    print('Keyword ID: Success!')
else :
    print('Keyword ID: Failure!')

Keyword ID: Failure!


In [25]:
if speaker == f'{SELECTED_SPEAKER}':
    print('Speaker ID: Success!')
else:
    print('Speaker ID: Failure!')

Speaker ID: Failure!
