In [2]:
from speech_encoder_v2 import SpeechEncoderV2
from data_preprocessing import *
import torch
import torchaudio
from Embed import Embed

## Setting the inputs and preparing the data

In [5]:
from pathlib import Path
wav_fpaths = list(Path("D:\CODING\Voice Encoder\Resemblyzer/audio_data\librispeech_test-other").glob("**/*.flac"))
wav_fpaths

[WindowsPath('D:/CODING/Voice Encoder/Resemblyzer/audio_data/librispeech_test-other/1688/1688-142285-0000.flac'),
 WindowsPath('D:/CODING/Voice Encoder/Resemblyzer/audio_data/librispeech_test-other/1688/1688-142285-0001.flac'),
 WindowsPath('D:/CODING/Voice Encoder/Resemblyzer/audio_data/librispeech_test-other/1688/1688-142285-0002.flac'),
 WindowsPath('D:/CODING/Voice Encoder/Resemblyzer/audio_data/librispeech_test-other/1688/1688-142285-0003.flac'),
 WindowsPath('D:/CODING/Voice Encoder/Resemblyzer/audio_data/librispeech_test-other/1688/1688-142285-0004.flac'),
 WindowsPath('D:/CODING/Voice Encoder/Resemblyzer/audio_data/librispeech_test-other/1688/1688-142285-0005.flac'),
 WindowsPath('D:/CODING/Voice Encoder/Resemblyzer/audio_data/librispeech_test-other/1688/1688-142285-0006.flac'),
 WindowsPath('D:/CODING/Voice Encoder/Resemblyzer/audio_data/librispeech_test-other/1688/1688-142285-0007.flac'),
 WindowsPath('D:/CODING/Voice Encoder/Resemblyzer/audio_data/librispeech_test-other/1688

In [6]:
from itertools import groupby
from tqdm import tqdm
sys.path.append("temp")
from audio import *

speaker_wavs = {speaker: list(map(preprocess_wav, wav_fpaths)) for speaker, wav_fpaths in
                groupby(tqdm(wav_fpaths, "Preprocessing wavs", len(wav_fpaths), unit="wavs"), 
                        lambda wav_fpath: wav_fpath.parent.stem)}

Preprocessing wavs: 100%|██████████| 100/100 [00:04<00:00, 24.38wavs/s]


In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
encoder = SpeechEncoderV2(device,device)
checkpoints = torch.load("models\speech_encoder_transformer\encoder(0.096).pt")
encoder.load_state_dict(checkpoints['model_state'])
encoder.eval()

SpeechEncoderV2(
  (transformer): TransformerEncoder(
    (layers): ModuleList(
      (0-2): 3 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=40, out_features=40, bias=True)
        )
        (linear1): Linear(in_features=40, out_features=2048, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=2048, out_features=40, bias=True)
        (norm1): LayerNorm((40,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((40,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
    (norm): LayerNorm((40,), eps=1e-05, elementwise_affine=True)
  )
  (linear): Linear(in_features=40, out_features=256, bias=True)
  (relu): ReLU()
  (loss_fn): CrossEntropyLoss()
)

In [10]:
embedder = Embed(encoder)

In [13]:
embeds_a = np.array([embedder.embed_utterance(wavs[0]) for wavs in speaker_wavs.values()])
embeds_b = np.array([embedder.embed_utterance(wavs[1]) for wavs in speaker_wavs.values()])

In [None]:
from visualisations import *

utt_sim_matrix = np.inner(embeds_a, embeds_b)
# Long, detailed version:
utt_sim_matrix2 = np.zeros((len(embeds_a), len(embeds_b)))
for i in range(len(embeds_a)):
    for j in range(len(embeds_b)):
        # The @ notation is exactly equivalent to np.dot(embeds_a[i], embeds_b[i])
        utt_sim_matrix2[i, j] = embeds_a[i] @ embeds_b[j]
assert np.allclose(utt_sim_matrix, utt_sim_matrix2)


## Similarity between two speaker embeddings
# Divide the utterances of each speaker in groups of identical size and embed each group as a
# speaker embedding
spk_embeds_a = np.array([embedder.embed_speaker(wavs[:len(wavs) // 2]) \
                         for wavs in speaker_wavs.values()])
spk_embeds_b = np.array([embedder.embed_speaker(wavs[len(wavs) // 2:]) \
                         for wavs in speaker_wavs.values()])
spk_sim_matrix = np.inner(spk_embeds_a, spk_embeds_b)


## Draw the plots
fix, axs = plt.subplots(2, 2, figsize=(8, 10))
labels_a = ["%s-A" % i for i in speaker_wavs.keys()]
labels_b = ["%s-B" % i for i in speaker_wavs.keys()]
mask = np.eye(len(utt_sim_matrix), dtype=bool)
plot_similarity_matrix(utt_sim_matrix, labels_a, labels_b, axs[0, 0],
                       "Cross-similarity between utterances\n(speaker_id-utterance_group)")
plot_histograms((utt_sim_matrix[mask], utt_sim_matrix[np.logical_not(mask)]), axs[0, 1],
                ["Same speaker", "Different speakers"], 
                "Normalized histogram of similarity\nvalues between utterances")
plot_similarity_matrix(spk_sim_matrix, labels_a, labels_b, axs[1, 0],
                       "Cross-similarity between speakers\n(speaker_id-utterances_group)")
plot_histograms((spk_sim_matrix[mask], spk_sim_matrix[np.logical_not(mask)]), axs[1, 1],
                ["Same speaker", "Different speakers"], 
                "Normalized histogram of similarity\nvalues between speakers")
plt.show()