In [6]:
import pandas as pd
import numpy as np
import torch
import torchaudio
import audio
import transformers

from IPython.display import display, Audio

In [7]:
SR = 16000

In [8]:
repo = "superb/wav2vec2-base-superb-sid"
from transformers import Wav2Vec2FeatureExtractor

feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(repo)

Downloading (…)rocessor_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

In [9]:
wav1 = audio.load_audio('../example1.wav')
wav2 = audio.load_audio('../example2.flac')

In [10]:
display(Audio(wav1, rate=SR))
display(Audio(wav2, rate=SR))

In [11]:
ft1 = feature_extractor(wav1[0:SR*2], sampling_rate=SR)
ft2 = feature_extractor(wav2[0:SR*2], sampling_rate=SR)

checker = torch.nn.CosineSimilarity(dim=-1, eps=1e-6)

In [12]:
ft2['input_values'].squeeze()

array([-0.00088501, -0.00045776, -0.00082397, ..., -0.02682495,
       -0.0300293 , -0.03198242], dtype=float32)

In [13]:
torch.Tensor(ft1['input_values'])
torch.Tensor(ft2['input_values'])

tensor([[-0.0009, -0.0005, -0.0008,  ..., -0.0268, -0.0300, -0.0320]])

In [14]:
checker(
    torch.Tensor(ft1['input_values']),
    torch.Tensor(ft2['input_values'])
)

tensor([0.0871])

## Testing diarization using cosine distance on our own data.

In [15]:
def load_json(path):
    import json
    with open(path, 'r') as f:
        data = json.load(f)
    return data


In [16]:
def check_segments(wav1, wav2):
    # if not isinstance(wav1, torch.Tensor):
    #     wav1 = torch.Tensor(wav1)
    # if not isinstance(wav2, torch.Tensor):
    #     wav2 = torch.Tensor(wav2)
    samples = min(len(wav1), len(wav2))
    ft1 = torch.Tensor(feature_extractor(wav1[0:samples], sampling_rate=SR)['input_values'])
    ft2 = torch.Tensor(feature_extractor(wav2[0:samples], sampling_rate=SR)['input_values'])
    return checker(ft1,ft2)

In [17]:
check_segments(wav1, wav2)

tensor([0.0884])

In [18]:
path = '../../data/jotun 02-05-2023/gWJmbiLGkb_data/1-JadePixie_7138.ogg'
segments = load_json(path+'_vad.json')
game_audio = audio.load_audio(path)

In [19]:
df = pd.DataFrame(segments)
df['len'] = df['end'] - df['start']
df.head()

Unnamed: 0,start,end,len
0,268320,284128,15808
1,546336,552416,6080
2,3249696,3299296,49600
3,3354144,3369440,15296
4,3381280,3396576,15296


In [20]:
def get_segment_wav(segment, wav):
    start = segment['start']
    end = segment['end']
    return wav[start:end]

In [21]:
master = get_segment_wav(segments[2], game_audio)
out = []
for seg in segments:
    target = get_segment_wav(seg, game_audio)
    out.append(check_segments(master, target))

In [22]:
out

[tensor([-0.0720]),
 tensor([-0.0607]),
 tensor([1.]),
 tensor([0.0079]),
 tensor([-0.0060]),
 tensor([0.0051]),
 tensor([0.0011]),
 tensor([-0.0910]),
 tensor([-0.0125]),
 tensor([0.0126]),
 tensor([-0.0092]),
 tensor([-0.0049]),
 tensor([-0.0035]),
 tensor([0.0204]),
 tensor([0.0153]),
 tensor([-0.0118]),
 tensor([-0.0072]),
 tensor([0.0050]),
 tensor([-0.0054]),
 tensor([0.0056]),
 tensor([-9.5403e-05]),
 tensor([0.0012]),
 tensor([0.0377]),
 tensor([0.0385]),
 tensor([0.0680]),
 tensor([-0.0090]),
 tensor([-0.0306]),
 tensor([-0.0080]),
 tensor([0.0305]),
 tensor([-0.0026]),
 tensor([-0.0529]),
 tensor([0.0075]),
 tensor([0.0093]),
 tensor([0.0132]),
 tensor([0.0694]),
 tensor([-0.0607]),
 tensor([0.0032]),
 tensor([-0.0066]),
 tensor([0.0548]),
 tensor([0.0047]),
 tensor([0.0019]),
 tensor([0.0598]),
 tensor([0.0227]),
 tensor([0.0067]),
 tensor([-0.0021]),
 tensor([-0.0263]),
 tensor([0.0175]),
 tensor([-0.0321]),
 tensor([-0.0100]),
 tensor([-0.0400]),
 tensor([0.0192]),
 tensor

### Using speechbrain

In [29]:
from speechbrain.pretrained import SpeakerRecognition
verification = SpeakerRecognition.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb", run_opts={'device':'cuda'})

In [31]:
verification.verify_batch(torch.Tensor(wav1), torch.Tensor(wav2))

(tensor([[0.1635]], device='cuda:0'), tensor([[False]], device='cuda:0'))

In [39]:
master = get_segment_wav(segments[2], game_audio)
out = []
for seg in segments:
    target = get_segment_wav(seg, game_audio)
    score, pred = verification.verify_batch(torch.Tensor(master), torch.Tensor(target))
    out.append((score.item(), pred.item()))

In [40]:
out

[(0.29959821701049805, True),
 (0.14473602175712585, False),
 (1.0, True),
 (0.36408036947250366, True),
 (0.3993750810623169, True),
 (0.32460013031959534, True),
 (0.26213064789772034, True),
 (-0.035422466695308685, False),
 (0.15714693069458008, False),
 (0.17623689770698547, False),
 (0.16144748032093048, False),
 (0.5945013761520386, True),
 (0.33932822942733765, True),
 (0.3879956901073456, True),
 (0.2563527822494507, True),
 (0.3606073260307312, True),
 (0.4947482943534851, True),
 (0.07062222808599472, False),
 (0.30478405952453613, True),
 (0.48510825634002686, True),
 (0.0669272318482399, False),
 (0.09220285713672638, False),
 (0.43830111622810364, True),
 (0.12496013939380646, False),
 (0.009687351062893867, False),
 (0.4020736813545227, True),
 (0.07309240847826004, False),
 (0.24375614523887634, False),
 (0.2781428098678589, True),
 (0.22914128005504608, False),
 (0.5472028255462646, True),
 (0.18113256990909576, False),
 (0.41488778591156006, True),
 (0.334244132041931

In [46]:
pd.DataFrame(out)[1].mean()

0.5207956600361664