In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = ''

In [2]:
from glob import glob
from sklearn.metrics.pairwise import cosine_similarity
from malaya.graph.pagerank import pagerank
import networkx as nx
import pickle
import numpy as np
from tqdm import tqdm

In [3]:
audios = sorted(glob('/home/husein/ssd2/processed-youtube-v2/*/*.mp3'))
len(audios)

110573

In [4]:
from datasets import Audio

audio = Audio(sampling_rate=22050)

In [5]:
audios = [f for f in audios if 'boboi' in f.lower()]
len(audios)

1194

In [6]:
audio_norm = audio.decode_example(audio.encode_example(audios[293]))['array']

In [7]:
import IPython.display as ipd
ipd.Audio(audio_norm, rate = 22050)

In [8]:
agg_function = np.mean
pkls = sorted(glob('/home/husein/ssd2/processed-youtube-v2/*.pkl'))
len(pkls)

5145

In [9]:
speakers = {}

for pkl in tqdm(pkls):
    with open(pkl, 'rb') as fopen:
        data = pickle.load(fopen)
        
    filename = os.path.split(pkl)[1].replace('.pkl', '')
    
    for result in data:
        if len(result['asr_model'][0]) < 2:
            continue
        speaker = result['classification_model'][1]
        vector = result['classification_model'][0]
        
        speaker_name = f'{filename}-{speaker}'
        
        if speaker_name not in speakers:
            speakers[speaker_name] = vector
        else:
            speakers[speaker_name] = agg_function([vector, speakers[speaker_name]], axis=0)

100%|██████████████████████████████████████████████████████████████████████████████████████████████| 5145/5145 [00:01<00:00, 3190.30it/s]


In [10]:
len(speakers)

10492

In [11]:
embeddings = list(speakers.values())
list_speakers = list(speakers.keys())
similar = (cosine_similarity(embeddings) + 1) / 2
similar[np.diag_indices(len(similar))] = 0.0

In [12]:
import scipy as sp
import scipy.sparse as sprs
import scipy.spatial
import scipy.sparse.linalg
from scipy import sparse

G = sparse.csr_matrix(similar)
A = G
n, _ = A.shape
r = sp.asarray(A.sum(axis=1)).reshape(-1)
k = r.nonzero()[0]
D_1 = sprs.csr_matrix((1 / r[k], (k, k)), shape=(n, n))

In [13]:
personalize = sp.ones(n)
personalize = personalize.reshape(n, 1)
s = (personalize / personalize.sum()) * n
I = sprs.eye(n)

In [14]:
p = 0.85
x = sprs.linalg.gmres((I - p * A.T @ D_1), s)

In [15]:
scores = (x[0] / x[0].sum())
ranked = sorted(
    [
        (scores[i], s, i)
        for i, s in enumerate(list_speakers)
    ],
    reverse=False,
)
sorted_speakers = [r[1] for r in ranked]

In [16]:
similarity_threshold = 0.9

In [17]:
G = nx.DiGraph()
G.add_nodes_from(list_speakers)

for speaker in tqdm(sorted_speakers):
    embeddings = list(speakers.values())
    list_speakers = list(speakers.keys())
    similar = (cosine_similarity(embeddings) + 1) / 2
    similar[np.diag_indices(len(similar))] = 0.0
    s = similar[list_speakers.index(speaker)]

    where = np.where(s >= similarity_threshold)[0]
    if len(where):
        argsort = (np.argsort(s)[::-1]).tolist()
        argsort = [a for a in argsort if a in where]
        speakers[list_speakers[argsort[0]]] = np.mean(
            [speakers[speaker], speakers[list_speakers[argsort[0]]]], axis=0)
        speakers.pop(speaker, None)

        G.add_edge(speaker, list_speakers[argsort[0]])

 37%|███████████████████████████████████                                                            | 3879/10492 [27:33<43:41,  2.52it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [18]:
mapping = {}
for speaker in tqdm(sorted_speakers):
    traversed = list(nx.dfs_edges(G, source=speaker))
    if len(traversed):
        new_label = traversed[-1][-1]
    else:
        new_label = speaker
    
    mapping[speaker] = new_label

100%|█████████████████████████████████| 10492/10492 [00:00<00:00, 284260.41it/s]


In [19]:
mapping

{'Malam_Jumaat_Bersama_Johan_&_Ozlynn_(Jozlynnhanania)-WSHL2nyN-wA-speaker 3': 'Malam_Jumaat_Bersama_Johan_&_Ozlynn_(Jozlynnhanania)-WSHL2nyN-wA-speaker 3',
 "Mamak_Sessions_-_Here's_What_You_Should_Know_About_Buying_Properties-ZcVRJSDCof0-speaker 8": "Mamak_Sessions_-_Here's_What_You_Should_Know_About_Buying_Properties-ZcVRJSDCof0-speaker 8",
 'Kami_Main_Bawah_Selimut_KL_-_OKLETSGO_EP30-iE9EmQVTwJE-speaker 13': 'Kami_Main_Bawah_Selimut_KL_-_OKLETSGO_EP30-iE9EmQVTwJE-speaker 13',
 'Bing_sedang_bermain_Sembunyi_dan_Cari!___Bing_Bahasa_Melayu-2RU_HU3TjyA-speaker 22': 'Bing_sedang_bermain_Sembunyi_dan_Cari!___Bing_Bahasa_Melayu-2RU_HU3TjyA-speaker 22',
 'Main_Game_Lejen_Zaman_Sekolah___SEISMIK_Plays-2EaYMKQjY7c-speaker 6': 'Main_Game_Lejen_Zaman_Sekolah___SEISMIK_Plays-2EaYMKQjY7c-speaker 6',
 'Episod_Penuh_Bing___70+_Minit___Eps_1-10___Bing_Bahasa_Malysia-jjoBv0X5fQA-speaker 52': 'Episod_Penuh_Bing___70+_Minit___Eps_1-10___Bing_Bahasa_Malysia-jjoBv0X5fQA-speaker 52',
 'Bing_Bahasa_Melayu

In [20]:
len(set(mapping.values()))

6085

In [21]:
len(set(mapping.keys()))

10492

In [22]:
import json

with open('mapping-youtube-speakers-90.json', 'w') as fopen:
    json.dump(mapping, fopen)