In [None]:
"""
You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.

Instructions for setting up Colab are as follows:
1. Open a new Python 3 notebook.
2. Import this notebook from GitHub (File -> Upload Notebook -> "GITHUB" tab -> copy/paste GitHub URL)
3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select "GPU" for hardware accelerator)
4. Run this cell to set up dependencies.
"""
# If you're using Google Colab and not running locally, run this cell.

## Install dependencies
!pip install wget
!apt-get install sox libsndfile1 ffmpeg
!pip install unidecode

# ## Install NeMo
BRANCH = 'sd_model'
!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[asr]

## Install TorchAudio
!pip install torchaudio>=0.6.0 -f https://download.pytorch.org/whl/torch_stable.html


# SPEAKER DIARIZATION 

Speaker Diarization is the task of segmenting audio recordings by speaker labels. Typical diarization system consists of a **Voice Activity Detection model** to get the time stamps of audio where speech is being spoken ignoring the background and **Speaker Embeddings model** to get speaker embedddings on segments that were previously time stampped. These speaker embeddings would then be clustered in to clusters based on number of speakers present in the audio recording. 

In this tutorial, we would be following above steps with a oracle VAD time stamps and pretrained speaker verification model which are already part of our training collections as can be found in [tutorials](https://github.com/NVIDIA/NeMo/#tutorials) section

For demonstration purposes we would be using simulated audio from [an4 dataset](http://www.speech.cs.cmu.edu/databases/an4/) 

In [None]:
import os
ROOT = os.getcwd()
print(ROOT)
an4_audio=os.path.join(ROOT,'data/an4_diarize_test.wav')
an4_rttm=os.path.join(ROOT,'data/an4_diarize_test.rttm')

Let's plot and listen to the audio

In [None]:
import IPython
import matplotlib.pyplot as plt
import numpy as np
import librosa

sr = 16000
signal, sr = librosa.load(an4_audio,sr=sr) 

fig,ax = plt.subplots(1,1)
fig.set_figwidth(20)
fig.set_figheight(2)
plt.plot(np.arange(len(signal)),signal)
fig.suptitle('Reference merged an4 audio', fontsize=16)
plt.xlabel('time (secs)', fontsize=18)
ax.margins(x=0)
plt.ylabel('signal strength', fontsize=16);
a,_ = plt.xticks();plt.xticks(a,a/sr);

In [None]:
IPython.display.Audio(an4_audio)

We would be using [pyannote_metrics](https://github.com/pyannote/pyannote-me) for visualization and score calculation purposes. Hence all the labels in rttm formats would eventually be converted to pyannote objects. For this purposes we created two helper functions *rttm_to_labels* (for NeMo intermediate processing) and *labels_to_pyannote_object* for scoring and visualization format

In [None]:
from nemo.collections.asr.parts.speaker_utils import rttm_to_labels,labels_to_pyannote_object

In [None]:
labels=rttm_to_labels(an4_rttm)
reference=labels_to_pyannote_object(labels)

In [None]:
reference

First step to start converting reference audio rttm time stamps to oracle manifest file. This manifest file would be input to our speaker diarizer to extract embeddings. 

#For that let's use script []

In [None]:
!python /disk2/rttm_to_manifest_mod.py --vad_directory {ROOT}/data/ --audio_directory {ROOT}/data/ --manifest_file {ROOT}/data/an4_oracle_manifest.json

In [None]:
!cat data/an4_oracle_manifest.json

In [None]:
pretrained_speaker = os.path.join(ROOT,'data/spkr.nemo')

In [None]:
import nemo
# NeMo's ASR collection - this collections contains complete ASR models and
# building blocks (modules) for ASR
import nemo.collections.asr as nemo_asr
from omegaconf import OmegaConf

In [None]:
MODEL_CONFIG = os.path.join(ROOT,'data/speaker_diarization.yaml')
config = OmegaConf.load(MODEL_CONFIG)
print(OmegaConf.to_yaml(config))

In [None]:
config.manifest_filepath='data/an4_oracle_manifest.json'
config.diarizer.vad.model_path = 'data/an4_oracle_manifest.json'
config.diarizer.speaker_embeddings.model_path = pretrained_speaker
config.diarizer.groundtruth_RTTM_dir='data/'

In [None]:
from nemo.collections.asr.models import ClusteringDiarizer
sd_model = ClusteringDiarizer(cfg=config)

In [None]:
sd_model.diarize()

In [None]:
!cat 'outputs/diarization/pred_rttms/an4_diarize_test.rttm'

In [None]:
pred_labels = rttm_to_labels('outputs/diarization/pred_rttms/an4_diarize_test.rttm')
pred_annotation = labels_to_pyannote_object(pred_labels)
pred_annotation

In [None]:
annotation

In [None]:
#As you could see it matched A to speaker_0 and B to speaker_1

In [None]:
IPython.display.Audio(an4_audio)

In [None]:
quartznet = nemo_asr.models.EncDecCTCModel.from_pretrained(model_name="QuartzNet15x5Base-En")

In [None]:
files = [an4_audio]
for fname, transcription in zip(files, quartznet.transcribe(paths2audio_files=files)):
  print(f"Audio in {fname} was recognized as: {transcription}")