<a href="https://colab.research.google.com/github/KevinWahle/ASSD-TPF/blob/master/ASSD_Diarization%2BTranscriber.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Dializator + Transcriber

El objetivo de este notebook es detectar y reconocer en un audio los distintos oradores que aparecen y determinar qué dice cada uno. 

Para ello se utilizaron dos redes neuronales que ingresandole los MelSpect del audio permiten hacer una la función de dialización, o sea determinar quien habla en cada momento, y la función de transcripción.


In [5]:
debugging=False

import warnings
warnings.filterwarnings('ignore')

## Descarga e importación de librerías


In [6]:
import os
os.system("pip install git+https://github.com/openai/whisper.git")
os.system("pip install git+https://github.com/resemble-ai/Resemblyzer.git")
os.system("pip install pathlib")

import whisper
from resemblyzer import preprocess_wav, VoiceEncoder
from pathlib import Path
from scipy.io import wavfile
import numpy as np

## Dialización

### Carga de Oradores
Ejecutar solo si se quiere correr la lógica sin la aplicación.

In [7]:
if debugging:
  wav_fpath = "/content/dibu.wav"
  wav = preprocess_wav(wav_fpath)
  sampling_rate=16000

  segments = [[1, 3.5], [3.5, 5], [15, 17]]
  speaker_names = ["Barney", "Marshall", "Lily"]
  speaker_wavs = [wav[int(s[0] * sampling_rate):int(s[1] * sampling_rate)] for s in segments]

### Aplicación de la Red Neuronal

In [8]:
def speakerProbabilities(wav, speaker_wavs, speaker_names,verbose=0):
  ## Compare speaker embeds to the continuous embedding of the interview
  # Derive a continuous embedding of the interview. We put a rate of 16, meaning that an 
  # embedding is generated every 0.0625 seconds. It is good to have a higher rate for speaker 
  # diarization, but it is not so useful for when you only need a summary embedding of the 
  # entire utterance. A rate of 2 would have been enough, but 16 is nice for the sake of the 
  # demonstration.
  encoder = VoiceEncoder("cpu")
  print("Running the continuous embedding on cpu, this might take a while...")
  _, cont_embeds, wav_splits = encoder.embed_utterance(wav, return_partials=True, rate=16) 

  # Get the continuous similarity for every speaker. It amounts to a dot product between the 
  # embedding of the speaker and the continuous embedding of the interview
  speaker_embeds = [encoder.embed_utterance(speaker_wav) for speaker_wav in speaker_wavs]
  similarity_dict = {name: cont_embeds @ speaker_embed for name, speaker_embed in 
                    zip(speaker_names, speaker_embeds)}
                  
  if verbose:
    print(similarity_dict)
  
  return similarity_dict

if debugging:
  speakerProbabilities(wav, speaker_wavs, speaker_names, verbose=debugging)

### Prefiltrado de la dialización
La idea de esta etapa es aplicar distintas lógicas para sacar el ruido que pueda entregar la red neuronal a su salida. 

Por ejemplo: si tenemos dos intervalos de más de 1 segundo donde habla el orador 1 y entre ellos un intervalo de 0.1 segundos donde la mayor probabilidad es que hable el orador 2, entonces consideramos que durante todo ese gran intervalo habló el orador 1.

In [9]:
def likelySpeaker(similarity_dict, speaker_names, MIN_PROB=0.5, verbose=False):
# Create an array with speaker predominance in each sample
  talk_names =[]

  for i in range(len(similarity_dict[speaker_names[0]])):     # Create an array with speaker predominance in each sample
    prev = 0
    
    for names in speaker_names:
      prev= max(similarity_dict[names][i], prev)
      if(prev==similarity_dict[names][i]): name = names
    
    # Impongo un umbral para reconocer el ruido ambiente
    if prev<MIN_PROB: name = None
    talk_names.append(name)
  
    # Si tuve 1 valor de 5 diferente, corregilo
    for i in range(len(talk_names)):                            # Soft change matrix 
      if 1<i<len(talk_names)-3 and talk_names[i-1]==talk_names[i-2]==talk_names[i+1]==talk_names[i+2]:
        talk_names[i]=talk_names[i-1]
  
  return talk_names

def createConverStruct(talk_names, verbose=False):
# Crea arreglo con tiempos y nombres [[start, end, speaker, speech],...]
  name=talk_names[0]
  speakers_time = []
  prev_name=0
  io=0

  for i in range(len(talk_names)):
    name=talk_names[i]
    if name!=prev_name:
      if prev_name!=None:
        if i<len(talk_names):
          speakers_time.append([io+1,i+1,prev_name,""])
        else:
          speakers_time.append([io+1,i,prev_name,""])            
      io=i+1
    prev_name=name

  if verbose:
    print(speakers_time)
  
  return speakers_time

def filteringConversation(speakers_time, MERGE_TIME = 8, DEL_TIME = 5, verbose=False):
  # Si el delta_tiempo es menor a MIN_TIME, sacamelo
  # Si son tempos cercanos (menor a MIN_TIME), hace merge

  time_processed=[]
  for i in range(len(speakers_time)):
      #Do merge
      if len(time_processed) and speakers_time[i][2]==time_processed[-1:][0][2] and (speakers_time[i][0]-time_processed[-1:][0][1])<=MERGE_TIME:
          temp = time_processed[-1:][0][0]            # Guardo de la iteración anterior 
          time_processed.pop()                        # Saco el tiempo viejo
          time_processed.append([temp, speakers_time[i][1], speakers_time[i][2], ""]) # Agrego el merge
      
      #Delete time
      elif (speakers_time[i][1]-speakers_time[i][0])>DEL_TIME:
          if len(time_processed) and time_processed[-1:][0][1]-time_processed[-1:][0][0]<=DEL_TIME:
              time_processed.pop()                    # Si el de la iteración anteior era menor, lo saco
          time_processed.append(speakers_time[i])     # Agrego el tiempo si es mayor a mintime
      
      #Append and check in next iteration
      elif i<len(speakers_time)-1:
          if len(time_processed) and time_processed[-1:][0][1]-time_processed[-1:][0][0]<=DEL_TIME:
              time_processed.pop()                    # Si el de la iteración anteior era menor, lo saco
          time_processed.append(speakers_time[i])

  if verbose:
    print(time_processed)
  
  return time_processed

if debugging:
  talk_names=likelySpeaker(similarity_dict, speaker_names, speakerverbose=debugging)
  conversation=createConverStruct(talk_names, verbose=debugging)
  conversation=filteringConversation(conversation, verbose=debugging)

#speaker_times=createConverStruct(talk_names, verbose=debugging)
#conversation=filteringConversation(speaker_times)

### Visualizacion (COMPLETAR)

In [10]:
## Run the interactive demo
#interactive_diarization(similarity_dict, wav, wav_splits)
 

## Speech to Text

### Creación de wavs por orador

In [11]:
# Creacion de los wavs ya separados en los distintos tramos por orador
def wavCreation(wav, slices, sampling_rate=16000):
  binSize=960    
  start_offset=9
  end_offset=11

  if not os.path.exists("/content/temp"):
    os.makedirs("/content/temp")

  for slice_index in range(len(slices)):
    sliceStart=(slices[slice_index][0]+start_offset)*binSize
    sliceEnd=(slices[slice_index][1]+end_offset)*binSize
    wavfile.write("/content/temp/slice"+str(slice_index)+".wav", sampling_rate, wav[sliceStart:sliceEnd])

if debugging:
  wavCreation(conversation)

### Transcripción

In [12]:
def speech2Text(slices, verbose=False, model_size="base"):
    model = whisper.load_model(model_size)          
    for slice_index in range(len(slices)):
        result = model.transcribe("/content/temp/slice"+str(slice_index)+".wav")
        slices[slice_index][3]=result['text']
        
        if verbose:
          print(slices[slice_index][2]+":")
          print(result['text'])
        
        os.remove("/content/temp/slice"+str(slice_index)+".wav")

    print("Transcription ready!!!")

if debugging:
  speech2Text(conversation, verbose=debugging)

## Resultado final


In [13]:
def showConversation(transcription):
  conversation=""
  for speech in transcription:  
    conversation+= (speech[2]+":\n")
    conversation+=("   "+speech[3]+"\n\n")
    
  return conversation

if debugging:
  print(showConversation(time_processed))

### Descarga de conversacion
Correr la celda o ejecutar ```downloadTranscription(dialog, audioPath)``` para descargar la conversacion.

In [14]:
from google.colab import files

def downloadTranscription(dialog, audioPath):
  # Create a file with same name as the audio with .txt extension
  convPath = (audioPath.split("."))[0]+ ".txt"
  file = open(convPath, "w")
  file.write(showConversation(transcription))

  # Close the file and download
  file.close()
  files.download(convPath)

if debugging:
  downloadTranscription(time_processed, wav_fpath)

# APLICACION

In [15]:
def processAudio(wav_fpath, segments, speaker_names, text_area):
  global similarity_dict, conversation
  wav = preprocess_wav(wav_fpath)
  sampling_rate=16000

  speaker_wavs = [wav[int(s[0] * sampling_rate):int(s[1] * sampling_rate)] for s in segments]

  similarity_dict=speakerProbabilities(wav, speaker_wavs, speaker_names)

  print(similarity_dict)

  # Dialization
  talk_names=likelySpeaker(similarity_dict, speaker_names)
  conversation=createConverStruct(talk_names)
  conversation=filteringConversation(conversation)

  # Transcription 
  wavCreation(wav, conversation)
  speech2Text(conversation)

  if debugging:
    print(showConversation(conversation))

if debugging:
  wav_fpath = "/content/dibu.wav"
  segments = [[1,4],[6,9]]
  speaker_names = ["Interviewer", "Martinez"]
  processAudio(wav_fpath, segments, speaker_names, None)  

In [20]:
def showDiarization(similarity_dict, conversation, plot_output):
  plt.rcParams['figure.figsize'] = (15, 6)
  
  with plot_output: 
    counter=0
    color_dict={}

    for speaker,value in similarity_dict.items():
      plt.plot(value, label=speaker)
      color_dict[speaker]=counter
      counter+=1
      
    changeTime=["0:00"]; changeSample=[0]
    for speech in conversation:
      X0=speech[0] ;Xf=speech[1]; speaker=speech[2]  
      plt.fill_between(range(X0,Xf+1), similarity_dict[speaker][X0:Xf+1],color="C"+str(color_dict[speaker]), alpha=0.2)
      tf=int(Xf/(16000/960))
      changeTime.append('{:01d}:{:02d}'.format(tf//60,tf%60+1))
      changeSample.append(Xf)

    plt.xticks(changeSample,changeTime, rotation=45)
    plt.xlim(0); plt.ylim(0.3, 1)
    plt.ylabel("Probabilidad"); plt.xlabel("Tiempo")
    plt.legend()

    plt.show()

if debugging:
  showDiarization(similarity_dict, conversation, plot_output=plot_output)

In [21]:
import os
import ipywidgets as widgets
import fnmatch
import IPython.display as ipd
import matplotlib.pyplot as plt
from scipy.io import wavfile
from google.colab import files

file_list = [""] + fnmatch.filter(os.listdir(), '*.wav')  # List .wav files inside Colab actual folder

# Create widgets
dropdown = widgets.Dropdown(options=file_list, description='File:')
player_out = widgets.Output()

filename = ""
speakerName = []
speakerTime = []

txtName = widgets.Text(
    placeholder='Name',
    disabled=True
)

timesStr = ['{:2d}:{:02d}'.format(m,s) for m in range(0,10) for s in range(0, 60)]

timeRange = widgets.SelectionRangeSlider(
    options=timesStr,
    index=(0,len(timesStr)-1),
    description='time interval',
    disabled=True
)

listNames = widgets.SelectMultiple(
    description='Speakers',
    disabled=True
)

btnDel  = widgets.Button(description='Delete', disabled=True)

btnAdd  = widgets.Button(description='Add', disabled=True)

btnProc = widgets.Button(description='Process', layout=widgets.Layout(width='99%'), disabled=True)

plot_output = widgets.Output()

# with player_out:
playerView = display(display_id=True)

# Define callback functions
def on_file_select(change):

    global playerView, filename

    # Get file name:
    filename = change['new']

    if not filename:
      return

    # Read the WAV file
    rate, data = wavfile.read(filename)

    duration = len(data)/rate

    # print("Duracion: ", duration)

    timesStr = ['{:2d}:{:02d}'.format(m,s) for m in range(0, int(duration//60)) for s in range(0, 60)]
    timesStr += ['{:2d}:{:02d}'.format(int(duration//60), s) for s in range(0, int(duration%60)+1)]

    # print(times)

    timeRange.options = timesStr
    timeRange.disabled = False

    txtName.disabled = False
    btnAdd.disabled = False

    # Update player widget
    with player_out:
      ipd.clear_output(wait=True)
      playerView.update(ipd.Audio(data, rate=rate, autoplay=False))

def downloadText(*args):
  global filename
  convPath = (filename.split("."))[0]+ ".txt"

  with open(convPath, 'w') as f:
    f.write(text_area.value)

  files.download(convPath)

def addSpeaker(*args):
  if (txtName.value):
    speakerName.append(txtName.value)
    speakerTime.append(timeRange.index)
    listNames.options += (txtName.value + timeRange.value[0] + ' -' + timeRange.value[1], )
    btnDel.disabled = False
    listNames.disabled = False
    btnProc.disabled = False

def delSpeakers(*args):
  listNames.options = [x for x in listNames.options if x not in listNames.value]  # Delete selected elements

def processAudio(*args):
  global filename, speakerName, speakerTime
  wav = preprocess_wav(filename)
  sampling_rate=16000

  speaker_wavs = [wav[int(s[0] * sampling_rate):int(s[1] * sampling_rate)] for s in speakerTime]

  similarity_dict=speakerProbabilities(wav, speaker_wavs, speakerName)

  # Dialization
  talk_names=likelySpeaker(similarity_dict, speakerName)
  conversation=createConverStruct(talk_names)
  conversation=filteringConversation(conversation)

  # Transcription 
  wavCreation(wav, conversation)
  speech2Text(conversation)

  # Load conversation in the gui
  text_area.value = showConversation(conversation)
  showDiarization(similarity_dict, conversation, plot_output)

# Create a vertical box layout
hboxAudio = widgets.HBox([dropdown, player_out], layout=widgets.Layout(align_items='center', justify_content='space-between', margin='10px', width='97%'))
hboxTimes = widgets.HBox([txtName, timeRange, btnAdd], layout=widgets.Layout(grid_gap='10px', margin='10px'))
hboxNames = widgets.HBox([listNames, btnDel], layout=widgets.Layout(align_items='flex-end', grid_gap='10px', margin='10px'))
main_tab = widgets.VBox([hboxAudio, hboxTimes, hboxNames, btnProc], layout=widgets.Layout(align_items='flex-start'))
vbox = widgets.VBox([main_tab], layout=widgets.Layout(align_items='center', width='100%'))

# Create a big text box
text_area = widgets.Textarea(value='This is a sample text area.\nIt can be scrolled.', disabled=True, rows=10, layout=widgets.Layout(width='99%'))
btnDownload = widgets.Button(description='Download', disabled=False)
vboxText = widgets.VBox([text_area, btnDownload], layout=widgets.Layout(align_items='flex-end'))
   
# Create event listeners
dropdown.observe(on_file_select, names='value')
btnDownload.on_click(downloadText)
btnAdd.on_click(addSpeaker)
btnDel.on_click(delSpeakers)
btnProc.on_click(processAudio)

# Create a new tab to display the GUI
tab = widgets.Tab([vbox, vboxText, plot_output])
tab.set_title(0, 'Audio')
tab.set_title(1, 'Text')
tab.set_title(2, 'Graph')
display(tab)


Tab(children=(VBox(children=(VBox(children=(HBox(children=(Dropdown(description='File:', options=('', 'dibu.wa…

Loaded the voice encoder model on cpu in 0.01 seconds.
Running the continuous embedding on cpu, this might take a while...
Transcription ready!!!


#TODO
* Integrar UI
* Arreglar el grafico de dialization