# Speaker Diarization and Audio Transcription in Python
## Introduction
This notebook provides a Python solution for downloading a YouTube video's audio, transcribing the audio, and performing speaker diarization. The output is analyzed to provide a consolidated output in a Pandas DataFrame. Speaker diarization is the process of separating an audio stream into segments, each associated with a different speaker. Audio transcription is the process of converting speech to text. 



In [None]:
#@title ##Installation of libraries
!pip install pydub
!pip install torch
!pip install torchaudio
!pip install -U demucs
!pip install -U stable-ts
!pip install geopandas
!pip install yt-dlp
!pip install -qq https://github.com/pyannote/pyannote-audio/archive/refs/heads/develop.zip

In [None]:
#@title ## Load of the models
import os
import json
import pandas as pd
import numpy as np
import geopandas as gpd
from pydub import AudioSegment
import subprocess
from shapely.geometry import LineString
from stable_whisper import load_model
from pyannote.audio import Pipeline

hugging_face_token = "" #@param {type:"string"}
# Load the Stable Whisper model and Pyannote Pipeline
model = load_model('large-v2')
pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization@2.1",
                                    use_auth_token=hugging_face_token)

In [3]:
#@title ##Define functions to use
def download_audio(url, output_filename):
    """
    Download and convert YouTube audio to a mono MP3 file.

    Args:
    url (str): YouTube video URL
    output_filename (str): Output MP3 file name with .mp3 extension
    """
    # Download YouTube video as audio and convert to MP3
    temp_file = "temp_audio"
    command = f'yt-dlp -x --audio-format vorbis --output "{temp_file}.%(ext)s" {url}'
    subprocess.call(command, shell=True)

    # Find the downloaded file with the correct extension
    for file in os.listdir():
        if file.startswith(temp_file):
            temp_file_with_ext = file
            break

    # Load the audio file using pydub and convert to mono
    audio = AudioSegment.from_file(temp_file_with_ext)
    audio = audio.set_channels(1)  # Set to mono
    audio.export(output_filename, format='mp3')

    # Remove temporary file
    os.remove(temp_file_with_ext)
    print(f"Audio downloaded and converted successfully: {output_filename}")


def process_audio(input_audio, output_json, output_lab,demucs=False,vad=False,language=None):
    """
    Process the audio file to obtain transcriptions and speaker diarization.

    Args:
    input_audio (str): Input audio file name
    output_json (str): Output JSON file name for transcriptions
    output_lab (str): Output LAB file name for speaker diarization
    """
    # Transcribe audio and save to JSON
    result = model.transcribe(input_audio, language=language, demucs=demucs, vad=vad, regroup=False)
    result.save_as_json(output_json)

    # Perform speaker diarization and save to LAB
    diarization_result = pipeline(input_audio)
    with open(output_lab, "w") as rttm:
        diarization_result.write_lab(rttm)

def analyze_transcriptions_and_diarization(transcriptions_json, diarization_lab):
    """
    Analyze transcriptions and diarization results.

    Args:
    transcriptions_json (str): JSON file with transcriptions
    diarization_lab (str): LAB file with diarization results

    Returns:
    pd.DataFrame: DataFrame with analyzed results
    """
    with open(transcriptions_json, 'r') as file:
        data = json.load(file)
    
    wrd = pd.DataFrame(data['segments'])[['id', 'words']].explode('words')
    wrd = pd.concat([wrd.drop(['words'], axis=1), wrd['words'].apply(pd.Series)], axis=1)
    wrd['id2'] = range(len(wrd))

    voice = pd.read_csv(diarization_lab, header=None, delimiter=r" ")
    voice.columns = ['start', 'end', 'speaker']
    voice['id2'] = range(len(voice))

    wrd['geometry'] = wrd.apply(lambda row: LineString([(row['start'], 0), (row['end'], 0)]), axis=1)
    voice['geometry'] = voice.apply(lambda row: LineString([(row['start'], 0), (row['end'], 0)]), axis=1)
    wrd, voice = gpd.GeoDataFrame(wrd, geometry='geometry'), gpd.GeoDataFrame(voice, geometry='geometry')

    wrd['len'] = wrd.geometry.length
    inter = gpd.overlay(wrd, voice, how='union').query('not id.isnull()')
    inter = inter.sort_values(by='id2_2').sort_values(by='id2_1').reset_index()
    inter['p_voice'] = inter.geometry.length / inter.len
    inter = inter.query('not p_voice.isnull()')
    inter = inter[['id', 'id2_1', 'speaker', 'word', 'start_1', 'end_1', 'probability', 'p_voice']]
    inter['ord'] = (~inter.speaker.isnull()) * 100 + inter.probability * 10 + inter.p_voice
    inter = inter.groupby('id2_1', group_keys=False).apply(lambda x: x.nlargest(1, 'ord', keep='all')).reset_index()

    tmp = inter.groupby(['id', 'speaker']).apply(lambda x: {"sentence": ' '.join(x['word']),
                                                            "start_1": min(x['start_1']),
                                                            "end_1": max(x['end_1']),
                                                            "prob": np.exp(np.average(np.log(x['probability']))),
                                                            "p_voice": np.exp(np.average(np.log(x['p_voice'])))})
    tmp = tmp.apply(pd.Series).sort_values(by='start_1').reset_index(['id', 'speaker'])
    return inter,tmp



## Example

In [None]:
url_youtube='https://www.youtube.com/watch?v=pj705DvCSxg' #@param {type:"string"}
download_audio(url_youtube, 'out.mp3')

# Process the audio to obtain transcriptions and speaker diarization
process_audio('out.mp3', 'out.json', 'output.lab',language="en")
results_df, result_simp = analyze_transcriptions_and_diarization('out.json', 'output.lab')
result_simp