In [3]:
import torch
import torchaudio
from speechbrain.inference.speaker import EncoderClassifier
from pyannote.audio import Pipeline
from scipy.spatial.distance import cdist
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
%pip install transformers

Collecting transformers
  Downloading transformers-4.54.1-py3-none-any.whl.metadata (41 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.4-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Downloading safetensors-0.5.3-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Downloading transformers-4.54.1-py3-none-any.whl (11.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.2/11.2 MB[0m [31m43.6 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hDownloading safetensors-0.5.3-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (471 kB)
Downloading tokenizers-0.21.4-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m100.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: safetensors, tokenizers, transformers
Succ

In [10]:
from langchain_huggingface.llms import HuggingFacePipeline

hf = HuggingFacePipeline.from_model_id(
    model_id="gpt2",
    task="text-generation",
    pipeline_kwargs={"max_new_tokens": 10},
)

HTTP Error 429 thrown while requesting HEAD https://huggingface.co/gpt2/resolve/main/tokenizer_config.json
Retrying in 1s [Retry 1/5].


HTTP Error 429 thrown while requesting HEAD https://huggingface.co/gpt2/resolve/main/tokenizer_config.json
Retrying in 2s [Retry 2/5].
HTTP Error 429 thrown while requesting HEAD https://huggingface.co/gpt2/resolve/main/tokenizer_config.json
Retrying in 4s [Retry 3/5].


KeyboardInterrupt: 

In [3]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

classifier = EncoderClassifier.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb", run_opts={"device": device})
classifier = classifier.to(device)

HTTP Error 429 thrown while requesting HEAD https://huggingface.co/speechbrain/spkrec-ecapa-voxceleb/resolve/main/hyperparams.yaml
Retrying in 1s [Retry 1/5].


KeyboardInterrupt: 

In [3]:
diarization = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1",
                                        use_auth_token="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")

In [4]:
def extract_speaker_embeddings(speaker_file):
    waveform, sample_rate = torchaudio.load(speaker_file)
    waveform = waveform.to(device)
    embedding = classifier.encode_batch(waveform)
    norm_emb = embedding.squeeze(1).cpu().numpy()
    return norm_emb / np.linalg.norm(norm_emb)

In [1]:
import sqlite3
DB_FILE = "/home/stagiaire/verbalens/app/api/speakers_store/spearker_voice.db"

In [2]:
def init_db():
    with sqlite3.connect(DB_FILE) as conn:
        conn.execute('''
            CREATE TABLE IF NOT EXISTS speakers (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                speaker_name TEXT NOT NULL,
                embedding TEXT NOT NULL
            )
        ''')

init_db()

In [3]:
with sqlite3.connect(DB_FILE) as conn:
    conn.execute("DELETE FROM speakers")

In [None]:

import os
import json

async def add_speaker(speaker_name,  file_path):
    # Sauver temporairement le fichier audio

    file = open(file_path, "rb")
 
    tmp_path = f"/tmp/voice/{speaker_name}.wav"
    
    if not os.path.exists(tmp_path):
        os.makedirs(os.path.dirname(tmp_path), exist_ok=True)
    
    with open(tmp_path, "wb") as f:
        f.write(file.read())

    file.close()
    # Extraire l'embedding
    embedding = extract_speaker_embeddings(tmp_path)
    embedding_json = json.dumps(embedding.tolist())

    with sqlite3.connect(DB_FILE) as conn:
        conn.execute(
            "INSERT INTO speakers (speaker_name, embedding) VALUES (?, ?)",
            (speaker_name, embedding_json)
        )

    return {"message": f"Speaker '{speaker_name}' ajouté avec succès.", "embedding_dim": len(embedding)}



In [11]:
for speaker_id, speaker_file in enumerate(["/home/stagiaire/verbalens/app/api/temp/voice/micode.wav", "/home/stagiaire/verbalens/app/api/temp/voice/thimothe.wav", "/home/stagiaire/verbalens/app/api/temp/voice/guillaume.wav"]):
    if speaker_id == 0:
        await add_speaker("micode",speaker_file)
    elif speaker_id == 1:
        await add_speaker("thimothe",speaker_file)
    elif speaker_id == 2:
        await add_speaker("guillaume",speaker_file)

In [None]:
def find_nearest_speaker(new_embedding):
    with sqlite3.connect(DB_FILE) as conn:
        cur = conn.cursor()
        cur.execute("SELECT id, speaker_name, embedding FROM speakers")
        rows = cur.fetchall()

    min_dist = float('inf')
    best_match = None

    for row in rows:
        speaker_id, speaker_name, emb_json = row
    
        speaker_emb = np.array(json.loads(emb_json))

        # Cosine distance avec cdist
        distances = cdist(new_embedding.tolist(), speaker_emb, metric="cosine")

        dist = distances.min()

        if dist < min_dist:
            min_dist = dist
            best_match = {
                "speaker": speaker_name,
                "similarity": 1.0 - dist
            }

    if best_match:
        return best_match
    else:
        return {"speaker": None, "similarity": 0.0}


In [13]:
async def identify_speaker_sqlite(file_path):

    new_emb = extract_speaker_embeddings(file_path)
    
    best_match = find_nearest_speaker(new_emb)

    if best_match:
        
        return {"speaker": best_match["speaker"], "similarity": best_match["similarity"]}
    else:
        return {"speaker": None, "similarity": 0.0}


In [62]:
import whisper

whisper_model = whisper.load_model("base")

In [24]:
waveform, sample_rate = torchaudio.load("/home/stagiaire/verbalens/app/api/temp/voice/micode.wav")
sample_rate

44100

In [29]:
import time
import datetime
def format_time(seconds):
    return str(datetime.timedelta(seconds=int(seconds)))[2:]


In [49]:
import math

In [63]:
def asr(file_path):
    transcript = whisper_model.transcribe(file_path)
    segments = diarization(file_path)
    # Set a threshold for similarity scores to determine when a match is considered successful
    threshold = 0.7
    sample_rate=44100
    speaker_segments = []
    aligned_output = []

    # Iterate through each segment identified in the diarization process
    for segment, label, confidence in segments.itertracks(yield_label=True):
        start_time, end_time = segment.start, segment.end

        # Load the specific audio segment from the meeting recording
        waveform, sample_rate = torchaudio.load(file_path, num_frames=int((end_time-start_time)*sample_rate), frame_offset=int(start_time*sample_rate))
        waveform = waveform.to(device)

        # Extract the speaker embedding from the audio segment
        embedding = classifier.encode_batch(waveform).squeeze(1).cpu().numpy()/np.linalg.norm(classifier.encode_batch(waveform).squeeze(1).cpu().numpy())

        # Initialize variables to find the recognized speaker
        min_distance = float('inf')
        recognized_speaker_id = None

        with sqlite3.connect(DB_FILE) as conn:
            cur = conn.cursor()
            cur.execute("SELECT id, speaker_name, embedding FROM speakers")
            rows = cur.fetchall()

        known_speaker_ids = [row[1] for row in rows]
        known_speakers = [np.array(json.loads(row[2])) for row in rows]

        # Compare the segment's embedding to each known speaker's embedding using cosine distance
        for i, speaker_embedding in enumerate(known_speakers):
            distances = cdist(embedding.tolist(), speaker_embedding, metric="cosine")
            min_distance_candidate = distances.min()
            if min_distance_candidate < min_distance:
                min_distance = min_distance_candidate
                recognized_speaker_id = known_speaker_ids[i]

        # Output the identified speaker and the time range they were speaking, if a match is found
        if min_distance < threshold and end_time-start_time>1:
            speaker_segments.append({
                "speaker": recognized_speaker_id,
                "start_time": math.floor(float(start_time)),
                "end_time": math.floor(float(end_time)),
            })
        elif end_time-start_time > 1:
            speaker_segments.append({
                "speaker": "unknown",
                "start_time": start_time,
                "end_time": end_time,
            })
    
    for segment in transcript['segments']   :
        start = segment['start']
        text = segment['text'].strip()
        start = math.floor(float(start))
        speak = None
        for seg in speaker_segments:
            if start >= seg["start_time"] and start < seg["end_time"]:
                speak = seg["speaker"]
                break
        if speak is None:
            speak = "unknown"
        timestamp = format_time(start)
        aligned_output.append({
            "speaker": speak,
            "timestamp": timestamp,
            "start": start,
            "end": segment['end'],
            "text": text
        })

    return aligned_output


In [7]:
import json
import re

def parse_transcript(file_path):
    transcript = []
    current_speaker = None
    current_start_time = None
    current_text = ""

    # Regex pour détecter les lignes de type : SPEAKER X 0:00:00
    pattern = re.compile(r"^(SPEAKER \d+)\s+(\d+:\d+:\d+)$")

    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    for line in lines:
        line = line.strip()
        if not line:
            continue  # ignorer les lignes vides

        match = pattern.match(line)
        if match:
            # Sauvegarder le bloc précédent s'il existe
            if current_speaker and current_text:
                transcript.append({
                    "speaker": current_speaker,
                    "start_time": current_start_time,
                    "text": current_text.strip()
                })
                current_text = ""

            # Nouveau bloc
            current_speaker = match.group(1)
            current_start_time = match.group(2)
        else:
            # Ajouter la ligne de texte au bloc courant
            if current_text:
                current_text += " " + line
            else:
                current_text = line

    # Ajouter le dernier bloc
    if current_speaker and current_text:
        transcript.append({
            "speaker": current_speaker,
            "start_time": current_start_time,
            "text": current_text.strip()
        })

    return {"transcript": transcript}

In [9]:
import subprocess
from sklearn.cluster import AgglomerativeClustering
from pyannote.audio import Audio
from pyannote.core import Segment
import wave
import contextlib
import numpy as np
import datetime



def extract_speakers(model, embedding_model, path, num_speakers=2):
    """Do diarization with speaker names"""
    mono = 'temp/mono.wav'
    cmd = 'ffmpeg -i {} -y -ac 1 temp/mono.wav'.format(path)
    subprocess.check_output(cmd, shell=True)
    result = model.transcribe(mono)
    segments = result["segments"]
    
    with contextlib.closing(wave.open(mono,'r')) as f:
      frames = f.getnframes()
      rate = f.getframerate()
      duration = frames / float(rate)
        
    audio = Audio()
    def segment_embedding(segment):
        start = segment["start"]
        # Whisper overshoots the end timestamp in the last segment
        end = min(duration, segment["end"])
        clip = Segment(start, end)
        waveform, sample_rate = audio.crop(mono, clip)
        return embedding_model(waveform[None])

    embeddings = np.zeros(shape=(len(segments), 192))
    for i, segment in enumerate(segments):
      embeddings[i] = segment_embedding(segment)
    embeddings = np.nan_to_num(embeddings)
    
    clustering = AgglomerativeClustering(num_speakers).fit(embeddings)
    labels = clustering.labels_
    for i in range(len(segments)):
      segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1)
    return segments    

def write_segments(segments, outfile):
    """write out segments to file"""
    
    def time(secs):
      return datetime.timedelta(seconds=round(secs))
    
    f = open(outfile, "w")  
    for (i, segment) in enumerate(segments):
      if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
        f.write("\n" + segment["speaker"] + ' ' + str(time(segment["start"])) + '\n')
      f.write(segment["text"][1:] + ' ')
    f.close()
    result = parse_transcript(outfile)
    return result

In [10]:
import torch
import whisper
from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
classifier = PretrainedSpeakerEmbedding(
    "speechbrain/spkrec-ecapa-voxceleb",
    device=torch.device(DEVICE))
llm = whisper.load_model("base")  # ou "small", "medium", "large"

  wrapped_fwd = torch.cuda.amp.custom_fwd(fwd, cast_inputs=cast_inputs)


In [11]:

from fastapi import HTTPException
import os


async def diarisation(audio_id: str, num_speakers: int):
    try:
        audio_data = "/home/stagiaire/verbalens/app/api/uploads/3bf25dec-1dc4-4f92-8049-f0041442a02a_podcast.mp3" #os.path.join("uploads", audio_id)
        if not os.path.exists(audio_data):
            raise HTTPException(status_code=404, detail="File not found")
        else:
            print("audio found")

            # Diarisation locale avec Whisper
            seg = extract_speakers(llm, classifier, audio_data,num_speakers)
            print(seg)
            # result = write_segments(seg, 'temp/transcript.txt')
            # return JSONResponse(content=result) 
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

await diarisation("3bf25dec-1dc4-4f92-8049-f0041442a02a_podcast.mp3", 2)

audio found


ffmpeg version 7.0.2-3ubuntu1.1 Copyright (c) 2000-2024 the FFmpeg developers
  built with gcc 14 (Ubuntu 14.2.0-4ubuntu2)
  configuration: --prefix=/usr --extra-version=3ubuntu1.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --disable-libmfx --disable-omx --enable-gnutls --enable-libaom --enable-libass --enable-libbs2b --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libglslang --enable-libgme --enable-libgsm --enable-libharfbuzz --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx265 --enable-libxml2 --enable-libxvid --enable-libzimg --

HTTPException: 500: Command 'ffmpeg -i /home/stagiaire/verbalens/app/api/uploads/3bf25dec-1dc4-4f92-8049-f0041442a02a_podcast.mp3 -y -ac 1 temp/mono.wav' returned non-zero exit status 254.