In [None]:
pip install transformers librosa webrtcvad

Collecting webrtcvad
  Downloading webrtcvad-2.0.10.tar.gz (66 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.2/66.2 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: webrtcvad
  Building wheel for webrtcvad (setup.py) ... [?25l[?25hdone
  Created wheel for webrtcvad: filename=webrtcvad-2.0.10-cp310-cp310-linux_x86_64.whl size=73461 sha256=ff1454d39f21b0c2d5bc1efe460e564e0838d55dd8c09c389950128435825ce8
  Stored in directory: /root/.cache/pip/wheels/2a/2b/84/ac7bacfe8c68a87c1ee3dd3c66818a54c71599abf308e8eb35
Successfully built webrtcvad
Installing collected packages: webrtcvad
Successfully installed webrtcvad-2.0.10


In [None]:
import webrtcvad
import librosa
import numpy as np

def vad_webrtc(audio, sr, frame_duration=30):
    vad = webrtcvad.Vad(3)
    audio = (audio * 32768).astype(np.int16)  # Convert to 16-bit PCM

    # Calculate the frame size in samples
    frame_size = int(sr * frame_duration / 1000)

    # Check if the frame size is valid for the given sample rate and duration
    assert frame_duration in [10, 20, 30], "Frame duration must be 10, 20, or 30 ms"
    assert len(audio) >= frame_size, "Audio too short for the given frame size"

    frames = librosa.util.frame(audio, frame_length=frame_size, hop_length=frame_size)
    speech_segments = []

    for i, frame in enumerate(frames.T):
        # `is_speech` expects the buffer as bytes
        is_speech = vad.is_speech(frame.tobytes(), sr)
        if is_speech:
            start_time = i * frame_duration / 1000
            end_time = (i + 1) * frame_duration / 1000
            speech_segments.append((start_time, end_time))

    return speech_segments

# Example usage:
audio_path = 'Hindi1_01.wav'
y, sr = librosa.load(audio_path, sr=None)
vad_segments = vad_webrtc(y, sr)
print(f'VAD segments: {vad_segments}')

VAD segments: [(0.21, 0.24), (0.24, 0.27), (0.27, 0.3), (0.3, 0.33), (0.33, 0.36), (0.36, 0.39), (0.39, 0.42), (0.42, 0.45), (0.48, 0.51), (0.51, 0.54), (0.54, 0.57), (0.57, 0.6), (0.6, 0.63), (0.63, 0.66), (0.66, 0.69), (0.69, 0.72), (0.72, 0.75), (0.75, 0.78), (0.78, 0.81), (0.81, 0.84), (0.84, 0.87), (0.87, 0.9), (0.9, 0.93), (0.93, 0.96), (0.96, 0.99), (0.99, 1.02), (1.02, 1.05), (1.05, 1.08), (1.08, 1.11), (1.11, 1.14), (1.14, 1.17), (1.17, 1.2), (1.2, 1.23), (1.23, 1.26), (1.26, 1.29), (1.29, 1.32), (1.32, 1.35), (1.35, 1.38), (1.38, 1.41), (1.41, 1.44), (1.44, 1.47), (1.47, 1.5), (1.5, 1.53), (1.53, 1.56), (1.56, 1.59), (1.59, 1.62), (1.62, 1.65), (1.65, 1.68), (1.68, 1.71), (1.71, 1.74), (1.74, 1.77), (1.77, 1.8), (1.8, 1.83), (1.83, 1.86), (1.86, 1.89), (1.89, 1.92), (1.92, 1.95), (1.95, 1.98), (1.98, 2.01), (2.01, 2.04), (2.04, 2.07), (2.07, 2.1), (2.1, 2.13), (2.13, 2.16), (2.16, 2.19), (2.19, 2.22), (2.22, 2.25), (2.25, 2.28), (2.28, 2.31), (2.31, 2.34), (2.34, 2.37), (2.37

In [None]:
def speaker_change_detection(audio, sr, window_size=1024, hop_size=512, threshold=0.5):
    energy = librosa.feature.rms(y=audio, frame_length=window_size, hop_length=hop_size)[0]
    changes = []

    for i in range(1, len(energy)):
        if abs(energy[i] - energy[i-1]) > threshold:
            changes.append((i * hop_size / sr, (i + 1) * hop_size / sr))

    return changes

# Example usage:
scd_segments = speaker_change_detection(y, sr)
print(f'Speaker change detection segments: {scd_segments}')

Speaker change detection segments: []


In [None]:
def extract_spectrogram(audio, sr, segment, n_mels=128, n_fft=1024, hop_length=512):
    start, end = int(segment[0] * sr), int(segment[1] * sr)
    y_segment = audio[start:end]
    spectrogram = librosa.feature.melspectrogram(y=y_segment, sr=sr, n_mels=n_mels, n_fft=n_fft, hop_length=hop_length)
    log_spectrogram = librosa.power_to_db(spectrogram, ref=np.max)
    return log_spectrogram

# Example usage:
spectrograms = [extract_spectrogram(y, sr, seg) for seg in vad_segments]

In [None]:
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Model
import torch
import librosa

# Load the pre-trained Wav2Vec2 model and feature extractor
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")

def extract_embeddings(model, spectrogram, feature_extractor, sr):
    inputs = feature_extractor(spectrogram, sampling_rate=sr, return_tensors="pt", padding="max_length", max_length=16000)
    with torch.no_grad():
        outputs = model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings.numpy()

# Example usage:
embeddings = [extract_embeddings(model, spec, feature_extractor, 16000) for spec in spectrograms]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.60k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/378M [00:00<?, ?B/s]

Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from sklearn.cluster import AgglomerativeClustering

def cluster_speakers(embeddings, num_speakers=None):
    if num_speakers is None:
        # Estimate number of speakers using a clustering algorithm like DBSCAN or silhouette analysis
        num_speakers = estimate_num_speakers(embeddings)

    clustering = AgglomerativeClustering(n_clusters=num_speakers)
    labels = clustering.fit_predict(embeddings)
    return labels

# Example usage:
speaker_labels = cluster_speakers(embeddings)
print(f'Speaker labels: {speaker_labels}')

In [None]:
def combine_segments(vad_segments, scd_segments, speaker_labels):
    diarization_result = []
    for i, segment in enumerate(vad_segments):
        speaker = speaker_labels[i]
        diarization_result.append({
            'start_time': segment[0],
            'end_time': segment[1],
            'speaker': speaker
        })
    return diarization_result

# Example usage:
diarization_result = combine_segments(vad_segments, scd_segments, speaker_labels)
for segment in diarization_result:
    print(f"Speaker {segment['speaker']} from {segment['start_time']}s to {segment['end_time']}s")