In [1]:
!pip install librosa numpy scikit-learn pandas soundfile gdown

import os
import librosa
import numpy as np
import soundfile as sf  # To handle .flac files
import gdown  # For downloading from Google Drive
import tarfile  # For extracting .tar.gz files
from sklearn.metrics.pairwise import cosine_similarity  # For similarity comparison

# Step 1: Download the dataset from Google Drive
google_drive_link = "https://drive.google.com/uc?id=1xkDfnwkBoeEPxRQ91hwAkwbICVhVJOQd"
output = "LibriSpeech_dataset.tar.gz"
gdown.download(google_drive_link, output, quiet=False)

# Step 2: Extract the dataset
dataset_path = "/content/LibriSpeech"
with tarfile.open(output, "r:gz") as tar_ref:
    tar_ref.extractall(dataset_path)

audio_dir = '/content/LibriSpeech/LibriSpeech/dev-clean/'

# Step 3: Function to extract MFCC features from an audio file
def extract_mfcc_features(file_path, n_mfcc=13):
    signal, sr = sf.read(file_path)  # Load the .flac file
    mfccs = librosa.feature.mfcc(y=signal, sr=sr, n_mfcc=n_mfcc)  # Extract MFCC features
    return np.mean(mfccs.T, axis=0)  # Return the mean MFCC features

# Step 4: Generate embedding for one-shot reference audio
def one_shot_embedding(file_path):
    return extract_mfcc_features(file_path)

# Step 5: Generate embedding for new audio and compare with the one-shot embedding
def identify_speaker(one_shot_embedding, new_audio_path, threshold=0.85):
    new_embedding = extract_mfcc_features(new_audio_path)

    # Calculate cosine similarity between embeddings
    similarity = cosine_similarity([one_shot_embedding], [new_embedding])
    if similarity[0][0] > threshold:
        return f"Same speaker with similarity: {similarity[0][0]:.2f}"
    else:
        return f"Different speaker with similarity: {similarity[0][0]:.2f}"

# Example usage
one_shot_audio = '/content/LibriSpeech/LibriSpeech/dev-clean/1272/128104/1272-128104-0000.flac'  # Path to one-shot audio
new_audio = '/content/LibriSpeech/LibriSpeech/dev-clean/1272/128104/1272-128104-0001.flac'  # Path to new audio

# Step 6: Generate one-shot embedding from reference audio
one_shot_emb = one_shot_embedding(one_shot_audio)

# Step 7: Compare new audio to the one-shot reference
result = identify_speaker(one_shot_emb, new_audio)
print(result)




Downloading...
From (original): https://drive.google.com/uc?id=1xkDfnwkBoeEPxRQ91hwAkwbICVhVJOQd
From (redirected): https://drive.google.com/uc?id=1xkDfnwkBoeEPxRQ91hwAkwbICVhVJOQd&confirm=t&uuid=abe1976f-ab55-45df-ac4b-48a6a6092f2e
To: /content/LibriSpeech_dataset.tar.gz
100%|██████████| 338M/338M [00:05<00:00, 59.3MB/s]


Same speaker with similarity: 1.00


In [2]:
# Example usage
one_shot_audio = '/content/LibriSpeech/LibriSpeech/dev-clean/1272/128104/1272-128104-0000.flac'  # Path to one-shot audio
new_audio = '/content/LibriSpeech/LibriSpeech/dev-clean/1272/128104/1272-128104-0002.flac'  # Path to new audio

# Step 6: Generate one-shot embedding from reference audio
one_shot_emb = one_shot_embedding(one_shot_audio)

# Step 7: Compare new audio to the one-shot reference
result = identify_speaker(one_shot_emb, new_audio)
print(result)


Same speaker with similarity: 1.00


In [3]:
# Assuming the previous code block has been run and embeddings have been generated

# Path to new audio sample (this is the news sample we are testing)
news_audio = '/content/LibriSpeech/LibriSpeech/dev-clean/84/121123/84-121123-0000.flac'

# Compare the new audio sample to the one-shot reference
result = identify_speaker(one_shot_emb, news_audio)
print(result)  # This should print whether the speaker is the same or different


Same speaker with similarity: 0.98


In [8]:
!pip install librosa numpy scikit-learn pandas soundfile gdown requests

import os
import librosa
import numpy as np
import soundfile as sf
import gdown
import requests
from sklearn.metrics.pairwise import cosine_similarity
import tarfile

# Download the LibriSpeech dataset from Google Drive
google_drive_link = "https://drive.google.com/uc?id=1xkDfnwkBoeEPxRQ91hwAkwbICVhVJOQd"
output = "LibriSpeech_dataset.tar.gz"
gdown.download(google_drive_link, output, quiet=False)

# Extract the dataset
dataset_path = "/content/LibriSpeech"
with tarfile.open(output, "r:gz") as tar_ref:
    tar_ref.extractall(dataset_path)

# Download the sample audio from Google Drive
audio_sample_url = "https://drive.google.com/uc?id=13qwuTkp0YpEHK6hLzxSVcIcaBOpdDF5u"
audio_sample_path = "sample_audio.mp3"

# Download the audio file
gdown.download(audio_sample_url, audio_sample_path, quiet=False)

audio_dir = '/content/LibriSpeech/LibriSpeech/dev-clean/'

# Function to extract MFCC features from an audio file
def extract_mfcc_features(file_path, n_mfcc=13):
    # Load the audio file
    signal, sr = librosa.load(file_path, sr=None)  # Automatically handles different audio formats
    mfccs = librosa.feature.mfcc(y=signal, sr=sr, n_mfcc=n_mfcc)  # Extract MFCC features
    return np.mean(mfccs.T, axis=0)  # Return the mean MFCC features

# Step 5: Generate embedding for new audio and compare with the one-shot embedding
def identify_speaker(one_shot_embedding, new_audio_path, threshold=0.85):
    new_embedding = extract_mfcc_features(new_audio_path)

    # Calculate cosine similarity between embeddings
    similarity = cosine_similarity([one_shot_embedding], [new_embedding])
    if similarity[0][0] > threshold:
        return f"Same speaker with similarity: {similarity[0][0]:.2f}"
    else:
        return f"Different speaker with similarity: {similarity[0][0]:.2f}"

# Example usage
one_shot_audio = '/content/LibriSpeech/LibriSpeech/dev-clean/1272/128104/1272-128104-0000.flac'  # Path to one-shot audio
new_audio = audio_sample_path  # Path to the downloaded audio

try:
    # Generate one-shot embedding from reference audio
    one_shot_emb = extract_mfcc_features(one_shot_audio)

    # Compare new audio to the one-shot reference
    result = identify_speaker(one_shot_emb, new_audio)
    print(result)  # This will print whether the speaker is the same or different
except Exception as e:
    print(e)




Downloading...
From (original): https://drive.google.com/uc?id=1xkDfnwkBoeEPxRQ91hwAkwbICVhVJOQd
From (redirected): https://drive.google.com/uc?id=1xkDfnwkBoeEPxRQ91hwAkwbICVhVJOQd&confirm=t&uuid=eb8a1bca-c947-4ac3-adfb-7598f44ebd24
To: /content/LibriSpeech_dataset.tar.gz
100%|██████████| 338M/338M [00:01<00:00, 191MB/s]
Downloading...
From: https://drive.google.com/uc?id=13qwuTkp0YpEHK6hLzxSVcIcaBOpdDF5u
To: /content/sample_audio.mp3
100%|██████████| 340k/340k [00:00<00:00, 10.7MB/s]

Same speaker with similarity: 0.98





In [18]:
!pip install librosa numpy scikit-learn pandas soundfile gdown requests

import os
import librosa
import numpy as np
import gdown
from sklearn.metrics.pairwise import cosine_similarity
import tarfile

# Download the LibriSpeech dataset from Google Drive
google_drive_link = "https://drive.google.com/uc?id=1xkDfnwkBoeEPxRQ91hwAkwbICVhVJOQd"
output = "LibriSpeech_dataset.tar.gz"
gdown.download(google_drive_link, output, quiet=False)

# Extract the dataset
dataset_path = "/content/LibriSpeech"
with tarfile.open(output, "r:gz") as tar_ref:
    tar_ref.extractall(dataset_path)

# Download the sample audio from Google Drive
audio_sample_url = "https://drive.google.com/uc?id=13qwuTkp0YpEHK6hLzxSVcIcaBOpdDF5u"
audio_sample_path = "sample_audio.mp3"

# Download the audio file
gdown.download(audio_sample_url, audio_sample_path, quiet=False)

audio_dir = '/content/LibriSpeech/LibriSpeech/dev-clean/'

# Function to load and prepare audio (pad or trim to 30 seconds)
def load_and_prepare_audio(file_path, target_length=30):
    # Load audio file using librosa
    signal, sr = librosa.load(file_path, sr=None)  # Load audio file
    if signal.size == 0:
        raise ValueError(f"Audio file {file_path} is empty or could not be loaded.")

    current_length = signal.shape[0] / sr  # Current length in seconds
    if current_length < target_length:  # Check if audio is shorter than target length
        # Pad the audio signal with zeros
        padding_length = int((target_length - current_length) * sr)
        signal = np.pad(signal, (0, padding_length), mode='constant')
    elif current_length > target_length:  # Trim the audio signal if it's longer
        signal = signal[:int(target_length * sr)]
    return signal, sr

# Function to extract MFCC features from an audio file
def extract_mfcc_features(file_path, n_mfcc=13, n_fft=2048, hop_length=512):
    signal, sr = load_and_prepare_audio(file_path)  # Load and prepare audio
    mfccs = librosa.feature.mfcc(y=signal, sr=sr, n_mfcc=n_mfcc, n_fft=n_fft, hop_length=hop_length)
    mfccs_delta = librosa.feature.delta(mfccs)  # Delta features
    mfccs_delta2 = librosa.feature.delta(mfccs, order=2)  # Delta-Delta features
    return np.mean(mfccs.T, axis=0), np.mean(mfccs_delta.T, axis=0), np.mean(mfccs_delta2.T, axis=0)

# Function to create a combined feature vector
def create_feature_vector(file_path):
    mfccs, delta, delta2 = extract_mfcc_features(file_path)
    return np.hstack((mfccs, delta, delta2))  # Combine MFCC, Delta, and Delta-Delta

# Step 5: Generate embedding for new audio and compare with the one-shot embedding
def identify_speaker(one_shot_embedding, new_audio_path, threshold=0.85):
    new_embedding = create_feature_vector(new_audio_path)

    # Calculate cosine similarity between embeddings
    similarity = cosine_similarity([one_shot_embedding], [new_embedding])
    if similarity[0][0] > threshold:
        return f"Same speaker with similarity: {similarity[0][0]:.2f}"
    else:
        return f"Different speaker with similarity: {similarity[0][0]:.2f}"

# Example usage
one_shot_audio = '/content/LibriSpeech/LibriSpeech/dev-clean/1272/128104/1272-128104-0000.flac'  # Path to one-shot audio
new_audio = audio_sample_path  # Path to the downloaded audio

try:
    # Generate one-shot embedding from reference audio
    one_shot_emb = create_feature_vector(one_shot_audio)

    # Compare new audio to the one-shot reference
    result = identify_speaker(one_shot_emb, new_audio)
    print(result)  # This will print whether the speaker is the same or different
except Exception as e:
    print(e)




Downloading...
From (original): https://drive.google.com/uc?id=1xkDfnwkBoeEPxRQ91hwAkwbICVhVJOQd
From (redirected): https://drive.google.com/uc?id=1xkDfnwkBoeEPxRQ91hwAkwbICVhVJOQd&confirm=t&uuid=7534cff3-48d2-405b-9d5b-fe2c23ddce01
To: /content/LibriSpeech_dataset.tar.gz
100%|██████████| 338M/338M [00:01<00:00, 185MB/s]
Downloading...
From: https://drive.google.com/uc?id=13qwuTkp0YpEHK6hLzxSVcIcaBOpdDF5u
To: /content/sample_audio.mp3
100%|██████████| 340k/340k [00:00<00:00, 15.9MB/s]


Same speaker with similarity: 1.00


In [21]:
!pip install librosa numpy scikit-learn pandas soundfile gdown requests webrtcvad

import os
import librosa
import numpy as np
import gdown
from sklearn.metrics.pairwise import cosine_similarity
import tarfile
import soundfile as sf
import webrtcvad

# Download the LibriSpeech dataset from Google Drive
google_drive_link = "https://drive.google.com/uc?id=1xkDfnwkBoeEPxRQ91hwAkwbICVhVJOQd"
output = "LibriSpeech_dataset.tar.gz"
gdown.download(google_drive_link, output, quiet=False)

# Extract the dataset
dataset_path = "/content/LibriSpeech"
with tarfile.open(output, "r:gz") as tar_ref:
    tar_ref.extractall(dataset_path)

# Download the sample audio from Google Drive
audio_sample_url = "https://drive.google.com/file/d/1FhyFfgBDpz41kegtYIjnZLAC2hQnCEJ3"
audio_sample_path = "sample_audio.mp3"

# Download the audio file
gdown.download(audio_sample_url, audio_sample_path, quiet=False)

audio_dir = '/content/LibriSpeech/LibriSpeech/dev-clean/'

# Voice Activity Detection (VAD) to detect speech segments
def detect_speech_segments(signal, sample_rate, frame_duration=30):
    vad = webrtcvad.Vad()
    frame_length = int(sample_rate * frame_duration / 1000)  # Convert duration to frames
    num_frames = len(signal) // frame_length

    segments = []
    for i in range(num_frames):
        start = i * frame_length
        stop = start + frame_length
        frame = signal[start:stop]

        if len(frame) < frame_length:
            continue  # Skip frames that are shorter than expected

        try:
            if vad.is_speech(frame.tobytes(), sample_rate):
                segments.append(frame)
        except Exception as e:
            print(f"Error while processing frame: {e}")

    return np.concatenate(segments) if segments else None

# Function to load audio and perform VAD
def load_audio_with_vad(file_path):
    try:
        signal, sr = librosa.load(file_path, sr=None)  # Load audio file
        speech_signal = detect_speech_segments(signal, sr)  # Apply VAD to get speech segments
        return speech_signal if speech_signal is not None else np.array([]), sr
    except Exception as e:
        print(f"Error loading audio file {file_path}: {e}")
        return np.array([]), 0

# Function to extract MFCC features from an audio file
def extract_mfcc_features(file_path, n_mfcc=13):
    signal, sr = load_audio_with_vad(file_path)  # Load and apply VAD
    if signal.size == 0:
        raise ValueError(f"No speech detected in {file_path}.")
    mfccs = librosa.feature.mfcc(y=signal, sr=sr, n_mfcc=n_mfcc)  # Extract MFCC features
    return np.mean(mfccs.T, axis=0)  # Return the mean MFCC features

# Function to create a feature vector
def create_feature_vector(file_path):
    return extract_mfcc_features(file_path)

# Step 5: Generate embedding for new audio and compare with the one-shot embedding
def identify_speaker(one_shot_embedding, new_audio_path, threshold=0.85):
    new_embedding = create_feature_vector(new_audio_path)

    # Calculate cosine similarity between embeddings
    similarity = cosine_similarity([one_shot_embedding], [new_embedding])
    if similarity[0][0] > threshold:
        return f"Same speaker with similarity: {similarity[0][0]:.2f}"
    else:
        return f"Different speaker with similarity: {similarity[0][0]:.2f}"

# Example usage
one_shot_audio = '/content/LibriSpeech/LibriSpeech/dev-clean/1272/128104/1272-128104-0000.flac'  # Path to one-shot audio
new_audio = audio_sample_path  # Path to the downloaded audio

try:
    # Generate one-shot embedding from reference audio
    one_shot_emb = create_feature_vector(one_shot_audio)

    # Compare new audio to the one-shot reference
    result = identify_speaker(one_shot_emb, new_audio)
    print(result)  # This will print whether the speaker is the same or different
except Exception as e:
    print(e)




Downloading...
From (original): https://drive.google.com/uc?id=1xkDfnwkBoeEPxRQ91hwAkwbICVhVJOQd
From (redirected): https://drive.google.com/uc?id=1xkDfnwkBoeEPxRQ91hwAkwbICVhVJOQd&confirm=t&uuid=ec52b1ee-5b27-4b16-a569-b1f9870d1007
To: /content/LibriSpeech_dataset.tar.gz
100%|██████████| 338M/338M [00:01<00:00, 189MB/s]
Downloading...
From: https://drive.google.com/file/d/1FhyFfgBDpz41kegtYIjnZLAC2hQnCEJ3
To: /content/sample_audio.mp3
89.5kB [00:00, 23.5MB/s]

Error while processing frame: Error while processing frame
Error while processing frame: Error while processing frame
Error while processing frame: Error while processing frame
Error while processing frame: Error while processing frame
Error while processing frame: Error while processing frame
Error while processing frame: Error while processing frame
Error while processing frame: Error while processing frame
Error while processing frame: Error while processing frame
Error while processing frame: Error while processing frame
Error while processing frame: Error while processing frame
Error while processing frame: Error while processing frame
Error while processing frame: Error while processing frame
Error while processing frame: Error while processing frame
Error while processing frame: Error while processing frame
Error while processing frame: Error while processing frame
Error while processing frame: Error while processing frame
Error while processing frame: Error while processing fra




In [23]:
!pip install librosa numpy scikit-learn pandas soundfile gdown requests webrtcvad

import os
import librosa
import numpy as np
import gdown
from sklearn.metrics.pairwise import cosine_similarity
import tarfile
import soundfile as sf
import webrtcvad

# Download the LibriSpeech dataset from Google Drive
google_drive_link = "https://drive.google.com/uc?id=1xkDfnwkBoeEPxRQ91hwAkwbICVhVJOQd"
output = "LibriSpeech_dataset.tar.gz"
gdown.download(google_drive_link, output, quiet=False)

# Extract the dataset
dataset_path = "/content/LibriSpeech"
with tarfile.open(output, "r:gz") as tar_ref:
    tar_ref.extractall(dataset_path)

# Download the sample audio from Google Drive
audio_sample_url = "https://drive.google.com/uc?id=13qwuTkp0YpEHK6hLzxSVcIcaBOpdDF5u"
audio_sample_path = "sample_audio.mp3"

# Download the audio file
gdown.download(audio_sample_url, audio_sample_path, quiet=False)

audio_dir = '/content/LibriSpeech/LibriSpeech/dev-clean/'

# Voice Activity Detection (VAD) to detect speech segments
def detect_speech_segments(signal, sample_rate, frame_duration=30):
    vad = webrtcvad.Vad()
    frame_length = int(sample_rate * frame_duration / 1000)  # Convert duration to frames
    num_frames = len(signal) // frame_length

    segments = []
    for i in range(num_frames):
        start = i * frame_length
        stop = start + frame_length
        frame = signal[start:stop]

        if len(frame) < frame_length:
            continue  # Skip frames that are shorter than expected

        try:
            if vad.is_speech(frame.tobytes(), sample_rate):
                segments.append(frame)
        except Exception as e:
            print(f"Error while processing frame: {e}")

    return np.concatenate(segments) if segments else None

# Function to load audio and perform VAD
def load_audio_with_vad(file_path):
    try:
        signal, sr = librosa.load(file_path, sr=None)  # Load audio file
        speech_signal = detect_speech_segments(signal, sr)  # Apply VAD to get speech segments
        return speech_signal if speech_signal is not None else np.array([]), sr
    except Exception as e:
        print(f"Error loading audio file {file_path}: {e}")
        return np.array([]), 0

# Function to extract MFCC features from an audio file
def extract_mfcc_features(file_path, n_mfcc=13):
    signal, sr = load_audio_with_vad(file_path)  # Load and apply VAD
    if signal.size == 0:
        raise ValueError(f"No speech detected in {file_path}.")
    mfccs = librosa.feature.mfcc(y=signal, sr=sr, n_mfcc=n_mfcc)  # Extract MFCC features
    return np.mean(mfccs.T, axis=0)  # Return the mean MFCC features

# Function to create a feature vector
def create_feature_vector(file_path):
    return extract_mfcc_features(file_path)

# Step 5: Generate embedding for new audio and compare with the one-shot embedding
def identify_speaker(one_shot_embedding, new_audio_path, threshold=0.85):
    new_embedding = create_feature_vector(new_audio_path)

    # Calculate cosine similarity between embeddings
    similarity = cosine_similarity([one_shot_embedding], [new_embedding])
    if similarity[0][0] > threshold:
        return f"Same speaker with similarity: {similarity[0][0]:.2f}"
    else:
        return f"Different speaker with similarity: {similarity[0][0]:.2f}"

# Example usage
one_shot_audio = '/content/LibriSpeech/LibriSpeech/dev-clean/1272/128104/1272-128104-0000.flac'  # Path to one-shot audio
new_audio = audio_sample_path  # Path to the downloaded audio

try:
    # Generate one-shot embedding from reference audio
    one_shot_emb = create_feature_vector(one_shot_audio)

    # Compare new audio to the one-shot reference
    result = identify_speaker(one_shot_emb, new_audio)
    print(result)  # This will print whether the speaker is the same or different
except Exception as e:
    print(e)




Downloading...
From (original): https://drive.google.com/uc?id=1xkDfnwkBoeEPxRQ91hwAkwbICVhVJOQd
From (redirected): https://drive.google.com/uc?id=1xkDfnwkBoeEPxRQ91hwAkwbICVhVJOQd&confirm=t&uuid=ad53c181-7d41-433c-b8d2-7fadc5729823
To: /content/LibriSpeech_dataset.tar.gz
100%|██████████| 338M/338M [00:01<00:00, 218MB/s]
Downloading...
From: https://drive.google.com/uc?id=13qwuTkp0YpEHK6hLzxSVcIcaBOpdDF5u
To: /content/sample_audio.mp3
100%|██████████| 340k/340k [00:00<00:00, 25.6MB/s]

Error while processing frame: Error while processing frame
Error while processing frame: Error while processing frame
Error while processing frame: Error while processing frame
Error while processing frame: Error while processing frame
Error while processing frame: Error while processing frame
Error while processing frame: Error while processing frame
Error while processing frame: Error while processing frame
Error while processing frame: Error while processing frame
Error while processing frame: Error while processing frame
Error while processing frame: Error while processing frame
Error while processing frame: Error while processing frame
Error while processing frame: Error while processing frame
Error while processing frame: Error while processing frame
Error while processing frame: Error while processing frame
Error while processing frame: Error while processing frame
Error while processing frame: Error while processing frame
Error while processing frame: Error while processing fra




In [24]:
!pip install librosa numpy scikit-learn pandas gdown requests soundfile

import os
import librosa
import numpy as np
import gdown
from sklearn.metrics.pairwise import cosine_similarity
import tarfile

# Download the LibriSpeech dataset from Google Drive
google_drive_link = "https://drive.google.com/uc?id=1xkDfnwkBoeEPxRQ91hwAkwbICVhVJOQd"
output = "LibriSpeech_dataset.tar.gz"
gdown.download(google_drive_link, output, quiet=False)

# Extract the dataset
dataset_path = "/content/LibriSpeech"
with tarfile.open(output, "r:gz") as tar_ref:
    tar_ref.extractall(dataset_path)

# Download the sample audio from Google Drive
audio_sample_url = "https://drive.google.com/uc?id=13qwuTkp0YpEHK6hLzxSVcIcaBOpdDF5u"
audio_sample_path = "sample_audio.mp3"

# Download the audio file
gdown.download(audio_sample_url, audio_sample_path, quiet=False)

audio_dir = '/content/LibriSpeech/LibriSpeech/dev-clean/'

# Function to extract MFCC features from an audio file
def extract_mfcc_features(file_path, n_mfcc=13):
    try:
        signal, sr = librosa.load(file_path, sr=None)  # Load audio file
        if len(signal) == 0:
            raise ValueError(f"No audio data in {file_path}.")

        mfccs = librosa.feature.mfcc(y=signal, sr=sr, n_mfcc=n_mfcc)  # Extract MFCC features
        return np.mean(mfccs.T, axis=0)  # Return the mean MFCC features
    except Exception as e:
        print(f"Error loading audio file {file_path}: {e}")
        return np.array([])

# Function to create a feature vector
def create_feature_vector(file_path):
    return extract_mfcc_features(file_path)

# Step 5: Generate embedding for new audio and compare with the one-shot embedding
def identify_speaker(one_shot_embedding, new_audio_path, threshold=0.85):
    new_embedding = create_feature_vector(new_audio_path)

    if new_embedding.size == 0:
        return "Error: No features extracted from new audio."

    # Calculate cosine similarity between embeddings
    similarity = cosine_similarity([one_shot_embedding], [new_embedding])
    if similarity[0][0] > threshold:
        return f"Same speaker with similarity: {similarity[0][0]:.2f}"
    else:
        return f"Different speaker with similarity: {similarity[0][0]:.2f}"

# Example usage
one_shot_audio = '/content/LibriSpeech/LibriSpeech/dev-clean/1272/128104/1272-128104-0000.flac'  # Path to one-shot audio
new_audio = audio_sample_path  # Path to the downloaded audio

try:
    # Generate one-shot embedding from reference audio
    one_shot_emb = create_feature_vector(one_shot_audio)

    if one_shot_emb.size == 0:
        raise ValueError("Error: No features extracted from one-shot audio.")

    # Compare new audio to the one-shot reference
    result = identify_speaker(one_shot_emb, new_audio)
    print(result)  # This will print whether the speaker is the same or different
except Exception as e:
    print(e)




Downloading...
From (original): https://drive.google.com/uc?id=1xkDfnwkBoeEPxRQ91hwAkwbICVhVJOQd
From (redirected): https://drive.google.com/uc?id=1xkDfnwkBoeEPxRQ91hwAkwbICVhVJOQd&confirm=t&uuid=27af0574-3b7b-4baa-aa47-89647a14fff3
To: /content/LibriSpeech_dataset.tar.gz
100%|██████████| 338M/338M [00:01<00:00, 227MB/s]
Downloading...
From: https://drive.google.com/uc?id=13qwuTkp0YpEHK6hLzxSVcIcaBOpdDF5u
To: /content/sample_audio.mp3
100%|██████████| 340k/340k [00:00<00:00, 16.9MB/s]

Same speaker with similarity: 0.98





In [26]:
!pip install librosa numpy scikit-learn pandas soundfile gdown transformers torch torchaudio

import os
import librosa
import numpy as np
import soundfile as sf  # To handle .flac files
import gdown  # For downloading from Google Drive
import tarfile  # For extracting .tar.gz files
from sklearn.metrics import accuracy_score
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

# Download the dataset
google_drive_link = "https://drive.google.com/uc?id=1xkDfnwkBoeEPxRQ91hwAkwbICVhVJOQd"
output = "LibriSpeech_dataset.tar.gz"
gdown.download(google_drive_link, output, quiet=False)

dataset_path = "/content/LibriSpeech"
with tarfile.open(output, "r:gz") as tar_ref:
    tar_ref.extractall(dataset_path)

audio_dir = '/content/LibriSpeech/LibriSpeech/dev-clean/'

# Load pretrained Wav2Vec2 model and processor
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")

# Function to extract embeddings using Wav2Vec2
def extract_embeddings(file_path):
    audio_input, _ = sf.read(file_path)  # Load the audio file
    inputs = processor(audio_input, sampling_rate=16000, return_tensors="pt", padding=True)
    with torch.no_grad():
        logits = model(inputs.input_values).logits
    embeddings = torch.mean(logits, dim=1)  # Average pooling to get a single embedding
    return embeddings.numpy().flatten()  # Return as a 1D array

def load_audio_data(audio_dir):
    features = []
    labels = []
    for root, dirs, files in os.walk(audio_dir):
        for file in files:
            if file.endswith('.flac'):  # Process .flac files
                label = root.split('/')[-2]  # Speaker ID is the folder name before the file level
                file_path = os.path.join(root, file)
                embedding = extract_embeddings(file_path)
                features.append(embedding)
                labels.append(label)
    return np.array(features), np.array(labels)

# Load audio data and extract embeddings
X, y = load_audio_data(audio_dir)

# Download the sample audio for the new speaker
sample_audio_link = "https://drive.google.com/uc?id=1eWp85xyn08a7qP1A3aKFXlR767IALea_"
sample_audio_output = "sample_audio.flac"
gdown.download(sample_audio_link, sample_audio_output, quiet=False)

# Extract embedding for the new speaker's audio
new_speaker_audio_path = sample_audio_output  # Path to the new audio for the new speaker
new_speaker_embedding = extract_embeddings(new_speaker_audio_path)

# Compute similarity (using cosine similarity)
from sklearn.metrics.pairwise import cosine_similarity

def is_similar(new_embedding, existing_embeddings, threshold=0.7):
    similarities = cosine_similarity([new_embedding], existing_embeddings)
    return np.any(similarities >= threshold)

# Check similarity with the existing speakers
similarity_result = is_similar(new_speaker_embedding, X)

if similarity_result:
    print("The new input is similar to existing data.")
else:
    print("The new input is not similar to existing data.")




Downloading...
From (original): https://drive.google.com/uc?id=1xkDfnwkBoeEPxRQ91hwAkwbICVhVJOQd
From (redirected): https://drive.google.com/uc?id=1xkDfnwkBoeEPxRQ91hwAkwbICVhVJOQd&confirm=t&uuid=e4f013c9-eb4b-483f-85d0-5a97096f1eba
To: /content/LibriSpeech_dataset.tar.gz
100%|██████████| 338M/338M [00:02<00:00, 121MB/s]
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.60k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/378M [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/wav2vec2-base-960h were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You sho

KeyboardInterrupt: 

In [1]:
# Install necessary libraries
!pip install git+https://github.com/openai/whisper.git
!pip install gdown

import torch
import whisper
import gdown
import numpy as np
import os

# Download the sample audio for the new speaker
sample_audio_link = "https://drive.google.com/uc?id=1eWp85xyn08a7qP1A3aKFXlR767IALea_"
sample_audio_output = "sample_audio.wav"
gdown.download(sample_audio_link, sample_audio_output, quiet=False)

# Load the Whisper model
model = whisper.load_model("base")  # You can also use 'small', 'medium', or 'large'

# Function to transcribe audio and get embeddings
def transcribe_and_extract_embedding(file_path):
    # Transcribe audio
    result = model.transcribe(file_path)
    return result['text'], result['mel']

# Load the dataset
google_drive_link = "https://drive.google.com/uc?id=1xkDfnwkBoeEPxRQ91hwAkwbICVhVJOQd"
output = "LibriSpeech_dataset.tar.gz"
gdown.download(google_drive_link, output, quiet=False)

# Extract the dataset
dataset_path = "/content/LibriSpeech"
os.makedirs(dataset_path, exist_ok=True)
!tar -xzf {output} -C {dataset_path}

audio_dir = '/content/LibriSpeech/LibriSpeech/dev-clean/'

# Function to load and extract features from the dataset
def load_and_extract_features(audio_dir):
    features = []
    labels = []
    for root, dirs, files in os.walk(audio_dir):
        for file in files:
            if file.endswith('.flac'):  # Process .flac files
                label = root.split('/')[-2]  # Speaker ID is the folder name before the file level
                file_path = os.path.join(root, file)
                text, mel = transcribe_and_extract_embedding(file_path)
                features.append(mel)  # Use the mel spectrogram as the feature
                labels.append(label)
    return np.array(features), np.array(labels)

# Extract features from the dataset
X, y = load_and_extract_features(audio_dir)

# Extract features for the new speaker's audio
new_text, new_mel = transcribe_and_extract_embedding(sample_audio_output)

# Compare new speaker's features with the dataset features
def compare_embeddings(new_embedding, X, y):
    similarities = np.linalg.norm(X - new_embedding, axis=1)  # Euclidean distance
    most_similar_index = np.argmin(similarities)
    return y[most_similar_index], similarities[most_similar_index]

# Get the prediction
predicted_speaker, similarity_score = compare_embeddings(new_mel, X, y)
print(f'The new input is classified as: {predicted_speaker} with similarity score: {similarity_score:.4f}')


Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-xiqausr8
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-xiqausr8
  Resolved https://github.com/openai/whisper.git to commit 25639fc17ddc013d56c594bfbf7644f2185fad84
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting tiktoken (from openai-whisper==20240930)
  Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting triton>=2.0.0 (from openai-whisper==20240930)
  Downloading triton-3.0.0-1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.3 kB)
Downloading triton-3.0.0-1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (209.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m 

Downloading...
From: https://drive.google.com/uc?id=1eWp85xyn08a7qP1A3aKFXlR767IALea_
To: /content/sample_audio.wav
100%|██████████| 3.25M/3.25M [00:00<00:00, 93.7MB/s]
100%|████████████████████████████████████████| 139M/139M [00:01<00:00, 142MiB/s]
  checkpoint = torch.load(fp, map_location=device)
Downloading...
From (original): https://drive.google.com/uc?id=1xkDfnwkBoeEPxRQ91hwAkwbICVhVJOQd
From (redirected): https://drive.google.com/uc?id=1xkDfnwkBoeEPxRQ91hwAkwbICVhVJOQd&confirm=t&uuid=ad48b985-156d-43d9-9802-e32457f460fc
To: /content/LibriSpeech_dataset.tar.gz
100%|██████████| 338M/338M [00:11<00:00, 28.4MB/s]


KeyError: 'mel'

In [4]:
# Install necessary libraries
!pip install git+https://github.com/openai/whisper.git
!pip install gdown
!pip install torch  # Ensure PyTorch is installed

import torch
import whisper
import gdown
import numpy as np
import os

# Download the sample audio for the new speaker
sample_audio_link = "https://drive.google.com/uc?id=1eWp85xyn08a7qP1A3aKFXlR767IALea_"
sample_audio_output = "sample_audio.wav"
gdown.download(sample_audio_link, sample_audio_output, quiet=False)

# Load the Whisper model
model = whisper.load_model("base")  # You can also use 'small', 'medium', or 'large'

# Function to transcribe audio and get embeddings
def transcribe_and_extract_embedding(file_path):
    # Load audio
    audio = whisper.load_audio(file_path)
    audio = whisper.pad_or_trim(audio)

    # Get mel spectrogram
    mel = whisper.log_mel_spectrogram(audio).numpy()

    # Transcribe audio
    result = model.transcribe(file_path)
    text = result['text']

    return text, mel

# Load the dataset
google_drive_link = "https://drive.google.com/uc?id=1xkDfnwkBoeEPxRQ91hwAkwbICVhVJOQd"
output = "LibriSpeech_dataset.tar.gz"
gdown.download(google_drive_link, output, quiet=False)

# Extract the dataset
dataset_path = "/content/LibriSpeech"
os.makedirs(dataset_path, exist_ok=True)
!tar -xzf {output} -C {dataset_path}

audio_dir = '/content/LibriSpeech/LibriSpeech/dev-clean/'

# Function to load and extract features from the dataset
def load_and_extract_features(audio_dir):
    features = []
    labels = []
    for root, dirs, files in os.walk(audio_dir):
        for file in files:
            if file.endswith('.flac'):  # Process .flac files
                label = root.split('/')[-2]  # Speaker ID is the folder name before the file level
                file_path = os.path.join(root, file)
                text, mel = transcribe_and_extract_embedding(file_path)
                features.append(mel)  # Use the mel spectrogram as the feature
                labels.append(label)
    return np.array(features), np.array(labels)

# Extract features from the dataset
X, y = load_and_extract_features(audio_dir)

# Extract features for the new speaker's audio
new_text, new_mel = transcribe_and_extract_embedding(sample_audio_output)

# Compare new speaker's features with the dataset features
def compare_embeddings(new_embedding, X, y):
    similarities = np.linalg.norm(X - new_embedding, axis=1)  # Euclidean distance
    most_similar_index = np.argmin(similarities)
    return y[most_similar_index], similarities[most_similar_index]

# Get the prediction
predicted_speaker, similarity_score = compare_embeddings(new_mel, X, y)
print(f'The new input is classified as: {predicted_speaker} with similarity score: {similarity_score:.4f}')


Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-1okqx_xh
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-1okqx_xh
  Resolved https://github.com/openai/whisper.git to commit 25639fc17ddc013d56c594bfbf7644f2185fad84
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


Downloading...
From: https://drive.google.com/uc?id=1eWp85xyn08a7qP1A3aKFXlR767IALea_
To: /content/sample_audio.wav
100%|██████████| 3.25M/3.25M [00:00<00:00, 122MB/s]
  checkpoint = torch.load(fp, map_location=device)
Downloading...
From (original): https://drive.google.com/uc?id=1xkDfnwkBoeEPxRQ91hwAkwbICVhVJOQd
From (redirected): https://drive.google.com/uc?id=1xkDfnwkBoeEPxRQ91hwAkwbICVhVJOQd&confirm=t&uuid=a945a496-f498-4b53-82b3-b319bf461a2f
To: /content/LibriSpeech_dataset.tar.gz
100%|██████████| 338M/338M [00:01<00:00, 174MB/s]


KeyboardInterrupt: 

In [1]:
# Install necessary libraries
!pip install transformers torch torchaudio soundfile gdown numpy

import torch
import torch.nn as nn
import torchaudio
import soundfile as sf
import gdown
from transformers import Wav2Vec2Processor, Wav2Vec2Model
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import os

# Download the sample audio file from the provided Google Drive link
google_drive_link = "https://drive.google.com/uc?id=1eWp85xyn08a7qP1A3aKFXlR767IALea_"
output = "sample_audio_file.flac"
gdown.download(google_drive_link, output, quiet=False)

# Load Wav2Vec 2.0 model and processor
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")

# Function to extract embeddings using Wav2Vec 2.0
def extract_wav2vec_embeddings(file_path):
    # Load the audio file
    speech, sample_rate = sf.read(file_path)

    # Resample to 16kHz if necessary
    if sample_rate != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
        speech = resampler(torch.tensor(speech).unsqueeze(0)).squeeze(0).numpy()

    # Preprocess the audio for Wav2Vec 2.0
    inputs = processor(speech, sampling_rate=16000, return_tensors="pt", padding=True)

    # Extract the embeddings using Wav2Vec 2.0
    with torch.no_grad():
        embeddings = model(**inputs).last_hidden_state.mean(dim=1).squeeze().numpy()

    return embeddings

# Define the Siamese Network
class SiameseNetwork(nn.Module):
    def __init__(self):
        super(SiameseNetwork, self).__init__()
        # Define the neural network layers
        self.fc1 = nn.Linear(768, 512)
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, 128)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Define the contrastive loss function
class ContrastiveLoss(nn.Module):
    def __init__(self, margin=1.0):
        super(ContrastiveLoss, self).__init__()
        self.margin = margin

    def forward(self, output1, output2, label):
        euclidean_distance = nn.functional.pairwise_distance(output1, output2)
        loss_contrastive = torch.mean((1 - label) * torch.pow(euclidean_distance, 2) +
                                      (label) * torch.pow(torch.clamp(self.margin - euclidean_distance, min=0.0), 2))
        return loss_contrastive

# Example dataset class for training pairs
class VoiceDataset(Dataset):
    def __init__(self, audio_files, labels):
        self.audio_files = audio_files
        self.labels = labels

    def __len__(self):
        return len(self.audio_files)

    def __getitem__(self, idx):
        # Here we simulate a pair of inputs (you should prepare pairs of similar/dissimilar audio samples)
        file1 = self.audio_files[idx]
        label1 = self.labels[idx]
        file2 = np.random.choice(self.audio_files)  # Randomly pick another sample for comparison
        label2 = np.random.choice(self.labels)

        # Label 1 means similar, 0 means dissimilar
        label = 1 if label1 == label2 else 0

        emb1 = torch.tensor(extract_wav2vec_embeddings(file1), dtype=torch.float32)
        emb2 = torch.tensor(extract_wav2vec_embeddings(file2), dtype=torch.float32)

        return emb1, emb2, torch.tensor(label, dtype=torch.float32)

# Prepare a dummy dataset for demonstration
audio_files = [output] * 5  # Simulating a small dataset with the same file
labels = ['Speaker1'] * 5  # Simulating that all belong to the same speaker

# Create the dataset and dataloader
dataset = VoiceDataset(audio_files, labels)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

# Instantiate the Siamese Network and Loss
siamese_network = SiameseNetwork()
criterion = ContrastiveLoss()

# Optimizer
optimizer = torch.optim.Adam(siamese_network.parameters(), lr=0.001)

# Training loop
for epoch in range(10):  # Training for 10 epochs
    total_loss = 0
    for data in dataloader:
        emb1, emb2, label = data
        optimizer.zero_grad()

        # Forward pass
        output1 = siamese_network(emb1)
        output2 = siamese_network(emb2)

        # Compute the loss
        loss = criterion(output1, output2, label)
        total_loss += loss.item()

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}, Loss: {total_loss/len(dataloader)}")

# After training, we can now use the model to predict similarity between new inputs
# Function to compare new embeddings using the trained Siamese network
def compare_embeddings(emb1, emb2):
    emb1 = torch.tensor(emb1, dtype=torch.float32)
    emb2 = torch.tensor(emb2, dtype=torch.float32)
    output1 = siamese_network(emb1)
    output2 = siamese_network(emb2)
    similarity = nn.functional.pairwise_distance(output1, output2)
    return similarity.item()

# Inference: Compare a new input with stored embeddings
new_input_embedding = extract_wav2vec_embeddings(output)
stored_embedding = extract_wav2vec_embeddings(output)  # Example, should be from stored data
similarity = compare_embeddings(new_input_embedding, stored_embedding)

print(f'Similarity score: {similarity:.2f}')




Downloading...
From: https://drive.google.com/uc?id=1eWp85xyn08a7qP1A3aKFXlR767IALea_
To: /content/sample_audio_file.flac
100%|██████████| 3.25M/3.25M [00:00<00:00, 78.5MB/s]
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and

RuntimeError: expected scalar type Double but found Float