**Wav2Vec**

In [None]:
!pip install torch torchaudio transformers



Wav2vec

In [None]:
import torch
import torchaudio
import sqlite3
import numpy as np
from transformers import Wav2Vec2Processor, Wav2Vec2Model
from sklearn.metrics.pairwise import cosine_similarity
from google.colab import files

# Load the pre-trained Wav2Vec2 model for feature extraction
model_name = "facebook/wav2vec2-base-960h"
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2Model.from_pretrained(model_name)

# Initialize SQLite database connection
conn = sqlite3.connect('audio_embeddings.db')
cursor = conn.cursor()

# Create a table for storing embeddings if it doesn't exist
cursor.execute('''
    CREATE TABLE IF NOT EXISTS audio_embeddings (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        file_name TEXT,
        embedding BLOB
    )
''')
conn.commit()

# Function to process audio and extract embeddings with a fixed size
def extract_embeddings(file_path, fixed_embedding_size=768):
    # Load the audio file
    waveform, sample_rate = torchaudio.load(file_path)

    # Resample the audio to 16 kHz if necessary
    if sample_rate != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
        waveform = resampler(waveform)

    # Normalize and preprocess the audio
    input_values = processor(waveform.squeeze().numpy(), return_tensors="pt", sampling_rate=16000).input_values

    # Extract embeddings using Wav2Vec2Model
    with torch.no_grad():
        embeddings = model(input_values).last_hidden_state

    # Average the embeddings across the sequence length (dimension 1) to get a single vector
    mean_embeddings = embeddings.mean(dim=1).squeeze().numpy()

    # Ensure the embedding has a fixed size by truncating or padding with zeros
    if mean_embeddings.shape[0] > fixed_embedding_size:
        mean_embeddings = mean_embeddings[:fixed_embedding_size]
    elif mean_embeddings.shape[0] < fixed_embedding_size:
        mean_embeddings = np.pad(mean_embeddings, (0, fixed_embedding_size - mean_embeddings.shape[0]), 'constant')

    return mean_embeddings

# Function to store embeddings in the SQLite database
def store_embedding(file_name, embedding):
    # Convert the numpy array to binary data to store in the database
    embedding_blob = embedding.tobytes()
    cursor.execute('INSERT INTO audio_embeddings (file_name, embedding) VALUES (?, ?)', (file_name, embedding_blob))
    conn.commit()

# Function to find the most similar audio from the database for a given embedding
def find_most_similar(embedding, similarity_threshold=0.75):
    cursor.execute('SELECT file_name, embedding FROM audio_embeddings')
    rows = cursor.fetchall()

    max_similarity = -1
    most_similar_file = None

    for row in rows:
        stored_file_name = row[0]
        stored_embedding = np.frombuffer(row[1], dtype=np.float32)

        # Ensure both embeddings have the same size
        if stored_embedding.shape[0] != embedding.shape[0]:
            continue  # Skip comparison if the sizes don't match

        # Compute the cosine similarity between the new and stored embeddings
        similarity = cosine_similarity([embedding], [stored_embedding])[0][0]

        if similarity > max_similarity:
            max_similarity = similarity
            most_similar_file = stored_file_name

    # Determine if the new file is considered similar to any stored file
    is_new_voice = max_similarity < similarity_threshold

    return most_similar_file, max_similarity, is_new_voice

# Upload and test new audio files
print("Please upload a new audio file for similarity testing:")
new_file = files.upload()
new_file_name = list(new_file.keys())[0]

# Extract embeddings for the uploaded file
new_embedding = extract_embeddings(new_file_name)

# Find the most similar audio file and check if it matches any existing ones
most_similar_file, similarity_score, is_new_voice = find_most_similar(new_embedding)

# Display the results of the similarity test
if is_new_voice:
    print(f"The uploaded file '{new_file_name}' does not match any existing files closely enough. Adding it to the database as a new voice.")
    store_embedding(new_file_name, new_embedding)
else:
    print(f"The uploaded file '{new_file_name}' is most similar to '{most_similar_file}' with a similarity score of {similarity_score:.4f}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.60k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/378M [00:00<?, ?B/s]

Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Please upload a new audio file for similarity testing:


Saving a1.wav to a1.wav
Saving a2.wav to a2.wav
Saving a3.wav to a3.wav
Saving a4.wav to a4.wav
Saving a5.wav to a5.wav
The uploaded file 'a1.wav' does not match any existing files closely enough. Adding it to the database as a new voice.


# **Wav2vec Similarity using Cosine**

In [None]:
import torch
import torchaudio
import sqlite3
import numpy as np
from transformers import Wav2Vec2Processor, Wav2Vec2Model
from sklearn.metrics.pairwise import cosine_similarity
from google.colab import files

# Load the pre-trained Wav2Vec2 model for feature extraction
model_name = "facebook/wav2vec2-base-960h"
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2Model.from_pretrained(model_name)

# Initialize SQLite database connection
conn = sqlite3.connect('audio_embeddings.db')
cursor = conn.cursor()

# Create a table for storing embeddings if it doesn't exist
cursor.execute('''
    CREATE TABLE IF NOT EXISTS audio_embeddings (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        file_name TEXT,
        embedding BLOB
    )
''')
conn.commit()

# Function to process audio and extract embeddings with a fixed size
def extract_embeddings(file_path, fixed_embedding_size=768):
    # Load the audio file
    waveform, sample_rate = torchaudio.load(file_path)

    # Resample the audio to 16 kHz if necessary
    if sample_rate != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
        waveform = resampler(waveform)

    # Normalize and preprocess the audio
    input_values = processor(waveform.squeeze().numpy(), return_tensors="pt", sampling_rate=16000).input_values

    # Extract embeddings using Wav2Vec2Model
    with torch.no_grad():
        embeddings = model(input_values).last_hidden_state

    # Average the embeddings across the sequence length (dimension 1) to get a single vector
    mean_embeddings = embeddings.mean(dim=1).squeeze().numpy()

    # Ensure the embedding has a fixed size by truncating or padding with zeros
    if mean_embeddings.shape[0] > fixed_embedding_size:
        mean_embeddings = mean_embeddings[:fixed_embedding_size]
    elif mean_embeddings.shape[0] < fixed_embedding_size:
        mean_embeddings = np.pad(mean_embeddings, (0, fixed_embedding_size - mean_embeddings.shape[0]), 'constant')

    return mean_embeddings

# Function to store embeddings in the SQLite database
def store_embedding(file_name, embedding):
    # Convert the numpy array to binary data to store in the database
    embedding_blob = embedding.tobytes()
    cursor.execute('INSERT INTO audio_embeddings (file_name, embedding) VALUES (?, ?)', (file_name, embedding_blob))
    conn.commit()

# Function to find the most similar audio from the database for a given embedding
def find_most_similar(embedding, similarity_threshold=0.75):
    cursor.execute('SELECT file_name, embedding FROM audio_embeddings')
    rows = cursor.fetchall()

    max_similarity = -1
    most_similar_file = None

    for row in rows:
        stored_file_name = row[0]
        stored_embedding = np.frombuffer(row[1], dtype=np.float32)

        # Ensure both embeddings have the same size
        if stored_embedding.shape[0] != embedding.shape[0]:
            continue  # Skip comparison if the sizes don't match

        # Compute the cosine similarity between the new and stored embeddings
        similarity = cosine_similarity([embedding], [stored_embedding])[0][0]

        if similarity > max_similarity:
            max_similarity = similarity
            most_similar_file = stored_file_name

    # Determine if the new file is considered similar to any stored file
    is_new_voice = max_similarity < similarity_threshold

    return most_similar_file, max_similarity, is_new_voice

# Step 1: Initial Upload of Files to Create Database
print("Please upload initial audio files to create the database:")
initial_files = files.upload()

# Process and store the uploaded initial files in the database
for file_name in initial_files.keys():
    embedding = extract_embeddings(file_name)
    store_embedding(file_name, embedding)
    print(f"Stored embedding for {file_name} in the database.")

# Step 2: Testing Phase - Upload a new file for similarity testing
print("Please upload a new audio file for similarity testing:")
new_file = files.upload()
new_file_name = list(new_file.keys())[0]

# Extract embeddings for the uploaded file
new_embedding = extract_embeddings(new_file_name)

# Step 3: Find the most similar audio file and update the database if necessary
most_similar_file, similarity_score, is_new_voice = find_most_similar(new_embedding)

# Display the results of the similarity test and update the database if needed
if is_new_voice:
    print(f"The uploaded file '{new_file_name}' does not match any existing files closely enough. Adding it to the database as a new voice.")
    store_embedding(new_file_name, new_embedding)
else:
    print(f"The uploaded file '{new_file_name}' is most similar to '{most_similar_file}' with a similarity score of {similarity_score:.4f}")


Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Please upload initial audio files to create the database:


Saving a1.wav to a1 (1).wav
Saving a2.wav to a2 (1).wav
Saving a3.wav to a3 (1).wav
Saving a4.wav to a4 (1).wav
Saving a5.wav to a5 (1).wav
Stored embedding for a1 (1).wav in the database.
Stored embedding for a2 (1).wav in the database.
Stored embedding for a3 (1).wav in the database.
Stored embedding for a4 (1).wav in the database.
Stored embedding for a5 (1).wav in the database.
Please upload a new audio file for similarity testing:


Saving r2.wav to r2.wav
The uploaded file 'r2.wav' is most similar to 'a2 (1).wav' with a similarity score of 0.9726


# **Flattening**

In [None]:
# Function to process audio and extract embeddings with fixed-size flattening
def extract_embeddings(file_path, fixed_embedding_size=768):
    # Load the audio file
    waveform, sample_rate = torchaudio.load(file_path)

    # Resample the audio to 16 kHz if necessary
    if sample_rate != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
        waveform = resampler(waveform)

    # Normalize and preprocess the audio
    input_values = processor(waveform.squeeze().numpy(), return_tensors="pt", sampling_rate=16000).input_values

    # Extract embeddings using Wav2Vec2Model
    with torch.no_grad():
        embeddings = model(input_values).last_hidden_state

    # Flatten the embeddings to create a fixed-size vector
    flattened_embeddings = embeddings.flatten().numpy()

    # Ensure that the embedding has the fixed size by truncating or padding with zeros
    if flattened_embeddings.shape[0] > fixed_embedding_size:
        flattened_embeddings = flattened_embeddings[:fixed_embedding_size]
    elif flattened_embeddings.shape[0] < fixed_embedding_size:
        flattened_embeddings = np.pad(flattened_embeddings, (0, fixed_embedding_size - flattened_embeddings.shape[0]), 'constant')

    # Reshape to ensure we have a 2D array with shape (1, fixed_embedding_size)
    return flattened_embeddings.reshape(1, -1)

# Function to store embeddings in the SQLite database
def store_embedding(file_name, embedding):
    # Convert the numpy array to binary data to store in the database
    embedding_blob = embedding.tobytes()
    cursor.execute('INSERT INTO audio_embeddings (file_name, embedding) VALUES (?, ?)', (file_name, embedding_blob))
    conn.commit()

# Function to find the most similar audio from the database for a given embedding
def find_most_similar(embedding, similarity_threshold=0.75):
    cursor.execute('SELECT file_name, embedding FROM audio_embeddings')
    rows = cursor.fetchall()

    max_similarity = -1
    most_similar_file = None

    for row in rows:
        stored_file_name = row[0]
        stored_embedding = np.frombuffer(row[1], dtype=np.float32)

        # Ensure that the stored embedding is also reshaped to the correct size
        stored_embedding = stored_embedding.reshape(1, -1)

        # Compute the cosine similarity between the new and stored embeddings
        similarity = cosine_similarity(embedding, stored_embedding)[0][0]

        if similarity > max_similarity:
            max_similarity = similarity
            most_similar_file = stored_file_name

    # Determine if the new file is considered similar to any stored file
    is_new_voice = max_similarity < similarity_threshold

    return most_similar_file, max_similarity, is_new_voice


# Step 2: Testing Phase - Upload a new file for similarity testing
print("Please upload a new audio file for similarity testing:")
new_file = files.upload()
new_file_name = list(new_file.keys())[0]

# Extract embeddings for the uploaded file
new_embedding = extract_embeddings(new_file_name)

# Step 3: Find the most similar audio file and update the database if necessary
most_similar_file, similarity_score, is_new_voice = find_most_similar(new_embedding)

# Display the results of the similarity test and update the database if needed
if is_new_voice:
    print(f"The uploaded file '{new_file_name}' does not match any existing files closely enough. Adding it to the database as a new voice.")
    store_embedding(new_file_name, new_embedding)
else:
    print(f"The uploaded file '{new_file_name}' is most similar to '{most_similar_file}' with a similarity score of {similarity_score:.4f}")


Please upload a new audio file for similarity testing:


Saving harvard.wav to harvard (3).wav
The uploaded file 'harvard (3).wav' is most similar to 'a2 (1).wav' with a similarity score of 0.8964


In [None]:

# Step 2: Testing Phase - Upload a new file for similarity testing
print("Please upload a new audio file for similarity testing:")
new_file = files.upload()
new_file_name = list(new_file.keys())[0]

# Extract embeddings for the uploaded file
new_embedding = extract_embeddings(new_file_name)

# Step 3: Find the most similar audio file and update the database if necessary
most_similar_file, similarity_score, is_new_voice = find_most_similar(new_embedding)

# Display the results of the similarity test and update the database if needed
if is_new_voice:
    print(f"The uploaded file '{new_file_name}' does not match any existing files closely enough. Adding it to the database as a new voice.")
    store_embedding(new_file_name, new_embedding)
else:
    print(f"The uploaded file '{new_file_name}' is most similar to '{most_similar_file}' with a similarity score of {similarity_score:.4f}")


Please upload a new audio file for similarity testing:


Saving jackhammer.wav to jackhammer (1).wav
The uploaded file 'jackhammer (1).wav' is most similar to 'a2 (1).wav' with a similarity score of 0.9191


# **Normalize**

In [None]:
from sklearn.preprocessing import normalize

# Function to process audio and extract embeddings using the flatten approach with fixed size
def extract_embeddings(file_path, fixed_embedding_size=3072):
    # Load the audio file
    waveform, sample_rate = torchaudio.load(file_path)

    # Resample the audio to 16 kHz if necessary
    if sample_rate != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
        waveform = resampler(waveform)

    # Normalize and preprocess the audio
    input_values = processor(waveform.squeeze().numpy(), return_tensors="pt", sampling_rate=16000).input_values

    # Extract embeddings using Wav2Vec2Model
    with torch.no_grad():
        embeddings = model(input_values).last_hidden_state

    # Flatten the embeddings to create a fixed-size vector
    flattened_embeddings = embeddings.flatten().numpy()

    # Ensure that the embedding has the fixed size by truncating or padding with zeros
    if flattened_embeddings.shape[0] > fixed_embedding_size:
        flattened_embeddings = flattened_embeddings[:fixed_embedding_size]
    elif flattened_embeddings.shape[0] < fixed_embedding_size:
        flattened_embeddings = np.pad(flattened_embeddings, (0, fixed_embedding_size - flattened_embeddings.shape[0]), 'constant')

    # Normalize the flattened embedding for better similarity comparison
    normalized_embedding = normalize(flattened_embeddings.reshape(1, -1))

    return normalized_embedding

# Function to store embeddings in the SQLite database
def store_embedding(file_name, embedding):
    # Convert the numpy array to binary data to store in the database
    embedding_blob = embedding.tobytes()
    cursor.execute('INSERT INTO audio_embeddings (file_name, embedding) VALUES (?, ?)', (file_name, embedding_blob))
    conn.commit()

# Function to find the most similar audio from the database for a given embedding
def find_most_similar(embedding, fixed_embedding_size=3072, similarity_threshold=0.75):
    cursor.execute('SELECT file_name, embedding FROM audio_embeddings')
    rows = cursor.fetchall()

    max_similarity = -1
    most_similar_file = None

    for row in rows:
        stored_file_name = row[0]
        stored_embedding = np.frombuffer(row[1], dtype=np.float32)

        # Ensure the stored embedding has the same fixed size as the new embedding
        if stored_embedding.shape[0] != fixed_embedding_size:
            if stored_embedding.shape[0] > fixed_embedding_size:
                stored_embedding = stored_embedding[:fixed_embedding_size]
            else:
                stored_embedding = np.pad(stored_embedding, (0, fixed_embedding_size - stored_embedding.shape[0]), 'constant')

        # Normalize the stored embedding before comparison
        normalized_stored_embedding = normalize(stored_embedding.reshape(1, -1))

        # Compute the cosine similarity between the new and stored embeddings
        similarity = cosine_similarity(embedding, normalized_stored_embedding)[0][0]

        if similarity > max_similarity:
            max_similarity = similarity
            most_similar_file = stored_file_name

    # Determine if the new file is considered similar to any stored file
    is_new_voice = max_similarity < similarity_threshold

    return most_similar_file, max_similarity, is_new_voice

# Process and store the uploaded initial files in the database
for file_name in initial_files.keys():
    embedding = extract_embeddings(file_name)
    store_embedding(file_name, embedding)
    print(f"Stored embedding for {file_name} in the database.")

# Step 2: Testing Phase - Upload a new file for similarity testing
print("Please upload a new audio file for similarity testing:")
new_file = files.upload()
new_file_name = list(new_file.keys())[0]

# Extract embeddings for the uploaded file
new_embedding = extract_embeddings(new_file_name)

# Step 3: Find the most similar audio file and update the database if necessary
most_similar_file, similarity_score, is_new_voice = find_most_similar(new_embedding)

# Display the results of the similarity test and update the database if needed
if is_new_voice:
    print(f"The uploaded file '{new_file_name}' does not match any existing files closely enough. Adding it to the database as a new voice.")
    store_embedding(new_file_name, new_embedding)
else:
    print(f"The uploaded file '{new_file_name}' is most similar to '{most_similar_file}' with a similarity score of {similarity_score:.4f}")


Stored embedding for a1 (1).wav in the database.
Stored embedding for a2 (1).wav in the database.
Stored embedding for a3 (1).wav in the database.
Stored embedding for a4 (1).wav in the database.
Stored embedding for a5 (1).wav in the database.
Please upload a new audio file for similarity testing:


Saving harvard.wav to harvard (6).wav
The uploaded file 'harvard (6).wav' is most similar to 'a2 (1).wav' with a similarity score of 0.8927


In [None]:
# Step 2: Testing Phase - Upload a new file for similarity testing
print("Please upload a new audio file for similarity testing:")
new_file = files.upload()
new_file_name = list(new_file.keys())[0]

# Extract embeddings for the uploaded file
new_embedding = extract_embeddings(new_file_name)

# Step 3: Find the most similar audio file and update the database if necessary
most_similar_file, similarity_score, is_new_voice = find_most_similar(new_embedding)

# Display the results of the similarity test and update the database if needed
if is_new_voice:
    print(f"The uploaded file '{new_file_name}' does not match any existing files closely enough. Adding it to the database as a new voice.")
    store_embedding(new_file_name, new_embedding)
else:
    print(f"The uploaded file '{new_file_name}' is most similar to '{most_similar_file}' with a similarity score of {similarity_score:.4f}")


Please upload a new audio file for similarity testing:


Saving r2.wav to r2 (1).wav
The uploaded file 'r2 (1).wav' is most similar to 'a4 (1).wav' with a similarity score of 0.7904


In [None]:

# Function to process audio and extract embeddings using the flatten approach with fixed size
def extract_embeddings(file_path, fixed_embedding_size=6144):
    # Load the audio file
    waveform, sample_rate = torchaudio.load(file_path)

    # Resample the audio to 16 kHz if necessary
    if sample_rate != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
        waveform = resampler(waveform)

    # Normalize and preprocess the audio
    input_values = processor(waveform.squeeze().numpy(), return_tensors="pt", sampling_rate=16000).input_values

    # Extract embeddings using Wav2Vec2Model
    with torch.no_grad():
        embeddings = model(input_values).last_hidden_state

    # Flatten the embeddings to create a fixed-size vector
    flattened_embeddings = embeddings.flatten().numpy()

    # Ensure that the embedding has the fixed size by truncating or padding with zeros
    if flattened_embeddings.shape[0] > fixed_embedding_size:
        flattened_embeddings = flattened_embeddings[:fixed_embedding_size]
    elif flattened_embeddings.shape[0] < fixed_embedding_size:
        flattened_embeddings = np.pad(flattened_embeddings, (0, fixed_embedding_size - flattened_embeddings.shape[0]), 'constant')

    # Normalize the flattened embedding for better similarity comparison
    normalized_embedding = normalize(flattened_embeddings.reshape(1, -1))

    return normalized_embedding

# Function to store embeddings in the SQLite database
def store_embedding(file_name, embedding):
    # Convert the numpy array to binary data to store in the database
    embedding_blob = embedding.tobytes()
    cursor.execute('INSERT INTO audio_embeddings (file_name, embedding) VALUES (?, ?)', (file_name, embedding_blob))
    conn.commit()

# Function to find the most similar audio from the database for a given embedding
def find_most_similar(embedding, fixed_embedding_size=6144, similarity_threshold=0.5):
    cursor.execute('SELECT file_name, embedding FROM audio_embeddings')
    rows = cursor.fetchall()

    max_similarity = -1
    most_similar_file = None

    for row in rows:
        stored_file_name = row[0]
        stored_embedding = np.frombuffer(row[1], dtype=np.float32)

        # Ensure the stored embedding has the same fixed size as the new embedding
        if stored_embedding.shape[0] != fixed_embedding_size:
            if stored_embedding.shape[0] > fixed_embedding_size:
                stored_embedding = stored_embedding[:fixed_embedding_size]
            else:
                stored_embedding = np.pad(stored_embedding, (0, fixed_embedding_size - stored_embedding.shape[0]), 'constant')

        # Normalize the stored embedding before comparison
        normalized_stored_embedding = normalize(stored_embedding.reshape(1, -1))

        # Compute the cosine similarity between the new and stored embeddings
        similarity = cosine_similarity(embedding, normalized_stored_embedding)[0][0]

        if similarity > max_similarity:
            max_similarity = similarity
            most_similar_file = stored_file_name

    # Determine if the new file is considered similar to any stored file
    is_new_voice = max_similarity < similarity_threshold

    return most_similar_file, max_similarity, is_new_voice


# Process and store the uploaded initial files in the database
for file_name in initial_files.keys():
    embedding = extract_embeddings(file_name)
    store_embedding(file_name, embedding)
    print(f"Stored embedding for {file_name} in the database.")

# Step 2: Testing Phase - Upload a new file for similarity testing
print("Please upload a new audio file for similarity testing:")
new_file = files.upload()
new_file_name = list(new_file.keys())[0]

# Extract embeddings for the uploaded file
new_embedding = extract_embeddings(new_file_name)

# Step 3: Find the most similar audio file and update the database if necessary
most_similar_file, similarity_score, is_new_voice = find_most_similar(new_embedding)

# Display the results of the similarity test and update the database if needed
if is_new_voice:
    print(f"The uploaded file '{new_file_name}' does not match any existing files closely enough. Adding it to the database as a new voice.")
    store_embedding(new_file_name, new_embedding)
else:
    print(f"The uploaded file '{new_file_name}' is most similar to '{most_similar_file}' with a similarity score of {similarity_score:.4f}")


Stored embedding for a1 (1).wav in the database.
Stored embedding for a2 (1).wav in the database.
Stored embedding for a3 (1).wav in the database.
Stored embedding for a4 (1).wav in the database.
Stored embedding for a5 (1).wav in the database.
Please upload a new audio file for similarity testing:


Saving harvard.wav to harvard (7).wav
The uploaded file 'harvard (7).wav' is most similar to 'a4 (1).wav' with a similarity score of 0.7222


In [None]:
# Step 2: Testing Phase - Upload a new file for similarity testing
print("Please upload a new audio file for similarity testing:")
new_file = files.upload()
new_file_name = list(new_file.keys())[0]

# Extract embeddings for the uploaded file
new_embedding = extract_embeddings(new_file_name)

# Step 3: Find the most similar audio file and update the database if necessary
most_similar_file, similarity_score, is_new_voice = find_most_similar(new_embedding)

# Display the results of the similarity test and update the database if needed
if is_new_voice:
    print(f"The uploaded file '{new_file_name}' does not match any existing files closely enough. Adding it to the database as a new voice.")
    store_embedding(new_file_name, new_embedding)
else:
    print(f"The uploaded file '{new_file_name}' is most similar to '{most_similar_file}' with a similarity score of {similarity_score:.4f}")


Please upload a new audio file for similarity testing:


Saving c1.wav to c1.wav
The uploaded file 'c1.wav' is most similar to 'a1 (1).wav' with a similarity score of 0.7032


# **# Threshold in Cosine**

In [None]:


# Function to process audio and extract embeddings using the flatten approach with fixed size
def extract_embeddings(file_path, fixed_embedding_size=6144):
    # Load the audio file
    waveform, sample_rate = torchaudio.load(file_path)

    # Resample the audio to 16 kHz if necessary
    if sample_rate != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
        waveform = resampler(waveform)

    # Normalize and preprocess the audio
    input_values = processor(waveform.squeeze().numpy(), return_tensors="pt", sampling_rate=16000).input_values

    # Extract embeddings using Wav2Vec2Model
    with torch.no_grad():
        embeddings = model(input_values).last_hidden_state

    # Flatten the embeddings to create a fixed-size vector
    flattened_embeddings = embeddings.flatten().numpy()

    # Ensure that the embedding has the fixed size by truncating or padding with zeros
    if flattened_embeddings.shape[0] > fixed_embedding_size:
        flattened_embeddings = flattened_embeddings[:fixed_embedding_size]
    elif flattened_embeddings.shape[0] < fixed_embedding_size:
        flattened_embeddings = np.pad(flattened_embeddings, (0, fixed_embedding_size - flattened_embeddings.shape[0]), 'constant')

    # Normalize the flattened embedding for better similarity comparison
    normalized_embedding = normalize(flattened_embeddings.reshape(1, -1))

    return normalized_embedding

# Function to store embeddings in the SQLite database
def store_embedding(file_name, embedding):
    # Convert the numpy array to binary data to store in the database
    embedding_blob = embedding.tobytes()
    cursor.execute('INSERT INTO audio_embeddings (file_name, embedding) VALUES (?, ?)', (file_name, embedding_blob))
    conn.commit()

# Function to find the most similar audio from the database for a given embedding
def find_most_similar(embedding, fixed_embedding_size=6144, similarity_threshold=0.8):
    cursor.execute('SELECT file_name, embedding FROM audio_embeddings')
    rows = cursor.fetchall()

    max_similarity = -1
    most_similar_file = None

    for row in rows:
        stored_file_name = row[0]
        stored_embedding = np.frombuffer(row[1], dtype=np.float32)

        # Ensure the stored embedding has the same fixed size as the new embedding
        if stored_embedding.shape[0] != fixed_embedding_size:
            if stored_embedding.shape[0] > fixed_embedding_size:
                stored_embedding = stored_embedding[:fixed_embedding_size]
            else:
                stored_embedding = np.pad(stored_embedding, (0, fixed_embedding_size - stored_embedding.shape[0]), 'constant')

        # Normalize the stored embedding before comparison
        normalized_stored_embedding = normalize(stored_embedding.reshape(1, -1))

        # Compute the cosine similarity between the new and stored embeddings
        similarity = cosine_similarity(embedding, normalized_stored_embedding)[0][0]

        if similarity > max_similarity:
            max_similarity = similarity
            most_similar_file = stored_file_name

    # Determine if the new file is considered similar to any stored file
    is_new_voice = max_similarity < similarity_threshold

    return most_similar_file, max_similarity, is_new_voice


# Process and store the uploaded initial files in the database
for file_name in initial_files.keys():
    embedding = extract_embeddings(file_name)
    store_embedding(file_name, embedding)
    print(f"Stored embedding for {file_name} in the database.")

# Step 2: Testing Phase - Upload a new file for similarity testing
print("Please upload a new audio file for similarity testing:")
new_file = files.upload()
new_file_name = list(new_file.keys())[0]

# Extract embeddings for the uploaded file
new_embedding = extract_embeddings(new_file_name)

# Step 3: Find the most similar audio file and update the database if necessary
most_similar_file, similarity_score, is_new_voice = find_most_similar(new_embedding)

# Display the results of the similarity test and update the database if the score is less than 0.8
if similarity_score < 0.8:
    print(f"The uploaded file '{new_file_name}' does not match any existing files closely enough (similarity score = {similarity_score:.4f}). Adding it to the database as a new voice.")
    store_embedding(new_file_name, new_embedding)
else:
    print(f"The uploaded file '{new_file_name}' is most similar to '{most_similar_file}' with a similarity score of {similarity_score:.4f}")


Stored embedding for a1 (1).wav in the database.
Stored embedding for a2 (1).wav in the database.
Stored embedding for a3 (1).wav in the database.
Stored embedding for a4 (1).wav in the database.
Stored embedding for a5 (1).wav in the database.
Please upload a new audio file for similarity testing:


Saving j.m4a to j (1).m4a
The uploaded file 'j (1).m4a' does not match any existing files closely enough (similarity score = 0.7474). Adding it to the database as a new voice.


In [None]:

# Step 2: Testing Phase - Upload a new file for similarity testing
print("Please upload a new audio file for similarity testing:")
new_file = files.upload()
new_file_name = list(new_file.keys())[0]

# Extract embeddings for the uploaded file
new_embedding = extract_embeddings(new_file_name)

# Step 3: Find the most similar audio file and update the database if necessary
most_similar_file, similarity_score, is_new_voice = find_most_similar(new_embedding)

# Display the results of the similarity test and update the database if the score is less than 0.8
if similarity_score < 0.8:
    print(f"The uploaded file '{new_file_name}' does not match any existing files closely enough (similarity score = {similarity_score:.4f}). Adding it to the database as a new voice.")
    store_embedding(new_file_name, new_embedding)
else:
    print(f"The uploaded file '{new_file_name}' is most similar to '{most_similar_file}' with a similarity score of {similarity_score:.4f}")


Please upload a new audio file for similarity testing:


Saving c1.wav to c1 (1).wav
The uploaded file 'c1 (1).wav' does not match any existing files closely enough (similarity score = 0.7853). Adding it to the database as a new voice.


noise reduction using Wiener filtering and trimming leading and trailing silence using Voice Activity Detection (VAD)

In [None]:
import torch
import torchaudio
import sqlite3
import numpy as np
from transformers import Wav2Vec2Processor, Wav2Vec2Model
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize
from scipy.signal import wiener
from google.colab import files

# Load the pre-trained Wav2Vec2 model for feature extraction
model_name = "facebook/wav2vec2-base-960h"
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2Model.from_pretrained(model_name)

# Initialize SQLite database connection
conn = sqlite3.connect('audio_embeddings.db')
cursor = conn.cursor()

# Create a table for storing embeddings if it doesn't exist
cursor.execute('''
    CREATE TABLE IF NOT EXISTS audio_embeddings (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        file_name TEXT,
        embedding BLOB
    )
''')
conn.commit()

# Function for noise reduction using Wiener filtering
def reduce_noise(waveform):
    # Apply Wiener filter for noise reduction
    return torch.tensor(wiener(waveform.numpy()))

# Function for trimming leading and trailing silence using Voice Activity Detection (VAD)
def trim_silence(waveform, sample_rate=16000):
    # Ensure the waveform is in mono (single channel)
    if waveform.shape[0] > 1:
        waveform = torch.mean(waveform, dim=0, keepdim=True)

    # Apply Voice Activity Detection (VAD) to trim silence
    vad = torchaudio.transforms.Vad(sample_rate=sample_rate)
    trimmed_waveform = vad(waveform)

    return trimmed_waveform

# Function to preprocess the audio file by reducing noise and trimming silence
def preprocess_audio(waveform, sample_rate=16000):
    # Reduce noise
    denoised_waveform = reduce_noise(waveform)

    # Trim leading and trailing silence
    trimmed_waveform = trim_silence(denoised_waveform, sample_rate=sample_rate)

    return trimmed_waveform

# Function to process audio and extract embeddings with preprocessing and flatten approach
def extract_embeddings(file_path, fixed_embedding_size=6144):
    # Load the audio file
    waveform, sample_rate = torchaudio.load(file_path)

    # Resample the audio to 16 kHz if necessary
    if sample_rate != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
        waveform = resampler(waveform)

    # Preprocess the audio to reduce noise and trim silence
    preprocessed_waveform = preprocess_audio(waveform, sample_rate=16000)

    # Normalize and preprocess the audio
    input_values = processor(preprocessed_waveform.squeeze().numpy(), return_tensors="pt", sampling_rate=16000).input_values

    # Extract embeddings using Wav2Vec2Model
    with torch.no_grad():
        embeddings = model(input_values).last_hidden_state

    # Flatten the embeddings to create a fixed-size vector
    flattened_embeddings = embeddings.flatten().numpy()

    # Ensure that the embedding has the fixed size by truncating or padding with zeros
    if flattened_embeddings.shape[0] > fixed_embedding_size:
        flattened_embeddings = flattened_embeddings[:fixed_embedding_size]
    elif flattened_embeddings.shape[0] < fixed_embedding_size:
        flattened_embeddings = np.pad(flattened_embeddings, (0, fixed_embedding_size - flattened_embeddings.shape[0]), 'constant')

    # Normalize the flattened embedding for better similarity comparison
    normalized_embedding = normalize(flattened_embeddings.reshape(1, -1))

    return normalized_embedding

# Function to store embeddings in the SQLite database
def store_embedding(file_name, embedding):
    # Convert the numpy array to binary data to store in the database
    embedding_blob = embedding.tobytes()
    cursor.execute('INSERT INTO audio_embeddings (file_name, embedding) VALUES (?, ?)', (file_name, embedding_blob))
    conn.commit()

# Function to find the most similar audio from the database for a given embedding
def find_most_similar(embedding, fixed_embedding_size=6144, similarity_threshold=0.5):
    cursor.execute('SELECT file_name, embedding FROM audio_embeddings')
    rows = cursor.fetchall()

    max_similarity = -1
    most_similar_file = None

    for row in rows:
        stored_file_name = row[0]
        stored_embedding = np.frombuffer(row[1], dtype=np.float32)

        # Ensure the stored embedding has the same fixed size as the new embedding
        if stored_embedding.shape[0] != fixed_embedding_size:
            if stored_embedding.shape[0] > fixed_embedding_size:
                stored_embedding = stored_embedding[:fixed_embedding_size]
            else:
                stored_embedding = np.pad(stored_embedding, (0, fixed_embedding_size - stored_embedding.shape[0]), 'constant')

        # Normalize the stored embedding before comparison
        normalized_stored_embedding = normalize(stored_embedding.reshape(1, -1))

        # Compute the cosine similarity between the new and stored embeddings
        similarity = cosine_similarity(embedding, normalized_stored_embedding)[0][0]

        if similarity > max_similarity:
            max_similarity = similarity
            most_similar_file = stored_file_name

    # Determine if the new file is considered similar to any stored file
    is_new_voice = max_similarity < similarity_threshold

    return most_similar_file, max_similarity, is_new_voice

# Step 1: Initial Upload of Files to Create Database
print("Please upload initial audio files to create the database:")
initial_files = files.upload()

# Process and store the uploaded initial files in the database
for file_name in initial_files.keys():
    embedding = extract_embeddings(file_name)
    store_embedding(file_name, embedding)
    print(f"Stored embedding for {file_name} in the database.")

# Step 2: Testing Phase - Upload a new file for similarity testing
print("Please upload a new audio file for similarity testing:")
new_file = files.upload()
new_file_name = list(new_file.keys())[0]

# Extract embeddings for the uploaded file
new_embedding = extract_embeddings(new_file_name)

# Step 3: Find the most similar audio file and update the database if necessary
most_similar_file, similarity_score, is_new_voice = find_most_similar(new_embedding)

# Display the results of the similarity test and update the database if needed
if is_new_voice:
    print(f"The uploaded file '{new_file_name}' does not match any existing files closely enough. Adding it to the database as a new voice.")
    store_embedding(new_file_name, new_embedding)
else:
    print(f"The uploaded file '{new_file_name}' is most similar to '{most_similar_file}' with a similarity score of {similarity_score:.4f}")


Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Please upload initial audio files to create the database:


Saving a1.wav to a1 (3).wav
Saving a2.wav to a2 (3).wav
Saving a3.wav to a3 (3).wav
Saving a4.wav to a4 (3).wav
Saving a5.wav to a5 (3).wav
Stored embedding for a1 (3).wav in the database.


  res *= (1 - noise / lVar)
  res *= (1 - noise / lVar)


Stored embedding for a2 (3).wav in the database.


  res *= (1 - noise / lVar)
  res *= (1 - noise / lVar)


Stored embedding for a3 (3).wav in the database.


  res *= (1 - noise / lVar)
  res *= (1 - noise / lVar)


Stored embedding for a4 (3).wav in the database.


  res *= (1 - noise / lVar)
  res *= (1 - noise / lVar)


Stored embedding for a5 (3).wav in the database.
Please upload a new audio file for similarity testing:


Saving harvard.wav to harvard (8).wav
The uploaded file 'harvard (8).wav' is most similar to 'a1 (3).wav' with a similarity score of 0.7403


In [None]:

# Step 2: Testing Phase - Upload a new file for similarity testing
print("Please upload a new audio file for similarity testing:")
new_file = files.upload()
new_file_name = list(new_file.keys())[0]

# Extract embeddings for the uploaded file
new_embedding = extract_embeddings(new_file_name)

# Step 3: Find the most similar audio file and update the database if necessary
most_similar_file, similarity_score, is_new_voice = find_most_similar(new_embedding)

# Display the results of the similarity test and update the database if needed
if is_new_voice:
    print(f"The uploaded file '{new_file_name}' does not match any existing files closely enough. Adding it to the database as a new voice.")
    store_embedding(new_file_name, new_embedding)
else:
    print(f"The uploaded file '{new_file_name}' is most similar to '{most_similar_file}' with a similarity score of {similarity_score:.4f}")


Please upload a new audio file for similarity testing:


Saving 2.wav to 2.wav
The uploaded file '2.wav' is most similar to 'a1 (3).wav' with a similarity score of 0.8963


## Proper **Embedding** normalize + flatten---Noise Reduction---Euclidean

In [None]:
import torch
import torchaudio
import sqlite3
import numpy as np
from transformers import Wav2Vec2Processor, Wav2Vec2Model
from sklearn.preprocessing import normalize
from scipy.signal import wiener
from scipy.spatial.distance import euclidean
from google.colab import files

# Load the pre-trained Wav2Vec2 model for feature extraction
model_name = "facebook/wav2vec2-base-960h"
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2Model.from_pretrained(model_name)

# Initialize SQLite database connection
conn = sqlite3.connect('audio_embeddings.db')
cursor = conn.cursor()

# Create a table for storing embeddings if it doesn't exist
cursor.execute('''
    CREATE TABLE IF NOT EXISTS audio_embeddings (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        file_name TEXT,
        embedding BLOB
    )
''')
conn.commit()

# Function for noise reduction using Wiener filtering
def reduce_noise(waveform):
    # Apply Wiener filter for noise reduction
    return torch.tensor(wiener(waveform.numpy()))

# Function for trimming leading and trailing silence using Voice Activity Detection (VAD)
def trim_silence(waveform, sample_rate=16000):
    # Ensure the waveform is in mono (single channel)
    if waveform.shape[0] > 1:
        waveform = torch.mean(waveform, dim=0, keepdim=True)

    # Apply Voice Activity Detection (VAD) to trim silence
    vad = torchaudio.transforms.Vad(sample_rate=sample_rate)
    trimmed_waveform = vad(waveform)

    return trimmed_waveform

# Function to preprocess the audio file by reducing noise and trimming silence
def preprocess_audio(waveform, sample_rate=16000):
    # Reduce noise
    denoised_waveform = reduce_noise(waveform)

    # Trim leading and trailing silence
    trimmed_waveform = trim_silence(denoised_waveform, sample_rate=sample_rate)

    return trimmed_waveform

# Function to process audio and extract embeddings with preprocessing and flatten approach
def extract_embeddings(file_path, fixed_embedding_size=6144):
    # Load the audio file
    waveform, sample_rate = torchaudio.load(file_path)

    # Resample the audio to 16 kHz if necessary
    if sample_rate != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
        waveform = resampler(waveform)

    # Preprocess the audio to reduce noise and trim silence
    preprocessed_waveform = preprocess_audio(waveform, sample_rate=16000)

    # Normalize and preprocess the audio
    input_values = processor(preprocessed_waveform.squeeze().numpy(), return_tensors="pt", sampling_rate=16000).input_values

    # Extract embeddings using Wav2Vec2Model
    with torch.no_grad():
        embeddings = model(input_values).last_hidden_state

    # Flatten the embeddings to create a fixed-size vector
    flattened_embeddings = embeddings.flatten().numpy()

    # Ensure that the embedding has the fixed size by truncating or padding with zeros
    if flattened_embeddings.shape[0] > fixed_embedding_size:
        flattened_embeddings = flattened_embeddings[:fixed_embedding_size]
    elif flattened_embeddings.shape[0] < fixed_embedding_size:
        flattened_embeddings = np.pad(flattened_embeddings, (0, fixed_embedding_size - flattened_embeddings.shape[0]), 'constant')

    # Normalize the flattened embedding for better similarity comparison
    normalized_embedding = normalize(flattened_embeddings.reshape(1, -1))

    return normalized_embedding

# Function to store embeddings in the SQLite database
def store_embedding(file_name, embedding):
    # Convert the numpy array to binary data to store in the database
    embedding_blob = embedding.tobytes()
    cursor.execute('INSERT INTO audio_embeddings (file_name, embedding) VALUES (?, ?)', (file_name, embedding_blob))
    conn.commit()

# Function to find the most similar audio using Euclidean distance
def find_most_similar(embedding, fixed_embedding_size=6144, similarity_threshold=0.5):
    cursor.execute('SELECT file_name, embedding FROM audio_embeddings')
    rows = cursor.fetchall()

    min_distance = float('inf')
    most_similar_file = None

    for row in rows:
        stored_file_name = row[0]
        stored_embedding = np.frombuffer(row[1], dtype=np.float32)

        # Ensure the stored embedding has the same fixed size as the new embedding
        if stored_embedding.shape[0] != fixed_embedding_size:
            if stored_embedding.shape[0] > fixed_embedding_size:
                stored_embedding = stored_embedding[:fixed_embedding_size]
            else:
                stored_embedding = np.pad(stored_embedding, (0, fixed_embedding_size - stored_embedding.shape[0]), 'constant')

        # Normalize the stored embedding before comparison
        normalized_stored_embedding = normalize(stored_embedding.reshape(1, -1))

        # Compute the Euclidean distance between the new and stored embeddings
        distance = euclidean(embedding.flatten(), normalized_stored_embedding.flatten())

        if distance < min_distance:
            min_distance = distance
            most_similar_file = stored_file_name

    # Determine if the new file is considered similar to any stored file based on the distance threshold
    is_new_voice = min_distance > similarity_threshold

    return most_similar_file, min_distance, is_new_voice

# Step 1: Initial Upload of Files to Create Database
print("Please upload initial audio files to create the database:")
initial_files = files.upload()

# Process and store the uploaded initial files in the database
for file_name in initial_files.keys():
    embedding = extract_embeddings(file_name)
    store_embedding(file_name, embedding)
    print(f"Stored embedding for {file_name} in the database.")

# Step 2: Testing Phase - Upload a new file for similarity testing
print("Please upload a new audio file for similarity testing:")
new_file = files.upload()
new_file_name = list(new_file.keys())[0]

# Extract embeddings for the uploaded file
new_embedding = extract_embeddings(new_file_name)

# Step 3: Find the most similar audio file and update the database if necessary
most_similar_file, distance_score, is_new_voice = find_most_similar(new_embedding)

# Display the results of the similarity test and update the database if needed
if is_new_voice:
    print(f"The uploaded file '{new_file_name}' does not match any existing files closely enough. Adding it to the database as a new voice.")
    store_embedding(new_file_name, new_embedding)
else:
    print(f"The uploaded file '{new_file_name}' is most similar to '{most_similar_file}' with a distance score of {distance_score:.4f}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.60k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/378M [00:00<?, ?B/s]

Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Please upload initial audio files to create the database:


Saving a1.wav to a1.wav
Saving a2.wav to a2.wav
Saving a3.wav to a3.wav
Saving a4.wav to a4.wav
Saving a5.wav to a5.wav


  res *= (1 - noise / lVar)
  res *= (1 - noise / lVar)


Stored embedding for a1.wav in the database.


  res *= (1 - noise / lVar)
  res *= (1 - noise / lVar)


Stored embedding for a2.wav in the database.


  res *= (1 - noise / lVar)
  res *= (1 - noise / lVar)


Stored embedding for a3.wav in the database.


  res *= (1 - noise / lVar)
  res *= (1 - noise / lVar)


Stored embedding for a4.wav in the database.


  res *= (1 - noise / lVar)
  res *= (1 - noise / lVar)


Stored embedding for a5.wav in the database.
Please upload a new audio file for similarity testing:


Saving c1.wav to c1.wav


  res *= (1 - noise / lVar)
  res *= (1 - noise / lVar)


The uploaded file 'c1.wav' does not match any existing files closely enough. Adding it to the database as a new voice.


In [None]:
# Step 2: Testing Phase - Upload a new file for similarity testing
print("Please upload a new audio file for similarity testing:")
new_file = files.upload()
new_file_name = list(new_file.keys())[0]

# Extract embeddings for the uploaded file
new_embedding = extract_embeddings(new_file_name)

# Step 3: Find the most similar audio file and update the database if necessary
most_similar_file, distance_score, is_new_voice = find_most_similar(new_embedding)

# Display the results of the similarity test and update the database if needed
if is_new_voice:
    print(f"The uploaded file '{new_file_name}' does not match any existing files closely enough. Adding it to the database as a new voice.")
    store_embedding(new_file_name, new_embedding)
else:
    print(f"The uploaded file '{new_file_name}' is most similar to '{most_similar_file}' with a distance score of {distance_score:.4f}")


Please upload a new audio file for similarity testing:


Saving harvard.wav to harvard.wav
The uploaded file 'harvard.wav' does not match any existing files closely enough. Adding it to the database as a new voice.


# **ECAPA-TDNN model for speaker recognition**

In [None]:
import torch
import torchaudio
import sqlite3
import numpy as np
from speechbrain.pretrained import SpeakerRecognition
from google.colab import files

# Load the pre-trained ECAPA-TDNN model for speaker recognition
speaker_recognition = SpeakerRecognition.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb", savedir="pretrained_models/spkrec-ecapa-voxceleb")

# Initialize SQLite database connection
conn = sqlite3.connect('audio_embeddings.db')
cursor = conn.cursor()

# Create a table for storing embeddings if it doesn't exist
cursor.execute('''
    CREATE TABLE IF NOT EXISTS audio_embeddings (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        file_name TEXT,
        embedding BLOB
    )
''')
conn.commit()

# Function to extract speaker embedding using ECAPA-TDNN
def extract_speaker_embedding(file_path):
    # Load the audio file
    signal, sample_rate = torchaudio.load(file_path)

    # Ensure the audio is in the correct format for the model
    if sample_rate != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
        signal = resampler(signal)

    # Extract the speaker embedding using the SpeechBrain model
    embedding = speaker_recognition.encode_batch(signal).squeeze().numpy()

    # Normalize the embedding for better comparison
    normalized_embedding = embedding / np.linalg.norm(embedding)

    return normalized_embedding

# Function to store embeddings in the SQLite database
def store_embedding(file_name, embedding):
    # Convert the numpy array to binary data to store in the database
    embedding_blob = embedding.tobytes()
    cursor.execute('INSERT INTO audio_embeddings (file_name, embedding) VALUES (?, ?)', (file_name, embedding_blob))
    conn.commit()

# Function to find the most similar audio using cosine similarity of ECAPA-TDNN embeddings
def find_most_similar(embedding, similarity_threshold=0.6):
    cursor.execute('SELECT file_name, embedding FROM audio_embeddings')
    rows = cursor.fetchall()

    max_similarity = -1
    most_similar_file = None

    for row in rows:
        stored_file_name = row[0]
        stored_embedding = np.frombuffer(row[1], dtype=np.float32)

        # Check if the stored embedding has the correct size
        if stored_embedding.shape != embedding.shape:
            print(f"Warning: Skipping embedding with shape mismatch: {stored_embedding.shape} vs {embedding.shape}")
            continue

        # Compute the cosine similarity between the new and stored embeddings
        similarity = np.dot(embedding, stored_embedding) / (np.linalg.norm(embedding) * np.linalg.norm(stored_embedding))

        if similarity > max_similarity:
            max_similarity = similarity
            most_similar_file = stored_file_name

    # Determine if the new file is considered similar to any stored file based on the similarity threshold
    is_new_voice = max_similarity < similarity_threshold

    return most_similar_file, max_similarity, is_new_voice

# Step 1: Initial Upload of Files to Create Database
print("Please upload initial audio files to create the database:")
initial_files = files.upload()

# Process and store the uploaded initial files in the database
for file_name in initial_files.keys():
    embedding = extract_speaker_embedding(file_name)
    store_embedding(file_name, embedding)
    print(f"Stored embedding for {file_name} in the database.")

# Step 2: Testing Phase - Upload a new file for similarity testing
print("Please upload a new audio file for similarity testing:")
new_file = files.upload()
new_file_name = list(new_file.keys())[0]

# Extract embeddings for the uploaded file
new_embedding = extract_speaker_embedding(new_file_name)

# Step 3: Find the most similar audio file and update the database if necessary
most_similar_file, similarity_score, is_new_voice = find_most_similar(new_embedding)

# Display the results of the similarity test and update the database if needed
if is_new_voice:
    print(f"The uploaded file '{new_file_name}' does not match any existing files closely enough. Adding it to the database as a new voice.")
    store_embedding(new_file_name, new_embedding)
else:
    print(f"The uploaded file '{new_file_name}' is most similar to '{most_similar_file}' with a similarity score of {similarity_score:.4f}")


Please upload initial audio files to create the database:


Saving a1.wav to a1 (2).wav
Saving a2.wav to a2 (2).wav
Saving a3.wav to a3 (2).wav
Saving a4.wav to a4 (2).wav
Saving a5.wav to a5 (2).wav
Stored embedding for a1 (2).wav in the database.
Stored embedding for a2 (2).wav in the database.
Stored embedding for a3 (2).wav in the database.
Stored embedding for a4 (2).wav in the database.
Stored embedding for a5 (2).wav in the database.
Please upload a new audio file for similarity testing:


Saving harvard.wav to harvard (2).wav
The uploaded file 'harvard (2).wav' does not match any existing files closely enough. Adding it to the database as a new voice.


In [None]:

# Step 2: Testing Phase - Upload a new file for similarity testing
print("Please upload a new audio file for similarity testing:")
new_file = files.upload()
new_file_name = list(new_file.keys())[0]

# Extract embeddings for the uploaded file
new_embedding = extract_speaker_embedding(new_file_name)

# Step 3: Find the most similar audio file and update the database if necessary
most_similar_file, similarity_score, is_new_voice = find_most_similar(new_embedding)

# Display the results of the similarity test and update the database if needed
if is_new_voice:
    print(f"The uploaded file '{new_file_name}' does not match any existing files closely enough. Adding it to the database as a new voice.")
    store_embedding(new_file_name, new_embedding)
else:
    print(f"The uploaded file '{new_file_name}' is most similar to '{most_similar_file}' with a similarity score of {similarity_score:.4f}")


Please upload a new audio file for similarity testing:


Saving 2.wav to 2.wav
The uploaded file '2.wav' is most similar to 'a1 (1).wav' with a similarity score of 0.9848


In [None]:

# Step 2: Testing Phase - Upload a new file for similarity testing
print("Please upload a new audio file for similarity testing:")
new_file = files.upload()
new_file_name = list(new_file.keys())[0]

# Extract embeddings for the uploaded file
new_embedding = extract_speaker_embedding(new_file_name)

# Step 3: Find the most similar audio file and update the database if necessary
most_similar_file, similarity_score, is_new_voice = find_most_similar(new_embedding)

# Display the results of the similarity test and update the database if needed
if is_new_voice:
    print(f"The uploaded file '{new_file_name}' does not match any existing files closely enough. Adding it to the database as a new voice.")
    store_embedding(new_file_name, new_embedding)
else:
    print(f"The uploaded file '{new_file_name}' is most similar to '{most_similar_file}' with a similarity score of {similarity_score:.4f}")


Please upload a new audio file for similarity testing:


Saving c1.wav to c1 (1).wav
The uploaded file 'c1 (1).wav' does not match any existing files closely enough. Adding it to the database as a new voice.
