In [None]:
import torch
from torch import nn
import librosa
import numpy as np
from scipy.spatial.distance import cosine




In [112]:
class VGGVox(nn.Module):
    def __init__(self):
        super(VGGVox, self).__init__()
        # Define the VGGVox architecture (simplified for brevity)
        self.conv_layers = nn.Sequential(
            nn.Conv2d(1, 96, kernel_size=7, stride=2, padding=3),
            nn.ReLU(),
            nn.MaxPool2d(3, 2),
            nn.Conv2d(96, 256, kernel_size=5, stride=2, padding=2),
            nn.ReLU(),
            nn.MaxPool2d(3, 2)
        )
        # Dynamically calculate the input size for the first fully connected layer
        self.fc_layers = nn.Sequential(
            nn.Linear(self._get_conv_output_size((1, 40, 312)), 4096), # Calculate size dynamically based on input shape (1, 40, 312) from your mel spectrogram
            nn.ReLU(),
            nn.Linear(4096, 1024)
        )

    def _get_conv_output_size(self, input_shape):

        """
        Calculates the output size of the convolutional layers for a given input shape.
        This is necessary to dynamically adjust the input size of the first fully connected layer.
        """
        # Create a dummy input tensor with the specified shape
        dummy_input = torch.zeros(1, *input_shape)

        # Pass the dummy input through the convolutional layers
        output = self.conv_layers(dummy_input)

        # Calculate the total number of features in the output
        output_size = output.view(output.size(0), -1).shape[1]
        return output_size

    def forward(self, x):
        x = self.conv_layers(x)
        x = x.view(x.size(0), -1)  # Flatten
        x = self.fc_layers(x)
        return x


model = VGGVox()
model.eval()

VGGVox(
  (conv_layers): Sequential(
    (0): Conv2d(1, 96, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(96, 256, kernel_size=(5, 5), stride=(2, 2), padding=(2, 2))
    (4): ReLU()
    (5): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (fc_layers): Sequential(
    (0): Linear(in_features=9728, out_features=4096, bias=True)
    (1): ReLU()
    (2): Linear(in_features=4096, out_features=1024, bias=True)
  )
)

In [None]:
audio_file1 = "D:\\voices\\Mohammed\\M96.wav"  # Replace with path to the stored user's audio
audio_file2 = "D:\\voices\\Hussin\\H11.wav"

In [114]:
def preprocess_audio(file_path):
    # Load the audio file
    y, sr = librosa.load(file_path, sr=16000)  # 16kHz sampling rate
    # Generate Mel-spectrogram
    # The change is on this line: explicitly provide 'y' as a keyword argument
    mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=40, fmax=8000)
    # Convert to log scale
    log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max)
    return log_mel_spec


# Cosine Similarity Matching Function
def is_match_embedding(embedding1, embedding2, threshold=0.5):
    embedding1 = embedding1.cpu().numpy().flatten()
    embedding2 = embedding2.cpu().numpy().flatten()
    similarity = 1 - cosine(embedding1, embedding2)
    print(f"Embedding Similarity: {similarity:.7f}")
    return similarity >= threshold


def get_spectral_centroid(audio_file):
    y, sr = librosa.load(audio_file, sr=16000)
    cent = librosa.feature.spectral_centroid(y=y, sr=sr)
    avg_cent = np.mean(cent)
    return avg_cent


def is_match_frequency(audio_file1, audio_file2, threshold=100):
    cent1 = get_spectral_centroid(audio_file1)
    cent2 = get_spectral_centroid(audio_file2)
    diff = np.abs(cent1 - cent2)
    print(f"Spectral Centroid Difference: {diff:.7f}")
    return diff <= threshold


# Preprocess and extract embedding for stored user
mel_spec1 = preprocess_audio(audio_file1)
input_tensor1 = torch.tensor(np.expand_dims(mel_spec1, axis=0), dtype=torch.float32).unsqueeze(0)
with torch.no_grad():
    stored_embedding = model(input_tensor1)

# Preprocess and extract embedding for new input
mel_spec2 = preprocess_audio(audio_file2)
input_tensor2 = torch.tensor(np.expand_dims(mel_spec2, axis=0), dtype=torch.float32).unsqueeze(0)
with torch.no_grad():
    new_embedding = model(input_tensor2)

In [115]:
embedding_match = is_match_embedding(stored_embedding, new_embedding)
frequency_match = is_match_frequency(audio_file1, audio_file2)

Embedding Similarity: 0.9865106
Spectral Centroid Difference: 167.9639614


In [None]:
if ((embedding_match > 0.5)  and (frequency_match < 200.00)):
    print("Access Granted: Both embedding and frequency match.")
else:
    print("Access Denied: Mismatch in either embedding or frequency.")

Access Denied: Mismatch in either embedding or frequency.
