In [5]:
from datasets import load_dataset

cv_17 = load_dataset("mozilla-foundation/common_voice_17_0", "hi", split="train",token="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",trust_remote_code=True)

Reading metadata...: 4689it [00:00, 156306.36it/s]les/s]
Generating train split: 4689 examples [00:01, 3916.58 examples/s]
Reading metadata...: 2428it [00:00, 161873.25it/s]examples/s]
Generating validation split: 2428 examples [00:00, 4005.61 examples/s]
Reading metadata...: 3154it [00:00, 153752.15it/s]es/s]
Generating test split: 3154 examples [00:00, 3939.10 examples/s]
Reading metadata...: 4044it [00:00, 161790.24it/s]les/s]
Generating other split: 4044 examples [00:01, 3982.20 examples/s]
Reading metadata...: 775it [00:00, 149603.53it/s]? examples/s]
Generating invalidated split: 775 examples [00:00, 3829.90 examples/s]
Reading metadata...: 10329it [00:00, 166596.04it/s]amples/s]
Generating validated split: 10329 examples [00:02, 4394.12 examples/s]


In [38]:
import torch.nn as nn
import torch.nn.functional as F
import sounddevice as sd
import soundfile as sf
import numpy as np
from tqdm import tqdm
import librosa
import torch
import torch.optim as optim
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader

In [64]:
DURATION  =10
SAMPLE_RATE = 44100
N_MFCC = 1
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [None]:
def extract_mfcc(audio_path, sr=SAMPLE_RATE, n_mfcc=N_MFCC):

    y, sr = librosa.load(audio_path, sr=sr)
    
    target_length = int(DURATION * sr)
    if len(y) < target_length:
        y = np.pad(y, (0, target_length - len(y)), 'constant')
    else:
        y = y[:target_length]
    
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
    
    delta_mfccs = librosa.feature.delta(mfccs)
    delta2_mfccs = librosa.feature.delta(mfccs, order=2)
    
    features = np.concatenate([mfccs, delta_mfccs, delta2_mfccs])
    
    features = features.T
    
    return features


In [None]:
class SpeakerRecognitionModel(nn.Module):
    def __init__(self, input_dim=N_MFCC*3, hidden_dim=128, num_classes=2):
        super(SpeakerRecognitionModel, self).__init__()
        
        self.conv1 = nn.Conv1d(input_dim, 64, kernel_size=3, padding=1)
        self.bn1 = nn.BatchNorm1d(64)
        self.pool1 = nn.MaxPool1d(kernel_size=2)
        self.dropout1 = nn.Dropout(0.3)
        
        self.conv2 = nn.Conv1d(64, 128, kernel_size=3, padding=1)
        self.bn2 = nn.BatchNorm1d(128)
        self.pool2 = nn.MaxPool1d(kernel_size=2)
        self.dropout2 = nn.Dropout(0.3)
        
        self.lstm = nn.LSTM(128, hidden_dim, batch_first=True, bidirectional=True)
        
        self.fc = nn.Linear(hidden_dim * 2, num_classes)
        
    def forward(self, x):
        batch_size = x.size(0)
        
        x = x.permute(0, 2, 1)
        
        x = self.pool1(F.relu(self.bn1(self.conv1(x))))
        x = self.dropout1(x)
        
        x = self.pool2(F.relu(self.bn2(self.conv2(x))))
        x = self.dropout2(x)
        
        x = x.permute(0, 2, 1)
        
        x, (hidden, _) = self.lstm(x)
        
        hidden = hidden.view(2, batch_size, -1) 
        hidden = hidden.permute(1, 0, 2)  
        hidden = hidden.contiguous().view(batch_size, -1)
        
        # Fully connected layer
        x = self.fc(hidden)
        
        return x


In [None]:
def record_audio(filename, duration=DURATION, sample_rate=SAMPLE_RATE):
    print(f"Recording {duration} .")
    print("saaay ")
    recording = sd.rec(int(duration * sample_rate), samplerate=sample_rate, channels=1)
    sd.wait()
    sf.write(filename, recording, sample_rate)
    print(f"Audio saved  {filename}")
    return filename


In [None]:
def concatenate_features(features_list):

    concatenated = np.concatenate(features_list, axis=0)
    return concatenated

def prepare_data(user_files, non_user_files, word_count):
    user_features_all = []
    non_user_features_all = []
    
    for file in user_files:
        features = extract_mfcc(file)
        user_features_all.append(features)
    
    for file in non_user_files:
        features = extract_mfcc(file)
        non_user_features_all.append(features)
    
    user_features = []
    non_user_features = []
    
    for i in range(0, len(user_features_all), word_count):
        if i + word_count <= len(user_features_all):
            group = user_features_all[i:i+word_count]
            combined = concatenate_features(group)
            user_features.append(combined)
    
    for i in range(0, len(non_user_features_all), word_count):
        if i + word_count <= len(non_user_features_all):
            group = non_user_features_all[i:i+word_count]
            combined = concatenate_features(group)
            non_user_features.append(combined)
    
    user_labels = np.ones(len(user_features))
    non_user_labels = np.zeros(len(non_user_features))
    
    features = user_features + non_user_features
    labels = np.concatenate([user_labels, non_user_labels])
    
    max_time_steps = max(f.shape[0] for f in features)
    
    aligned_features = []
    for f in features:
        if f.shape[0] < max_time_steps:
            # Добавляем паддинг
            pad = np.zeros((max_time_steps - f.shape[0], f.shape[1]))
            f = np.vstack([f, pad])
        else:
            f = f[:max_time_steps]
        aligned_features.append(f)
    
    return np.array(aligned_features), labels


In [None]:
def train_model(model, train_loader, val_loader, num_epochs=20,word_count = 1):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=3, factor=0.5)
    
    best_val_acc = 0.0
    train_losses = []
    val_losses = []
    train_accs = []
    val_accs = []
    
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        correct = 0
        total = 0
        print(train_loader)
        for inputs, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} (Train)"):
            inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
            
            optimizer.zero_grad()

            outputs = model(inputs)
            print(outputs)
            print(labels)
            loss = criterion(outputs, labels)
            
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item() * inputs.size(0)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
        
        train_loss = running_loss / total
        train_acc = correct / total
        train_losses.append(train_loss)
        train_accs.append(train_acc)
        
        model.eval()
        val_loss = 0.0
        val_correct = 0
        val_total = 0
        
        with torch.no_grad():
            for inputs, labels in tqdm(val_loader, desc=f"Epoch {epoch+1}/{num_epochs} (Val)"):
                inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
                
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                
                val_loss += loss.item() * inputs.size(0)
                _, predicted = torch.max(outputs, 1)
                val_total += labels.size(0)
                val_correct += (predicted == labels).sum().item()
        
        val_loss = val_loss / val_total
        val_acc = val_correct / val_total
        val_losses.append(val_loss)
        val_accs.append(val_acc)
        
        scheduler.step(val_loss)
        
        print(f"Epoch {epoch+1}/{num_epochs} | "
              f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f} | "
              f"Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f}")
        
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            torch.save(model.state_dict(), f"best_speaker_model_{word_count}.pth")
            print("Model saved!")
    
    return best_val_acc


In [None]:
import torch
from torch.utils.data import Dataset

class VoiceDataset(Dataset):
    def __init__(self, features, labels):

        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        feature = self.features[idx]  
        label = self.labels[idx]
        
        feature_tensor = torch.FloatTensor(feature)
        label_tensor = torch.LongTensor([label])
        
        return feature_tensor, label_tensor


In [None]:
def collate_fn(batch):
    features, labels = zip(*batch)
    
    max_len = max(f.shape[0] for f in features)
    
    padded_features = []
    for f in features:
        pad_size = max_len - f.shape[0]
        padded = torch.nn.functional.pad(f, (0, 0, 0, pad_size), "constant", 0)
        padded_features.append(padded)
    
    features_tensor = torch.stack(padded_features)
    labels_tensor = torch.stack(labels).squeeze()
    
    return features_tensor, labels_tensor


In [None]:
import torch
from sklearn.metrics import accuracy_score, classification_report

def evaluate_model(model, test_loader, word_count):
    model.eval()
    test_loss = 0
    correct = 0
    total = 0
    
    predictions = []
    labels = []
    with torch.no_grad():
        for inputs, targets in test_loader:
            inputs, targets = inputs.to(DEVICE), targets.to(DEVICE)
            
            outputs = model(inputs)
            
            
            print(outputs)
            print(targets)
            if len(targets.shape) > 1:
                targets = targets.squeeze()
            
            loss = torch.nn.CrossEntropyLoss()(outputs, targets)
            test_loss += loss.item() * inputs.size(0)
            
            _, predicted = torch.max(outputs, 1)
            total += targets.size(0)
            correct += (predicted == targets).sum().item()
            
            predictions.extend(predicted.cpu().numpy())
            labels.extend(targets.cpu().numpy())
    
    accuracy = correct / total
    print(f"Test Loss: {test_loss / total:.4f} | Test Accuracy: {accuracy:.4f}")
    
    print("\nClassification Report:")
    print(classification_report(labels, predictions))
    
    return accuracy


In [None]:

def train_and_evaluate_word_count(user_files, non_user_files, word_count):
    """Train and evaluate a model with a specific word count."""
    print(f"\n{'='*50}")
    print(f"Training with {word_count} word(s)")
    print(f"{'='*50}")
    
    features, labels = prepare_data(user_files, non_user_files, word_count)
    
    X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
    
    train_dataset = VoiceDataset(X_train, y_train)
    val_dataset = VoiceDataset(X_val, y_val)
    test_dataset = VoiceDataset(X_test, y_test)
    
    batch_size = min(16, len(train_dataset))
    train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=collate_fn 
)
    val_loader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=collate_fn  
)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)
    
    input_dim = features.shape[2]  
    model = SpeakerRecognitionModel(input_dim=input_dim).to(DEVICE)
    
    best_val_acc = train_model(model, train_loader, val_loader, num_epochs=15,word_count = word_count)
    
    model.load_state_dict(torch.load(f"best_speaker_model_{word_count}.pth"))
    
    test_acc = evaluate_model(model, test_loader, word_count)
    
    return best_val_acc, test_acc


In [None]:

import os
import time
user_dir = "user_voice_samples"
non_user_dir = "non_user_voice_samples"

os.makedirs(user_dir, exist_ok=True)
os.makedirs(non_user_dir, exist_ok=True)

user_files = [os.path.join(user_dir, f) for f in os.listdir(user_dir) if f.endswith('.wav')]
non_user_files = [os.path.join(non_user_dir, f) for f in os.listdir(non_user_dir) if f.endswith('.wav')]

max_word_count = 15  
min_user_samples = max_word_count * 2 
min_non_user_samples = max_word_count * 2

if len(user_files) < min_user_samples:
    print(f"Need to collect {min_user_samples - len(user_files)} more user voice samples.")
    print("Please say one word or short phrase for each recording.")
    for i in range(len(user_files), min_user_samples):
        filename = os.path.join(user_dir, f"user_sample_{i:03d}.wav")
        record_audio(filename)
        time.sleep(0.5)

if len(non_user_files) < min_non_user_samples:
    print(f"Need to collect {min_non_user_samples - len(non_user_files)} more non-user voice samples.")
    print("Please have someone else speak one word or short phrase for each recording.")
    for i in range(len(non_user_files), min_non_user_samples):
        filename = os.path.join(non_user_dir, f"non_user_sample_{i:03d}.wav")
        record_audio(filename)
        time.sleep(0.5)

user_files = [os.path.join(user_dir, f) for f in os.listdir(user_dir) if f.endswith('.wav')]
non_user_files = [os.path.join(non_user_dir, f) for f in os.listdir(non_user_dir) if f.endswith('.wav')]

word_counts = [5, 10, 30] 
results = {}

for count in word_counts:
    if len(user_files) >= count * 2 and len(non_user_files) >= count * 2:
        val_acc, test_acc = train_and_evaluate_word_count(user_files, non_user_files, count)
        results[count] = {"val_acc": val_acc, "test_acc": test_acc}
        print(results)
    else:
        print(f"Not enough samples for word count {count}. Skipping.")

print("\nResults Summary:")
print("="*50)
print(f"{'Word Count':<15} {'Validation Accuracy':<20} {'Test Accuracy':<15}")
print("-"*50)
for count, metrics in results.items():
    print(f"{count:<15} {metrics['val_acc']:<20.4f} {metrics['test_acc']:<15.4f}")

print("\nLive Demonstration:")
print("="*50)

best_count = max(results.items(), key=lambda x: x[1]['test_acc'])[0]
print(f"Using the best model (trained with {best_count} words)")

input_dim = extract_mfcc(user_files[0]).shape[1]
best_model = SpeakerRecognitionModel(input_dim=input_dim).to(DEVICE)
best_model.load_state_dict(torch.load(f"best_speaker_model_{best_count}.pth"))
best_model.eval()

print("\nPlease speak for a live test (say several words).")
live_samples = []
for i in range(best_count):
    test_file = f"live_test_{i}.wav"
    record_audio(test_file)
    live_samples.append(test_file)

live_features = []
for sample in live_samples:
    features = extract_mfcc(sample)
    live_features.append(features)

combined_features = concatenate_features(live_features)
features_tensor = torch.FloatTensor(combined_features).unsqueeze(0).to(DEVICE)

with torch.no_grad():
    outputs = best_model(features_tensor)
    _, prediction = torch.max(outputs, 1)

result = "Your voice" if prediction.item() == 1 else "Not your voice"
confidence = F.softmax(outputs, dim=1)[0][prediction.item()].item() * 100

print(f"\nPrediction: {result} (Confidence: {confidence:.2f}%)")

for sample in live_samples:
    os.remove(sample)



Speaker Recognition System

Training with 5 word(s)
<torch.utils.data.dataloader.DataLoader object at 0x00000122B6A39B10>


Epoch 1/15 (Train): 100%|██████████| 1/1 [00:00<00:00,  9.59it/s]


tensor([[-0.1339, -0.0777],
        [ 0.1419,  0.1314],
        [-0.1177,  0.0293],
        [-0.0042,  0.1488],
        [-0.0107,  0.1058],
        [-0.1627,  0.0462],
        [ 0.0650, -0.0089],
        [-0.0214, -0.0266],
        [ 0.0355, -0.1831]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([1, 0, 0, 0, 0, 0, 0, 0, 1], device='cuda:0')


Epoch 1/15 (Val): 100%|██████████| 1/1 [00:00<00:00, 30.95it/s]


Epoch 1/15 | Train Loss: 0.7341 | Train Acc: 0.4444 | Val Loss: 0.7372 | Val Acc: 0.3333
Model saved!
<torch.utils.data.dataloader.DataLoader object at 0x00000122B6A39B10>


Epoch 2/15 (Train):   0%|          | 0/1 [00:00<?, ?it/s]

tensor([[ 0.2429, -0.2815],
        [-0.1789, -0.0967],
        [ 0.4277, -0.2899],
        [ 0.1717, -0.1673],
        [ 0.2446, -0.2580],
        [ 0.3621, -0.2523],
        [ 0.2691, -0.2673],
        [ 0.3741, -0.2157],
        [ 0.1575, -0.1894]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([0, 1, 0, 1, 0, 0, 0, 0, 0], device='cuda:0')


Epoch 2/15 (Train): 100%|██████████| 1/1 [00:00<00:00, 12.08it/s]
Epoch 2/15 (Val): 100%|██████████| 1/1 [00:00<00:00, 133.25it/s]


Epoch 2/15 | Train Loss: 0.5260 | Train Acc: 0.8889 | Val Loss: 0.6911 | Val Acc: 0.3333
<torch.utils.data.dataloader.DataLoader object at 0x00000122B6A39B10>


Epoch 3/15 (Train): 100%|██████████| 1/1 [00:00<00:00, 48.72it/s]


tensor([[ 0.6425, -0.8969],
        [ 0.3413, -0.2368],
        [ 0.4802, -0.7207],
        [ 0.3216, -0.4453],
        [ 0.5331, -0.2536],
        [ 0.6787, -0.6875],
        [ 0.3670, -0.6333],
        [ 0.3762, -0.5251],
        [ 0.4046, -0.4642]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([0, 1, 0, 0, 1, 0, 0, 0, 0], device='cuda:0')


Epoch 3/15 (Val): 100%|██████████| 1/1 [00:00<00:00, 128.52it/s]


Epoch 3/15 | Train Loss: 0.4729 | Train Acc: 0.7778 | Val Loss: 0.6756 | Val Acc: 0.6667
Model saved!
<torch.utils.data.dataloader.DataLoader object at 0x00000122B6A39B10>


Epoch 4/15 (Train): 100%|██████████| 1/1 [00:00<00:00, 48.75it/s]


tensor([[ 0.8975, -0.9540],
        [ 0.4668, -0.4352],
        [ 0.7708, -0.4037],
        [ 0.5837, -0.6868],
        [ 0.5716, -0.7797],
        [ 0.6448, -0.9031],
        [ 0.0711, -0.3501],
        [ 0.7767, -0.9768],
        [ 0.7490, -0.7403]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([0, 1, 0, 0, 0, 0, 1, 0, 0], device='cuda:0')


Epoch 4/15 (Val): 100%|██████████| 1/1 [00:00<00:00, 143.72it/s]


Epoch 4/15 | Train Loss: 0.4019 | Train Acc: 0.7778 | Val Loss: 0.6770 | Val Acc: 0.6667
<torch.utils.data.dataloader.DataLoader object at 0x00000122B6A39B10>


Epoch 5/15 (Train):   0%|          | 0/1 [00:00<?, ?it/s]

tensor([[ 1.0186, -0.9913],
        [ 0.9533, -1.0956],
        [ 1.0990, -1.1478],
        [ 0.7197, -0.5940],
        [ 0.9814, -1.1951],
        [ 0.5774, -0.8325],
        [ 0.3484, -0.1522],
        [ 0.1092, -0.3250],
        [ 0.8009, -1.1020]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([0, 0, 0, 0, 0, 0, 1, 1, 0], device='cuda:0')


Epoch 5/15 (Train): 100%|██████████| 1/1 [00:00<00:00, 47.23it/s]
Epoch 5/15 (Val): 100%|██████████| 1/1 [00:00<00:00, 156.17it/s]


Epoch 5/15 | Train Loss: 0.3287 | Train Acc: 0.7778 | Val Loss: 0.6808 | Val Acc: 0.6667
<torch.utils.data.dataloader.DataLoader object at 0x00000122B6A39B10>


Epoch 6/15 (Train): 100%|██████████| 1/1 [00:00<00:00, 48.09it/s]


tensor([[ 1.0830, -0.8985],
        [ 0.8520, -1.2264],
        [ 0.8343, -1.0132],
        [ 0.2863, -0.2170],
        [ 1.0819, -1.2725],
        [ 1.3521, -1.4571],
        [ 1.1278, -1.0749],
        [ 0.0592, -0.2307],
        [ 1.1805, -1.3215]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([0, 0, 0, 1, 0, 0, 0, 1, 0], device='cuda:0')


Epoch 6/15 (Val): 100%|██████████| 1/1 [00:00<00:00, 125.05it/s]


Epoch 6/15 | Train Loss: 0.2834 | Train Acc: 0.7778 | Val Loss: 0.6992 | Val Acc: 0.6667
<torch.utils.data.dataloader.DataLoader object at 0x00000122B6A39B10>


Epoch 7/15 (Train): 100%|██████████| 1/1 [00:00<00:00, 48.66it/s]


tensor([[-0.0139, -0.1660],
        [ 1.1333, -1.4137],
        [ 0.9769, -1.0590],
        [ 1.2481, -1.2692],
        [ 0.9673, -1.2651],
        [ 1.4116, -1.4974],
        [ 0.1112, -0.2744],
        [ 1.4963, -1.5713],
        [ 0.9847, -0.9946]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([1, 0, 0, 0, 0, 0, 1, 0, 0], device='cuda:0')


Epoch 7/15 (Val): 100%|██████████| 1/1 [00:00<00:00, 124.91it/s]


Epoch 7/15 | Train Loss: 0.2536 | Train Acc: 0.7778 | Val Loss: 0.7289 | Val Acc: 0.3333
<torch.utils.data.dataloader.DataLoader object at 0x00000122B6A39B10>


Epoch 8/15 (Train):   0%|          | 0/1 [00:00<?, ?it/s]

tensor([[ 1.1418, -1.4304],
        [ 1.3140, -1.3527],
        [ 1.0181, -1.3330],
        [ 1.5145, -1.6150],
        [ 0.8893, -1.2035],
        [-0.0796, -0.0079],
        [-0.1029, -0.2017],
        [ 1.1634, -1.0419],
        [ 1.5240, -1.5816]], device='cuda:0', grad_fn=<AddmmBackward0>)

Epoch 8/15 (Train): 100%|██████████| 1/1 [00:00<00:00, 47.72it/s]



tensor([0, 0, 0, 0, 0, 1, 1, 0, 0], device='cuda:0')


Epoch 8/15 (Val): 100%|██████████| 1/1 [00:00<00:00, 124.87it/s]


Epoch 8/15 | Train Loss: 0.2157 | Train Acc: 0.8889 | Val Loss: 0.7231 | Val Acc: 0.3333
<torch.utils.data.dataloader.DataLoader object at 0x00000122B6A39B10>


Epoch 9/15 (Train):   0%|          | 0/1 [00:00<?, ?it/s]

tensor([[ 1.1861, -1.0268],
        [ 0.9904, -1.3355],
        [ 0.0277,  0.2949],
        [ 0.8715, -0.9157],
        [ 1.5813, -1.4770],
        [ 1.6747, -1.4754],
        [-0.0031,  0.0421],
        [ 1.5784, -1.4034],
        [ 1.3665, -1.5408]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([0, 0, 1, 0, 0, 0, 1, 0, 0], device='cuda:0')


Epoch 9/15 (Train): 100%|██████████| 1/1 [00:00<00:00, 46.29it/s]
Epoch 9/15 (Val): 100%|██████████| 1/1 [00:00<00:00, 116.86it/s]


Epoch 9/15 | Train Loss: 0.1980 | Train Acc: 1.0000 | Val Loss: 0.7182 | Val Acc: 0.3333
<torch.utils.data.dataloader.DataLoader object at 0x00000122B6A39B10>


Epoch 10/15 (Train):   0%|          | 0/1 [00:00<?, ?it/s]

tensor([[ 1.1247, -0.9073],
        [ 1.2971, -1.5077],
        [ 1.7130, -1.9740],
        [ 1.4038, -1.5660],
        [-0.0170,  0.2669],
        [-0.1632,  0.2072],
        [ 1.2531, -1.1664],
        [ 1.3297, -1.5581],
        [ 1.1869, -1.4868]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([0, 0, 0, 0, 1, 1, 0, 0, 0], device='cuda:0')


Epoch 10/15 (Train): 100%|██████████| 1/1 [00:00<00:00, 49.91it/s]
Epoch 10/15 (Val): 100%|██████████| 1/1 [00:00<00:00, 142.03it/s]


Epoch 10/15 | Train Loss: 0.1721 | Train Acc: 1.0000 | Val Loss: 0.7109 | Val Acc: 0.3333
<torch.utils.data.dataloader.DataLoader object at 0x00000122B6A39B10>


Epoch 11/15 (Train):   0%|          | 0/1 [00:00<?, ?it/s]

tensor([[ 1.0968, -0.9346],
        [ 1.9317, -1.8098],
        [-0.3533,  0.5020],
        [ 1.6399, -1.6225],
        [ 1.1478, -1.5915],
        [ 0.8966, -0.8229],
        [-0.0724,  0.2243],
        [ 1.4311, -1.5701],
        [ 1.3769, -1.7612]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([0, 0, 1, 0, 0, 0, 1, 0, 0], device='cuda:0')


Epoch 11/15 (Train): 100%|██████████| 1/1 [00:00<00:00, 52.54it/s]
Epoch 11/15 (Val): 100%|██████████| 1/1 [00:00<00:00, 142.92it/s]


Epoch 11/15 | Train Loss: 0.1570 | Train Acc: 1.0000 | Val Loss: 0.6991 | Val Acc: 0.3333
<torch.utils.data.dataloader.DataLoader object at 0x00000122B6A39B10>


Epoch 12/15 (Train):   0%|          | 0/1 [00:00<?, ?it/s]

tensor([[ 1.9629, -2.1327],
        [ 1.2038, -0.9579],
        [ 0.8881, -0.7947],
        [ 1.5378, -1.6150],
        [ 1.3972, -1.5190],
        [ 1.7091, -1.7659],
        [-0.4598,  0.3832],
        [ 1.2566, -1.5826],
        [-0.2441,  0.3488]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([0, 0, 0, 0, 0, 0, 1, 0, 1], device='cuda:0')


Epoch 12/15 (Train): 100%|██████████| 1/1 [00:00<00:00, 51.73it/s]
Epoch 12/15 (Val): 100%|██████████| 1/1 [00:00<00:00, 133.03it/s]


Epoch 12/15 | Train Loss: 0.1418 | Train Acc: 1.0000 | Val Loss: 0.6567 | Val Acc: 0.3333
<torch.utils.data.dataloader.DataLoader object at 0x00000122B6A39B10>


Epoch 13/15 (Train):   0%|          | 0/1 [00:00<?, ?it/s]

tensor([[-0.4055,  0.2001],
        [ 1.1331, -1.4441],
        [ 1.7405, -1.5466],
        [ 0.9843, -0.8991],
        [ 1.5744, -1.8546],
        [ 1.1236, -0.9186],
        [ 1.8972, -1.9942],
        [ 1.4607, -1.5777],
        [-0.2538,  0.3307]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([1, 0, 0, 0, 0, 0, 0, 0, 1], device='cuda:0')


Epoch 13/15 (Train): 100%|██████████| 1/1 [00:00<00:00, 49.49it/s]
Epoch 13/15 (Val): 100%|██████████| 1/1 [00:00<00:00, 133.14it/s]


Epoch 13/15 | Train Loss: 0.1501 | Train Acc: 1.0000 | Val Loss: 0.6148 | Val Acc: 0.6667
<torch.utils.data.dataloader.DataLoader object at 0x00000122B6A39B10>


Epoch 14/15 (Train): 100%|██████████| 1/1 [00:00<00:00, 48.74it/s]

tensor([[ 1.3460, -1.7271],
        [-0.5538,  0.7209],
        [ 1.0601, -0.8570],
        [ 1.6150, -1.7610],
        [ 1.8399, -1.7801],
        [ 1.8123, -1.9882],
        [-0.2466,  0.5267],
        [ 1.1126, -1.0473],
        [ 1.6729, -1.8047]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([0, 1, 0, 0, 0, 0, 1, 0, 0], device='cuda:0')



Epoch 14/15 (Val): 100%|██████████| 1/1 [00:00<00:00, 132.48it/s]


Epoch 14/15 | Train Loss: 0.1145 | Train Acc: 1.0000 | Val Loss: 0.5718 | Val Acc: 0.6667
<torch.utils.data.dataloader.DataLoader object at 0x00000122B6A39B10>


Epoch 15/15 (Train):   0%|          | 0/1 [00:00<?, ?it/s]

tensor([[ 1.5128, -1.7788],
        [ 1.1626, -0.8559],
        [ 1.3716, -1.5367],
        [ 1.9772, -1.9954],
        [ 1.0926, -0.9580],
        [-0.4334,  0.4049],
        [ 1.4355, -1.4783],
        [-0.5019,  0.5364],
        [ 1.8384, -1.7847]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([0, 0, 0, 0, 0, 1, 0, 1, 0], device='cuda:0')


Epoch 15/15 (Train): 100%|██████████| 1/1 [00:00<00:00, 51.21it/s]
Epoch 15/15 (Val): 100%|██████████| 1/1 [00:00<00:00, 141.17it/s]

Epoch 15/15 | Train Loss: 0.1217 | Train Acc: 1.0000 | Val Loss: 0.5299 | Val Acc: 0.6667
tensor([[-0.2416,  0.0110],
        [-0.2046, -0.0173],
        [-0.1788, -0.0052],
        [-0.0488, -0.1226]], device='cuda:0')
tensor([[1],
        [1],
        [1],
        [0]], device='cuda:0')
Test Loss: 0.6114 | Test Accuracy: 1.0000

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       1.00      1.00      1.00         3

    accuracy                           1.00         4
   macro avg       1.00      1.00      1.00         4
weighted avg       1.00      1.00      1.00         4




  model.load_state_dict(torch.load(f"best_speaker_model_{word_count}.pth"))


{5: {'val_acc': 0.6666666666666666, 'test_acc': 1.0}}

Training with 10 word(s)
<torch.utils.data.dataloader.DataLoader object at 0x0000012286063690>


Epoch 1/15 (Train): 100%|██████████| 1/1 [00:00<00:00, 34.00it/s]


tensor([[0.1294, 0.1745],
        [0.2429, 0.0480],
        [0.1314, 0.2298],
        [0.1422, 0.2465]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([0, 1, 0, 0], device='cuda:0')


Epoch 1/15 (Val): 100%|██████████| 1/1 [00:00<00:00, 85.67it/s]


Epoch 1/15 | Train Loss: 0.7504 | Train Acc: 0.0000 | Val Loss: 0.6785 | Val Acc: 0.5000
Model saved!
<torch.utils.data.dataloader.DataLoader object at 0x0000012286063690>


Epoch 2/15 (Train): 100%|██████████| 1/1 [00:00<00:00, 33.20it/s]


tensor([[ 0.4007, -0.1577],
        [ 0.3780, -0.0181],
        [ 0.2932,  0.0278],
        [ 0.3190, -0.0160]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([0, 1, 0, 0], device='cuda:0')


Epoch 2/15 (Val): 100%|██████████| 1/1 [00:00<00:00, 86.37it/s]


Epoch 2/15 | Train Loss: 0.6180 | Train Acc: 0.7500 | Val Loss: 0.7238 | Val Acc: 0.5000
<torch.utils.data.dataloader.DataLoader object at 0x0000012286063690>


Epoch 3/15 (Train): 100%|██████████| 1/1 [00:00<00:00, 35.04it/s]


tensor([[ 0.6777, -0.1624],
        [ 0.4196, -0.2784],
        [ 0.6977, -0.1776],
        [ 0.4423, -0.0715]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([0, 0, 0, 1], device='cuda:0')


Epoch 3/15 (Val): 100%|██████████| 1/1 [00:00<00:00, 83.15it/s]


Epoch 3/15 | Train Loss: 0.5234 | Train Acc: 0.7500 | Val Loss: 0.7462 | Val Acc: 0.5000
<torch.utils.data.dataloader.DataLoader object at 0x0000012286063690>


Epoch 4/15 (Train): 100%|██████████| 1/1 [00:00<00:00, 34.30it/s]


tensor([[ 0.7330, -0.5142],
        [ 0.8587, -0.1449],
        [ 0.7278, -0.3413],
        [ 0.7071, -0.4007]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([0, 1, 0, 0], device='cuda:0')


Epoch 4/15 (Val): 100%|██████████| 1/1 [00:00<00:00, 86.70it/s]


Epoch 4/15 | Train Loss: 0.5372 | Train Acc: 0.7500 | Val Loss: 0.7540 | Val Acc: 0.5000
<torch.utils.data.dataloader.DataLoader object at 0x0000012286063690>


Epoch 5/15 (Train): 100%|██████████| 1/1 [00:00<00:00, 33.84it/s]

tensor([[ 1.1055, -0.4776],
        [ 0.7964, -0.3299],
        [ 0.5592, -0.3493],
        [ 0.8609, -0.4694]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([0, 0, 1, 0], device='cuda:0')



Epoch 5/15 (Val): 100%|██████████| 1/1 [00:00<00:00, 83.17it/s]


Epoch 5/15 | Train Loss: 0.4874 | Train Acc: 0.7500 | Val Loss: 0.7580 | Val Acc: 0.5000
<torch.utils.data.dataloader.DataLoader object at 0x0000012286063690>


Epoch 6/15 (Train):   0%|          | 0/1 [00:00<?, ?it/s]

tensor([[ 0.8655, -0.3216],
        [ 0.7475, -0.3809],
        [ 1.4631, -0.8111],
        [ 0.8584, -0.7564]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([0, 1, 0, 0], device='cuda:0')


Epoch 6/15 (Train): 100%|██████████| 1/1 [00:00<00:00, 33.25it/s]
Epoch 6/15 (Val): 100%|██████████| 1/1 [00:00<00:00, 81.21it/s]


Epoch 6/15 | Train Loss: 0.4886 | Train Acc: 0.7500 | Val Loss: 0.7493 | Val Acc: 0.5000
<torch.utils.data.dataloader.DataLoader object at 0x0000012286063690>


Epoch 7/15 (Train):   0%|          | 0/1 [00:00<?, ?it/s]

tensor([[ 1.2446, -0.7876],
        [ 1.0835, -0.7955],
        [ 0.6626, -0.4254],
        [ 0.8501, -0.6985]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([0, 0, 1, 0], device='cuda:0')


Epoch 7/15 (Train): 100%|██████████| 1/1 [00:00<00:00, 33.34it/s]
Epoch 7/15 (Val): 100%|██████████| 1/1 [00:00<00:00, 90.91it/s]


Epoch 7/15 | Train Loss: 0.4591 | Train Acc: 0.7500 | Val Loss: 0.7255 | Val Acc: 0.5000
<torch.utils.data.dataloader.DataLoader object at 0x0000012286063690>


Epoch 8/15 (Train):   0%|          | 0/1 [00:00<?, ?it/s]

tensor([[ 0.6164, -0.5177],
        [ 0.8103, -0.5933],
        [ 1.4740, -0.6469],
        [ 1.2273, -0.8807]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([1, 0, 0, 0], device='cuda:0')


Epoch 8/15 (Train): 100%|██████████| 1/1 [00:00<00:00, 33.39it/s]
Epoch 8/15 (Val): 100%|██████████| 1/1 [00:00<00:00, 81.58it/s]


Epoch 8/15 | Train Loss: 0.4652 | Train Acc: 0.7500 | Val Loss: 0.7028 | Val Acc: 0.5000
<torch.utils.data.dataloader.DataLoader object at 0x0000012286063690>


Epoch 9/15 (Train):   0%|          | 0/1 [00:00<?, ?it/s]

tensor([[ 0.9898, -0.5516],
        [ 0.5766, -0.3967],
        [ 0.9520, -0.6193],
        [ 1.3665, -0.7396]], device='cuda:0', grad_fn=<AddmmBackward0>)

Epoch 9/15 (Train): 100%|██████████| 1/1 [00:00<00:00, 31.79it/s]



tensor([0, 1, 0, 0], device='cuda:0')


Epoch 9/15 (Val): 100%|██████████| 1/1 [00:00<00:00, 79.92it/s]


Epoch 9/15 | Train Loss: 0.4479 | Train Acc: 0.7500 | Val Loss: 0.6823 | Val Acc: 0.5000
<torch.utils.data.dataloader.DataLoader object at 0x0000012286063690>


Epoch 10/15 (Train):   0%|          | 0/1 [00:00<?, ?it/s]

tensor([[ 1.2010, -0.7473],
        [ 1.3632, -0.5206],
        [ 0.4911, -0.1460],
        [ 1.4492, -1.0059]], device='cuda:0', grad_fn=<AddmmBackward0>)

Epoch 10/15 (Train): 100%|██████████| 1/1 [00:00<00:00, 33.13it/s]



tensor([0, 0, 1, 0], device='cuda:0')


Epoch 10/15 (Val): 100%|██████████| 1/1 [00:00<00:00, 79.83it/s]


Epoch 10/15 | Train Loss: 0.3547 | Train Acc: 0.7500 | Val Loss: 0.6713 | Val Acc: 0.5000
<torch.utils.data.dataloader.DataLoader object at 0x0000012286063690>


Epoch 11/15 (Train):   0%|          | 0/1 [00:00<?, ?it/s]

tensor([[ 0.7500, -0.1128],
        [ 1.3880, -0.8320],
        [ 1.4816, -0.7158],
        [ 0.8289, -0.3992]], device='cuda:0', grad_fn=<AddmmBackward0>)

Epoch 11/15 (Train): 100%|██████████| 1/1 [00:00<00:00, 33.18it/s]



tensor([1, 0, 0, 0], device='cuda:0')


Epoch 11/15 (Val): 100%|██████████| 1/1 [00:00<00:00, 82.69it/s]


Epoch 11/15 | Train Loss: 0.4200 | Train Acc: 0.7500 | Val Loss: 0.6586 | Val Acc: 0.5000
<torch.utils.data.dataloader.DataLoader object at 0x0000012286063690>


Epoch 12/15 (Train):   0%|          | 0/1 [00:00<?, ?it/s]

tensor([[ 1.3496, -0.9662],
        [ 1.3509, -0.8330],
        [ 0.4181, -0.3614],
        [ 1.0559, -0.8517]], device='cuda:0', grad_fn=<AddmmBackward0>)


Epoch 12/15 (Train): 100%|██████████| 1/1 [00:00<00:00, 33.01it/s]


tensor([0, 0, 1, 0], device='cuda:0')


Epoch 12/15 (Val): 100%|██████████| 1/1 [00:00<00:00, 86.34it/s]


Epoch 12/15 | Train Loss: 0.3741 | Train Acc: 0.7500 | Val Loss: 0.6440 | Val Acc: 0.5000
<torch.utils.data.dataloader.DataLoader object at 0x0000012286063690>


Epoch 13/15 (Train):   0%|          | 0/1 [00:00<?, ?it/s]

tensor([[ 0.4869, -0.1706],
        [ 1.4129, -0.6900],
        [ 1.2368, -0.6497],
        [ 1.0458, -0.5364]], device='cuda:0', grad_fn=<AddmmBackward0>)


Epoch 13/15 (Train): 100%|██████████| 1/1 [00:00<00:00, 32.51it/s]


tensor([1, 0, 0, 0], device='cuda:0')


Epoch 13/15 (Val): 100%|██████████| 1/1 [00:00<00:00, 76.83it/s]


Epoch 13/15 | Train Loss: 0.3796 | Train Acc: 0.7500 | Val Loss: 0.6301 | Val Acc: 0.5000
<torch.utils.data.dataloader.DataLoader object at 0x0000012286063690>


Epoch 14/15 (Train):   0%|          | 0/1 [00:00<?, ?it/s]

tensor([[ 1.0271, -0.6163],
        [ 0.4148, -0.1626],
        [ 1.2538, -0.6899],
        [ 1.5449, -0.9142]], device='cuda:0', grad_fn=<AddmmBackward0>)


Epoch 14/15 (Train): 100%|██████████| 1/1 [00:00<00:00, 33.49it/s]


tensor([0, 1, 0, 0], device='cuda:0')


Epoch 14/15 (Val): 100%|██████████| 1/1 [00:00<00:00, 83.32it/s]


Epoch 14/15 | Train Loss: 0.3539 | Train Acc: 0.7500 | Val Loss: 0.6159 | Val Acc: 0.5000
<torch.utils.data.dataloader.DataLoader object at 0x0000012286063690>


Epoch 15/15 (Train):   0%|          | 0/1 [00:00<?, ?it/s]

tensor([[ 0.1877,  0.0968],
        [ 1.5508, -0.7943],
        [ 1.0283, -0.4844],
        [ 1.5432, -0.8994]], device='cuda:0', grad_fn=<AddmmBackward0>)


Epoch 15/15 (Train): 100%|██████████| 1/1 [00:00<00:00, 33.28it/s]


tensor([1, 0, 0, 0], device='cuda:0')


Epoch 15/15 (Val): 100%|██████████| 1/1 [00:00<00:00, 90.62it/s]

Epoch 15/15 | Train Loss: 0.2784 | Train Acc: 0.7500 | Val Loss: 0.6019 | Val Acc: 0.5000



  model.load_state_dict(torch.load(f"best_speaker_model_{word_count}.pth"))


tensor([[ 0.3039,  0.0267],
        [ 0.3013, -0.2178]], device='cuda:0')
tensor([[1],
        [0]], device='cuda:0')
Test Loss: 0.6541 | Test Accuracy: 0.5000

Classification Report:
              precision    recall  f1-score   support

           0       0.50      1.00      0.67         1
           1       0.00      0.00      0.00         1

    accuracy                           0.50         2
   macro avg       0.25      0.50      0.33         2
weighted avg       0.25      0.50      0.33         2



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{5: {'val_acc': 0.6666666666666666, 'test_acc': 1.0}, 10: {'val_acc': 0.5, 'test_acc': 0.5}}
Not enough samples for word count 30. Skipping.

Results Summary:
Word Count      Validation Accuracy  Test Accuracy  
--------------------------------------------------
5               0.6667               1.0000         
10              0.5000               0.5000         

Live Demonstration:
Using the best model (trained with 5 words)


  best_model.load_state_dict(torch.load(f"best_speaker_model_{best_count}.pth"))



Please speak for a live test (say several words).
Recording 10 seconds of audio...
Say a word or phrase now...
Audio saved to live_test_0.wav
Recording 10 seconds of audio...
Say a word or phrase now...
Audio saved to live_test_1.wav
Recording 10 seconds of audio...
Say a word or phrase now...
Audio saved to live_test_2.wav
Recording 10 seconds of audio...
Say a word or phrase now...
Audio saved to live_test_3.wav
Recording 10 seconds of audio...
Say a word or phrase now...
Audio saved to live_test_4.wav

Prediction: Your voice (Confidence: 53.45%)


: 