In [52]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchaudio
from torch.utils.data import Dataset, DataLoader,Subset 
from sklearn.model_selection import KFold


SAMPLE_RATE = 16000 
DURATION = 1.0  
BATCH_SIZE = 64

device = "cuda" if torch.cuda.is_available() else "cpu"

print(f"Cihaz: {device}")


Cihaz: cuda


In [53]:
# Ben kaggle üzerinden indirdim.Dosya yolunu data\SpeechCommands\speech_commands_v0.02 olarak ayarladım.
# Veri setini test ve train olarak ayırma işlemi torchaudio tarafından otomatik yapılıyor.
# Cross-validation yapacağımız için val sete gerek yok.
train_dataset = torchaudio.datasets.SPEECHCOMMANDS(root="./data",url="speech_commands_v0.02", download=False, subset="training")
test_dataset = torchaudio.datasets.SPEECHCOMMANDS(root="./data", url="speech_commands_v0.02", download=False ,subset="testing")


In [54]:
labels = sorted(list(set(
    train_dataset.get_metadata(i)[2] for i in range(len(train_dataset))
)))

label_to_index = {label: i for i, label in enumerate(labels)}
index_to_label = {i: label for i, label in enumerate(labels)}
NUM_CLASSES = len(labels)

print(f"Etiketler: {labels}")
print(f"Etiket sayısı: {NUM_CLASSES}")


Etiketler: ['backward', 'bed', 'bird', 'cat', 'dog', 'down', 'eight', 'five', 'follow', 'forward', 'four', 'go', 'happy', 'house', 'learn', 'left', 'marvin', 'nine', 'no', 'off', 'on', 'one', 'right', 'seven', 'sheila', 'six', 'stop', 'three', 'tree', 'two', 'up', 'visual', 'wow', 'yes', 'zero']
Etiket sayısı: 35


In [None]:
# CNN Modeli

class Cnn(nn.Module):
    def __init__(self,num_classes):
        super(Cnn, self).__init__()

        
        self.mel_spectrogram = torchaudio.transforms.MelSpectrogram(
            n_mels=64,
            n_fft=1024,
            hop_length=512
        ).to(device)

        self.amplitude_to_db = torchaudio.transforms.AmplitudeToDB().to(device)
        
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3,  padding=1)
        self.bn1 = nn.BatchNorm2d(16)
        self.pool1 = nn.MaxPool2d(2)

        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, padding=1)
        self.bn2 = nn.BatchNorm2d(32)
        self.pool2 = nn.MaxPool2d(2)

        self.conv3 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.bn3 = nn.BatchNorm2d(64)
        self.pool3 = nn.MaxPool2d(2)

        self.gap = nn.AdaptiveAvgPool2d((1, 1))

        self.fc1 = nn.Linear(64, num_classes)


        #Dropout düşürebilir
        self.dropout = nn.Dropout(0.3) 
    
    def forward(self, x):
        x = self.mel_spectrogram(x)
        x = self.amplitude_to_db(x)

        x = self.pool1(F.relu(self.bn1(self.conv1(x))))
        x = self.pool2(F.relu(self.bn2(self.conv2(x))))
        x = self.pool3(F.relu(self.bn3(self.conv3(x))))
        
        
        x = self.gap(x) 
        x = torch.flatten(x, 1)

        x = self.dropout(x)

        x = self.fc1(x)
        
        return x

In [56]:
def train_model(model, train_loader, criterion, optimizer, epoch):
    model.train()
    total_loss = 0.0
    correct = 0
    total_samples = 0



    for batch_idx, (data, target) in enumerate(train_loader):
        optimizer.zero_grad()
        outputs = model(data)
        loss = criterion(outputs, target)
        loss.backward()
        optimizer.step()


        total_loss += loss.item()
        predicted = torch.argmax(outputs, dim=1)
        correct += (predicted == target).sum().item()
        total_samples += len(data)

    avg_loss = total_loss / len(train_loader) 
    accuracy = 100. * correct / total_samples
    print(f"Train Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}")

    
def validate_model(model, val_loader, criterion,epoch):
    model.eval()
    total_loss = 0.0
    correct = 0
    total_samples = 0
    with torch.no_grad():
        for data, target in val_loader:
            outputs = model(data)
            loss = criterion(outputs, target)
            total_loss += loss.item()
            predicted = torch.argmax(outputs, dim=1)
            correct += (predicted == target).sum().item()
            total_samples += len(data)

    avg_loss = total_loss / len(val_loader)
    accuracy = 100* correct / total_samples
    print(f"Validation Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}") 
    
    return accuracy
    


In [57]:
# Pytorch DataLoader için custom collate_fn fonksiyonu
# Pytorch DataLoader, farklı uzunluktaki ses dalga formlarını tek bir batch'te birleştiremez
# Bu nedenle özel bir collate_fn fonksiyonu tanımlıyoruz
# Uzun olan kısımları keser, eksi olan kısımları ise sıfır ile doldurur
# Bu sayede modelimize sabit boyutta giriş sağlanır

def collate_fn(batch):
    waveforms , targets = [] , []
    for waveform, _, label, _, _ in batch:
        target_len = int(SAMPLE_RATE * DURATION)
        if waveform.shape[1] > target_len:
            waveform = waveform[:, :target_len]
        elif waveform.shape[1] < target_len:
            padding_needed = target_len - waveform.shape[1]
            padding = torch.zeros((1, padding_needed))
            waveform = torch.cat([waveform, padding], dim=1)
        waveforms.append(waveform)
        targets.append(label_to_index[label])
    waveforms = torch.stack(waveforms).to(device)
    targets = torch.tensor(targets, dtype=torch.long).to(device)
    return waveforms, targets

In [None]:
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

fold_validation_scores = []

dataset_indices = list(range(len(train_dataset)))


for fold, (train_ids, val_ids) in enumerate(kfold.split(dataset_indices)):
    print(f"--- FOLD {fold + 1}/{5} ---")
    
    train_subset = Subset(train_dataset, train_ids)
    val_subset = Subset(train_dataset, val_ids)
    
    train_loader = DataLoader(
        train_subset,
        batch_size=BATCH_SIZE,
        shuffle=True, # Eğitim verisini karıştır
        collate_fn=collate_fn
    )
    val_loader = DataLoader(
        val_subset,
        batch_size=BATCH_SIZE,
        shuffle=False, # Doğrulama verisini karıştırma
        collate_fn=collate_fn
    )
    
    model = Cnn(NUM_CLASSES).to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.CrossEntropyLoss()
    
    best_fold_accuracy = 0.0
    
    for epoch in range(1, 5 + 1):

        print(f"--- EPOCH {epoch}/{5} ---")

        train_model(model, train_loader, criterion, optimizer, epoch)
        fold_accuracy = validate_model(model, val_loader, criterion, epoch)
        
        if fold_accuracy > best_fold_accuracy:
            best_fold_accuracy = fold_accuracy
            #En iyi modeli kaydetme işlemi yapılabilir
            #Early stopping için kullanılabilir
            
    print(f"Fold {fold + 1} tamamlandı. En iyi doğruluk: {best_fold_accuracy:.2f}%")
    fold_validation_scores.append(best_fold_accuracy)

--- FOLD 1/5 ---
--- EPOCH 1/5 ---
Train Loss: 2.9305, Accuracy: 19.0426
Validation Loss: 2.3648, Accuracy: 34.6396
--- EPOCH 2/5 ---
Train Loss: 2.1447, Accuracy: 38.4772
Validation Loss: 1.7883, Accuracy: 50.0501
--- EPOCH 3/5 ---
Train Loss: 1.7745, Accuracy: 48.3263
Validation Loss: 1.5054, Accuracy: 56.5089
--- EPOCH 4/5 ---
Train Loss: 1.5420, Accuracy: 55.1257
Validation Loss: 1.2450, Accuracy: 66.2797
--- EPOCH 5/5 ---
Train Loss: 1.3782, Accuracy: 59.8992
Validation Loss: 1.1373, Accuracy: 68.6900
Fold 1 tamamlandı. En iyi doğruluk: 68.69%
--- FOLD 2/5 ---
--- EPOCH 1/5 ---
