In [1]:
import os
import cv2
import pandas as pd
import shutil
import numpy as np
import torch
import torch.optim as optim
import glob
import librosa
import librosa.display
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
from PIL import Image
from scipy.special import softmax
from torch.utils.data import Dataset, DataLoader, random_split
from librosa.util import normalize
from tqdm.auto import tqdm
from torchvision import datasets, transforms
from torchvision.models.mobilenet import mobilenet_v2
from torch.optim.lr_scheduler import StepLR
from torch.nn import CrossEntropyLoss


In [2]:
def train(model, device, train_loader, optimizer, epoch):
    log_interval = 10
    loss_func = CrossEntropyLoss()
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        data = data.repeat(1, 3, 1, 1)
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = loss_func(output, target)
        loss.backward()
        optimizer.step()
        if batch_idx % log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                       100. * batch_idx / len(train_loader), loss.item()))

In [3]:
def test_model(model, device, test_loader):
    model.eval()
    test_loss = 0
    correct = 0
    loss_func = CrossEntropyLoss()
    with torch.no_grad():
        for data, target in test_loader:
            data = data.repeat(1, 3, 1, 1)
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += loss_func(output, target)
            pred = output.argmax(dim=1, keepdim=True)
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)

    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))

In [27]:
def form_out(model, device, out_loader):
    model.eval()  
    output_vals = []
    with torch.no_grad():
        for data in tqdm(out_loader):
            data = data.repeat(1, 3, 1, 1)
            data = data.to(device)  
            output = model(data)  
            pred = output.argmax(dim=1, keepdim=True)
            output_vals += list(pred.cpu().numpy()[:, 0])
    return output_vals

In [5]:
class DatasetSounds(Dataset):

    def __init__(self, root_path, names, sounds, labels,  transform=None):
        self.labels = labels
        self.sounds = sounds
        self.names = names
        self.root_path = root_path
        self.transform = transform

    def __len__(self):
        return len(self.names)

    def __getitem__(self, index):
        waveform, sr = librosa.load(self.root_path + '/' + self.sounds[index] + '/' + self.names[index])
        label = self.labels[index]
        n_fft = 1024
        step = n_fft // 4
        mel_spectrogram = librosa.feature.melspectrogram(y=waveform,
                                                         sr=sr,
                                                         n_fft=n_fft,
                                                         hop_length=step)
        mel_spectrogram = librosa.power_to_db(mel_spectrogram, ref=np.max)
        mel_spectrogram = np.round(mel_spectrogram)
        mel_spectrogram = Image.fromarray(mel_spectrogram)
        if self.transform is not None:
             mel_spectrogram = self.transform(mel_spectrogram)

        return mel_spectrogram, label

In [24]:
class DatasetTest(Dataset):

    def __init__(self, root_path, csv,  transform=None):
        self.root_path = root_path
        self.data = pd.read_csv(csv)
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        waveform, sr = librosa.load(self.root_path + '/' + self.data.loc[index, 'id'])
        label = self.data.loc[index, 'answer']
        n_fft = 1024
        step = n_fft // 4
        mel_spectrogram = librosa.feature.melspectrogram(y=waveform,
                                                         sr=sr,
                                                         n_fft=n_fft,
                                                         hop_length=step)
        mel_spectrogram = librosa.power_to_db(mel_spectrogram, ref=np.max)
        mel_spectrogram = np.round(mel_spectrogram)
        mel_spectrogram = Image.fromarray(mel_spectrogram)
        if self.transform is not None:
             mel_spectrogram = self.transform(mel_spectrogram)

        return mel_spectrogram

In [25]:
dict_of_sounds_indexes = {
    0: 'stop',
    1: 'one',
    2: 'two',
    3: 'three',
    4: 'four',
    5: 'five',
    6: 'six',
    7: 'cat',
    8: 'dog',
    9: 'house'
}

dict_of_sounds = {
    'stop': 0,
    'one': 1,
    'two': 2,
    'three': 3,
    'four': 4,
    'five': 5,
    'six': 6,
    'cat': 7,
    'dog': 8,
    'house': 9
}

sounds_path = sorted(glob.glob('train/*/*.wav'))

sounds = []
names =[]
labels = []

for path in sounds_path:
    sound = os.path.basename(os.path.dirname(path))
    name = os.path.basename(path)
    sounds.append(sound)
    names.append(name)

for sound in sounds:
    labels.append(sound)

for i in range(len(labels)):
    labels[i] = dict_of_sounds[labels[i]]



batch_size = 128
learning_rate = 1.0
reduce_lr_gamma = 0.7
epochs = 4
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print('Device: {} Epochs: {} Batch size: {}'.format(device, epochs, batch_size))



kwargs = {'batch_size': batch_size}
if torch.cuda.is_available():
    kwargs.update({'num_workers': 1, 'pin_memory': True})

transform = transforms.Compose([
    transforms.Resize((128, 128)),
    transforms.ToTensor(),
])


dataset = DatasetSounds(root_path='train',
                       names=names,
                       sounds=sounds,
                       labels=labels,
                       transform=transform)

train_size = int(0.9 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

out_dataset = DatasetTest(root_path='test',
                                   csv='sample_submission.csv',
                                   transform=transform)

train_loader = torch.utils.data.DataLoader(train_dataset, shuffle=True, **kwargs)
test_loader = torch.utils.data.DataLoader(test_dataset, shuffle=False, **kwargs)
out_loader = torch.utils.data.DataLoader(out_dataset, shuffle=False, **kwargs)

Device: cpu Epochs: 4 Batch size: 128


In [7]:




model = mobilenet_v2(pretrained=True)
model.classifier[1] = torch.nn.Linear(in_features=model.classifier[1].in_features, out_features=10)
model.to(device)
optimizer = optim.Adadelta(model.parameters(), lr=learning_rate)


scheduler = StepLR(optimizer, step_size=1, gamma=reduce_lr_gamma)
for epoch in range(1, epochs + 1):
    train(model, device, train_loader, optimizer, epoch)
    test_model(model, device, test_loader)
    scheduler.step()

torch.save(model.state_dict(), "mnist_cnn.pt")



Device: cpu Epochs: 4 Batch size: 128





Test set: Average loss: 0.0020, Accuracy: 1210/1315 (92%)


Test set: Average loss: 0.0030, Accuracy: 1193/1315 (91%)


Test set: Average loss: 0.0007, Accuracy: 1283/1315 (98%)


Test set: Average loss: 0.0005, Accuracy: 1297/1315 (99%)



In [28]:
output = form_out(model, device, out_loader)

  0%|          | 0/71 [00:00<?, ?it/s]

In [34]:
for i in range(len(output)):
    output[i] = dict_of_sounds_indexes[output[i]]

In [35]:
sample = pd.read_csv('sample_submission.csv')
result = pd.DataFrame(sample['id'], columns=['id'])
result['answer'] = output
result.to_csv('result.csv', index=False)


In [36]:
sample

Unnamed: 0,id,answer
0,0.wav,cat
1,1.wav,cat
2,2.wav,cat
3,3.wav,cat
4,4.wav,cat
...,...,...
9065,9065.wav,cat
9066,9066.wav,cat
9067,9067.wav,cat
9068,9068.wav,cat


In [37]:
result

Unnamed: 0,id,answer
0,0.wav,four
1,1.wav,one
2,2.wav,two
3,3.wav,dog
4,4.wav,six
...,...,...
9065,9065.wav,one
9066,9066.wav,dog
9067,9067.wav,house
9068,9068.wav,two
