In [1]:
from pathlib import Path

In [2]:
import librosa
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
import torch
from torch import optim
import torch.nn as nn
from torchvision.transforms import transforms
from torch.utils.data import Dataset, DataLoader

### Parameters

In [4]:
melspec_params = {
    "sr": 44100,
    "n_mels": 128,
    "n_fft": 512,
    "hop_length": 345,
}

In [5]:
def makeLabels(dir):
    """ GENERATE SPECTROGRAMS AND LABELS OF 1 SEC SAMPLES """
    p = Path(dir)

    sample_audio = []
    for filename in p.iterdir():
        waveform, _ = librosa.load(filename,
                                   sr=44100)
        mel_spec = librosa.feature.melspectrogram(waveform,
                                                  **melspec_params)
        mel_spec = np.array(mel_spec)
        # resize spectrogram here to have same size tensors
        resized_melspec = np.resize(mel_spec, (128,128))
        spec = resized_melspec.reshape((1, resized_melspec.shape[0], resized_melspec.shape[1]))
        sample_audio.append(spec)

    sample_labels = []
    sample_names = [str(x)[5:-4] for x in p.iterdir() if p.is_dir()]
    for name in sample_names:
        if name[0] == 'h':
            sample_labels.append(0)
        else:
            sample_labels.append(1)
            
    return sample_audio, sample_labels
    

In [6]:
class ChickenDataset(Dataset):
    """Chicken Vocalization Dataset."""
    
    def __init__(self, audio_labels, audio_samples, transform=None, target_transform=None):
        self.audio_labels = audio_labels
        self.audio_samples = audio_samples
        self.transform = transform
        self.target_transform = target_transform

    def __len__(self):
        return len(self.audio_labels)

    def __getitem__(self, idx):
        audio = self.audio_samples[idx]
        label = self.audio_labels[idx]
        if self.transform:
            audio = self.transform(audio)
        if self.target_transform:
            label = self.target_transform(audio)
        return audio, label

In [7]:
filename='./data/'

specs, labels = makeLabels(filename)

chicken_dataset = ChickenDataset(audio_labels=labels, 
                                 audio_samples=specs)



In [8]:
split_size = 0.8
train_size = int(split_size * len(chicken_dataset))
test_size = len(chicken_dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(chicken_dataset, [train_size, test_size])

In [9]:
len(test_dataset)

1875

### Dataloader

In [10]:
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=True)

## Defining the model

In [11]:
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()        
        self.conv1 = nn.Sequential(nn.Conv2d(in_channels=1,
                                             out_channels=16,
                                             kernel_size=(5,5),
                                             stride=(1,1),
                                             padding=(2,2),
                                            ),
                                   nn.ReLU(),
                                   nn.MaxPool2d(2),)
        self.conv2 = nn.Sequential(nn.Conv2d(in_channels=16,
                                             out_channels=32,
                                             kernel_size=(5,5),
                                             stride=(1,1),
                                             padding=(2,2),), 
                                   nn.ReLU(),
                                   nn.MaxPool2d(2),)        
        # fully connected layer, output 2 classes
        self.out = nn.Linear(32 * 32 * 32, 2)        
    def forward(self,x):
        x = self.conv1(x)
        x = self.conv2(x)        
        # flatten the output of conv2 to (batch_size, 32 * 32 * 32)
        x = x.view(x.size(0), -1)
        output = self.out(x)
        return output, x

In [12]:
cnn = CNN()
loss_func = nn.CrossEntropyLoss()
optimizer = optim.Adam(cnn.parameters(), lr=0.01)
# print(cnn)
# loss_func

### Define Loss Function

## TRAINING

In [17]:
from torch.autograd import Variable

num_epochs = 10
def train(num_epochs, cnn, loaders):
    cnn.train()    
    #Train the model
    total_step = len(loaders)    
    for epoch in range(num_epochs):
        for i, (images, labels) in enumerate(loaders):
            b_x = Variable(images)
            b_y = Variable(labels)            
            output = cnn(b_x)[0]
            loss = loss_func(output, b_y)
            # clear gradients 
            optimizer.zero_grad()            
            # backpropagation, conmpute gradients
            loss.backward()            
            # apply gradients
            optimizer.step()            
            if (i+1) % 59 == 0:
                print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'.format(epoch+1, num_epochs, i+1, total_step, loss.item()))
                pass
            pass
        pass
train(num_epochs, cnn, train_dataloader)            

Epoch [1/10], Step [59/118], Loss: 0.0000
Epoch [1/10], Step [118/118], Loss: 0.0000
Epoch [2/10], Step [59/118], Loss: 0.0000
Epoch [2/10], Step [118/118], Loss: 0.0000
Epoch [3/10], Step [59/118], Loss: 0.0000
Epoch [3/10], Step [118/118], Loss: 0.0000
Epoch [4/10], Step [59/118], Loss: 0.0000
Epoch [4/10], Step [118/118], Loss: 0.0000
Epoch [5/10], Step [59/118], Loss: 0.0000
Epoch [5/10], Step [118/118], Loss: 0.0000
Epoch [6/10], Step [59/118], Loss: 0.0000
Epoch [6/10], Step [118/118], Loss: 0.0000
Epoch [7/10], Step [59/118], Loss: 0.0000
Epoch [7/10], Step [118/118], Loss: 0.0000
Epoch [8/10], Step [59/118], Loss: 0.0000
Epoch [8/10], Step [118/118], Loss: 0.0000
Epoch [9/10], Step [59/118], Loss: 0.0000
Epoch [9/10], Step [118/118], Loss: 0.0000
Epoch [10/10], Step [59/118], Loss: 0.0000
Epoch [10/10], Step [118/118], Loss: 0.0000


In [18]:
def test():
    # Test the model
    cnn.eval()
    
    with torch.no_grad():
        correct = 0
        total = 0
        for images, labels in test_dataloader:
            test_output, last_layer = cnn(images)
            pred_y = torch.max(test_output, 1)[1].data.squeeze()
            accuracy = (pred_y == labels).sum().item()/ float(labels.size(0))
            pass
    print('Test Accuracy of the model on the 7552 test images: %.2f' % accuracy)
    pass
test()

Test Accuracy of the model on the 7552 test images: 1.00


In [19]:
sample = next(iter(test_dataloader))
imgs, lbls = sample
actual_number = lbls[:10].numpy()
test_output, last_layer = cnn(imgs[:10])
pred_y = torch.max(test_output, 1)[1].data.numpy().squeeze()
print(f'Prediction number: {pred_y}')
print(f'Actual number: {actual_number}')

Prediction number: [0 1 0 1 0 1 0 0 0 0]
Actual number: [0 1 0 1 0 1 0 0 0 0]
