In [1]:
import pickle
import os
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader

In [2]:
# Loading data from disk

spectograms_major = []
folder_major = "./spectogram_data/Major/"
for file_name in os.listdir(folder_major):
    with open(folder_major+file_name, "rb") as f:
        spec = pickle.load(f)
    spectograms_major.append(spec)

spectograms_minor = []
folder_minor = "./spectogram_data/Minor/"
for file_name in os.listdir(folder_minor):
    with open(folder_minor+file_name, "rb") as f:
        spec = pickle.load(f)
    spectograms_minor.append(spec)

In [3]:
len(spectograms_major), len(spectograms_minor)

(1431, 370)

In [4]:
### Creating dataset class
##########################

# The available data has far more major-mode audio samples than minor ones
# To solve this problem, this dataset class is made to feed the model 
# alternating minor-mode and major-mode audio samples (one minor, one major, one minor, one major, ...)
# An epoch is set to be finished after two times the number of minor-mode audio samples
# Therefore, each epoch uses always all minor-mode samples available, but the major ones are changing between different epochs

class SpectogramDataForModeRecognition(Dataset):
    def __init__(self, spectograms_major, spectograms_minor) -> None:
        super().__init__()
        self.spectograms_major = spectograms_major
        self.spectograms_minor = spectograms_minor
        self.n = 2*min(len(spectograms_major), len(spectograms_minor))
        self.n_major = len(spectograms_major)
        self.n_minor = len(spectograms_minor)
        self.next = True # False = minor, True = major
        self.index_minor = 0
        self.index_major = 0
    
    def __getitem__(self, index) -> any:
        if self.next:
            spec = self.spectograms_major[self.index_major]
            self.index_major += 1
            if self.index_major == self.n_major:
                self.index_major = 0
            self.next = False
            label = np.float32(1) # 1 is Major
        else:
            spec = self.spectograms_minor[self.index_minor]
            self.index_minor += 1
            if self.index_minor == self.n_minor:
                self.index_minor = 0
            self.next = True
            label = np.float32(0) # 0 is Minor
        return torch.tensor(np.array([spec])), torch.tensor(np.array([label]))

    def __len__(self):
        return self.n

In [5]:
### Spliting data in training and testing. Making loader objects for easyly feed the model
##########################################################################################

# shuffling the data
np.random.shuffle(spectograms_major)
np.random.shuffle(spectograms_minor)

# making the training and testing dataset from all available data
train_spectogram_major_end_index = int(0.8*len(spectograms_major))
train_spectogram_minor_end_index = int(0.8*len(spectograms_minor))
train_data = SpectogramDataForModeRecognition(
    torch.tensor(np.array(spectograms_major[:train_spectogram_major_end_index]), dtype=torch.float32), 
    torch.tensor(np.array(spectograms_minor[:train_spectogram_minor_end_index]), dtype=torch.float32))
test_data = SpectogramDataForModeRecognition(
    torch.tensor(np.array(spectograms_major[train_spectogram_major_end_index:]), dtype=torch.float32), 
    torch.tensor(np.array(spectograms_minor[train_spectogram_minor_end_index:]), dtype=torch.float32))

# making data loader objects
train_loader = DataLoader(train_data, batch_size=128)
test_loader = DataLoader(test_data, batch_size=128)

In [6]:
### Model definition
####################

# Model feeded with a logaritmic filtered spectogram of an audio sample
# Combination of 5x5 convolutional and 2x2 MaxPool layers with two final linear layers. 
# ReLU activation function after each layer with the exeption of a final sigmoid (0 -> minor, 1 -> major)

class modeDetectionModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.layer1 = nn.Conv2d(1,8,5)
        self.layer2 = nn.Conv2d(8,16,5)
        self.layer3 = nn.MaxPool2d(2)
        self.layer4 = nn.Conv2d(16,32,5)
        self.layer5 = nn.Conv2d(32,64,5)
        self.layer6 = nn.Linear(64*109*40, 64)
        self.layer7 = nn.Linear(64,1)
        self.relu = nn.ReLU() 
    def forward(self, x):
        x = self.relu(self.layer1(x))
        x = self.relu(self.layer2(x))
        x = self.relu(self.layer3(x))
        x = self.relu(self.layer4(x))
        x = self.relu(self.layer5(x))
        #print(x.size())
        x = x.reshape(-1,64*109*40)
        x = self.relu(self.layer6(x))
        return F.sigmoid(self.layer7(x))

# model
net = modeDetectionModel()
# optimizer
opt = optim.Adam(net.parameters(), lr=1e-3)
# loss function
loss_fn = nn.MSELoss()
# losses history
losses = []
# accuracy history
accuracies = []
best_accuracy = 0
# epoch counter
epoch = 0

In [7]:
### Training function
#####################

def training(numberOfEpochs):
    global epoch
    global best_accuracy
    for i in range(numberOfEpochs):
        epoch += 1
        total_correct = 0
        total = len(train_data)
        for specs, labels in train_loader:
            # feeding the model
            predictions = net(specs)
            # training
            loss = loss_fn(labels, predictions)
            losses.append(loss)
            loss.backward()
            opt.step()
            opt.zero_grad()
            # computing accuracy
            total_correct += sum(torch.abs(predictions-labels) <= 0.5)
        # saving and printing accuracy of this epoch
        accuracy = total_correct / total
        accuracies.append(accuracy.item())
        print(epoch, total_correct/total)
        # saving the model if it is the best one or if it has > 0.90 accuracy
        #if accuracy > 0.9:
            #torch.save(net, f'./models/epoch{epoch}.pkl')
        if accuracy > best_accuracy:
            torch.save(net, f'./models/best.pkl')
            best_accuracy = accuracy

In [26]:
# This cell has been run several times for training the model

training(2)

37 tensor([0.9932])
38 tensor([0.9882])


In [39]:
best_accuracy

tensor([0.9949])

In [38]:
### Testing models with test data
#################################

net_loaded = torch.load('./models/best.pkl')
total_correct = 0
total = len(test_data)
for specs, labels in test_loader:
    # feeding the model
    predictions = net_loaded(specs)
    # computing accuracy
    total_correct += sum(torch.abs(predictions-labels) <= 0.5)
# printing accuracy
print(total_correct/total)

tensor([0.7230])
