In [1]:
import os
import time
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.preprocessing import LabelBinarizer
import torch
import torchaudio
import torch.nn as nn
import torch.nn.functional as F  
from torchsummary import summary
from torch.utils.data import Dataset, DataLoader
from utils import load, get_audio_path, stereo_to_mono, frequency_mask, time_mask, plot_spectrogram
# define device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(torch.cuda.is_available())

True


In [2]:
DATA_DIR = './fma/data/fma_small'

tracks = load('fma/data/fma_metadata/tracks.csv')
#features = utils.load('fma/data/fma_metadata/features.csv')
#echonest = utils.load('fma/data/fma_metadata/echonest.csv')

subset = tracks.index[tracks['set', 'subset'] <= 'small']

#features_all = features.join(echonest, how='inner').sort_index(axis=1)

tracks = tracks.loc[subset]
#features_all = features.loc[subset]

train = tracks.index[tracks['set', 'split'] == 'training']
val = tracks.index[tracks['set', 'split'] == 'validation']
test = tracks.index[tracks['set', 'split'] == 'test']

labels_onehot = LabelBinarizer().fit_transform(tracks['track', 'genre_top'])
labels_onehot = pd.DataFrame(labels_onehot, index=tracks.index)

In [3]:
class FMA2D(Dataset):   
    
    
    def __init__(self, track_ids, use_subsamples=True, transforms=None, augment_prob=0.5):
        self.track_ids = track_ids
        self.subsamples = use_subsamples
        self.transforms = transforms
        self.augment_prob = augment_prob
        self.data_path = './fma/data/spectrograms_2/' if use_subsamples else './fma/data/spectrograms_1/'
        self.data = []
        if use_subsamples:
            for tid in track_ids:
                for i in range(7):
                    self.data.append([self.data_path +"{:06d}".format(tid)+f'_{i}.pt', tid])
        else:
            for tid in track_ids:
                self.data.append([self.data_path +"{:06d}".format(tid)+'.pt', tid])
    
    def __getitem__(self, index): 
        
        spec_path = self.data[index][0]
        tid = self.data[index][1]
        
        # load the spectrogram data
        try:
            spec = torch.load(spec_path)
        except Exception as e:
            return self.__getitem__(index + 1)
        
        if self.transforms and torch.rand(1) < self.augment_prob:
            spec = self.transforms[0](spec)
            spec = self.transforms[1](spec)
            
        
        label = torch.from_numpy(labels_onehot.loc[tid].values).float()
        return spec, label
        
        
    def __len__(self):
        return len(self.data)


In [4]:
from torch.nn import init
class AudioClassifier (nn.Module):
    def __init__(self):
        super().__init__()
        conv_layers = []

        # First Convolution Block with Relu and Batch Norm. Use Kaiming Initialization
        self.conv1 = nn.Conv2d(1, 8, kernel_size=(5, 5), stride=(2, 2), padding=(2, 2))
        self.relu1 = nn.ReLU()
        self.bn1 = nn.BatchNorm2d(8)
        init.kaiming_normal_(self.conv1.weight, a=0.1)
        self.conv1.bias.data.zero_()
        conv_layers += [self.conv1, self.relu1, self.bn1]

        # Second Convolution Block
        self.conv2 = nn.Conv2d(8, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        self.relu2 = nn.ReLU()
        self.bn2 = nn.BatchNorm2d(16)
        init.kaiming_normal_(self.conv2.weight, a=0.1)
        self.conv2.bias.data.zero_()
        conv_layers += [self.conv2, self.relu2, self.bn2]

        # Second Convolution Block
        self.conv3 = nn.Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        self.relu3 = nn.ReLU()
        self.bn3 = nn.BatchNorm2d(32)
        init.kaiming_normal_(self.conv3.weight, a=0.1)
        self.conv3.bias.data.zero_()
        conv_layers += [self.conv3, self.relu3, self.bn3]

        # Second Convolution Block
        self.conv4 = nn.Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        self.relu4 = nn.ReLU()
        self.bn4 = nn.BatchNorm2d(64)
        init.kaiming_normal_(self.conv4.weight, a=0.1)
        self.conv4.bias.data.zero_()
        conv_layers += [self.conv4, self.relu4, self.bn4]

        # Linear Classifier
        self.ap = nn.AdaptiveAvgPool2d(output_size=1)
        self.lin = nn.Linear(in_features=64, out_features=8)

        # Wrap the Convolutional Blocks
        self.conv = nn.Sequential(*conv_layers)
        
        self.dropout = nn.Dropout(p=0.3)
 
    def forward(self, x):
        # Run the convolutional blocks
        x = self.conv(x)

        # Adaptive pool and flatten for input to linear layer
        x = self.ap(x)
        x = x.view(x.shape[0], -1)

        # Linear layer
        x = self.lin(x)
        x = self.dropout(x)

        # Final output
        return x


In [5]:
class nnet1(nn.Module):
    def __init__(self, input_size, num_classes):
        super().__init__()

        self.conv1 = nn.Sequential(
            nn.Conv2d(in_channels=1, out_channels=128, kernel_size=(4, input_size), stride=1, padding=2, bias=True),
            nn.ReLU(),   
            nn.MaxPool2d(kernel_size=(2, 1), stride=(2, 1))
        )
        
        self.conv2 = nn.Sequential(
            nn.Conv2d(in_channels=128, out_channels=128, kernel_size=(4, 1), stride=1, padding=2, bias=True),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=(2, 1), stride=(2, 1))
        )
        self.dropout2 = nn.Dropout(p=0.5)
        
        self.conv3 = nn.Sequential(
            nn.Conv2d(in_channels=128, out_channels=256, kernel_size=(4, 1), stride=1, padding=2, bias=True),
            nn.ReLU()
        )
        
        self.maxpool = nn.MaxPool2d(kernel_size=(26, 1))
        self.avgpool = nn.AvgPool2d(kernel_size=(26, 1))
        
        self.flatten = nn.Flatten()
        self.linear1 = nn.Linear(6656, 300)
        self.relu1 = nn.ReLU()
        self.dropout = nn.Dropout(p=0.5)
        self.linear2 = nn.Linear(300, 150 )
        self.relu2 = nn.ReLU()
        self.linear3 = nn.Linear(150, num_classes)
        
        
     #in this method we tell pytorch how to pass data from layer to another layer   
    def forward(self, input_data):
        x = self.conv1(input_data)
        x = self.conv2(x)
        #x = self.dropout2(x)
        x = self.conv3(x)
        y = self.maxpool(x)
        z = self.avgpool(x)
        x = torch.cat((y, z), dim=1)
        x = self.flatten(x)
        x = self.linear1(x)
        x = self.relu1(x)
        x = self.dropout(x)
        x = self.linear2(x)
        x = self.relu2(x)
        x = self.linear3(x)
        x = self.dropout(x)
        return x
        

In [6]:


class ResidualBlock(nn.Module):
    def __init__(self, input_size, F=256):
        super(ResidualBlock, self).__init__()
        self.conv1 = nn.Sequential(
                        nn.Conv2d(in_channels = 1, out_channels = F, kernel_size = (4, input_size), bias = False),
                        nn.BatchNorm2d(F),
                        nn.ReLU())
        self.conv2 = nn.Sequential(
                        nn.Conv2d(in_channels = F, out_channels = F, kernel_size = (4, 1), bias = False),
                        nn.BatchNorm2d(F),
                        nn.ReLU())
        self.conv3 = nn.Sequential(
                        nn.Conv2d(in_channels = F, out_channels = F, kernel_size = (4, 1),  padding=(3,0), bias = False),
                        nn.BatchNorm2d(F))
        self.relu = nn.ReLU()
        

        
    def forward(self, x):
        residual = self.conv1(x)
        out = self.conv2(residual)
        out = self.conv3(out)
        out = self.relu(out + residual)
        return out       


class nnet2(nn.Module):
    def __init__(self, input_size, num_classes=8, F=256):
        super(nnet2, self).__init__()
        self.block = ResidualBlock(input_size, F=F)
        self.Max = nn.MaxPool2d(kernel_size = (125, 1))
        self.Avg = nn.AvgPool2d(kernel_size = (125, 1))
        self.flatten = nn.Flatten()
        self.linear1 = nn.Linear(F*2, F)
        self.linear2 = nn.Linear(F,  F//2)
        self.linear3 = nn.Linear(F//2, num_classes)
        self.dropout = nn.Dropout(p=0.3)
        
    def forward(self, x):
        x = self.block(x)
        #print(x.size())
        y = self.Max(x)
        z = self.Avg(x)
        #print(y.size(), z.size())
        x = torch.cat((y, z), dim=1)
        #print(x.size())
        x = self.flatten(x)
        #print(x.size())
        x = self.linear1(x)
        x = F.relu(x)
        x = self.linear2(x)
        x = F.relu(x)
        x = self.linear3(x)
        x = self.dropout(x)
        return x
    


In [11]:
#ResNet34

class BasicBlock(nn.Module):
    expansion = 1 #we don't use the block.expansion here

    def __init__(self, inplanes, planes, stride=1,padding = 1):
        super().__init__()
        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size = 3, stride=stride,
                     padding=padding, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size = 3, stride=1,
                     padding=padding, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)
        self.downsample = nn.Sequential(
                nn.Conv2d(inplanes, planes, 1, stride, bias=False),
                nn.BatchNorm2d(planes))
        self.stride = stride

    def forward(self, x):
        identity = x
        out = self.conv1(x)
        out = self.relu(out)
        out = self.conv2(out)
        out = self.bn2(out)
        identity = self.downsample(x)
        out += identity
        out = self.relu(out)
        return out
    
    
class ResNet(nn.Module):

    def __init__(self, block, layers, num_classes=8):
        super().__init__()
        
        self.inplanes = 128

        self.c1 = nn.Conv2d(1, self.inplanes, kernel_size=7, stride=2, padding=3,
                               bias=False)
        self.bn1 = nn.BatchNorm2d(128)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        
        self.layer1 = self._make_layer(block, 128, 128, layers[0])
        self.layer2 = self._make_layer(block, 128, 256, layers[1], stride=2)
        self.layer3 = self._make_layer(block, 256, 256, layers[2], stride=2)
        self.layer4 = self._make_layer(block, 256, 512, layers[3], stride=2)
        self.avgpool = nn.AdaptiveAvgPool2d(7)
        self.fc = nn.Linear(25088 , num_classes)


    def _make_layer(self, block, inplanes, planes, blocks, stride=1):
 

        layers = []
        layers.append(block(inplanes, planes, stride))
        
        self.inplanes = planes
        
        for _ in range(1, blocks):
            layers.append(block(self.inplanes, planes))

        return nn.Sequential(*layers)
    
    
    def forward(self, x):
        x = self.c1(x)           
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)         

        x = self.layer1(x)          
        x = self.layer2(x)          
        x = self.layer3(x)          
        x = self.layer4(x)          

        x = self.avgpool(x)         
        x = torch.flatten(x, 1)     
        x = self.fc(x)

        return x
    
    
def resnet34():
    layers=[3, 4, 6, 3]
    model = ResNet(BasicBlock, layers)
    return model

In [24]:
class ResNetM(nn.Module):

    def __init__(self, block, layers, p_dropout=0.5, num_classes=8):
        super().__init__()
        
        self.name = 'ResNet2D-M'
        self.inplanes = 16

        self.c1 = nn.Conv2d(1, self.inplanes, kernel_size=7, stride=2, padding=3,
                               bias=False)
        self.bn1 = nn.BatchNorm2d(16)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        
        self.layer1 = self._make_layer(block, 16, 32, layers[0])
        self.layer2 = self._make_layer(block, 32, 32, layers[1], stride=2)
        self.layer3 = self._make_layer(block, 32, 64, layers[2], stride=2)
        self.layer4 = self._make_layer(block, 64, 128, layers[3], stride=2)
        self.avgpool = nn.AdaptiveAvgPool2d(7)
        self.fc = nn.Linear(6272 , num_classes)
        self.dropout = nn.Dropout(p=p_dropout)


    def _make_layer(self, block, inplanes, planes, blocks, stride=1):
 

        layers = []
        layers.append(block(inplanes, planes, stride))
        
        self.inplanes = planes
        
        for _ in range(1, blocks):
            layers.append(block(self.inplanes, planes))

        return nn.Sequential(*layers)
    
    
    def forward(self, x):
        x = self.c1(x)           
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)         

        x = self.layer1(x)          
        x = self.layer2(x)          
        x = self.layer3(x)          
        x = self.layer4(x)          

        x = self.avgpool(x)         
        x = torch.flatten(x, 1)     
        x = self.fc(x)
        x = self.dropout(x)

        return x
    
    
def resnet34M(p_dropout=0.5):
    layers=[3, 4, 6, 3]
    model = ResNetM(BasicBlock, layers, p_dropout=p_dropout)
    return model

In [29]:
BATCH = 32
EPOCHS = 40
augment_prob = 0.8

# create a training dataset and dataloader
dataset = FMA2D(train, use_subsamples=False, transforms=[frequency_mask, time_mask], augment_prob=augment_prob)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=BATCH, shuffle=True)

# create a validation dataset and dataloader
val_dataset = FMA2D(val, use_subsamples=False)
val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=BATCH, shuffle=True)

# define the loss function and the optimizer
loss_fn = nn.CrossEntropyLoss()

# Adam optimizer01
lr = 0.0001
optimizer = torch.optim.Adam(model.parameters())
optimizer_name = 'Adam'

# Lee 2017
# SGD optimizer
#optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9, nesterov=True)
#scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.2, patience=5)


for spec, label in val_dataloader:
    print(spec.size())
    #plot_spectrogram(spec[0])
    input_size = spec.size()[2]
    break

p_dropout = 0.3
model = resnet34M(p_dropout=p_dropout)
model.to(device)
summary(model, (1, 128, 1290))


timestamp = time.strftime("feb%d_t%H%M", time.gmtime())
model_name = f"{model.name}_B{BATCH}_E{EPOCHS}_O{optimizer_name}_LR{lr}_pD{p_dropout}_A{augment_prob}_{timestamp}"

torch.Size([32, 128, 1290])
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1          [-1, 16, 64, 645]             784
       BatchNorm2d-2          [-1, 16, 64, 645]              32
              ReLU-3          [-1, 16, 64, 645]               0
         MaxPool2d-4          [-1, 16, 32, 323]               0
            Conv2d-5          [-1, 32, 32, 323]           4,608
              ReLU-6          [-1, 32, 32, 323]               0
            Conv2d-7          [-1, 32, 32, 323]           9,216
       BatchNorm2d-8          [-1, 32, 32, 323]              64
            Conv2d-9          [-1, 32, 32, 323]             512
      BatchNorm2d-10          [-1, 32, 32, 323]              64
             ReLU-11          [-1, 32, 32, 323]               0
       BasicBlock-12          [-1, 32, 32, 323]               0
           Conv2d-13          [-1, 32, 32, 323]           9,216
           

In [30]:


i = 0
running_loss = 0.0
best_val_loss = float('inf') # initialize the best validation loss


# train the model
acc_tr = []
acc_val = []
loss_tr = []
loss_val = []

for epoch in range(EPOCHS):
    # evaluate the model on the training dataset
    train_correct = 0
    train_total = 0
    for spectrogram, label in dataloader:
            model.train()
            label = label.to(device)
            train_label = torch.argmax(label, dim=1)



            # forward pass
            spectrogram = spectrogram.squeeze(0)  
            spectrogram = spectrogram.unsqueeze(1)
            
            spectrogram = spectrogram.to(device)
            output = model(spectrogram)
            
            loss = loss_fn(output, label)

            # backward pass
            optimizer.zero_grad()
            model.zero_grad()
            loss.backward()
            optimizer.step()  
            
            # Update the learning rate
            # scheduler.step(loss)
            
            _, train_predicted = torch.max(output.data, 1)
            train_total += train_label.size(0)
            train_correct += (train_predicted == train_label).sum().item()
            # print statistics
            i += 1
            running_loss += loss.item()
            
            
    loss = running_loss / len(dataloader)
    loss_tr.append(loss)
    print('[%d, %5d subsamples] Training loss: %.3f' % (epoch + 1, i*BATCH, loss))
    running_loss = 0            
    # evaluate the model on the validation dataset
    val_loss = 0.0
    val_correct = 0
    val_total = 0
    model.eval()
    with torch.no_grad():
        for val_spectrogram, val_label in val_dataloader:
                val_label = val_label.to(device)
                val_label = torch.argmax(val_label, dim=1)
            
                val_spectrogram = val_spectrogram.squeeze(0)
                val_spectrogram = val_spectrogram.unsqueeze(1)
                val_spectrogram = val_spectrogram.to(device)
                val_output = model(val_spectrogram)
                val_loss += loss_fn(val_output, val_label).item()
                _, val_predicted = torch.max(val_output.data, 1)
                val_total += val_label.size(0)
                val_correct += (val_predicted == val_label).sum().item()

    loss = val_loss / len(val_dataloader)
    loss_val.append(loss)
    val_acc = val_correct / val_total
    tr_acc = train_correct / train_total 
    acc_tr.append(tr_acc)
    acc_val.append(val_acc)
    # Save the model if the validation loss is the best seen so far
    if loss < best_val_loss:
        best_val_loss = loss
        best_val_acc = val_acc
        best_tr_acc = tr_acc
        best_state_dict = model.state_dict()
    print('Validation Loss: {:.4f} | Validation Accuracy: {:.4f} | Training Accuracy: {:.4f}'.format(loss, val_acc, tr_acc))

plt.plot(loss_val, label='Validation loss')
plt.plot(loss_tr, label='Training loss')
plt.show()

plt.plot(acc_val, label='Validation accuracy')
plt.plot(acc_tr, label='Training accuracy')
plt.show()

torch.save(best_state_dict, model_name + f'_VAL{best_val_acc}_TRAIN{best_tr_acc}.pt')
print('Finished Training')

[1,  6400 subsamples] Training loss: 2.146
Validation Loss: 2.1389 | Validation Accuracy: 0.1288 | Training Accuracy: 0.1339


KeyboardInterrupt: 

In [17]:
# in case we keyboard interrupt the training process we can save the best model manually:
torch.save(best_state_dict, model_name + f'_VAL{best_val_acc}_TRAIN{best_tr_acc}.pt')
# model_name = "nnet1_2022-01-30_batchsize_32_epochs_100_opt_Adam_LR_0.0001_dropout1_acc0.373_subsampling"
# torch.save(model, model_name + '.pt')

In [13]:
# model_name = "ResNet_2022-01-23_batchsize_40_epochs_5_opt_Adam_LR_0.001"
# model_name = "nnet1_2022-01-25_batchsize_32_epochs_50_opt_Adam_LR_0.0001_acc_overfitted" #epoch15 overfit
# model_name = "nnet1_2022-01-30_batchsize_32_epochs_100_opt_Adam_LR_0.0001_dropout1_acc0.373_subsampling" # no overfit
# model_name = "ResNet_2022-01-23_batchsize_40_epochs_10_opt_Adam_LR_0.001"
torch.load(model_name+'.pt')

In [None]:

# create a Mp3Dataset from a directory of MP3 files
dataset = FMADataset(DATA_DIR, train)

# create a DataLoader from the FMADataset
dataloader = torch.utils.data.DataLoader(dataset, batch_size=8, shuffle=True)

    
# create the CNN model
model = ResNet1D(input_size=750000, num_classes=8)

# define the loss function and the optimizer
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())

num_epochs = 10
i = 0
running_loss = 0.0
# train the model
for epoch in range(num_epochs):
    for waveform, label in dataloader:
        # clear the gradients
        optimizer.zero_grad()

        # forward pass
#         waveform = waveform.unsqueeze(0)  # add a batch dimension
#         print(waveform.shape)

        # extract the first channel
        first_channel = waveform[:, :, 0]

        # reshape the first channel to add an additional dimension for the channel dimension
        first_channel = first_channel.unsqueeze(-1)
        
        output = model(first_channel)

        loss = loss_fn(output, label)

        # backward pass
        loss.backward()
        optimizer.step()
        # print statistics
        i += 1
        running_loss += loss.item()
        if i % 10 == 9:    # print every 100 samples
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 2000))
            running_loss = 0.0


print('Finished Training')

## Testing - subsampling

In [None]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
# create a dataset object for testing
test_dataset = FMADataset(DATA_DIR, test)
batch_size = 8
# create a data loader to load the dataset
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=True)



# test the model
model.eval()
model.to(device)
correct = 0
total = 0
with torch.no_grad(): # don't need to track, calculate or save the gradients in the model
    for subsamples, labels in test_loader:
        labels = labels.to(device)
        labels = torch.argmax(labels, dim=1)
        batch_size = labels.size(0) # we reupdate the batch size because the last batch can be incomplete.
        subsample_outputs = {i: [] for i in range(batch_size)}
        for waveform in subsamples:
            waveform = waveform.squeeze(0)  
            waveform = waveform.unsqueeze(-1)
            waveform = waveform.to(device)
            outputs = model(waveform)
            predicted = torch.argmax(outputs.data, dim=1).cpu()
            
            for j in range(batch_size):
                subsample_outputs[j].append(predicted[j]) 
        for j in range(batch_size):
            # count the occurrences of each class
            counts = np.bincount(subsample_outputs[j])
            # Find the class with the highest count
            aggregate_prediction = np.argmax(counts)
            correct += (aggregate_prediction == labels[j])
        total += labels.size(0)
        
    
        print(f"CORRECT #  {correct}")

print('Accuracy of the network on the test samples: %d %%' % (100 * correct / total))

## Testing - full sample

In [None]:
# create a dataset object for testing
test_dataset = FMADataset(DATA_DIR, test)

# create a data loader to load the dataset
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=True)

# test the model
model.eval()
model.to(device)
correct = 0
total = 0
with torch.no_grad(): # don't need to track, calculate or save the gradients in the model
    for data in test_loader:
        # get the inputs
        audio, labels = data
        # wrap them in a torch Variable
        audio, labels = audio.to(device), labels.to(device)

        # forward + backward + optimize
        outputs = model(audio)
        _, predicted = torch.max(outputs.data, 1)
        labels = torch.argmax(labels, dim=1)
        predicted = torch.argmax(outputs.data, dim=1)
        print(labels)
        print(predicted)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        print(f"CORRECT #  {correct}")

print('Accuracy of the network on the test samples: %d %%' % (100 * correct / total))

In [None]:
import os
import torch
import torchaudio

data_dir = './fma/data/fma_small'
output_dir = './fma/data/waveforms'
sampling_rate = 22_050

# create the output directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
    
    
resample = torchaudio.transforms.Resample(44100, sampling_rate)
    
# loop through all MP3 files in the data directory
for root, dirs, files in os.walk(data_dir):
    for filename in files:
        if filename.endswith('.mp3'):
            filepath = os.path.join(root, filename)
            try: 
                waveform, sample_rate = torchaudio.load(filepath)
                waveform = stereo_to_mono(waveform) 

                # resample the waveform to the desired sample rate using the Resample transform
                waveform = resample(waveform)

                # save the spectrogram to the output directory
                output_file = os.path.join(output_dir, filename[:-4] + '.pt')
                torch.save(waveform, output_file)
            except:
                continue

  from .autonotebook import tqdm as notebook_tqdm


## Remarks on implementations - draft

### There are two types of samples: mono and stereo - we need to convert mono to stereo when feeding the CNN

An audio channel refers to a single track of audio. The number of channels in an audio file determines the number of separate audio tracks that are mixed together to form the final audio.

A mono audio file has a single channel, which means that all the audio is mixed together into one single track. This means that if you play a mono audio file, the same audio will come out of both the left and right speakers (or headphones) and it will sound the same regardless of the stereo or mono setup.

A stereo audio file, on the other hand, has two channels - a left channel and a right channel. These two channels carry separate audio tracks that are mixed together to create the final audio. When played back on a stereo setup, each channel will be played through its corresponding speaker or headphone and this way, the stereo audio creates a sense of space and directionality.

So, for example, a stereo audio recording of a live concert will have different audio captured by different microphone positioned in different positions in the concert hall, and when it is played back, it creates the sense of being there in the concert hall.

It is worth noting that there are also audio file format with more than 2 channels, such as 5.1 or 7.1 surround sound audio.


### Downsampling

 we downsample the audio signals to a lower sample rate to reduce the data size or to simplify the processing of the signal. Downsampling can be useful for tasks such as speech recognition or audio classification, where the lower frequencies of the signal are more important than the higher frequencies.

## One batch check

In [None]:
BATCH = 1

# create a training dataset and dataloader
dataset = FMA2D(train)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=BATCH, shuffle=True)



# create a validation dataset and dataloader
val_dataset = FMA2D(val)
val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=BATCH, shuffle=True)

    
# create the CNN model
model = nnet1(num_classes=8) # HERE YOU PUT UR NETWORK
model.to(device)

# define the loss function and the optimizer
loss_fn = nn.CrossEntropyLoss()

# Adam optimizer
#optimizer = torch.optim.Adam(model.parameters())


# Lee 2017
# Define the optimizer
optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9, nesterov=True)

# Define the scheduler
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.2, patience=5)



num_epochs = 10
i = 0
running_loss = 0.0

import time
    
# train the model
acc_tr = []
acc_val = []
loss_tr = []
loss_val = []

spectrogram, label = next(iter(dataloader))
spectrogram = spectrogram.squeeze(-1)  
spectrogram = spectrogram.unsqueeze(0)

for epoch in range(num_epochs):
            # evaluate the model on the training dataset
            train_correct = 0
            train_total = 0
    #for spectrogram, label in dataloader:
            
            label = label.to(device)
            train_label = torch.argmax(label, dim=1)


            # forward pass
#             spectrogram = spectrogram.squeeze(-1)  
#             spectrogram = spectrogram.unsqueeze(0)
            
            spectrogram = spectrogram.to(device)
            output = model(spectrogram)
            print(spectrogram.size())
            loss = loss_fn(output, label)

            # backward pass
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()  
            
            # Update the learning rate
            scheduler.step(loss)
            
            _, train_predicted = torch.max(output.data, 1)
            train_total += train_label.size(0)
            train_correct += (train_predicted == train_label).sum().item()
            # print statistics
            i += 1
            running_loss += loss.item()
            print(train_correct)
            
            
            
#     loss = running_loss / len(dataloader)
#     loss_tr.append(loss)
#     print('[%d, %5d subsamples] Training loss: %.3f' % (epoch + 1, i*BATCH, loss))
#     running_loss = 0            
#     # evaluate the model on the validation dataset
#     val_loss = 0.0
#     val_correct = 0
#     val_total = 0
#     with torch.no_grad():
#         for val_spectrogram, val_label in val_dataloader:
#                 val_label = val_label.to(device)
#                 val_label = torch.argmax(val_label, dim=1)
            
#                 val_spectrogram = val_spectrogram.squeeze(-1)
#                 val_spectrogram = val_spectrogram.unsqueeze(0)
#                 val_spectrogram = val_spectrogram.to(device)
#                 val_output = model(val_spectrogram)
#                 val_loss += loss_fn(val_output, val_label).item()
#                 _, val_predicted = torch.max(val_output.data, 1)
#                 val_total += val_label.size(0)
#                 val_correct += (val_predicted == val_label).sum().item()

#     loss = val_loss / len(val_dataloader)
#     loss_val.append(loss)
#     val_acc = val_correct / val_total
#     tr_acc = train_correct / train_total 
#     acc_tr.append(tr_acc)
#     acc_val.append(val_acc)
#     print('Validation Loss: {:.4f} | Validation Accuracy: {:.4f} | Training Accuracy: {:.4f}'.format(loss, val_acc, tr_acc))

plt.plot(loss_val, label='Validation loss')
plt.plot(loss_tr, label='Training loss')
plt.show()

plt.plot(acc_val, label='Validation accuracy')
plt.plot(acc_tr, label='Training accuracy')
plt.show()
    
print('Finished Training')