## Dataloader

In [1]:
import librosa
import torch
from torch.utils.data import Dataset, DataLoader
import re
import os
from pathlib import Path
import numpy as np
import random
# class裡面的括號表示繼承torch dataset的東西
class MusicDataset(Dataset):
    def __init__(self, audio_files, labels, sample_rate=22050, n_fft=2048, hop_length=512):
        self.audio_files = audio_files
        self.labels = labels
        self.sample_rate = sample_rate
        self.n_fft = n_fft
        self.hop_length = hop_length

    def __len__(self):
        return len(self.audio_files)

    def __getitem__(self, idx):
        length = 130
        audio_file = self.audio_files[idx]
        label = self.labels[idx]
        audio, _ = librosa.load(audio_file, sr=self.sample_rate)
        
        # 新加入
        return_data = []
        n_segments = 10
        n_mfcc = 40
        samples_per_segment = int (self.sample_rate*30/n_segments)
        
        for n in range (n_segments):
            mfcc = librosa.feature.mfcc(y=audio[samples_per_segment*n:samples_per_segment*(n+1)], sr=self.sample_rate, n_mfcc=n_mfcc, n_fft=self.n_fft, hop_length=self.hop_length)
            # mfcc = librosa.feature.mfcc(audio[samples_per_segment*n:samples_per_segment*(n+1)],sr=self.sample_rate, n_mfcc=n_mfcc, n_fft=self.n_fft,hop_length=self.hop_length)
            mfcc = mfcc.T
            mfcc = torch.FloatTensor(mfcc)
            # print(mfcc.shape)
            if mfcc.shape[0] < length:
                pad_width = length - mfcc.shape[0]
                mfcc =  torch.from_numpy(np.pad(mfcc, ((0, pad_width), (0, 0)), 'constant'))
            elif mfcc.shape[0] > length:
                mfcc = mfcc[:, :length]
            return_data.append(mfcc)
            
        # return_data = torch.cuda.FloatTensor(return_data)
        label = torch.LongTensor([label])
        cat_tensor = torch.cat(return_data , dim=0)
        return cat_tensor, label
    
        # 原本的
        # stft = librosa.stft(audio, n_fft=self.n_fft, hop_length=self.hop_length)
        # magnitude, phase = librosa.magphase(stft)
        # # print(stft.shape)
        # magnitude = torch.FloatTensor(magnitude)
        # phase = torch.FloatTensor(phase)
        # if magnitude.shape[1] < length:
        #     pad_width = length - magnitude.shape[1]
        #     magnitude =  torch.from_numpy(np.pad(magnitude, ((0, 0), (0, pad_width)), 'constant'))
        #     phase = torch.from_numpy(np.pad(phase, ((0, 0), (0, pad_width)), 'constant'))
        # elif magnitude.shape[1] > length:
        #     magnitude = magnitude[:, :length]
        #     phase = phase[:, :length]
        # label = torch.LongTensor([label])
        # # print("call get item",audio_file)
        # # print(magnitude.shape)
        # # print(phase.shape)
        # return magnitude, phase, label
        
        

### Test load data

In [None]:
# data_dir = Path('/home/fanal/disk2/luo/genre_classification/genre34/country')
# file_list = os.listdir(data_dir)

# index = 3
# file_path = data_dir / file_list[index]
# print(file_path)
# print(file_list[index])
# if (file_list[index]=='.DS_Store'):
#     print("error")
# else:
#     label = re.findall('^[a-z]+', file_path.name)[0]  # 提取標籤
#     number = re.findall('\d+', file_path.name)[0]  # 提取編號
#     print(label)
#     print(number)

## CNN model

In [2]:
import torch.nn as nn
import torch.optim as optim

class MusicCNN(nn.Module):
    def __init__(self, n_classes):
        super(MusicCNN, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=32, kernel_size=3, padding=0)
        # self.conv1 = nn.Conv2d(in_channels=10, out_channels=32, kernel_size=3, padding=0)

        self.relu1 = nn.ReLU()
        self.maxpool1 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=128, kernel_size=3, padding=0)
        self.relu2 = nn.ReLU()
        self.maxpool2 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.dropout1 = nn.Dropout(p=0.3)
        self.conv3 = nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, padding=0)
        self.relu3 = nn.ReLU()
        self.maxpool3 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.dropout2 = nn.Dropout(p=0.3)
        self.globalavgpool = nn.AdaptiveAvgPool2d(output_size=(1, 1))
        self.fc1 = nn.Linear(in_features=128, out_features=512)
        self.relu4 = nn.ReLU()
        self.fc2 = nn.Linear(in_features=512, out_features=n_classes)
    
    def forward(self, x):
        x = x.unsqueeze(1)
        x = self.conv1(x)
        x = self.relu1(x)
        x = self.maxpool1(x)
        x = self.conv2(x)
        x = self.relu2(x)
        x = self.maxpool2(x)
        x = self.dropout1(x)
        x = self.conv3(x)
        x = self.relu3(x)
        x = self.maxpool3(x)
        x = self.dropout2(x)
        x = self.globalavgpool(x)
        x = x.view(-1, 128)
        x = self.fc1(x)
        x = self.relu4(x)
        x = self.fc2(x)
        return x
        
# class MusicCNN(nn.Module):
#     def __init__(self, n_classes):
#         super(MusicCNN, self).__init__()
#         self.conv1 = nn.Conv2d(10, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
#         # self.conv1 = nn.Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
#         self.bn1 = nn.BatchNorm2d(32)
#         self.relu1 = nn.ReLU()
#         self.maxpool1 = nn.MaxPool2d(kernel_size=(2, 2))
#         self.conv2 = nn.Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
#         self.bn2 = nn.BatchNorm2d(64)
#         self.relu2 = nn.ReLU()
#         self.maxpool2 = nn.MaxPool2d(kernel_size=(2, 2))
#         self.fc1 = nn.Linear(64 * 32 * 10, 128)
#         # self.fc1 = nn.Linear(64 * 256 * 323, 128)

#         self.relu3 = nn.ReLU()
#         self.fc2 = nn.Linear(128, n_classes)

#     def forward(self, x):
#         x = x.view(x.size(0), -1, x.size(2), x.size(3)) # [batch_size, 10, height, width] -> [batch_size, 10*1, height, width]
#         # x = x.unsqueeze(1)
#         x = self.conv1(x)
#         x = self.bn1(x)
#         x = self.relu1(x)
#         x = self.maxpool1(x)
#         x = self.conv2(x)
#         x = self.bn2(x)
#         x = self.relu2(x)
#         x = self.maxpool2(x)
#         # print("before view",x.shape)
#         x = x.view(x.size(0), -1)
#         x = self.fc1(x)
#         x = self.relu3(x)
#         x = self.fc2(x)
#         return x


### load data

In [3]:
audio_files = []
labels = []
postfix = ['12','34','56','78','910']
for i in range(5):
    music_folder = '/home/fanal/disk2/luo/genre_classification/genre'
    music_folder += postfix[i]
    genres = os.listdir(music_folder)        
    for genre in genres:
        if(genre !='.DS_Store'):
            genre_folder = os.path.join(music_folder, genre)
            genre_files = os.listdir(genre_folder)
            for file in genre_files:
                if file.endswith('.wav'):
                    file_path = os.path.join(genre_folder, file)
                    label = re.findall('^[a-z]+', file)[0]
                    if (label=="blues"):
                        label = 0
                    elif (label=="classical"):
                        label = 1
                    elif (label=="country"):
                        label = 2    
                    elif (label=="disco"):
                        label = 3
                    elif (label=="hiphop"):
                        label = 4 
                    elif (label=="jazz"):
                        label = 5 
                    elif (label=="metal"):
                        label = 6 
                    elif (label=="pop"):
                        label = 7
                    elif (label=="reggae"):
                        label = 8 
                    elif (label=="rock"):
                        label = 9  
                    audio_files.append(file_path)
                    labels.append(label)




dataset = MusicDataset(audio_files, labels)

# 沒有cross
# train_size = int(len(dataset) * 0.8)
# train_set, test_set = torch.utils.data.random_split(dataset, [train_size, len(dataset) - train_size])
# dataloader = DataLoader(train_set, batch_size=32, shuffle=True)

# cross validation
fold_sizes = [len(dataset)//5]*5
dataset_splits = torch.utils.data.random_split(dataset, fold_sizes)


### evaluation

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
def evaluate(model, dataloader):
    model.eval()
    total_correct = 0
    total_samples = 0
    with torch.no_grad():
        for data, labels in dataloader:
            # data= torch.stack(data)
            # data = data.permute(1, 0, 2, 3)
            data = data.type(torch.cuda.FloatTensor)
            data.to(device)
            labels = labels.type(torch.cuda.LongTensor)
            labels.to(device)

            outputs = model(data)
            _, predicted = torch.max(outputs.data, 1)
            labels = labels.squeeze()
            # print('pred',predicted.shape)
            # print('label',labels.shape)
            # print((predicted == labels).sum())
            total_samples += labels.size(0)
            total_correct += (predicted == labels).sum().item()
    # print(total_samples)
    accuracy = 100.0 * total_correct / total_samples
    return accuracy

### torch summary


In [5]:

from torchsummary import summary
n_classes = 10
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("device",device)
model = MusicCNN(n_classes)
model.to(device)
optimizer = optim.Adam(model.parameters())
loss_function = nn.CrossEntropyLoss()

# summary(model, input_size=(1025,1293), batch_size=32)
summary(model, input_size=(1300,40), batch_size=32)




device cuda
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [32, 32, 1298, 38]             320
              ReLU-2         [32, 32, 1298, 38]               0
         MaxPool2d-3          [32, 32, 649, 19]               0
            Conv2d-4         [32, 128, 647, 17]          36,992
              ReLU-5         [32, 128, 647, 17]               0
         MaxPool2d-6          [32, 128, 323, 8]               0
           Dropout-7          [32, 128, 323, 8]               0
            Conv2d-8          [32, 128, 321, 6]         147,584
              ReLU-9          [32, 128, 321, 6]               0
        MaxPool2d-10          [32, 128, 160, 3]               0
          Dropout-11          [32, 128, 160, 3]               0
AdaptiveAvgPool2d-12            [32, 128, 1, 1]               0
           Linear-13                  [32, 512]          66,048
             ReLU-14       

## cross validation

In [None]:
for i, fold in enumerate(dataset_splits):
    # 将4个子集合并成一个训练集
    train_dataset = torch.utils.data.ConcatDataset([dataset_splits[j] for j in range(5) if j != i])
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    test_loader = DataLoader(fold, batch_size=32, shuffle=False)
    n_epochs = 100
    model = MusicCNN(n_classes)
    model.to(device)
    optimizer = optim.Adam(model.parameters())
    for epoch in range(n_epochs):
        for batch, (magnitude, label) in enumerate(train_loader):
            # print("label",label)
            # print(i)
            optimizer.zero_grad()
            # magnitude = torch.stack(magnitude)
            # magnitude = magnitude.permute(1, 0, 2, 3)
            # print(magnitude.shape)
            # size = np.array(magnitude[0]).shape
            # print("magshape",magnitude.shape)
            # new_label = batch_change(magnitude,label)
            # print(device)
            magnitude = magnitude.type(torch.cuda.FloatTensor)
            magnitude.to(device)
            label = label.type(torch.cuda.LongTensor)
            label.to(device)
            output = model(magnitude)
            loss = loss_function(output, label.squeeze())
            loss.backward()
            optimizer.step()
            if batch % 10 == 0:
                print("Epoch {} Batch {}: Loss = {}".format(epoch, batch, loss.item()))
    torch.save(model.state_dict(), 'model_new'+str(i)+'.pt')
    accuracy = evaluate(model, test_loader)
    print(f"Test accuracy: {accuracy:.2f}%")
        

Epoch 0 Batch 0: Loss = 2.9075701236724854
Epoch 0 Batch 10: Loss = 2.227437973022461
Epoch 1 Batch 0: Loss = 2.138370990753174
Epoch 1 Batch 10: Loss = 1.748353362083435
Epoch 2 Batch 0: Loss = 1.8806076049804688
Epoch 2 Batch 10: Loss = 1.5488409996032715
Epoch 3 Batch 0: Loss = 1.3475744724273682
Epoch 3 Batch 10: Loss = 1.3900036811828613
Epoch 4 Batch 0: Loss = 1.204941987991333
Epoch 4 Batch 10: Loss = 1.1224218606948853
Epoch 5 Batch 0: Loss = 1.029836893081665
Epoch 5 Batch 10: Loss = 1.2332992553710938
Epoch 6 Batch 0: Loss = 1.3724204301834106
Epoch 6 Batch 10: Loss = 1.2578963041305542
Epoch 7 Batch 0: Loss = 1.0088543891906738
Epoch 7 Batch 10: Loss = 1.0038859844207764
Epoch 8 Batch 0: Loss = 0.9929409027099609
Epoch 8 Batch 10: Loss = 0.8980150818824768
Epoch 9 Batch 0: Loss = 0.9893862009048462
Epoch 9 Batch 10: Loss = 1.2443101406097412
Epoch 10 Batch 0: Loss = 0.964398205280304
Epoch 10 Batch 10: Loss = 0.8416042327880859
Epoch 11 Batch 0: Loss = 1.0424917936325073
Epo

Epoch 91 Batch 0: Loss = 0.010563721880316734
Epoch 91 Batch 10: Loss = 0.002294724341481924
Epoch 92 Batch 0: Loss = 0.0028725003357976675
Epoch 92 Batch 10: Loss = 0.003230625530704856
Epoch 93 Batch 0: Loss = 0.003975998144596815
Epoch 93 Batch 10: Loss = 0.005756074097007513
Epoch 94 Batch 0: Loss = 0.0034853892866522074
Epoch 94 Batch 10: Loss = 0.004953098017722368
Epoch 95 Batch 0: Loss = 0.0046235849149525166
Epoch 95 Batch 10: Loss = 0.0018712474266067147
Epoch 96 Batch 0: Loss = 0.00278697581961751
Epoch 96 Batch 10: Loss = 0.003389835124835372
Epoch 97 Batch 0: Loss = 0.0024576536379754543
Epoch 97 Batch 10: Loss = 0.0020840021315962076
Epoch 98 Batch 0: Loss = 0.0011818730272352695
Epoch 98 Batch 10: Loss = 0.005840074270963669
Epoch 99 Batch 0: Loss = 0.00195406679995358
Epoch 99 Batch 10: Loss = 0.0027874140068888664
Test accuracy: 77.00%
Epoch 0 Batch 0: Loss = 2.7189083099365234
Epoch 0 Batch 10: Loss = 2.1872684955596924
Epoch 1 Batch 0: Loss = 2.1185836791992188
Epoch

Epoch 82 Batch 0: Loss = 0.022771138697862625
Epoch 82 Batch 10: Loss = 0.04075034707784653
Epoch 83 Batch 0: Loss = 0.04437469318509102
Epoch 83 Batch 10: Loss = 0.03556814417243004
Epoch 84 Batch 0: Loss = 0.015073629096150398
Epoch 84 Batch 10: Loss = 0.006881672888994217
Epoch 85 Batch 0: Loss = 0.00779313687235117
Epoch 85 Batch 10: Loss = 0.007822347804903984
Epoch 86 Batch 0: Loss = 0.01173210609704256
Epoch 86 Batch 10: Loss = 0.008207172155380249
Epoch 87 Batch 0: Loss = 0.0019578346982598305
Epoch 87 Batch 10: Loss = 0.011884495615959167
Epoch 88 Batch 0: Loss = 0.0036836520303040743
Epoch 88 Batch 10: Loss = 0.005854309070855379
Epoch 89 Batch 0: Loss = 0.0035491790622472763
Epoch 89 Batch 10: Loss = 0.005509711802005768
Epoch 90 Batch 0: Loss = 0.008466463536024094
Epoch 90 Batch 10: Loss = 0.006174588110297918
Epoch 91 Batch 0: Loss = 0.002106346655637026
Epoch 91 Batch 10: Loss = 0.002433375921100378
Epoch 92 Batch 0: Loss = 0.003614828223362565
Epoch 92 Batch 10: Loss = 

Epoch 73 Batch 0: Loss = 0.026760492473840714
Epoch 73 Batch 10: Loss = 0.02108573354780674
Epoch 74 Batch 0: Loss = 0.012625772505998611
Epoch 74 Batch 10: Loss = 0.011247503571212292
Epoch 75 Batch 0: Loss = 0.011805822141468525
Epoch 75 Batch 10: Loss = 0.017143767327070236
Epoch 76 Batch 0: Loss = 0.0031214184127748013
Epoch 76 Batch 10: Loss = 0.014142121188342571
Epoch 77 Batch 0: Loss = 0.08711720257997513
Epoch 77 Batch 10: Loss = 0.05169191211462021
Epoch 78 Batch 0: Loss = 0.02043822593986988
Epoch 78 Batch 10: Loss = 0.02219279855489731
Epoch 79 Batch 0: Loss = 0.01483422052115202
Epoch 79 Batch 10: Loss = 0.3234531581401825
Epoch 80 Batch 0: Loss = 0.10448519140481949
Epoch 80 Batch 10: Loss = 0.10633489489555359
Epoch 81 Batch 0: Loss = 0.06279421597719193
Epoch 81 Batch 10: Loss = 0.07415515929460526
Epoch 82 Batch 0: Loss = 0.20270031690597534
Epoch 82 Batch 10: Loss = 0.12216739356517792
Epoch 83 Batch 0: Loss = 0.2266218662261963
Epoch 83 Batch 10: Loss = 0.39048337936

Epoch 64 Batch 0: Loss = 0.047366656363010406
Epoch 64 Batch 10: Loss = 0.28797343373298645
Epoch 65 Batch 0: Loss = 0.13192814588546753
Epoch 65 Batch 10: Loss = 0.14113639295101166
Epoch 66 Batch 0: Loss = 0.09916669130325317
Epoch 66 Batch 10: Loss = 0.06429456174373627
Epoch 67 Batch 0: Loss = 0.06994511187076569
Epoch 67 Batch 10: Loss = 0.036414023488759995
Epoch 68 Batch 0: Loss = 0.03177967295050621
Epoch 68 Batch 10: Loss = 0.03588885813951492
Epoch 69 Batch 0: Loss = 0.04464111849665642
Epoch 69 Batch 10: Loss = 0.11529811471700668
Epoch 70 Batch 0: Loss = 0.10322172939777374
Epoch 70 Batch 10: Loss = 0.021638324484229088
Epoch 71 Batch 0: Loss = 0.026202989742159843
Epoch 71 Batch 10: Loss = 0.10359757393598557
Epoch 72 Batch 0: Loss = 0.013467120006680489
Epoch 72 Batch 10: Loss = 0.013534977100789547
Epoch 73 Batch 0: Loss = 0.014670020900666714
Epoch 73 Batch 10: Loss = 0.013568558730185032
Epoch 74 Batch 0: Loss = 0.008852289989590645
Epoch 74 Batch 10: Loss = 0.00915600

、## Train

In [None]:

n_epochs = 100
for epoch in range(n_epochs):
    for i, (magnitude, label) in enumerate(dataloader):
        # print("label",label)
        # print(i)
        optimizer.zero_grad()
        # magnitude = torch.stack(magnitude)
        # magnitude = magnitude.permute(1, 0, 2, 3)
        # print(magnitude.shape)
        # size = np.array(magnitude[0]).shape
        # print("magshape",magnitude.shape)
        # new_label = batch_change(magnitude,label)
        # print(device)
        magnitude = magnitude.type(torch.cuda.FloatTensor)
        magnitude.to(device)
        label = label.type(torch.cuda.LongTensor)
        label.to(device)
        output = model(magnitude)
        loss = loss_function(output, label.squeeze())
        loss.backward()
        optimizer.step()
        if i % 10 == 0:
            print("Epoch {} Batch {}: Loss = {}".format(epoch, i, loss.item()))

## save model

In [None]:
torch.save(model.state_dict(), 'model_new.pt')

In [None]:


test_loader = DataLoader(test_set, batch_size=32, shuffle=True)

accuracy = evaluate(model, test_loader)
print(f"Test accuracy: {accuracy:.2f}%")
