In [47]:
import torch
import math
# this ensures that the current MacOS version is at least 12.3+
#print(torch.backends.mps.is_available())
# this ensures that the current current PyTorch installation was built with MPS activated.
#print(torch.backends.mps.is_built())

In [48]:
import os
# from tqdm import tqdm
import torch
import torchaudio
import torch.nn as nn
import torch.nn.functional as F  
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
import copy
from torchsummary import summary

In [49]:
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder, LabelBinarizer, StandardScaler

In [50]:
import utils

In [51]:
# define device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

In [52]:
DATA_DIR = '/Users/melikakeshavarz/desktop/fma/data/fma_small'

tracks = utils.load('/Users/melikakeshavarz/desktop/fma/data/fma_metadata/tracks.csv')
features = utils.load('/Users/melikakeshavarz/desktop/fma/data/fma_metadata/features.csv')#annotation files
echonest = utils.load('/Users/melikakeshavarz/desktop/fma/data/fma_metadata/echonest.csv')

subset = tracks.index[tracks['set', 'subset'] <= 'small']

assert subset.isin(tracks.index).all()
assert subset.isin(features.index).all()

features_all = features.join(echonest, how='inner').sort_index(axis=1)
print('Not enough Echonest features: {}'.format(features_all.shape))

tracks = tracks.loc[subset]
features_all = features.loc[subset]

tracks.shape, features_all.shape

train = tracks.index[tracks['set', 'split'] == 'training'] #bunch of indexes (not ids) for training val and test
val = tracks.index[tracks['set', 'split'] == 'validation']
test = tracks.index[tracks['set', 'split'] == 'test']

Not enough Echonest features: (13129, 767)


In [53]:
#pause
labels_onehot = LabelBinarizer().fit_transform(tracks['track', 'genre_top'])
labels_onehot = pd.DataFrame(labels_onehot, index=tracks.index)


In [54]:
#adding the ch_three attribute/option to create 3 channel spectrogram.

#for manual spectogram we used one channel, but for the prepared one from pytorch we used 3 channel spectogram

#three channels are simply just a 3 replicas of 1 channel spectrogram.





from torch.utils.data import Dataset, DataLoader


#custome dataset class
class FMA(Dataset):
    def __init__(self, data_dir, track_ids,
                 target_sample_rate, transformation, num_samples, device , twoD = False, paper_cut = False):
        self.data_dir = data_dir
        self.track_ids = track_ids
        self.filenames = os.listdir(data_dir)
        self.target_sample_rate = target_sample_rate
        self.device = device
        self.transformation = transformation.to(self.device)
        self.twoD = twoD
        self.num_samples = num_samples
        self.paper_cut = paper_cut

        
        

    def __getitem__(self, index):
        tid = self.track_ids[index]
        filepath = utils.get_audio_path(self.data_dir, tid)
        label = torch.from_numpy(labels_onehot.loc[tid].values).float()
        waveform, sr = torchaudio.load(filepath)#be careful all of the sample rates aren't the same(resample)
        #waveform --> (2, 10000) #(number of channels, number of samples)
        waveform = waveform.to(self.device)
        waveform = self._resample_if_necessary(waveform, sr)
        waveform = self._mix_down_if_necessary(waveform)
        #we have to adjust the length of the audio waveforms before the transformation
        waveform = self._cut_if_necessary(waveform)
        waveform = self._right_pad_if_necessary(waveform)
        if self.twoD == True:
            waveform = self.transformation(waveform)
        else:
            pass
        
        
        if self.paper_cut == True:
            waveform = waveform[:, :128, :513]
        else:
            pass
        
        

        return waveform, label
    
    
    def _cut_if_necessary(self, waveform):
        #this method happens before the transformation
        if waveform.shape[1] > self.num_samples:
            waveform = waveform[:, :self.num_samples]
            return waveform
        
        
    def _right_pad_if_necessary(self, waveform):
        if waveform.shape[1] < self.num_samples:
            num_missing_samples = self.num_samples - waveform.shape[1]
            last_dim_padding = (0,num_missing_samples) # (1, 2) -> (left, right)   
            #(1, 2, 0, 1) -> (left, right, padnumleft, padnumright)
            # what happens is : [1, 1, 1] --> [0, 1, 1, 1, 0, 0]
            waveform = torch.nn.functional.pad(waveform, last_dim_padding)
            waveform = waveform.T
        return waveform
    
    
        
    def _resample_if_necessary(self, waveform , sr):
        if sr != self.target_sample_rate:
            resampler = torchaudio.transforms.Resample(sr, self.target_sample_rate)
            waveform = resampler(waveform)
        return waveform
    
    
    #from (2, 10000) to (1, 0000) taking the average between two waveforms
    def _mix_down_if_necessary(self, waveform):
        if waveform.shape[0] > 1:
            waveform = torch.mean(waveform , dim = 0, keepdim = True)
        return waveform
        

        
        return waveform.T, label 
    
    def __len__(self):#just gives us the number of samples in our datasets.
        return len(self.filenames) 

        

        

In [55]:
#trying the class:

if __name__ == "__main__":
    

    SAMPLE_RATE=44100
    
    #maxlength
    NUM_SAMPLES = 1320000
    #working on GPU
    if torch.cuda.is_available():
        Device = "cuda"
    else:
        Device = "cpu"
        
    print(f"we are using {Device}.")  
    
    #50% hop_length is the best for accuracy
    mel_spectrogram = torchaudio.transforms.MelSpectrogram(sample_rate = SAMPLE_RATE, n_fft = 1024, hop_length = 256,
                                                        n_mels = 64) 
    
    n_fft = 2048   # FFT window size
    hop_length = 256    # number of samples between successive frames
    win_length = n_fft
    
    spectrogram = torchaudio.transforms.Spectrogram(n_fft=n_fft, hop_length = hop_length, win_length = win_length )
    
    
    
    FL = FMA(DATA_DIR, train, SAMPLE_RATE, spectrogram, NUM_SAMPLES, Device, twoD =True)
    #print(f"there are {len(FL)} samples in the dataset" )
    waveform, label = FL[0] #track number 2
    a = 1
    
    


we are using cpu.


In [56]:
waveform.size()

torch.Size([1, 128, 513])

In [57]:
#loading the data
BATCH = 16

FL = FMA(DATA_DIR, train, SAMPLE_RATE,spectrogram, NUM_SAMPLES, Device, twoD =True)

dataloader = torch.utils.data.DataLoader(FL, batch_size=BATCH, shuffle=True)


for spects, labels in dataloader:
    print(spects.size())

torch.Size([16, 1, 128, 513])
torch.Size([16, 1, 128, 513])
torch.Size([16, 1, 128, 513])
torch.Size([16, 1, 128, 513])
torch.Size([16, 1, 128, 513])
torch.Size([16, 1, 128, 513])
torch.Size([16, 1, 128, 513])
torch.Size([16, 1, 128, 513])
torch.Size([16, 1, 128, 513])
torch.Size([14, 1, 128, 513])


In [58]:
#ResNet34

class BasicBlock(nn.Module):
    expansion = 1 #we don't use the block.expansion here

    def __init__(self, inplanes, planes, stride=1,padding = 1):
        super().__init__()
        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size = 3, stride=stride,
                     padding=padding, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size = 3, stride=1,
                     padding=padding, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)
        self.downsample = nn.Sequential(
                nn.Conv2d(inplanes, planes, 1, stride, bias=False),
                nn.BatchNorm2d(planes))
        self.stride = stride


    def forward(self, x):
        identity = x

        out = self.conv1(x)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)


        identity = self.downsample(x)

        out += identity
        out = self.relu(out)

        return out

In [59]:

class ResNet(nn.Module):

    def __init__(self, block, layers, num_classes=8):
        super().__init__()
        
        self.inplanes = 128

        self.c1 = nn.Conv2d(1, self.inplanes, kernel_size=7, stride=2, padding=3,
                               bias=False)
        self.bn1 = nn.BatchNorm2d(128)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        
        self.layer1 = self._make_layer(block, 128, 128, layers[0])
        self.layer2 = self._make_layer(block, 128, 256, layers[1], stride=2)
        self.layer3 = self._make_layer(block, 256, 256, layers[2], stride=2)
        self.layer4 = self._make_layer(block, 256, 512, layers[3], stride=2)
        self.avgpool = nn.AdaptiveAvgPool2d(7)
        self.fc = nn.Linear(25088 , num_classes)


    def _make_layer(self, block, inplanes, planes, blocks, stride=1):
 

        layers = []
        layers.append(block(inplanes, planes, stride))
        
        self.inplanes = planes
        
        for _ in range(1, blocks):
            layers.append(block(self.inplanes, planes))

        return nn.Sequential(*layers)
    
    
    def forward(self, x):
        x = self.c1(x)           
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)         

        x = self.layer1(x)          
        x = self.layer2(x)          
        x = self.layer3(x)          
        x = self.layer4(x)          

        x = self.avgpool(x)         
        x = torch.flatten(x, 1)     
        x = self.fc(x)

        return x

In [60]:


def resnet34_manual():
    layers=[3, 4, 6, 3]
    model = ResNet(BasicBlock, layers)
    return model

In [61]:
model=resnet34_manual()



In [62]:
if __name__ == "__main__":
    if torch.cuda.is_available():
        Device = "cuda"
    else:
        Device = "cpu"
    print(f"Using {Device}")
    
    
   
    
    cnn =resnet34_manual()
    summary(cnn.to(Device), (1, 128, 513) ) #summary(model, size of the spectogram)
    

Using cpu
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 128, 64, 257]           6,272
       BatchNorm2d-2         [-1, 128, 64, 257]             256
              ReLU-3         [-1, 128, 64, 257]               0
         MaxPool2d-4         [-1, 128, 32, 129]               0
            Conv2d-5         [-1, 128, 32, 129]         147,456
              ReLU-6         [-1, 128, 32, 129]               0
            Conv2d-7         [-1, 128, 32, 129]         147,456
       BatchNorm2d-8         [-1, 128, 32, 129]             256
            Conv2d-9         [-1, 128, 32, 129]          16,384
      BatchNorm2d-10         [-1, 128, 32, 129]             256
             ReLU-11         [-1, 128, 32, 129]               0
       BasicBlock-12         [-1, 128, 32, 129]               0
           Conv2d-13         [-1, 128, 32, 129]         147,456
             ReLU-14         

In [64]:
BATCH = 128

# create a training dataset and dataloader
FL = FMA(DATA_DIR, train, SAMPLE_RATE,spectrogram, NUM_SAMPLES, Device, twoD =True)
val_dataset = FMA(DATA_DIR, val, SAMPLE_RATE, spectrogram, NUM_SAMPLES, Device, twoD =True)


# create a validation dataset and dataloader
val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=BATCH, shuffle=True)
dataloader = torch.utils.data.DataLoader(FL, batch_size=BATCH, shuffle=True)

    
# create the CNN model
model = resnet34_manual().to(Device) # HERE YOU PUT UR NETWORK
model.to(device)

# define the loss function and the optimizer
loss_fn = nn.CrossEntropyLoss()

# Adam optimizer
# optimizer = torch.optim.Adam(model.parameters())


#
optimizer = torch.optim.Adam(model.parameters(), lr=1)

# Define the scheduler
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.2, patience=5)



num_epochs = 5
i = 0
running_loss = 0.0


    
# train the model

for epoch in range(num_epochs):
    # evaluate the model on the training dataset
    train_correct = 0
    train_total = 0
    for waveform, label in dataloader:
        label = label.to(device)
        train_label = torch.argmax(label, dim=1)

        # clear the gradients
        optimizer.zero_grad()

        # forward pass
        waveform = waveform.squeeze(0)

        
        waveform = waveform.to(device)
        output = model(waveform)
        
        print(output.size(), label.size())
        loss = loss_fn(output, label)

        # backward pass
        loss.backward()
        optimizer.step()  
        
        # Update the learning rate
        scheduler.step(loss)
            
        _, train_predicted = torch.max(output.data, 1)
        train_total += train_label.size(0)
        train_correct += (train_predicted == train_label).sum().item()
        # print statistics
        i += 1
        running_loss += loss.item()
            
           
    print('[%d, %5d subsamples] Training loss: %.3f' % (epoch + 1, i*BATCH, running_loss / len(dataloader)))
    running_loss = 0            
    # evaluate the model on the validation dataset
    val_loss = 0.0
    val_correct = 0
    val_total = 0
    with torch.no_grad():
        for val_waveform, val_label in val_dataloader:
            val_label = val_label.to(device)
            val_label = torch.argmax(val_label, dim=1)
            val_waveform = val_waveform.squeeze(0)
            
            val_waveform = val_waveform.to(device)
            val_output = model(val_waveform)
            val_loss += loss_fn(val_output, val_label).item()
            _, val_predicted = torch.max(val_output.data, 1)
            val_total += val_label.size(0)
            val_correct += (val_predicted == val_label).sum().item()


    print('Validation Loss: {:.4f} | Validation Accuracy: {:.4f} | Training Accuracy: {:.4f}'.format(val_loss / len(val_dataloader), val_correct / val_total, train_correct / train_total))
print('Finished Training')

torch.Size([128, 8]) torch.Size([128, 8])
torch.Size([30, 8]) torch.Size([30, 8])
[1,   256 subsamples] Training loss: 3984.255
Validation Loss: 11792.5947 | Validation Accuracy: 0.0886 | Training Accuracy: 0.1076
torch.Size([128, 8]) torch.Size([128, 8])
torch.Size([30, 8]) torch.Size([30, 8])
[2,   512 subsamples] Training loss: 5405.951
Validation Loss: 1078.7103 | Validation Accuracy: 0.2215 | Training Accuracy: 0.1835
torch.Size([128, 8]) torch.Size([128, 8])
torch.Size([30, 8]) torch.Size([30, 8])
[3,   768 subsamples] Training loss: 630.505
Validation Loss: 47.9781 | Validation Accuracy: 0.0823 | Training Accuracy: 0.1329
torch.Size([128, 8]) torch.Size([128, 8])
torch.Size([30, 8]) torch.Size([30, 8])
[4,  1024 subsamples] Training loss: 9.725
Validation Loss: 971.9133 | Validation Accuracy: 0.0949 | Training Accuracy: 0.1519
torch.Size([128, 8]) torch.Size([128, 8])


KeyboardInterrupt: 

In [43]:
from torchvision.models import resnet34
model = resnet34()



model.fc = nn.Linear(512, 8, bias = True)
model.conv1 = nn.Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [46]:
BATCH = 32

#Here we used all the data, and the Three channel option.
FL = FMA(DATA_DIR, train, SAMPLE_RATE,spectrogram, NUM_SAMPLES, Device, twoD =True, Paper_cut = False)
val_dataset = FMA(DATA_DIR, val, SAMPLE_RATE, spectrogram, NUM_SAMPLES, Device, twoD =True)



val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=BATCH, shuffle=True)
dataloader = torch.utils.data.DataLoader(FL, batch_size=BATCH, shuffle=True)


#the Original ResNet34
model.to(device)


loss_fn = nn.CrossEntropyLoss()

# Adam optimizer



#
optimizer = torch.optim.Adam(model.parameters(), lr=1)

# Define the scheduler
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.2, patience=5)



num_epochs = 10
i = 0
running_loss = 0.0


    
# train the model

for epoch in range(num_epochs):
    # evaluate the model on the training dataset
    model.train()
    train_correct = 0
    train_total = 0
    for waveform, label in dataloader:
        label = label.to(device)
        train_label = torch.argmax(label, dim=1)

        # clear the gradients
        optimizer.zero_grad()

        # forward pass
        waveform = waveform.squeeze(0)


        
        waveform = waveform.to(device)
        output = model(waveform)
        

        
        print(output.size(), label.size())
        loss = loss_fn(output, label)

        # backward pass
        loss.backward()
        optimizer.step()  
        
        # Update the learning rate
        scheduler.step(loss)
            
        _, train_predicted = torch.max(output.data, 1)
        train_total += train_label.size(0)
        train_correct += (train_predicted == train_label).sum().item()
        # print statistics
        i += 1
        running_loss += loss.item()
            
           
    print('[%d, %5d subsamples] Training loss: %.3f' % (epoch + 1, i*BATCH, running_loss / len(dataloader)))
    running_loss = 0            
    # evaluate the model on the validation dataset
    val_loss = 0.0
    val_correct = 0
    val_total = 0
    with torch.no_grad():
        model.eval()
        for val_waveform, val_label in val_dataloader:
            val_label = val_label.to(device)
            val_label = torch.argmax(val_label, dim=1)
            val_waveform = val_waveform.squeeze(0)
            
            val_waveform = val_waveform.to(device)
            val_output = model(val_waveform)
            val_loss += loss_fn(val_output, val_label).item()
            _, val_predicted = torch.max(val_output.data, 1)
            val_total += val_label.size(0)
            val_correct += (val_predicted == val_label).sum().item()


    print('Validation Loss: {:.4f} | Validation Accuracy: {:.4f} | Training Accuracy: {:.4f}'.format(val_loss / len(val_dataloader), val_correct / val_total, train_correct / train_total))
print('Finished Training')

torch.Size([32, 8]) torch.Size([32, 8])
torch.Size([32, 8]) torch.Size([32, 8])
torch.Size([32, 8]) torch.Size([32, 8])
torch.Size([32, 8]) torch.Size([32, 8])
torch.Size([30, 8]) torch.Size([30, 8])
[1,   160 subsamples] Training loss: 25.211
Validation Loss: 70971185.6000 | Validation Accuracy: 0.2089 | Training Accuracy: 0.3101
torch.Size([32, 8]) torch.Size([32, 8])
torch.Size([32, 8]) torch.Size([32, 8])
torch.Size([32, 8]) torch.Size([32, 8])
torch.Size([32, 8]) torch.Size([32, 8])
torch.Size([30, 8]) torch.Size([30, 8])
[2,   320 subsamples] Training loss: 17.604
Validation Loss: 43359.7453 | Validation Accuracy: 0.1076 | Training Accuracy: 0.2089
torch.Size([32, 8]) torch.Size([32, 8])
torch.Size([32, 8]) torch.Size([32, 8])
torch.Size([32, 8]) torch.Size([32, 8])
torch.Size([32, 8]) torch.Size([32, 8])
torch.Size([30, 8]) torch.Size([30, 8])
[3,   480 subsamples] Training loss: 4.849
Validation Loss: 10.1512 | Validation Accuracy: 0.0823 | Training Accuracy: 0.3671
torch.Size(