In [1]:
import os
# from tqdm import tqdm
import torch
import torchaudio
import torch.nn as nn
import torch.nn.functional as F  
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
import copy
from torchsummary import summary

In [2]:
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder, LabelBinarizer, StandardScaler

In [3]:
import utils

In [4]:
# define device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [33]:
DATA_DIR = '/Users/melikakeshavarz/desktop/new/data/fma_small'

tracks = utils.load('/Users/melikakeshavarz/desktop/new/data/fma_metadata/tracks.csv')
features = utils.load('/Users/melikakeshavarz/desktop/new/data/fma_metadata/features.csv')#annotation files
echonest = utils.load('/Users/melikakeshavarz/desktop/new/data/fma_metadata/echonest.csv')

subset = tracks.index[tracks['set', 'subset'] <= 'small']

assert subset.isin(tracks.index).all()
assert subset.isin(features.index).all()

features_all = features.join(echonest, how='inner').sort_index(axis=1)
print('Not enough Echonest features: {}'.format(features_all.shape))

tracks = tracks.loc[subset]
features_all = features.loc[subset]

tracks.shape, features_all.shape

train = tracks.index[tracks['set', 'split'] == 'training'] #bunch of indexes (not ids) for training val and test
val = tracks.index[tracks['set', 'split'] == 'validation']
test = tracks.index[tracks['set', 'split'] == 'test']

Not enough Echonest features: (13129, 767)


In [38]:
#pause
labels_onehot = LabelBinarizer().fit_transform(tracks['track', 'genre_top'])
labels_onehot = pd.DataFrame(labels_onehot, index=tracks.index)


In [84]:
#Meeeeeeeeeee

#custome dataset class



from torch.utils.data import Dataset, DataLoader



class FMA(Dataset):
    def __init__(self, data_dir, track_ids, target_sample_rate, transformation, num_samples, device , twoD = False):
        self.data_dir = data_dir
        self.track_ids = track_ids
        self.filenames = os.listdir(data_dir)
        self.target_sample_rate = target_sample_rate
        self.device = device
        self.transformation = transformation.to(self.device)
        self.twoD = twoD
        self.num_samples = num_samples
        
        

    def __getitem__(self, index):
        tid = self.track_ids[index]
        filepath = utils.get_audio_path(self.data_dir, tid)
        label = torch.from_numpy(labels_onehot.loc[tid].values).float()
        waveform, sr = torchaudio.load(filepath)#be careful all of the sample rates aren't the same(resample)
        #waveform --> (2, 10000) #(number of channels, number of samples)
        waveform = waveform.to(self.device)
        waveform = self._resample_if_necessary(waveform, sr)
        waveform = self._mix_down_if_necessary(waveform)
        #we have to adjust the length of the audio waveforms before the transformation
        waveform = self._cut_if_necessary(waveform)
        waveform = self._right_pad_if_necessary(waveform)
        if self.twoD == True:
            waveform = self.transformation(waveform)
        else:
            pass
        return waveform, label
    
    
    def _cut_if_necessary(self, waveform):
        #this method happens before the transformation
        if waveform.shape[1] > self.num_samples:
            waveform = waveform[:, :self.num_samples]
            return waveform
        
        
    def _right_pad_if_necessary(self, waveform):
        if waveform.shape[1] < self.num_samples:
            num_missing_samples = self.num_samples - waveform.shape[1]
            last_dim_padding = (0,num_missing_samples) # (1, 2) -> (left, right)   
            #(1, 2, 0, 1) -> (left, right, padnumleft, padnumright)
            # what happens is : [1, 1, 1] --> [0, 1, 1, 1, 0, 0]
            waveform = torch.nn.functional.pad(waveform, last_dim_padding)
            waveform = waveform.T
        return waveform
    
    
        
    def _resample_if_necessary(self, waveform , sr):
        if sr != self.target_sample_rate:
            resampler = torchaudio.transforms.Resample(sr, self.target_sample_rate)
            waveform = resampler(waveform)
        return waveform
    
    
    #from (2, 10000) to (1, 0000) taking the average between two waveforms
    def _mix_down_if_necessary(self, waveform):
        if waveform.shape[0] > 1:
            waveform = torch.mean(waveform , dim = 0, keepdim = True)
        return waveform
        
    
        return waveform.T, label 
    
    def __len__(self):#just gives us the number of samples in our datasets.
        return len(self.filenames) 

        

        

In [88]:
#trying the class:

if __name__ == "__main__":
    

    SAMPLE_RATE=22050
    NUM_SAMPLES = 22050
    #working on GPU
    if torch.cuda.is_available():
        Device = "cuda"
    else:
        Device = "cpu"
        
    print(f"we are using {device}.")  
    
    
    mel_spectrogram = torchaudio.transforms.MelSpectrogram(sample_rate = SAMPLE_RATE, n_fft = 1024, hop_length = 512,
                                                        n_mels = 64) 
    
    FL = FMA(DATA_DIR, train, SAMPLE_RATE, mel_spectrogram, NUM_SAMPLES, Device, twoD =True)
    #print(f"there are {len(FL)} samples in the dataset" )
    waveform, label = FL[0] #track number 2
    a = 1
    
    


we are using cpu.


In [89]:
# If melspectrogram is applied

waveform.shape

torch.Size([1, 64, 44])

In [91]:
####### very bad 2D architecture :))))))) just a test
from torch import nn

class CNN2D(nn.Module):
    
    
    def __init__(self):
        #vgg
        super().__init__()
        #4 conv blocks / flatten / linear / softmax
        self.conv1 = nn.Sequential(nn.Conv2d(in_channels =1, out_channels =16, kernel_size =3, stride =1, padding =2),
                                  nn.ReLU(),
                                  nn.MaxPool2d(kernel_size = 2))
        
        
        
        
        self.conv2 = nn.Sequential(nn.Conv2d(in_channels =16, out_channels =32, kernel_size =3, stride =1, padding =2),
                                  nn.ReLU(),
                                  nn.MaxPool2d(kernel_size = 2))
        
        
        
        self.conv3 = nn.Sequential(nn.Conv2d(in_channels =32, out_channels =64, kernel_size =3, stride =1, padding =2),
                                  nn.ReLU(),
                                  nn.MaxPool2d(kernel_size = 2))
        
        
        
        self.conv4 = nn.Sequential(nn.Conv2d(in_channels =64, out_channels =128, kernel_size =3, stride =1, padding =2),
                                  nn.ReLU(),
                                  nn.MaxPool2d(kernel_size = 2))
        
        

        
        self.flatten = nn.Flatten()
        self.linear = nn.Linear(128*5*4, 8)
        self.softmax =  nn.Softmax(dim = 1)
        
        
     #in this method we tell pytorch how to pass data from layer to another layer   
    def forward(self, input_data):
        x = self.conv1(input_data)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.conv4(x)
        x = self.flatten(x)
        logits = self.linear(x)
        predictions = self.softmax(logits)
        return predictions
        
        
            






In [92]:
if __name__ == "__main__":
    if torch.cuda.is_available():
        Device = "cuda"
    else:
        Device = "cpu"
    print(f"Using {Device}")
    cnn = CNN2D()
    summary(cnn.to(Device), (1, 64, 44))  #summary(model, size of the spectogram)
    
    #warning: the input is on gpu that's why we have to have the model on the smae device

Using cpu
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 16, 66, 46]             160
              ReLU-2           [-1, 16, 66, 46]               0
         MaxPool2d-3           [-1, 16, 33, 23]               0
            Conv2d-4           [-1, 32, 35, 25]           4,640
              ReLU-5           [-1, 32, 35, 25]               0
         MaxPool2d-6           [-1, 32, 17, 12]               0
            Conv2d-7           [-1, 64, 19, 14]          18,496
              ReLU-8           [-1, 64, 19, 14]               0
         MaxPool2d-9             [-1, 64, 9, 7]               0
           Conv2d-10           [-1, 128, 11, 9]          73,856
             ReLU-11           [-1, 128, 11, 9]               0
        MaxPool2d-12            [-1, 128, 5, 4]               0
          Flatten-13                 [-1, 2560]               0
           Linear-14         

In [93]:
import torch
import torchaudio
from torch import nn
from torch.utils.data import DataLoader





BATCH_SIZE = 32
EPOCHS = 10
LEARNING_RATE = 0.001


SAMPLE_RATE = 22050
NUM_SAMPLES = 22050




def create_data_loader(train_data, batch_size):
    train_dataloader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
    return train_dataloader


def train_single_epoch(model, data_loader, loss_fn, optimiser, device):
    for in_put, target in data_loader:
        in_put, target = in_put.to(device), target.to(device)

        # calculate loss
        prediction = model(in_put)
        loss = loss_fn(prediction, target)

        # backpropagate error and update weights
        optimiser.zero_grad()
        loss.backward()
        optimiser.step()

    print(f"loss: {loss.item()}")


def train_model(model, data_loader, loss_fn, optimiser, device, epochs):
    for i in range(epochs):
        print(f"Epoch {i+1}")
        train_single_epoch(model, data_loader, loss_fn, optimiser, device)
        print("---------------------------")
    print("Finished training")

In [94]:
if __name__ == "__main__":
    if torch.cuda.is_available():
        Device = "cuda"
    else:
        Device = "cpu"
    print(f"Using {Device}")

    # instantiating our dataset object and create data loader
    
    #transform
    mel_spectrogram = torchaudio.transforms.MelSpectrogram(sample_rate = SAMPLE_RATE, n_fft = 1024, hop_length = 512,
                                                        n_mels = 64) 
    # 
    FL = FMA(DATA_DIR, train, SAMPLE_RATE, mel_spectrogram, NUM_SAMPLES, Device, twoD =True)
    
    train_dataloader = create_data_loader(FL, BATCH_SIZE)
    


    # construct model and assign it to device
    cnn = CNN2D().to(Device)
    print(cnn)

    # initialise loss funtion + optimiser
    loss_fn = nn.CrossEntropyLoss()
    optimiser = torch.optim.Adam(cnn.parameters(),
                                 lr=LEARNING_RATE)


    # train model
    train_model(cnn, train_dataloader, loss_fn, optimiser, Device, EPOCHS)

    # save model
    torch.save(cnn.state_dict(), "feedforwardnet.pth")
    print("Trained feed forward net saved at feedforwardnet.pth")


Using cpu
CNN2D(
  (conv1): Sequential(
    (0): Conv2d(1, 16, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (conv2): Sequential(
    (0): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (conv3): Sequential(
    (0): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (conv4): Sequential(
    (0): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear): Linear(in_features=2560, out_features=8, bias=True)
  (softmax): Softmax(dim=1)
)
Epoch 1
loss: 1.9835455417633057
--

In [95]:
waveform.shape

torch.Size([1, 64, 44])