<a href="https://colab.research.google.com/github/Melikakmm/CNN-for-sound-classification/blob/main/CNN-for-sound-classification/Melika/VGG16/VGG16_GTZAN/VGG16_transfer_learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Transfer learning:


Transfer learning is a popular technique in deep learning that allows the use of pre-trained neural network models to solve new tasks. By leveraging the knowledge learned from one task, a pre-trained model can be fine-tuned on a different, but related task, thus saving time and computational resources. PyTorch is a popular deep learning framework that provides powerful tools and libraries for building and training neural networks, including pre-trained models that can be used for transfer learning. With PyTorch, transfer learning can be easily implemented, enabling developers to quickly build high-performing models for a wide range of applications.

In [20]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [21]:
import os
# from tqdm import tqdm
import torch
import torchaudio
import torch.nn as nn
import torch.nn.functional as F  
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
import copy
from torchsummary import summary
#Confusion matrix:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import numpy as np
import itertools


In [22]:
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder, LabelBinarizer, StandardScaler

In [23]:


import sys
import os

py_file_location = "/content/drive/MyDrive/"
sys.path.append(os.path.abspath(py_file_location))

In [24]:

import utils

In [None]:
# define device
#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

In [25]:
DATA_DIR = '/content/drive/MyDrive/fma/data/fma_small'

tracks = utils.load('/content/drive/MyDrive/fma/data/fma_metadata/tracks.csv')
features = utils.load('/content/drive/MyDrive/fma/data/fma_metadata/features.csv')#annotation files
echonest = utils.load('/content/drive/MyDrive/fma/data/fma_metadata/echonest.csv')

subset = tracks.index[tracks['set', 'subset'] <= 'small']

assert subset.isin(tracks.index).all()
assert subset.isin(features.index).all()

features_all = features.join(echonest, how='inner').sort_index(axis=1)
print('Not enough Echonest features: {}'.format(features_all.shape))

tracks = tracks.loc[subset]
features_all = features.loc[subset]

tracks.shape, features_all.shape

train = tracks.index[tracks['set', 'split'] == 'training'] #bunch of indexes (not ids) for training val and test
val = tracks.index[tracks['set', 'split'] == 'validation']
test = tracks.index[tracks['set', 'split'] == 'test']

Not enough Echonest features: (13129, 767)


In [26]:
len(train)

6400

In [27]:
tracks_index = tracks.index
tracks_index

Int64Index([     2,      5,     10,    140,    141,    148,    182,    190,
               193,    194,
            ...
            153956, 154303, 154305, 154306, 154307, 154308, 154309, 154413,
            154414, 155066],
           dtype='int64', name='track_id', length=8000)

In [28]:
#From data to one hot labels
labels_onehot = LabelBinarizer().fit_transform(tracks['track', 'genre_top'])
labels_onehot_Ten = torch.tensor(labels_onehot)
labels_onehot = pd.DataFrame(labels_onehot, index=tracks.index)




#from onehot labels to encoded targets.
targets = torch.argmax(labels_onehot_Ten, dim=1)





In [29]:
#creating an annotation dataframe.


df = pd.read_csv('/content/drive/MyDrive/fma/data/fma_small/checksums', sep='  |/', header = None,
                 names = ['id', 'fold', 'songs'], converters={'fold': str})
df.index = tracks_index
df.loc[5][1]

  return func(*args, **kwargs)


'000'

In [30]:
#adding the ch_three attribute/option to create 3 channel spectrogram.

#for manual spectogram we used one channel, but for the prepared one from pytorch we used 3 channel spectogram

#three channels are simply just a 3 replicas of 1 channel spectrogram.





from torch.utils.data import Dataset, DataLoader


#custome dataset class
class FMA(Dataset):
    def __init__(self, data_dir, track_ids, annotation,
                 target_sample_rate, transformation, num_samples, device = False , twoD = False, paper_cut = False):
        self.annotation = annotation
        self.data_dir = data_dir
        self.track_ids = track_ids
        self.filenames = os.listdir(data_dir)
        self.target_sample_rate = target_sample_rate
        self.device = device
        self.transformation = transformation
        if self.device == True :
          self.transformation = transformation.to(self.device)
        self.twoD = twoD
        self.num_samples = num_samples
        self.paper_cut = paper_cut


        
        

    def __getitem__(self, index):
        tid = self.track_ids[index]
        filepath = self._get_audio_sample_path(tid)
        label = torch.from_numpy(labels_onehot.loc[tid].values).float()
        
        try:
            waveform, sr = torchaudio.load(filepath)
        except:
            print(filepath)
        #be careful all of the sample rates aren't the same(resample)
        #waveform --> (2, 10000) #(number of channels, number of samples)
        if self.device == True :
          waveform = waveform.to(self.device)
        waveform = self._resample_if_necessary(waveform, sr)
        waveform = self._mix_down_if_necessary(waveform)
        #we have to adjust the length of the audio waveforms before the transformation
        waveform = self._cut_if_necessary(waveform)
        waveform = self._right_pad_if_necessary(waveform)
        if self.twoD == True:
            waveform = self.transformation(waveform)
        else:
            pass
        
        
        if self.paper_cut == True:
            waveform = waveform[:, :128, :513]
        else:
            pass
        
        

        return waveform, label
    
    
    def _get_audio_sample_path(self, dex):
        fold = self.annotation.loc[dex][1]
        path = os.path.join(self.data_dir, fold, self.annotation.loc[dex][2])
        return path
        

            
    
    
    
    def _cut_if_necessary(self, waveform):
        #this method happens before the transformation
        if waveform.shape[1] > self.num_samples:
            waveform = waveform[:, :self.num_samples]
            return waveform
        
        
    def _right_pad_if_necessary(self, waveform):
        if waveform.shape[1] < self.num_samples:
            num_missing_samples = self.num_samples - waveform.shape[1]
            last_dim_padding = (0,num_missing_samples) # (1, 2) -> (left, right)   
            #(1, 2, 0, 1) -> (left, right, padnumleft, padnumright)
            # what happens is : [1, 1, 1] --> [0, 1, 1, 1, 0, 0]
            waveform = torch.nn.functional.pad(waveform, last_dim_padding)
            waveform = waveform.T
        return waveform
    
    
        
    def _resample_if_necessary(self, waveform , sr):
        if sr != self.target_sample_rate:
            resampler = torchaudio.transforms.Resample(sr, self.target_sample_rate)
            waveform = resampler(waveform)
        return waveform
    
    
    #from (2, 10000) to (1, 0000) taking the average between two waveforms
    def _mix_down_if_necessary(self, waveform):
        if waveform.shape[0] > 1:
            waveform = torch.mean(waveform , dim = 0, keepdim = True)
        return waveform
     
    

        
        return waveform, label 
    
    def __len__(self):#just gives us the number of samples in our datasets.
        return len(self.track_ids) 

        

        

In [31]:
#trying the class:

if __name__ == "__main__":
    

    SAMPLE_RATE=44100

    #maxlength
    NUM_SAMPLES = 44100
    #working on GPU
 #   if torch.cuda.is_available():
 #       Device = "cuda"
 #   else:
 #       Device = "cpu"
        
 #   print(f"we are using {Device}.")  
    
    #50% hop_length is the best for accuracy
    mel_spectrogram = torchaudio.transforms.MelSpectrogram(sample_rate = SAMPLE_RATE, n_fft = 1024, hop_length = 256,
                                                        n_mels = 64) 
    
    n_fft = 1024    # FFT window size
    hop_length = 256    # number of samples between successive frames
    win_length = n_fft
    
    spectrogram = torchaudio.transforms.Spectrogram(n_fft=1024, hop_length = 256, win_length = win_length )
    
    
    
    FL = FMA(DATA_DIR, train, df, SAMPLE_RATE, mel_spectrogram, NUM_SAMPLES, twoD =True)
    print(f"there are {len(FL)} samples in the dataset" )
    waveform, label = FL[0] #track number 2
    a = 1
    
    


there are 6400 samples in the dataset


In [None]:
#Here are corruted songs!
#it's your choice how to deal with the :)

Dex = tracks_index

def _get_audio_sample_path(data_dir, dex): 
        fold = df.loc[dex][1]
        path = os.path.join(data_dir, fold, df.loc[dex][2])
        return path
    
for i in Dex:
    p = _get_audio_sample_path(DATA_DIR, i)
    try:
            w, sr = torchaudio.load(p)
    except:
            print(p)
    

/content/drive/MyDrive/fma/data/fma_small/018/018124.mp3
/content/drive/MyDrive/fma/data/fma_small/019/019073.mp3
/content/drive/MyDrive/fma/data/fma_small/019/019422.mp3
/content/drive/MyDrive/fma/data/fma_small/020/020366.mp3


# VGG16 without weights

In [28]:
model = torch.hub.load('pytorch/vision:v0.10.0', 'vgg16', pretrained=False)
model.features[0] = nn.Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
model.classifier[6] = nn.Linear(in_features=4096, out_features=8, bias=True)


Using cache found in /root/.cache/torch/hub/pytorch_vision_v0.10.0


In [31]:
BATCH = 32


FL = FMA(DATA_DIR, train, df, SAMPLE_RATE, mel_spectrogram, NUM_SAMPLES, twoD =True, paper_cut = True)
val_dataset = FMA(DATA_DIR, val,df, SAMPLE_RATE, mel_spectrogram, NUM_SAMPLES, twoD =True, paper_cut = True)



val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=BATCH, shuffle=True)
dataloader = torch.utils.data.DataLoader(FL, batch_size=BATCH, shuffle=True)



#model.to(device)


loss_fn = nn.CrossEntropyLoss()

# Adam optimizer
optimizer = torch.optim.Adam(model.parameters(),lr = 0.0001)

# Define the scheduler
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.2, patience=5)



num_epochs = 10
i = 0
running_loss = 0.0



train_acV_no = []
val_acV_no = [] 


# train the model
for epoch in range(num_epochs):
    # evaluate the model on the training dataset
    model.train()
    train_correct = 0
    train_total = 0
    for waveform, label in dataloader:
        #label = label.to(device)
        train_label = torch.argmax(label, dim=1)

        # clear the gradients
        optimizer.zero_grad()

        # forward pass
        waveform = waveform.squeeze(0)


        
        #waveform = waveform.to(device)
        output = model(waveform)
        

        

        loss = loss_fn(output, label)

        # backward pass
        loss.backward()
        optimizer.step()  
        
        # Update the learning rate
        scheduler.step(loss)
            
        _, train_predicted = torch.max(output.data, 1)
        train_total += train_label.size(0)
        train_correct += (train_predicted == train_label).sum().item()
        # print statistics
        i += 1
        running_loss += loss.item()
        
        
    train_a = train_correct / train_total        
    train_acV_no.append(train_a)       
    print('[%d, %5d subsamples] Training loss: %.3f' % (epoch + 1, i*BATCH, running_loss / len(dataloader)))
    running_loss = 0            
    # evaluate the model on the validation dataset
    val_loss = 0.0
    val_correct = 0
    val_total = 0
    with torch.no_grad():
        model.eval()
        for val_waveform, val_label in val_dataloader:
            #val_label = val_label.to(device)
            val_label = torch.argmax(val_label, dim=1)
            val_waveform = val_waveform.squeeze(0)
            
            #val_waveform = val_waveform.to(device)
            val_output = model(val_waveform)
            val_loss += loss_fn(val_output, val_label).item()
            _, val_predicted = torch.max(val_output.data, 1)
            val_total += val_label.size(0)
            val_correct += (val_predicted == val_label).sum().item()
            
            
    val_a = val_correct/ val_total
    val_acV_no.append(val_a)
    print('Validation Loss: {:.4f} | Validation Accuracy: {:.4f} | Training Accuracy: {:.4f}'.format(val_loss / len(val_dataloader), val_correct / val_total, train_correct / train_total))
print('Finished Training')

[1,  6400 subsamples] Training loss: 2.051
Validation Loss: 2.0330 | Validation Accuracy: 0.1338 | Training Accuracy: 0.1603
[2, 12800 subsamples] Training loss: 2.026
Validation Loss: 2.0328 | Validation Accuracy: 0.1338 | Training Accuracy: 0.1709
[3, 19200 subsamples] Training loss: 2.025
Validation Loss: 2.0326 | Validation Accuracy: 0.1338 | Training Accuracy: 0.1713
[4, 25600 subsamples] Training loss: 2.026
Validation Loss: 2.0325 | Validation Accuracy: 0.1338 | Training Accuracy: 0.1750
[5, 32000 subsamples] Training loss: 2.025
Validation Loss: 2.0323 | Validation Accuracy: 0.1338 | Training Accuracy: 0.1781
[6, 38400 subsamples] Training loss: 2.028
Validation Loss: 2.0321 | Validation Accuracy: 0.1338 | Training Accuracy: 0.1689
[7, 44800 subsamples] Training loss: 2.025
Validation Loss: 2.0319 | Validation Accuracy: 0.1338 | Training Accuracy: 0.1728
[8, 51200 subsamples] Training loss: 2.026
Validation Loss: 2.0317 | Validation Accuracy: 0.1338 | Training Accuracy: 0.1716


In [33]:
import json


with open("/content/drive/MyDrive/Result/train_acV_no.json", 'w') as f:

    json.dump(train_acV_no, f, indent=2) 

with open("/content/drive/MyDrive/Result/val_acV_no.json", 'w') as f:

    json.dump(val_acV_no, f, indent=2) 

# VGG16 with pytorch pretrained weights:

when using a pretrained model, it's important that your custom data going into the model is prepared in the same way as the original training data that went into the model.

In [32]:
import torchvision

weights = torchvision.models.VGG16_Weights.DEFAULT
model = torchvision.models.vgg16(weights=weights)

In [33]:
model

VGG(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace=True)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace=True)
    (16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1

In [17]:
#We have to change the first layer, and last layer of the networkbc our data has 1 channel and 8 outputs.
#but we are going to freeze other layers.
for i in range(1, 31):
    for param in model.features[i].parameters():
        param.requires_grad = False

for j in range(0, 6):
  for param in model.classifier[j].parameters():
    param.requires_grad = False

In [18]:
#changing the first cnn and last linear layer:


model.features[0] = nn.Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
model.classifier[6] = nn.Linear(in_features=4096, out_features=8, bias=True)

In [19]:
BATCH = 32


FL = FMA(DATA_DIR, train, df, SAMPLE_RATE, mel_spectrogram, NUM_SAMPLES, twoD =True, paper_cut = True)
val_dataset = FMA(DATA_DIR, val,df, SAMPLE_RATE, mel_spectrogram, NUM_SAMPLES, twoD =True, paper_cut = True)



val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=BATCH, shuffle=True)
dataloader = torch.utils.data.DataLoader(FL, batch_size=BATCH, shuffle=True)



#model.to(device)


loss_fn = nn.CrossEntropyLoss()

# Adam optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

# Define the scheduler
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.2, patience=5)



num_epochs = 10
i = 0
running_loss = 0.0



train_acV_pyt = []
val_acV_pyt = [] 


# train the model
for epoch in range(num_epochs):
    # evaluate the model on the training dataset
    model.train()
    train_correct = 0
    train_total = 0
    for waveform, label in dataloader:
        #label = label.to(device)
        train_label = torch.argmax(label, dim=1)

        # clear the gradients
        optimizer.zero_grad()

        # forward pass
        waveform = waveform.squeeze(0)


        
        #waveform = waveform.to(device)
        output = model(waveform)
        

        

        loss = loss_fn(output, label)

        # backward pass
        loss.backward()
        optimizer.step()  
        
        # Update the learning rate
        scheduler.step(loss)
            
        _, train_predicted = torch.max(output.data, 1)
        train_total += train_label.size(0)
        train_correct += (train_predicted == train_label).sum().item()
        # print statistics
        i += 1
        running_loss += loss.item()
        
        
    train_a = train_correct / train_total        
    train_acV_pyt.append(train_a)       
    print('[%d, %5d subsamples] Training loss: %.3f' % (epoch + 1, i*BATCH, running_loss / len(dataloader)))
    running_loss = 0            
    # evaluate the model on the validation dataset
    val_loss = 0.0
    val_correct = 0
    val_total = 0
    with torch.no_grad():
        model.eval()
        for val_waveform, val_label in val_dataloader:
            #val_label = val_label.to(device)
            val_label = torch.argmax(val_label, dim=1)
            val_waveform = val_waveform.squeeze(0)
            
            #val_waveform = val_waveform.to(device)
            val_output = model(val_waveform)
            val_loss += loss_fn(val_output, val_label).item()
            _, val_predicted = torch.max(val_output.data, 1)
            val_total += val_label.size(0)
            val_correct += (val_predicted == val_label).sum().item()
            
            
    val_a = val_correct/ val_total
    val_acV_pyt.append(val_a)
    print('Validation Loss: {:.4f} | Validation Accuracy: {:.4f} | Training Accuracy: {:.4f}'.format(val_loss / len(val_dataloader), val_correct / val_total, train_correct / train_total))
print('Finished Training')

[1,  6400 subsamples] Training loss: 56.298
Validation Loss: 23.5925 | Validation Accuracy: 0.1487 | Training Accuracy: 0.1370
[2, 12800 subsamples] Training loss: 54.269
Validation Loss: 23.5861 | Validation Accuracy: 0.1487 | Training Accuracy: 0.1375
[3, 19200 subsamples] Training loss: 55.582
Validation Loss: 23.5798 | Validation Accuracy: 0.1487 | Training Accuracy: 0.1430
[4, 25600 subsamples] Training loss: 56.922
Validation Loss: 23.5740 | Validation Accuracy: 0.1487 | Training Accuracy: 0.1284
[5, 32000 subsamples] Training loss: 55.101
Validation Loss: 23.5683 | Validation Accuracy: 0.1487 | Training Accuracy: 0.1442
[6, 38400 subsamples] Training loss: 55.610
Validation Loss: 23.5623 | Validation Accuracy: 0.1487 | Training Accuracy: 0.1350
[7, 44800 subsamples] Training loss: 56.161
Validation Loss: 23.5569 | Validation Accuracy: 0.1487 | Training Accuracy: 0.1342
[8, 51200 subsamples] Training loss: 54.968
Validation Loss: 23.5514 | Validation Accuracy: 0.1487 | Training A

In [34]:
import json

with open("/content/drive/MyDrive/Result/train_acV_pyt.json", 'w') as f:

    json.dump(train_acV_pyt, f, indent=2) 

with open("/content/drive/MyDrive/Result/val_acV_pyt.json", 'w') as f:

    json.dump(val_acV_pyt, f, indent=2) 

# VGG16 with GTZAN pretrained weights:

### GTZAN dataset:
The GTZAN dataset is a widely used benchmark dataset in the field of music information retrieval. It was created by George Tzanetakis and Perry Cook in 2002, and is named after Tzanetakis' initials. The dataset consists of 1,000 audio tracks of 30 seconds each, equally divided into 10 different music genres: blues, classical, country, disco, hip-hop, jazz, metal, pop, reggae, and rock. The audio tracks were collected from various online sources and were pre-processed to have a uniform format and sampling rate. The GTZAN dataset has been extensively used for various music classification tasks, such as genre classification, mood detection, and instrument recognition. Its popularity stems from its wide variety of genres and large sample size, which makes it a valuable resource for developing and evaluating music analysis algorithms.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import sklearn.metrics as skm
import sklearn.model_selection as skms
import sklearn.preprocessing as skp
import random
import librosa, IPython
import librosa.display as lplt
seed = 12
np.random.seed(seed)

In [None]:
df = pd.read_csv('~/downloads/Data/features_3_sec.csv')
df.head()

Unnamed: 0,filename,length,chroma_stft_mean,chroma_stft_var,rms_mean,rms_var,spectral_centroid_mean,spectral_centroid_var,spectral_bandwidth_mean,spectral_bandwidth_var,...,mfcc16_var,mfcc17_mean,mfcc17_var,mfcc18_mean,mfcc18_var,mfcc19_mean,mfcc19_var,mfcc20_mean,mfcc20_var,label
0,blues.00000.0.wav,66149,0.335406,0.091048,0.130405,0.003521,1773.065032,167541.630869,1972.744388,117335.771563,...,39.687145,-3.24128,36.488243,0.722209,38.099152,-5.050335,33.618073,-0.243027,43.771767,blues
1,blues.00000.1.wav,66149,0.343065,0.086147,0.112699,0.00145,1816.693777,90525.690866,2010.051501,65671.875673,...,64.748276,-6.055294,40.677654,0.159015,51.264091,-2.837699,97.03083,5.784063,59.943081,blues
2,blues.00000.2.wav,66149,0.346815,0.092243,0.132003,0.00462,1788.539719,111407.437613,2084.565132,75124.921716,...,67.336563,-1.76861,28.348579,2.378768,45.717648,-1.938424,53.050835,2.517375,33.105122,blues
3,blues.00000.3.wav,66149,0.363639,0.086856,0.132565,0.002448,1655.289045,111952.284517,1960.039988,82913.639269,...,47.739452,-3.841155,28.337118,1.218588,34.770935,-3.580352,50.836224,3.630866,32.023678,blues
4,blues.00000.4.wav,66149,0.335579,0.088129,0.143289,0.001701,1630.656199,79667.267654,1948.503884,60204.020268,...,30.336359,0.664582,45.880913,1.689446,51.363583,-3.392489,26.738789,0.536961,29.146694,blues


In [None]:
audio_fp = '/Users/melikakeshavarz/Downloads/Data 2/genres_original/blues/blues.00004.wav'
audio_data, sr = librosa.load(audio_fp)
audio_data, _ = librosa.effects.trim(audio_data)

In [None]:
IPython.display.Audio(audio_data, rate=sr)