<a href="https://colab.research.google.com/github/JohnDHitti/music_cnn/blob/main/Music_CNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# run to mount your google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# LOAD A TRAINING SET

# change to location of the folder containing all the training data subfolders
path="/content/drive/MyDrive/MusicAnalysis/Training Datasets/3 Intervals - 4kHz" 

#----------------------------------------------------------------------
import torch
import torchvision
import torchvision.transforms as transforms

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # designate gpu as device
# load in the images (turn them into tensors and also normalize them )
# normalize takes the mean and sd of the three channels for all images - https://pytorch.org/vision/stable/transforms.html
train = torchvision.datasets.ImageFolder(path, transform=transforms.Compose([transforms.ToTensor(),transforms.Normalize([0.6803, 0.2475, 0.4323], [0.2560, 0.1522, 0.0951])]))
print(train)
# put them into trainset as tensors
trainset = torch.utils.data.DataLoader(train,batch_size=1, shuffle=True, num_workers=0)
# images are currently 432*228 px

print("Loading complete... ")
print("Size of trainset: ", len(trainset)) 


Dataset ImageFolder
    Number of datapoints: 12000
    Root location: /content/drive/MyDrive/MusicAnalysis/Training Datasets/3 Intervals - 4kHz
    StandardTransform
Transform: Compose(
               ToTensor()
               Normalize(mean=[0.6803, 0.2475, 0.4323], std=[0.256, 0.1522, 0.0951])
           )
Loading complete... 
Size of trainset:  12000


In [None]:
# DEFINE THE RCNN

# chance the feature amount to the amount of possible network outputs for your dataset
featureamount=3

#--------------------
import torch
import torchvision
import torchvision.transforms as transforms
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # designate gpu as device
import torch.nn as nn

torch.manual_seed(1)

# network structure inspired by custom "upchannel" network by https://github.com/Dohppak/Music_Genre_Classification_Pytorch
# network includes lstm layers

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        # 4 convolutional layers with batch normaliztion and maxpooling
        self._convolution = nn.Sequential(
            nn.Conv2d(in_channels=3, out_channels=64, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),

            nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),

            nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=4),

            nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=4)
        )
        # LSTM layer
        self._RNN = nn.Sequential(
                nn.LSTM(2048, 2048, bidirectional =True),
                )
        # connected layers with dropouts
        self._connectedLayers = nn.Sequential(
            nn.Linear(in_features=24576, out_features=1024),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(in_features=1024, out_features=256),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(in_features=256, out_features=featureamount))
        self.apply(self._init_weights)

    def forward(self, x):
        x = self._convolution(x)
        x = x.permute(3,0,1,2)
        x = x.view(x.size(0), x.size(1), -1)
        x, hn = self._RNN(x)
        x = x.permute(1, 2, 0)
        x = x.reshape(x.size(0), -1)
        out = self._connectedLayers(x)
        return out

      # defines the weights for the layers using kaming unoform method and xavier uniform method
    def _init_weights(self, layer) -> None:
        if isinstance(layer, nn.Conv1d):
            nn.init.kaiming_uniform_(layer.weight)
        elif isinstance(layer, nn.Linear):
            nn.init.xavier_uniform_(layer.weight)

net = Net()
net.to(device) # send net to gpu

In [None]:
#Declare Standard Loss Functions
import torch.optim as optim
criterion = nn.CrossEntropyLoss()# combines logsoftmax and NLLLoss in one
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

In [None]:
# Declare Custom Loss Function (unfinished)
import torch.optim as optim
def criterion(output, target):
    targ=target.item()
    # pre defined tensor weights
    w1=10
    w2=5
    w3=0
    t20=torch.tensor([w1,w2,w3,w3,w3,w3,w3,w3,w3,w3])
    t30=torch.tensor([w2,w1,w2,w3,w3,w3,w3,w3,w3,w3])
    t40=torch.tensor([w3,w2,w1,w2,w3,w3,w3,w3,w3,w3])
    t50=torch.tensor([w3,w3,w2,w1,w2,w3,w3,w3,w3,w3])
    t60=torch.tensor([w3,w3,w3,w2,w1,w2,w3,w3,w3,w3])
    t70=torch.tensor([w3,w3,w3,w3,w2,w1,w2,w3,w3,w3])
    t80=torch.tensor([w3,w3,w3,w3,w3,w2,w1,w2,w3,w3])
    t90=torch.tensor([w3,w3,w3,w3,w3,w3,w2,w1,w2,w3])
    t2000=torch.tensor([w3,w3,w3,w3,w3,w3,w2,w1,w2,w3])
    t2010=torch.tensor([w3,w3,w3,w3,w3,w3,w3,w2,w1,w2])
    if (targ==0):
      weight=t20.cuda()
    if (targ==1):
      weight=t30.cuda()
    if (targ==2):
      weight=t40.cuda()
    if (targ==3):
      weight=t50.cuda()
    if (targ==4):
      weight=t60.cuda()
    if (targ==5):
      weight=t70.cuda()
    if (targ==6):
      weight=t80.cuda()
    if (targ==7):
      weight=t90.cuda()
    if (targ==8):
      weight=t2000.cuda()
    if (targ==9):
      weight=t2010.cuda()
    loss = torch.mean((output - target)*weight)
    #print(loss)
    return loss
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

In [None]:
# TRAIN NETWORK WITH TEST

#change to the location of the testing data folder
path2="/content/drive/MyDrive/MusicAnalysis/Testing Datasets/3 Intervals - 4kHz" #change to where you have the year subfolders

#-------------------------------------------------------
for epoch in range(20):  # loop over the dataset x times
    net.train()
    epochloss=0
    if __name__ == '__main__':
        for i, data in enumerate(trainset, 0):
            # seperate data and labels from tensors
            inputs, labels = data

            inputs, labels = inputs.cuda(), labels.cuda()   
            # zero the parameter gradients (for backwards operation [loss.backwards])
            optimizer.zero_grad()
    
            # forward + backward + optimize
            outputs = net(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step() # performs optimization step
            epochloss+=loss.item()
            #print(loss)

    epochloss=epochloss/len(trainset)
    print("Epoch: ",epoch+1)
    print()
    print("Average Epoch Loss: ",epochloss)
    print()
    print("testing...")
      # load in the images from the test set folder
    test = torchvision.datasets.ImageFolder(path2, transform=transforms.Compose([transforms.ToTensor(),transforms.Normalize([0.6803, 0.2475, 0.4323], [0.2560, 0.1522, 0.0951])]))
    # put them into trainset
    testset = torch.utils.data.DataLoader(test,batch_size=1, shuffle=True, num_workers=0)
    testaverage=0
    for x in range(10):
      correct = 0
      total = 0
    
      with torch.no_grad(): # Turns off the Gradient Calculations
          for data in testset:
              X, y = data
              X, y = X.cuda(), y.cuda()   
              output = net(X)
              for idx, i in enumerate(output):
                  if torch.argmax(i) == y[idx]: # Returns the index of the maximum value
                      correct += 1
                  total +=1
      print("Accuracy: ", round((correct/total)*100, 3), "%")
      testaverage+=round((correct/total)*100, 3)
    testaverage=testaverage/10
    print("Average: ", testaverage,"%")
    print()
    # save trained network parameters when if is true
    if (False):
      torch.save(net.state_dict(), '/content/drive/MyDrive/MusicAnalysis/Saved Networks for Epoch 20/savednetepoch10interval'+str(epoch)+'.pt')
    print("")

print('Finished Training')

In [None]:
# save trained network parameters
torch.save(net.state_dict(), '/content/drive/MyDrive/MusicAnalysis/savednet.pt')

In [None]:
# load model parameters in
net = Net()
net.load_state_dict(torch.load('/content/drive/MyDrive/MusicAnalysis/Saved Networks/savednetepoch10(goodOne).pt'))
net.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu')) 
#net.eval() idk why but this usually makes it not work so i dissabled it

Net(
  (_convolution): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (4): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (5): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ReLU()
    (7): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (8): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (10): ReLU()
    (11): MaxPool2d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
    (12): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True

In [None]:
#Perform a test on the network

#change this to the test folder location
path2="/content/drive/MyDrive/MusicAnalysis/Testing Datasets/10 Intervals/2000s Test" 

#---------------------------------------------
for x in range(5):
  # TEST NETWORK
  test = torchvision.datasets.ImageFolder(path2, transform=transforms.Compose([transforms.ToTensor(),transforms.Normalize([0.6803, 0.2475, 0.4323], [0.2560, 0.1522, 0.0951])]))
  # put them into trainset
  testset = torch.utils.data.DataLoader(test,batch_size=1, shuffle=True, num_workers=0)
  correct = 0
  total = 0
  net.train()
  with torch.no_grad(): # Turns off the Gradient Calculations
      for data in testset:
          X, y = data
          X, y = X.cuda(), y.cuda()   
          output = net(X)
          for idx, i in enumerate(output):
              if torch.argmax(i) == y[idx]: # Returns the index of the maximum value
                  correct += 1
              total +=1
  print("Accuracy: ", round((correct/total)*100, 3), "%")

FileNotFoundError: ignored

In [None]:
# Load file from desktop
from google.colab import files
uploaded = files.upload()

Saving Justin Bieber - Sorry.mp3 to Justin Bieber - Sorry (4).mp3


In [None]:
# convert to melspectrogram and test in the network 

#place path to file here
path3='/content/Justin Bieber - Sorry.mp3'

#------------------------
import glob
import librosa
import pylab
import librosa.display
import numpy as np
import os

      
signalData, samplingFrequency = librosa.load(path3,sr=16000,offset=30.0, duration=30.0) # converts mp3 file to an array downsampled to 12000Hz
librosa.util.normalize(signalData) # normalizes audio data
        
pylab.axis('off') # no axis
pylab.axes([0., 0., 1., 1.], frameon=False, xticks=[], yticks=[]) # Remove the white edge
S = librosa.feature.melspectrogram(y=signalData, sr=samplingFrequency) # uses FFT to convert waveform to melspectrogram
librosa.display.specshow(librosa.power_to_db(S, ref=np.max)) 
!mkdir dummyfolder
!mkdir dummyfolder/dummyfolder2
fp='/content/dummyfolder/dummyfolder2' # direct to save path
pylab.savefig((fp+'/fileMEL'+'.png'), bbox_inches=None, pad_inches=0)      
pylab.close()   

print("Image Created")


path4="/content/dummyfolder" #change to where you have the year subfolders
test = torchvision.datasets.ImageFolder(path4, transform=transforms.Compose([transforms.ToTensor(),transforms.Normalize([0.6803, 0.2475, 0.4323], [0.2560, 0.1522, 0.0951])]))
# put them into trainset
testset = torch.utils.data.DataLoader(test,batch_size=1, shuffle=True, num_workers=0)
net.train()
# key is asuming 3 interval network
print()

with torch.no_grad(): # Turns off the Gradient Calculations
    for data in testset:
        X, y = data
        X, y = X.cuda(), y.cuda()   
        output = net(X)
        for idx, i in enumerate(output):
            if (output.argmax()).item()==0:
              print("This song was released some time between 1920 and 1959")
            if (output.argmax()).item()==1:
              print("This song was released some time between 2000 and 2021")
            if (output.argmax()).item()==2:
              print("This song was released some time between 1960 and 1999")



In [None]:
#module for generating spectrograms from audio files

import glob
import librosa
import eyed3
import pylab
import librosa.display
import numpy as np



# function used to create directory if it does not exitst
def mkdir_p(mypath):
    from errno import EEXIST
    from os import makedirs,path
    try:
        makedirs(mypath)
    except OSError as exc: 
        if exc.errno == EEXIST and path.isdir(mypath):
            pass
        else: raise

x=0# counter for save file names
for filepath in glob.iglob('C:/NN School Project/fma_small/**/*.mp3',recursive=True): # direct to path full of wav files
        print(filepath)
        
        signalData, samplingFrequency = librosa.load(filepath,sr=12000) # converts mp3 file to an array downsampled to 12000Hz
        librosa.util.normalize(signalData) # normalizes audio data
        
        
        
        pylab.axis('off') # no axis
        pylab.axes([0., 0., 1., 1.], frameon=False, xticks=[], yticks=[]) # Remove the white edge
        S = librosa.feature.melspectrogram(y=signalData, sr=samplingFrequency) # uses FFT to convert waveform to melspectrogram
        librosa.display.specshow(librosa.power_to_db(S, ref=np.max)) 
       
        audiofile = eyed3.load(filepath)
        year = audiofile.tag.getBestDate()
        year = str(year)
        year = year[ 0: 4: 1]
        fp='C:/NN School Project/'+str(year) # direct to save path
        if year != "None":
            
          mkdir_p(fp) # checks to see if the directory exists and if not it makes it
          pylab.savefig((fp+'/fileMEL'+str(x)+'.png'), bbox_inches=None, pad_inches=0)
        
        
        pylab.close()   
        x=x+1
        #plot.show()
        print("Current progress: "+ str(x) + " out of ~8000")

print("Finished")