<a href="https://colab.research.google.com/github/Golem8/Music-Genre-Classifier/blob/main/livedemo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook takes in a mp3 upload and converts it to a spectrogram and displays the model's resultant classification

In [184]:
pathToEvaluate = r'/content/The_Message.mp3'
modelPath = r'/content/78.7valacc'

In [185]:
from torchvision import datasets, transforms
import torch
import torch.nn.functional as nnf
import torch.nn as nn
import os
import torch.optim as optim 

!pip install pydub
import pydub
import librosa, librosa.display
from pydub import AudioSegment 
from pydub.utils import make_chunks

from pydub import AudioSegment
import numpy as np
import shutil
import matplotlib.pyplot as plt
from PIL import Image
from IPython.display import Audio

from IPython.display import Audio
from IPython.display import clear_output
import time



In [186]:
# get a pretrained network for transfer learning
import torchvision.models as models
densenet = models.densenet161(pretrained=True)

class transfer_music_classifer(nn.Module):
  def __init__(self):
    super(transfer_music_classifer, self).__init__()
    self.featureExtract = densenet.features
    
    self.classifier = nn.Sequential(
      nn.Linear(2208, 780),
      nn.ReLU(),
      nn.Dropout(0.2),
      nn.Linear(780, 240),
      nn.ReLU(),
      nn.Dropout(0.2),
      nn.Linear(240, 5),
      nn.Sigmoid(),
    )

  def forward(self, spectrogram):
    features = self.featureExtract(spectrogram)

    ######
    # This code is normally built into densenet, but since we are splitting
    # the features and classifier it must be put here
    # Taken from here: 
    # https://pytorch.org/vision/stable/_modules/torchvision/models/densenet.html

    # inplace = True means it modifies the input instead of allocating memory
    # for an ouput. Saves on memory
    out = nnf.relu(features, inplace=True)

    # adaptive pool decides the stride and kernel size automatically to ensure
    # the output has shape (x,x,1,1) regardless of input
    out = nnf.adaptive_avg_pool2d(out, (1, 1))

    # reshape output to be (1, x)
    out = torch.flatten(out, 1)
    ######    

    out = self.classifier(out)
    return out

In [187]:
model = transfer_music_classifer()
state = torch.load(modelPath, map_location='cpu')
model.load_state_dict(state)

<All keys matched successfully>

In [188]:
# Create 2 second .wav files for the selected audio
try:
  shutil.rmtree('/content/chunks')
except:
  pass
os.mkdir('/content/chunks')

num_samples = 0

audio_object = AudioSegment.from_mp3(pathToEvaluate) 
chunk_length_ms = 2000   #2000 ms clips each 
chunks = make_chunks(audio_object,chunk_length_ms)
for iteration, chunk in enumerate(chunks): 
  # if the chunks are not 2000 ms long (the last one may be shorter)
  # discard them as they would
  # mess up the creation of the spectrograms
  if len(chunk) == 2000:
    chunk.export(os.path.join('/content', 'chunks', str(iteration) + '.wav'), format="wav")
    num_samples += 1

In [189]:
Audio(pathToEvaluate, autoplay=True)

In [190]:
# allows the audio to properly begin playing before evaluation spectrograms
time.sleep(4)
try:
  shutil.rmtree('/content/spectrograms')
except:
  pass
os.mkdir('/content/spectrograms')

last_exec = time.time()

genres = [0,0,0,0,0]
classDict = {0: 'Electronic', 1: 'Folk', 2: 'Hip-Hop', 3: 'Pop', 4: 'Rock'}

preprocess = transforms.Compose([
  transforms.Resize([224,224]),
  transforms.ToTensor(),
  # densenet expects this transformations
  transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# disable dropout layers
model.eval()
torch.no_grad()

for x in range(num_samples):
  fullPath = os.path.join('/content', 'chunks', str(x) + '.wav')
  audio, sr = librosa.load(fullPath, sr= 44100) 
  plt.figure(figsize=(8,8.2), dpi= 36)
  value = librosa.stft(audio)
  data_in_db = librosa.amplitude_to_db(np.abs(value)) #applies Fourier Transform to show power throughout the time
  librosa.display.specshow(data_in_db, sr=sr)   #show spectrogram, in terms of log Function because that is how audio is usually represented
  
  specPath = os.path.join('/content', 'spectrograms', str(x) + '.png')
  plt.savefig(specPath, format="PNG", bbox_inches= "tight", pad_inches= 0, transparent= "True", )
  plt.clf()
  plt.close()

  fullPath = os.path.join('/content', 'spectrograms', str(x)+'.png')
  spec = preprocess(Image.open(fullPath).convert('RGB'))
  out = model(spec.unsqueeze(0))
  pred = out.max(1, keepdim=True)[1][0][0].item()
  genres[pred] += 1
  total = sum(genres)
  percents = []
  for count in genres:
    percents.append(round(float(count)/total * 100,2))
  clear_output()
  for x in range(5):
    classification = percents.index(max(percents))
    print(str(max(percents)) + '% ' + classDict[classification])
    percents[percents.index(max(percents))] = -100
  
  time.sleep(last_exec + 2 - time.time())
  last_exec = time.time()

98.41% Hip-Hop
1.59% Pop
0.0% Electronic
0.0% Folk
0.0% Rock


KeyboardInterrupt: ignored