This notebook is associated to the report on *Automatic hip-hop dance playlist ordering*. It contains the details of the following steps :


*   Models training
*   Song information extraction
*   Song combination






Notes : 

*   the results can slightly change from the report because of certain sources of randomness
*   I have done print("..." + str(value)) during the whole Notebook which is not optimal so please forgive me.. 😅


**Installation of the depedencies and importation of the modules**

In [None]:
!pip install librosa
!pip install spleeter

In [2]:
import librosa
from IPython.display import Audio, display
import numpy as np
import os
import matplotlib.pyplot as plt
import pandas as pd
import random
import math
import sklearn

In [3]:
import spleeter
from scipy.fft import fft

from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import multilabel_confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import plot_confusion_matrix

from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import mean_absolute_error

import pickle
sound_d = 1

In [4]:
import warnings
warnings.filterwarnings('ignore')

#Models Training

##Key and Mode detection

In [None]:
from google.colab import files
uploaded = files.upload()

In [6]:
#load the data
X_pca_pitch = np.loadtxt('/content/X_pca_pitch.txt')
y_pca_pitch = np.loadtxt('/content/y_pca_pitch.txt')
X_average_pitch = np.loadtxt('/content/X_average_pitch.txt')
y_average_pitch = np.loadtxt('/content/y_average_pitch.txt')
X_pca_mode = np.loadtxt('/content/X_pca_mode.txt')
y_pca_mode = np.loadtxt('/content/y_pca_mode.txt')
X_average_mode = np.loadtxt('/content/X_average_mode.txt')
y_average_mode = np.loadtxt('/content/y_average_mode.txt')

###Per-beat chromagrams

Grid-search to find the best hyper-parameters for SVM and Random Forest. *These 2 functions are customized because there was a compatibility problem with the grid-search function from sklearn*

In [7]:
#grid search for SVM
def grid_search_svm(X, y, n_comp, C, gamma, kernel) :
  #pca
  pca = PCA(n_comp)
  pca.fit(X)
  X = pca.fit_transform(X)
  print("the % of the explained variance is : " + str(sum(pca.explained_variance_ratio_)))

  X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)

  #find the model with the best cross-validation score
  max_score = 0
  best_clf = None
  for c in C :
    for g in gamma :
      for k in kernel :
        clf = SVC(kernel=k, C=c, gamma=g, random_state=42)
        scores = cross_val_score(clf, X, y, cv=5)
        score = scores.mean() 
        if(score > max_score) :
          max_score = score
          best_clf = clf

  #evaluation
  best_clf.fit(X_train,y_train)
  y_pred = best_clf.predict(X_test)

  print("Best SVM parameters : ")
  print(best_clf.kernel)
  print(best_clf.gamma)
  print(best_clf.C)

  accuracy = accuracy_score(y_test, y_pred)
  recall = recall_score(y_test, y_pred, average='macro')
  precision = precision_score(y_test, y_pred, average='macro')

  print("accuracy : " + str(accuracy))
  print("recall : " + str(recall))
  print("precision : " + str(precision))
      
          
  return pca, best_clf

In [8]:
#grid-search for Random Forest
def grid_search_random_forest(X, y, n_comp, maximum_depth) :
  #pca
  pca = PCA(n_comp)
  pca.fit(X)
  X = pca.fit_transform(X)
  print("the % of the explained variance is : " + str(sum(pca.explained_variance_ratio_)))

  X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)

  #find the model with the best cross-validation score
  max_score = 0
  best_clf = None
  for m in maximum_depth :
    clf = RandomForestClassifier(max_depth=m)
    scores = cross_val_score(clf, X_train, y_train, cv=5)
    score = scores.mean() 
    if(score > max_score) :
      max_score = score
      best_clf = clf

  #evaluation
  best_clf.fit(X_train,y_train)
  y_pred = best_clf.predict(X_test)

  print("Random Forest parameters : ")
  print(best_clf.max_depth)

  accuracy = accuracy_score(y_test, y_pred)
  recall = recall_score(y_test, y_pred, average='macro')
  precision = precision_score(y_test, y_pred, average='macro')

  print("accuracy : " + str(accuracy))
  print("recall : " + str(recall))
  print("precision : " + str(precision))
      
  return pca, best_clf

####Pitch Detection

In [9]:
pca_pitch_svm, clf_pitch_svm = grid_search_svm(X_pca_pitch, y_pca_pitch, n_comp=500, C=[1,10,100,1000], gamma=[0.001,0.0001], kernel=['linear','rbf'])

the % of the explained variance is : 0.9050104497416427
Best SVM parameters : 
rbf
0.001
10
accuracy : 0.6391184573002755
recall : 0.6001554180590715
precision : 0.6378027956176909


In [10]:
pca_pitch_random_forest, clf_pitch_random_forest = grid_search_random_forest(X_pca_pitch, y_pca_pitch, n_comp=500, maximum_depth=[1, 5, 10, 20, 25, 50, 75, 100])

the % of the explained variance is : 0.9050205198571126
Random Forest parameters : 
10
accuracy : 0.5151515151515151
recall : 0.4001575034294176
precision : 0.49056925990776024


####Mode Detection

*we obtain the exact same results for the 2 models, this could be a problem... Fortunately, we will not use per-beat chromagrams for mode detection*

In [11]:
pca_mode_svm, clf_mode_svm = grid_search_svm(X_pca_mode, y_pca_mode, n_comp=400, C=[1,10,100,1000], gamma=[0.001,0.0001], kernel=['linear','rbf'])

the % of the explained variance is : 0.937477868139397
Best SVM parameters : 
rbf
0.001
10
accuracy : 0.8154761904761905
recall : 0.5
precision : 0.40773809523809523


In [12]:
pca_mode_random_forest, clf_mode_random_forest = grid_search_random_forest(X_pca_mode, y_pca_mode, n_comp=400, maximum_depth=[1, 5, 10, 20, 25, 50, 75, 100])

the % of the explained variance is : 0.9375247085552415
Random Forest parameters : 
1
accuracy : 0.8154761904761905
recall : 0.5
precision : 0.40773809523809523


###Averaged chromagrams

In [13]:
#grid search for SVM
def average_svm_model(X, y, confusion_matrix=False) :
  
  X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)

  #grid-search
  parameters = [
  {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf', 'linear']},
  ]
  grid_search = GridSearchCV(SVC(),parameters)

  #evaluation
  best_clf = grid_search.fit(X_train, y_train)
  best_clf.fit(X_train,y_train)
  y_pred = best_clf.predict(X_test)

  print("Best SVM parameters : ")
  print(best_clf.best_estimator_)

  accuracy = accuracy_score(y_test, y_pred)
  recall = recall_score(y_test, y_pred, average='macro')
  precision = precision_score(y_test, y_pred, average='macro')

  print("Performances : ")
  print("accuracy : " + str(accuracy))
  print("recall : " + str(recall))
  print("precision : " + str(precision))
  
  #to plot the confusion matrix
  if(confusion_matrix) :
    plot_confusion_matrix(best_clf, X_test, y_test)  
    plt.show()

  return best_clf

In [14]:
def average_random_forest_model(X, y, confusion_matrix=False) :
  
  X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)

  #grid-search
  parameters = [
  {'max_depth': [1, 5, 10, 20, 25, 50, 75, 100]},
  ]
  grid_search = GridSearchCV(RandomForestClassifier(),parameters)

  #evaluation
  best_clf = grid_search.fit(X_train, y_train)
  best_clf.fit(X_train,y_train)
  y_pred = best_clf.predict(X_test)

  print("Best Random Forest parameters : ")
  print(best_clf.best_estimator_)

  accuracy = accuracy_score(y_test, y_pred)
  recall = recall_score(y_test, y_pred, average='macro')
  precision = precision_score(y_test, y_pred, average='macro')

  print("Performances : ")
  print("accuracy : " + str(accuracy))
  print("recall : " + str(recall))
  print("precision : " + str(precision))
  
  #to plot the confusion matrix
  if(confusion_matrix) :
    plot_confusion_matrix(best_clf, X_test, y_test)  
    plt.show()

  return best_clf

####Pitch Detection

In [15]:
clf_average_pitch = average_svm_model(X_average_pitch, y_average_pitch)

Best SVM parameters : 
SVC(C=100, gamma=0.001, kernel='linear')
Performances : 
accuracy : 0.7683923705722071
recall : 0.7417563675959199
precision : 0.7561239596718689


In [16]:
clf_average_pitch_svm = average_random_forest_model(X_average_pitch, y_average_pitch)

Best Random Forest parameters : 
RandomForestClassifier(max_depth=75)
Performances : 
accuracy : 0.7002724795640327
recall : 0.6648617275482948
precision : 0.7090217754716509


####Mode Detection

In [17]:
clf_average_mode = average_svm_model(X_average_mode, y_average_mode)

Best SVM parameters : 
SVC(C=1, gamma=0.001)
Performances : 
accuracy : 0.7470588235294118
recall : 0.5
precision : 0.3735294117647059


In [18]:
clf_average_mode_forest = average_random_forest_model(X_average_mode, y_average_mode)

Best Random Forest parameters : 
RandomForestClassifier(max_depth=75)
Performances : 
accuracy : 0.7647058823529411
recall : 0.5348837209302325
precision : 0.8802395209580838


Saving the best models in the folder /content/models

In [19]:
os.mkdir('/content/models')
pickle.dump(clf_average_pitch_svm, open('/content/models/clf_pitch.sav', 'wb'))
pickle.dump(clf_average_mode_forest, open('/content/models/clf_mode.sav', 'wb'))

##Drum Detection

In [20]:
#normalize the DFT
def normalized_dft(x) :
  dft = abs(fft(x))
  m = max(dft)
  if(m!=0) :
    return dft/m
  else :
    return dft

In [21]:
#class that contains the sound that will be used in the dataset
class Sound :
  sound_duration = sound_d #duration of a sound in [sec], shared by every sound object
  def __init__(self, sound_type, values, sr=22050) :
    self.sound_type = sound_type #[e.g. "snare", "kick", "whitenoise", ...
    self.values = values #contains temporal signal of the sound
    self.sr=sr #sample rate [Hz]

    #make sure that all the sounds exctracts have the same length (sound_duration)
    if(len(self.values)/sr > self.sound_duration) :  
      self.values = self.values[0:int(self.sound_duration*sr)]
    elif (len(self.values)/sr < self.sound_duration):
      self.values = np.append(self.values, np.zeros(int(self.sound_duration*sr-len(self.values))))

In [22]:
#create an array of object of the same type
def create_object_array(sound_type, objects_directory) :
  A = []
  for filename in os.listdir(objects_directory): #browse in the directory containg the wav file of our given type
      if filename.endswith("wav"): 
          x, sr = librosa.load(objects_directory +'/'+ filename)
          A.append(Sound(sound_type, x))
  return A

In [23]:
#implementation detail
def get_right_format(x) :
  x_new = []
  x_new = np.append(x_new, x)
  return x_new

In [24]:
#generate the dataset for drum detection
def generate_dataset(drum, other_drums, ambiance_noise, percussion_noise, N_max) :
  ''' 
  generates random combinations between one specific target drum type (kick or snare)
  and noise sources :

      target drum type : different samples of this target drum type
      percussion noise : different drum type than the target (e.g. snare and hi-hats if the
                         target is a kick)
      ambiance noise : white noise, brown noise, vynile crackling noise, etc...
  '''
  X=[]
  Y=[]
  n_iter=-1

  for d in drum : 
    #add the target sound alone to the database
    s = d.values
    x = normalized_dft(s)
    x = get_right_format(x)
    X.append(x)
    Y.append([1])

    for i in range(0, N_max) :
      s_noise = np.zeros(len(s))
      other_drums_index = random.randint(-1,len(other_drums)-1)

      if(other_drums_index!=-1) :
        random_drum_index = random.randint(0, len(other_drums[other_drums_index])-1)
        random_drum = other_drums[other_drums_index][random_drum_index]
        s_noise+=random_drum.values
        
      #add ambiance noise
      ambiance_noise_index = random.randint(-1,len(ambiance_noise)-1)
      if(ambiance_noise_index!=-1) :
        amb_noise = ambiance_noise[ambiance_noise_index]
        s_noise+=amb_noise.values
       
      #add percussion noise
      percussion_noise_index = random.randint(-1, len(percussion_noise)-1)
      if(percussion_noise_index!=-1) :
        perc_noise = percussion_noise[percussion_noise_index]
        s_noise+=perc_noise.values

      #add the noise alone to the database
      if(all(s_noise==np.zeros(len(s)))==False) : #if n_noise is not a null array
        x_noise = normalized_dft(s_noise)
        x_noise = get_right_format(x_noise)
        X.append(x_noise)
        Y.append([0])
        
      #add the original sound + the noise in the database
      s+=s_noise
      x = normalized_dft(s)
      x = get_right_format(x)
      X.append(x)
      Y.append([1])
     
  return X,Y

In [25]:
#creating directories that will contain the wav files for the database
os.mkdir('/content/kick')
os.mkdir('/content/hat')
os.mkdir('/content/snare')
os.mkdir('/content/percussion_noise')
os.mkdir('/content/ambiance_noise')

In [30]:
#creates arrays of different drum types with their samples
kick = create_object_array('kick', '/content/kick') 
snare = create_object_array('snare', '/content/snare') 
hat = create_object_array('hat', '/content/hat')

percussion_noise = create_object_array('drums_noise', '/content/percussion_noise')
ambiance_noise = create_object_array('ambiance_noise', '/content/ambiance_noise')

In [31]:
#generates the data from drum detection
X_kick,y_kick = generate_dataset(kick, [snare,hat], ambiance_noise, percussion_noise, 50)
X_snare,y_snare = generate_dataset(snare, [kick,hat], ambiance_noise, percussion_noise, 50)

#for the format
y_kick = np.array(y_kick).ravel()
y_snare = np.array(y_snare).ravel()

In [32]:
#selects the best model for drum detection
def detection_model(X, y, n_comp) :
  '''
  - PCA on the normalized DFT
  - grid-search for hyperparameters for SVM and Random Forest
  - selects the best model between SVM and Random Forest
  '''

  #pca
  pca = PCA(n_comp)
  pca.fit(X)
  X = pca.fit_transform(X)
  print("the % of the explained variance is : " + str(sum(pca.explained_variance_ratio_)))

  X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)

  #grid search for SVM
  print('\n' + "SVM Prediction : " + '\n')
  parameters = [
  {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf', 'linear']},
  ]
  grid_search = GridSearchCV(SVC(),parameters)

  #evaluation for SVM
  clf_svm = grid_search.fit(X_train, y_train)
  y_pred = clf_svm.predict(X_test)
  accuracy_svm = accuracy_score(y_test, y_pred)
  recall = recall_score(y_test, y_pred, average='macro')
  precision = precision_score(y_test, y_pred, average='macro')
  print("Parameters : " + str(clf_svm.best_estimator_))
  print("accuracy : " + str(accuracy_svm))
  print("recall : " + str(recall))
  print("precision : " + str(precision))
  
  #grid search for Random Forest
  print('\n' + "Random Forest : " + '\n')
  parameters = {'max_depth': [1, 2, 5, 10, 20, 30, 40, 50, 100]}
  grid_search = GridSearchCV(RandomForestClassifier(),parameters)
  clf_random_forest = grid_search.fit(X_train, y_train)
  y_pred = clf_random_forest.predict(X_test)
  accuracy_random_forest = accuracy_score(y_test, y_pred)
  recall = recall_score(y_test, y_pred, average='macro')
  precision = precision_score(y_test, y_pred, average='macro')
  print("Parameters : " + str(clf_random_forest.best_estimator_))
  print("accuracy : " + str(accuracy_random_forest))
  print("recall : " + str(recall))
  print("precision : " + str(precision))

  #selects the best model between Random Forest and SVM based on accuracy
  if(accuracy_svm >= accuracy_random_forest) :
    clf = clf_svm
  else :
    clf = clf_random_forest

  return pca, clf
  

Note : the performances can be different from the report because the dataset is randomely generated

In [33]:
#performances of drum detection
print('***** kick *****')
pca_kick, clf_kick = detection_model(X_kick, y_kick, 20)
print('***** snare *****')
pca_snare, clf_snare = detection_model(X_snare, y_snare, 20)

***** kick *****
the % of the explained variance is : 0.9492990998112847

SVM Prediction : 

Parameters : SVC(C=1000, gamma=0.001)
accuracy : 0.9783393501805054
recall : 0.9774436090225564
precision : 0.98

Random Forest : 

Parameters : RandomForestClassifier(max_depth=10)
accuracy : 0.9819494584837545
recall : 0.981203007518797
precision : 0.9832214765100671
***** snare *****
the % of the explained variance is : 0.9634751081751318

SVM Prediction : 

Parameters : SVC(C=1000, gamma=0.001, kernel='linear')
accuracy : 0.9235880398671097
recall : 0.9194451853827688
precision : 0.9288636363636363

Random Forest : 

Parameters : RandomForestClassifier(max_depth=10)
accuracy : 0.9667774086378738
recall : 0.9682137458877923
precision : 0.9658527954706299


Saving the best models in the folder /content/models

In [34]:
pickle.dump(clf_kick, open('/content/models/clf_kick.sav', 'wb'))
pickle.dump(clf_snare, open('/content/models/clf_snare.sav', 'wb'))

pickle.dump(pca_kick, open('/content/models/pca_kick.sav', 'wb'))
pickle.dump(pca_snare, open('/content/models/pca_snare.sav', 'wb'))

###Test of drum detection and onset correction

In [35]:
from google.colab import files
uploaded = files.upload()

Saving kick_snare_95.wav to kick_snare_95.wav
Saving regular_kick1.wav to regular_kick1.wav
Saving shook_ones_extract_I.wav to shook_ones_extract_I.wav


In [36]:
#import and display the songs
path1 ="/content/shook_ones_extract_I.wav"
path2 = "/content/kick_snare_95.wav" 
path3 = "/content/regular_kick1.wav"

x1,sr1 = librosa.load(path1)
display(Audio(data=x1, rate=sr1))
 
x2,sr2 = librosa.load(path2)
display(Audio(data=x2, rate=sr2))

x3,sr3 = librosa.load(path3)
display(Audio(data=x3, rate=sr3))

In [37]:
#import the models
clf_pitch = pickle.load(open('/content/models/clf_pitch.sav', 'rb'))
clf_mode = pickle.load(open('/content/models/clf_mode.sav', 'rb'))
clf_kick = pickle.load(open('/content/models/clf_kick.sav', 'rb'))
clf_snare = pickle.load(open('/content/models/clf_snare.sav', 'rb'))
pca_kick = pickle.load(open('/content/models/pca_kick.sav', 'rb'))
pca_snare = pickle.load(open('/content/models/pca_snare.sav', 'rb'))

In [38]:
#normalize the DFT
def normalized_dft(x) :
  dft = abs(fft(x))
  m = max(dft)
  if(m!=0) :
    return dft/m
  else :
    return dft

In [39]:
#divide the time axis into a beat-based grid
def compute_grid(path, n_beat, beat_res, offset, sound_duration) :
  '''
  detects the beat locations and the tempo of the song and divide it into a beat-based
  time scale to have a time representation independant of the tempo

  path : path of the audio file on which we want to test drum detection
  nbeat : number of beat of the beat-based grid
  best_res : resolution of the beat-based grid 
  offset : shift the grid from a certain factor because beat events are detected
           with an offset
  sound_duration : duration of a sound (has to be the same than for the training data
                   for dimension considerations)
  '''

  x, sr = librosa.load(path) 
  tempo, beat_sample_stamps = librosa.beat.beat_track(x, sr=sr, units='samples')
  onset = librosa.onset.onset_detect(y=x, sr=sr, units='samples')
  grid = []
  onset_grid = []

  for i in range(1, n_beat+1) :
    l = beat_sample_stamps[i+1] - beat_sample_stamps[i]
    beat = x[int(beat_sample_stamps[i]-offset*l):int(beat_sample_stamps[i+1]-offset*l)]
    sub_beat_len = (len(beat)-len(beat)%beat_res)/beat_res
    for k in range(0, beat_res) :
      lower = int(beat_sample_stamps[i]-offset*l + sub_beat_len*k)
      higher = int(beat_sample_stamps[i]-offset*l + sub_beat_len*(k+1))
      sub_beat = x[lower:higher]
      current_onset = np.where(np.logical_and(onset>=lower, onset<=higher))
      if(len(current_onset[0])>0) :
        onset_grid.append(1)
      else :
        onset_grid.append(0)
      #to ensure that the length of the exctracts is the same than for the training
      if(len(sub_beat)-sound_duration < 0) :
        sub_beat = np.append(sub_beat, np.zeros(int(sound_duration-len(sub_beat)))) #we add zeros
      else :
        sub_beat = sub_beat[0:int(sound_duration)]

      grid.append(sub_beat)

  return grid, onset_grid

In [40]:
#computes the beat-based temporal grids
grid_1, onset_grid1 = compute_grid(path1, 4, 4, 0.15, sound_d*sr1) 
grid_2, onset_grid2 = compute_grid(path2, 4, 4, 0.15, sound_d*sr2) 
grid_3, onset_grid3 = compute_grid(path3, 4, 4, 0.15, sound_d*sr3) 

In [41]:
#onset detection
print("onset grid 1 :" + str(onset_grid1))
print("onset grid 2 :" + str(onset_grid2))
print("onset grid 3 :" + str(onset_grid3))

onset grid 1 :[1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0]
onset grid 2 :[1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0]
onset grid 3 :[1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0]


In [42]:
#predicts the drums event with our models
def predict_drum(pca, clf, grid) :
  pred = []
  for g in grid :
    x_dft = normalized_dft(g)
    x_dft = [x_dft]
    x_dft = pca.transform(x_dft)
    pred.append(clf.predict(x_dft))

  predict = []
  for p in pred :
    predict.append(p[0])

  return predict

In [43]:
#drum detection
predict_kick_1 = predict_drum(pca_kick, clf_kick, grid_1)
predict_snare_1 = predict_drum(pca_snare, clf_snare, grid_1)
predict_kick_2 = predict_drum(pca_kick, clf_kick, grid_2)
predict_snare_2 = predict_drum(pca_snare, clf_snare, grid_2)
predict_kick_3 = predict_drum(pca_kick, clf_kick, grid_3)
predict_snare_3 = predict_drum(pca_snare, clf_snare, grid_3)

In [44]:
#print the prediction result
print("kick 1 : " + str(predict_kick_1))
print("snare 1 : " + str(predict_snare_1))
print("kick 2 : " + str(predict_kick_2))
print("snare 2 : " + str(predict_snare_2))
print("kick 3 : " + str(predict_kick_3))
print("snare 3 : " + str(predict_snare_3))

kick 1 : [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
snare 1 : [1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]
kick 2 : [1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0]
snare 2 : [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]
kick 3 : [1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1]
snare 3 : [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [45]:
#onset correction
def onset_correction(predict_drum, onset_grid) :
  '''
  XOR between the drum ML prediction and the onset detection
  '''
  return np.logical_and(predict_drum, onset_grid).astype(int)

In [46]:
#onset correction
predict_kick_1 = onset_correction(predict_kick_1, onset_grid1)
predict_kick_2 = onset_correction(predict_kick_2, onset_grid2)

predict_snare_1 = onset_correction(predict_snare_1, onset_grid1)
predict_snare_2 = onset_correction(predict_snare_2, onset_grid2)

In [47]:
#print the detection result with onset correction
print("kick1 : " + str(predict_kick_1))
print("kick2 : " + str(predict_kick_2))
print("kick3 : " + str(predict_kick_3) + '\n')
print("snare1 : " + str(predict_snare_1))
print("snare2 : " + str(predict_snare_2))
print("snare3 : " + str(predict_snare_3))

kick1 : [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
kick2 : [1 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0]
kick3 : [1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1]

snare1 : [1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0]
snare2 : [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
snare3 : [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


#Playlist Ordering

##Song information extraction

In [48]:
#class to represent the songs of the playlist
class Song :
  def __init__(self, name, x=None, pitch=None, tempo=None, mode=None, kick_array=None, snare_array=None, next=[]) :
    self.name = name
    self.x = x
    self.pitch = pitch
    self.tempo = tempo
    self.mode = mode
    self.kick_array = kick_array
    self.snare_array = snare_array
    self.next = next

In [49]:
#folder that will contain the song of the playlist that we want to order
song_path = '/content/songs'
os.mkdir(song_path)

In [50]:
#Import the song that we want to order in the *song_path* directory juste created (WAV format and no spaces !)
song_array = []
for filename in os.listdir(song_path): #browse in the directory containg the wav file of our given type
      if filename.endswith("wav"): #we get rid of the '.wav' 
        filename = filename[:-4]
        song_array.append(Song(filename))

In [51]:
#print the song names (optional)
for s in song_array :
  print(s.name)

epps_tard_le_soir
wonder_eb
dre_still
royce_boom
break2
pusha_coupes
break3
nas_message
nipsey_fountain
nas_without_drum
gosth_run
break4
redcafe_ill
nas_affirmative
big_party
soul_tilinfinity
break1
fugees_la
mobb_shook
nas_islike
fugees_softly
pete_smooth
dre_themessage


###Key and Mode Detection

We will use averaged chromagrams

In [52]:
#detects the key and the mode of a song
def pitch_mode_detection(x, clf_pith, clf_mode) :
  chroma = librosa.feature.chroma_stft(x)
  averaged_chroma = []
  averaged_chroma.append(np.mean(chroma, axis=1))

  pitch_ = []
  mode_ = []
  pitch = clf_pitch.predict(averaged_chroma)
  mode = clf_mode.predict(averaged_chroma)
  pitch_

  return pitch, mode

###Tempo Detection

In [53]:
#get the tempo of a song
def get_tempo(x) :
  return librosa.beat.tempo(x)

###Drums Detection

In [54]:
#divide the time axis into a beat-based grid
def compute_grid(x_drum, n_beat, beat_res, offset, sound_duration) :
  '''
  detects the beat locations and the tempo of the song and divide it into a beat-based
  time scale to have a time representation independant of the tempo

  path : path of the audio file on which we want to test drum detection
  nbeat : number of beat of the beat-based grid
  best_res : resolution of the beat-based grid 
  offset : shift the grid from a certain factor because beat events are detected
           with an offset
  sound_duration : duration of a sound (has to be the same than for the training data
                   for dimension considerations)
  '''

  x = x_drum
  tempo, beat_sample_stamps = librosa.beat.beat_track(x, sr=22050, units='samples')
  onset = librosa.onset.onset_detect(y=x, sr=22050, units='samples')
  grid = []
  onset_grid = []

  if(len(beat_sample_stamps)>=n_beat) :
    for i in range(1, n_beat+1) :
      l = beat_sample_stamps[i+1] - beat_sample_stamps[i]
      beat = x[int(beat_sample_stamps[i]-offset*l):int(beat_sample_stamps[i+1]-offset*l)]
      sub_beat_len = (len(beat)-len(beat)%beat_res)/beat_res
      for k in range(0, beat_res) :
        lower = int(beat_sample_stamps[i]-offset*l + sub_beat_len*k)
        higher = int(beat_sample_stamps[i]-offset*l + sub_beat_len*(k+1))
        sub_beat = x[lower:higher]
        current_onset = np.where(np.logical_and(onset>=lower, onset<=higher))
        if(len(current_onset[0])>0) :
          onset_grid.append(1)
        else :
          onset_grid.append(0)
        #to ensure that the length of the exctracts is the same than for the training
        if(len(sub_beat)-sound_duration < 0) :
          sub_beat = np.append(sub_beat, np.zeros(int(sound_duration-len(sub_beat)))) #we add zeros
        else :
          sub_beat = sub_beat[0:int(sound_duration)]

        grid.append(sub_beat)

  return grid, onset_grid

In [55]:
#use the trained model to predict drum events in a songs
def predict_drum(pca, clf, grid) :
  pred = []
  for g in grid :
    x_dft = normalized_dft(g)
    x_dft = [x_dft]
    x_dft = pca.transform(x_dft)
    pred.append(clf.predict(x_dft))

  predict = []
  for p in pred :
    predict.append(p[0])

  return predict

In [56]:
#corrects the model's prediction with onset detection
def onset_filtering(predict_drum, onset_grid) :
  return np.logical_and(predict_drum, onset_grid).astype(int)

###Source Separation

In [57]:
#separates the drum content of a song
def get_drum_signal(song_name) :
  path_input = "/content/songs/"+song_name+".wav"
  x, sr = librosa.load(path_input)
  !spleeter separate -p spleeter:4stems -o output/ $path_input
  path_output = "/content/output/"+song_name+"/drums.wav"
  x_drum, sr = librosa.load(path_output)
  
  return x, x_drum

All Together : song information extraction

In [None]:
for s in song_array :
  x, x_drum = get_drum_signal(s.name)
  pitch,mode = pitch_mode_detection(x, clf_pitch, clf_mode)
  tempo = get_tempo(x_drum)
  grid, onset_grid = compute_grid(x_drum, 4, 4, 0.15, sound_d*22050) 
  predict_kick = predict_drum(pca_kick, clf_kick, grid)
  predict_snare = predict_drum(pca_snare, clf_snare, grid)
  predict_kick = onset_filtering(predict_kick, onset_grid)
  predict_snare = onset_filtering(predict_snare, onset_grid)
  
  s.x = x
  s.pitch = pitch
  s.mode = mode
  s.tempo = tempo
  s.kick_array = predict_kick
  s.snare_array = predict_snare

In [59]:
#prints the content of each song (optional)
for s in song_array :
  print("*** " + str(s.name) + " ***")
  print("pitch : " + str(s.pitch))
  print("mode : " + str(s.mode))
  print("tempo : " + str(s.tempo))
  print("kick : " + str(s.kick_array))
  print("snare : " + str(s.snare_array) + "\n")

*** epps_tard_le_soir ***
pitch : [1.]
mode : [1.]
tempo : [161.49902344]
kick : [1 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0]
snare : [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]

*** wonder_eb ***
pitch : [9.]
mode : [1.]
tempo : [112.34714674]
kick : [0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0]
snare : [1 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0]

*** dre_still ***
pitch : [4.]
mode : [1.]
tempo : [92.28515625]
kick : [0 0 0 0 1 0 0 0 0 0 1 0 1 0 0 0]
snare : [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]

*** royce_boom ***
pitch : [7.]
mode : [1.]
tempo : [95.703125]
kick : [1 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0]
snare : [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]

*** break2 ***
pitch : [9.]
mode : [0.]
tempo : [107.66601562]
kick : [0 0 0 1 0 0 1 1 0 0 0 0 1 0 1 0]
snare : [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]

*** pusha_coupes ***
pitch : [7.]
mode : [1.]
tempo : [99.38401442]
kick : [1 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0]
snare : [0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0]

*** break3 ***
pitch : [11.]
mode : [1.]
tempo : [135.99917763]
kick : [0 0 0 1 0 0 0 0 0 0 0 0 

##Song combination

###Possible combinations

In [60]:
#computes all the combinations that respect our constraints
def song_combination(l, song_list, current, N) :
  if(len(l)>=N-1) :
    l = np.append(l,current)
    #for the format
    l_ = []
    l_.append(l)
    global combination_list
    if(len(np.unique(l))==len(l)) : #check if we don't have duplicates
      combination_list = np.append(combination_list, np.array(l_), axis=0)
    return 
  else :
    l = np.append(l, current)
    for n in song_list[current] :
      song_combination(l, song_list, n, N)

In [61]:
#implementations detail (to go from a ditionnary to a song list and vice-versa)
def from_name_list_to_obj_array(name_list, song_array) :
  obj_list = []
  for name in name_list :
    for s in song_array :
      if(s.name==name) :
        obj_list.append(s)
  return obj_list

def from_obj_to_dict(obj_array) :
  song_dict = {}
  for o in obj_array :
    song_dict.update({o.name: o.next})
  return song_dict

In [62]:
#defines all the songs that can consecutively follow a given song in the respect of tempo and key constraints
def define_song_next(song_array, tempo_tol, pitch_tol) :
  for s in song_array :
    s.next = np.array([])
    for s_next in song_array :
      if(abs((s.tempo-s_next.tempo)/s_next.tempo)<=tempo_tol and abs(s.pitch-s_next.pitch)<=pitch_tol and s.name != s_next.name):
        s.next = np.append(s.next, s_next.name) 
  
  return song_array


> N is the number of songs in a playlist



In [71]:
#compute all the possible song combinations that respect the constraints
define_song_next(song_array, 0.10, 1) #song_array is an array of 'Song' objects
song_list=from_obj_to_dict(song_array) #dictionnary with song names and next song names
l = []
N = 6  
if(N<=len(song_list)) :
  combination_list = np.empty((0, N))
  for s in song_list :
    song_combination(l, song_list, s, N)
  print(str(len(combination_list)) + " possible combinations : \n")
  print(combination_list)
else :
  print("N has to be smaller or equal to " + str(len(song_list)))

1296 possible combinations : 

[['royce_boom' 'pusha_coupes' 'big_party' 'fugees_softly' 'nas_message'
  'nas_affirmative']
 ['royce_boom' 'pusha_coupes' 'big_party' 'fugees_softly' 'nas_message'
  'fugees_la']
 ['royce_boom' 'pusha_coupes' 'big_party' 'fugees_softly'
  'nas_affirmative' 'nas_message']
 ...
 ['fugees_softly' 'fugees_la' 'nas_affirmative' 'nas_message'
  'royce_boom' 'pusha_coupes']
 ['fugees_softly' 'fugees_la' 'nas_affirmative' 'nas_message'
  'royce_boom' 'big_party']
 ['fugees_softly' 'fugees_la' 'nas_affirmative' 'nas_message'
  'royce_boom' 'soul_tilinfinity']]


###Similarity score computation

In [64]:
 #computes the score between 2 drum arrays
def compute_similarity_score(predict_drum_1, predict_drum_2) :

  if(len(predict_drum_1)==len(predict_drum_1)) :
    p1 = predict_drum_1
    p2 = predict_drum_2
    scores = []
    for i in range(0, len(p2)) :
      p2 = np.roll(p2, 1) #cyclic permutations
      score_array = np.logical_not(np.logical_xor(p1,p2)).astype(int)
      score = sum(score_array)/len(score_array)
      scores.append(score)
    return max(scores) 
  else :
    print("the drum predictions don't have the same length...")


In [65]:
#compute the total drum similarity between 2 songs
def compute_drum_similarity(s1, s2) :
  #compute the score for the kick and the snare
  kick_score = compute_similarity_score(s1.kick_array, s2.kick_array)
  snare_score = compute_similarity_score(s1.snare_array, s2.snare_array)

  #quadratic mean of the 2 latter scores
  tot_score = np.sqrt(kick_score**2 + snare_score**2) 

  return tot_score

In [66]:
#computes the score between 2 songs
def compute_linked_score(s1, s2) :
  score=0
  if(s1.mode==s2.mode) :
    score+=2
  score+=compute_drum_similarity(s1, s2)
  return score

In [67]:
#computes the total score of a given song list
def compute_tot_list_score(song_list) :
  tot_score=0
  for i in range(0, len(song_list)-1) :
    tot_score+=compute_linked_score(song_list[i], song_list[i+1])
  return tot_score

In [68]:
#print the best song combination
l_max = []
score_max = 0
for l in combination_list :
  l = from_name_list_to_obj_array(l, song_array)
  score = compute_tot_list_score(l)
  if(score >= score_max) :
    score_max = score
    l_max = l
print("the best combination among " + str(len(combination_list)) + " is the following : \n")
for s in l_max :
  print(s.name)
  display(Audio(data=s.x, rate=22050))

the best combination among 1296 is the following : 

soul_tilinfinity


big_party


royce_boom


fugees_la


fugees_softly


nas_affirmative
