# CNN data pre-processing

Import libraries

In [None]:
#Basic imports
import os
import matplotlib.pyplot as plt
import PIL
import essentia.standard as ess
import essentia
import numpy as np
import IPython.display as ipd
from random import randint

Reading the labels and counting the number of files in each label:

In [None]:
count = 0
count_songs = 0
labels = []
labels_dict = dict()
main_dir = '/mnt/kora-asplab/electronic-music-dataset/archivesunzipped/'

for root, dirs, files in os.walk (main_dir):
    count += 1
    if count==1:
        labels = [label for label in dirs if not label.startswith('.')]
    else:
        break
        
for l in labels:
    count_songs = 0
    for root, dirs, files in os.walk(main_dir+l):
        for file in files:
            if not file.startswith('.') and file.endswith('.mp3'):
                count_songs += 1
                
    print(l, " songs ", count_songs)       
    if "name" in labels_dict:
        labels_dict["name"].append(l)
        labels_dict['number_songs'].append(count_songs)
    else:
        labels_dict["name"] = [l]
        labels_dict["number_songs"] = [count_songs]

defectedarchive  songs  1560
ninjatunearchive  songs  305
kompaktarchive  songs  329
chillhoparchive  songs  308
suaraarchive  songs  571
bar25archive  songs  422
akbalrchives  songs  304
littlehelpersarchive  songs  360
mordarchive  songs  407
stilvortalentarchive  songs  580


Remove akbalrchives due to useless data inside that folder

In [None]:
labels_dict["name"].remove('akbalrchives')
labels_dict["number_songs"].remove(304)

# Mel spectrogram calculation

In this section we calculate mel spectrograms for:
- Complete tracks
- 1 minute chunk 
- 30 seconds chunks 

Create folder to save mel spectrogram images

In [None]:
img_dir_name = '/homedtic/jbustos/mel_specs'

if not os.path.exists(img_dir_name):
    os.mkdir(img_dir_name)
    print("Directory ", img_dir_name, " Created ")
else:
    print("Directory ", img_dir_name, " already exists")

Directory  /homedtic/jbustos/mel_specs  already exists


### Complete track 

Create folder for our complete tracks mel specs and one for the training dataset and another for testing

In [None]:
track_folder = '/homedtic/jbustos/mel_specs/complete_track/'
if not os.path.exists(track_folder):
    os.mkdir(track_folder)
    print("Directory ", track_folder, " Created ")
else:
    print("Directory ", track_folder, " already exists")
 

track_folder_train = track_folder+'train'
if not os.path.exists(track_folder_train):
    os.mkdir(track_folder_train)
    print("Directory ", track_folder_train, " Created ")
else:
    print("Directory ", track_folder_train, " already exists")
    
    
track_folder_test = track_folder+'test'
if not os.path.exists(track_folder_test):
    os.mkdir(track_folder_test)
    print("Directory ", track_folder_test, " Created ")
else:
    print("Directory ", track_folder_test, " already exists")

Directory  /homedtic/jbustos/mel_specs/complete_track/  already exists
Directory  /homedtic/jbustos/mel_specs/complete_track/train  Created 
Directory  /homedtic/jbustos/mel_specs/complete_track/test  Created 


Mel spectrogram calculation and saving files 

In [None]:
countlabel = 0
count = 0
nfiles = min(labels_dict["number_songs"])
fs = 44100
    
# Start iteration through labels folders
for l in labels_dict['name']:
    if not os.path.exists(track_folder_train+'/'+l):
        os.mkdir(track_folder_train+'/'+l)
        os.mkdir(track_folder_test+'/'+l)
        print("Directory ", track_folder_train+'/'+l, " Created ")
        print("Directory ", track_folder_test+'/'+l, " Created ")
    else:
        print("Directory ", track_folder_test+'/'+l, " already exists")
        print("Directory ", track_folder_train+'/'+l, " already exists")
        print('ANALIZING NEW LABEL... ',l)
    count = 0    
    count_chunk = 0
    for root, dirs, files in os.walk (main_dir+l):
        # Check if counter of files per label and reset
        # Iterate over files in label l
        for file in files: 
            if not file.startswith('.') and file.endswith('.mp3'):                                  
                if count < nfiles:
                    count += 1
                    # Reading audio file 
                    x = ess.MonoLoader(filename=root+'/'+file, sampleRate=fs)()

                    # Initilization of spectrum, window and melbands
                    spectrum = ess.Spectrum()
                    windowing = ess.Windowing(type='blackmanharris62', zeroPadding=2048)
                    melbands = ess.MelBands(numberBands=96, lowFrequencyBound=0, highFrequencyBound=11000)

                    pool = essentia.Pool()
                    amp2db = ess.UnaryOperator(type='lin2db', scale=2)

                    # Mel band calculation for each chunk
                    pool = essentia.Pool()
                    for frame in ess.FrameGenerator(x, frameSize=2048, hopSize=1024):
                        frame_spectrum = spectrum(windowing(frame))
                        frame_mel = melbands(frame_spectrum)

                        pool.add('mel96_db', amp2db(frame_mel))  
                    
                    # Save as training set 80% of each of the labels and as test set the 20%
                    if count <= 244:
                        print('Train: ',count)
                        np.save(track_folder_train + '/' + l + '/' + str(count), pool['mel96_db'].T)
                    else:
                        np.save(track_folder_test + '/' + l + '/' + str(count), pool['mel96_db'].T)
                        print('Test: ',count)


                    if count % 20 == 0:
                        print(count, " files processed, current file: ",file)
                        print(count_chunk, " chunks processed, current file: ",file)

Directory  /homedtic/jbustos/mel_specs/complete_track/train/defectedarchive  Created 
Directory  /homedtic/jbustos/mel_specs/complete_track/test/defectedarchive  Created 
Train:  1
Train:  2
Train:  3
Train:  4


Once we have all the spectrograms calculated we save paths and labels in a csv for test and training sets which we would you to create our DataLoader in pytorch.

In [None]:
split = 'train'

data_file = '/homedtic/jbustos/mel_specs/complete_track/'+split+'_complete.csv'
with open(data_file, 'w') as writer:
    line2write = 'path,label\n'
    writer.write(line2write)
    for l in labels:
        print(l)
        count = 0
        for root, dirs, files in os.walk ('/homedtic/jbustos/mel_specs/complete_track/'+split+'/'+l):
            for file in files:
                line2write = root+'/'+file+','+l+'\n'
                writer.write(line2write)

In [None]:
split = 'test'

data_file = '/homedtic/jbustos/mel_specs/complete_track/'+split+'_complete.csv'
with open(data_file, 'w') as writer:
    line2write = 'path,label\n'
    writer.write(line2write)
    for l in labels:
        print(l)
        count = 0
        for root, dirs, files in os.walk ('/homedtic/jbustos/mel_specs/complete_track/'+split+'/'+l):
            for file in files:
                line2write = root+'/'+file+','+l+'\n'
                writer.write(line2write)

For the following sections we follow the same flow but for different length of chunks

### 1 minute chunk 

Create folder for our 1 minute chunks mel specs and one for the training dataset and another for testing.

In [None]:
minute_1_folder = '/homedtic/jbustos/mel_specs/1_minute_BoC'
if not os.path.exists(minute_1_folder):
    os.mkdir(minute_1_folder)
    print("Directory ", minute_1_folder, " Created ")
else:
    print("Directory ", minute_1_folder, " already exists")
 

minute_1_folder_train = '/homedtic/jbustos/mel_specs/1_minute_BoC/train'
if not os.path.exists(minute_1_folder_train):
    os.mkdir(minute_1_folder_train)
    print("Directory ", minute_1_folder_train, " Created ")
else:
    print("Directory ", minute_1_folder_train, " already exists")
    
    
minute_1_folder_test = '/homedtic/jbustos/mel_specs/1_minute_BoC/test'
if not os.path.exists(minute_1_folder_test):
    os.mkdir(minute_1_folder_test)
    print("Directory ", minute_1_folder_test, " Created ")
else:
    print("Directory ", minute_1_folder_test, " already exists")

Directory  /homedtic/jbustos/mel_specs/1_minute_BoC  already exists
Directory  /homedtic/jbustos/mel_specs/1_minute_BoC/train  Created 
Directory  /homedtic/jbustos/mel_specs/1_minute_BoC/test  Created 


Mel spectrogram calculation and saving files 

In [None]:
countlabel = 0
count = 0
nfiles = min(labels_dict["number_songs"])
fs = 44100
sample_1m = fs*60   
    
# Start iteration through labels folders
for l in labels_dict['name']:
    if not os.path.exists(minute_1_folder_train+'/'+l):
        os.mkdir(minute_1_folder_train+'/'+l)
        os.mkdir(minute_1_folder_test+'/'+l)
        print("Directory ", minute_1_folder_train+'/'+l, " Created ")
        print("Directory ", minute_1_folder_test+'/'+l, " Created ")
    else:
        print("Directory ", minute_1_folder_test+'/'+l, " already exists")
        print("Directory ", minute_1_folder_train+'/'+l, " already exists")
        print('ANALIZING NEW LABEL... ',l)
    count = 0    
    count_chunk = 0
    for root, dirs, files in os.walk (main_dir+l):
        # Check if counter of files per label and reset
        # Iterate over files in label l
        for file in files: 
            if not file.startswith('.') and file.endswith('.mp3'):                                  
                if count < nfiles:
                    count += 1
                    # Reading audio file 
                    x = ess.MonoLoader(filename=root+'/'+file, sampleRate=fs)()

                    # Append audio file choppend in a list
                    X = []
                    for i in range(x.size//sample_1m):
                        X.append(x[i*sample_1m:i*sample_1m+sample_1m])

                    # Initilization of spectrum, window and melbands
                    spectrum = ess.Spectrum()
                    windowing = ess.Windowing(type='blackmanharris62', zeroPadding=2048)
                    melbands = ess.MelBands(numberBands=96, lowFrequencyBound=0, highFrequencyBound=11000)

                    pool = essentia.Pool()
                    amp2db = ess.UnaryOperator(type='lin2db', scale=2)

                    # Mel band calculation for each chunk
                    for x in X:
                        count_chunk+=1;
                        pool = essentia.Pool()
                        for frame in ess.FrameGenerator(x, frameSize=2048, hopSize=1024):
                            frame_spectrum = spectrum(windowing(frame))
                            frame_mel = melbands(frame_spectrum)

                            pool.add('mel96_db', amp2db(frame_mel))  

                        # Save as training set 80% of each of the labels and as test set the 20%
                        if count <= 244:
                            print('Train: ',count,' | Chunk: ', count_chunk)
                            np.save(minute_1_folder_train + '/' + l + '/' + str(count_chunk), pool['mel96_db'].T)
                        else:
                            np.save(minute_1_folder_test + '/' + l + '/' + str(count_chunk), pool['mel96_db'].T)
                            print('Test: ',count,' | Chunk: ', count_chunk)


                    if count % 20 == 0:
                        print(count, " files processed, current file: ",file)
                        print(count_chunk, " chunks processed, current file: ",file)
                    



Directory  /homedtic/jbustos/mel_specs/1_minute_BoC/train/defectedarchive  Created 
Directory  /homedtic/jbustos/mel_specs/1_minute_BoC/test/defectedarchive  Created 
Train:  1  | Chunk:  1
Train:  1  | Chunk:  2
Train:  1  | Chunk:  3
Train:  1  | Chunk:  4
Train:  1  | Chunk:  5
Train:  2  | Chunk:  6
Train:  2  | Chunk:  7
Train:  2  | Chunk:  8
Train:  2  | Chunk:  9
Train:  2  | Chunk:  10
Train:  2  | Chunk:  11
Train:  3  | Chunk:  12
Train:  3  | Chunk:  13
Train:  3  | Chunk:  14
Train:  3  | Chunk:  15


Once we have all the spectrograms calculated we save paths and labels in a csv for test and training sets which we would you to create our DataLoader in pytorch.

In [None]:
split = 'train'

data_file = '/homedtic/jbustos/mel_specs/1_minute_BoC/'+split+'_complete.csv'
with open(data_file, 'w') as writer:
    line2write = 'path,label\n'
    writer.write(line2write)
    for l in labels:
        print(l)
        count = 0
        for root, dirs, files in os.walk ('/homedtic/jbustos/mel_specs/1_minute_BoC/'+split+'/'+l):
            for file in files:
                line2write = root+'/'+file+','+l+'\n'
                writer.write(line2write)

In [None]:
split = 'test'

data_file = '/homedtic/jbustos/mel_specs/1_minute_BoC/'+split+'_complete.csv'
with open(data_file, 'w') as writer:
    line2write = 'path,label\n'
    writer.write(line2write)
    for l in labels:
        print(l)
        count = 0
        for root, dirs, files in os.walk ('/homedtic/jbustos/mel_specs/1_minute_BoC/'+split+'/'+l):
            for file in files:
                line2write = root+'/'+file+','+l+'\n'
                writer.write(line2write)

### 30 seconds chunk

Create folder for our 1 minute chunks mel specs and one for the training dataset and another for testing.

In [None]:
seconds_30_folder = '/homedtic/jbustos/mel_specs/30_seconds_BoC'

if not os.path.exists(seconds_30_folder):
    os.mkdir(seconds_30_folder)
    print("Directory ", seconds_30_folder, " Created ")
else:
    print("Directory ", seconds_30_folder, " already exists")
    
seconds_30_folder_train = '/homedtic/jbustos/mel_specs/30_seconds_BoC/train'
if not os.path.exists(seconds_30_folder_train):
    os.mkdir(seconds_30_folder_train)
    print("Directory ", seconds_30_folder_train, " Created ")
else:
    print("Directory ", seconds_30_folder_train, " already exists")
    
    
seconds_30_folder_test = '/homedtic/jbustos/mel_specs/30_seconds_BoC/test'
if not os.path.exists(seconds_30_folder_test):
    os.mkdir(seconds_30_folder_test)
    print("Directory ", seconds_30_folder_test, " Created ")
else:
    print("Directory ", seconds_30_folder_test, " already exists")

Directory  /homedtic/jbustos/mel_specs/30_seconds_BoC  already exists
Directory  /homedtic/jbustos/mel_specs/30_seconds_BoC/train  Created 
Directory  /homedtic/jbustos/mel_specs/30_seconds_BoC/test  Created 


Mel spectrogram calculation and saving files 

In [None]:
countlabel = 0
count = 0
nfiles = min(labels_dict["number_songs"])
fs = 44100
sample_30s = fs*30
   
# Start iteration through labels folders
for l in labels_dict['name']:

    if not os.path.exists(seconds_30_folder_train+'/'+l):
        os.mkdir(seconds_30_folder_train+'/'+l)
        os.mkdir(seconds_30_folder_test+'/'+l)
        print("Directory ", seconds_30_folder_train+'/'+l, " Created ")
        print("Directory ", seconds_30_folder_test+'/'+l, " Created ")
    else:
        print("Directory ", seconds_30_folder_test+'/'+l, " already exists")
        print("Directory ", seconds_30_folder_train+'/'+l, " already exists")
        
    count = 0    
    count_chunk = 0
    for root, dirs, files in os.walk (main_dir+l):
        # Check if counter of files per label and reset
        # Iterate over files in label l
        for file in files: 
            if not file.startswith('.') and file.endswith('.mp3'):                                  
                if count < nfiles:
                    count += 1
                    # Reading audio file 
                    x = ess.MonoLoader(filename=root+'/'+file, sampleRate=fs)()
                    
                    # Append audio file choppend in a list
                    X = []
                    for i in range(x.size//sample_30s):
                        X.append(x[i*sample_30s:i*sample_30s+sample_30s])
                    
                    # Initilization of spectrum, window and melbands
                    spectrum = ess.Spectrum()
                    windowing = ess.Windowing(type='blackmanharris62', zeroPadding=2048)
                    melbands = ess.MelBands(numberBands=96, lowFrequencyBound=0, highFrequencyBound=11000)

                    pool = essentia.Pool()
                    amp2db = ess.UnaryOperator(type='lin2db', scale=2)
                    
                    # Mel band calculation for each chunk 
                    for x in X:
                        count_chunk+=1;
                        pool = essentia.Pool()
                        for frame in ess.FrameGenerator(x, frameSize=2048, hopSize=1024):
                            frame_spectrum = spectrum(windowing(frame))
                            frame_mel = melbands(frame_spectrum)

                            pool.add('mel96_db', amp2db(frame_mel))                               
                           
                        if count <= 244:
                            np.save(seconds_30_folder_train + '/' + l + '/' + str(count_chunk), pool['mel96_db'].T)
                        else:
                            np.save(seconds_30_folder_test + '/' + l + '/' +str(count_chunk), pool['mel96_db'].T)
                         
                    if count % 20 == 0:
                        print(count, "files processed, current file: ",file)                    


Directory  /homedtic/jbustos/mel_specs/30_seconds_BoC/train/defectedarchive  Created 
Directory  /homedtic/jbustos/mel_specs/30_seconds_BoC/test/defectedarchive  Created 


Once we have all the spectrograms calculated we save paths and labels in a csv for test and training sets which we would you to create our DataLoader in pytorch.

In [None]:
split = 'train'

data_file = '/homedtic/jbustos/mel_specs/30_seconds_BoC/'+split+'_complete.csv'
with open(data_file, 'w') as writer:
    line2write = 'path,label\n'
    writer.write(line2write)
    for l in labels:
        print(l)
        count = 0
        for root, dirs, files in os.walk ('/homedtic/jbustos/mel_specs/30_seconds_BoC/'+split+'/'+l):
            for file in files:
                line2write = root+'/'+file+','+l+'\n'
                writer.write(line2write)

In [None]:
split = 'test'

data_file = '/homedtic/jbustos/mel_specs/30_seconds_BoC/'+split+'_complete.csv'
with open(data_file, 'w') as writer:
    line2write = 'path,label\n'
    writer.write(line2write)
    for l in labels:
        print(l)
        count = 0
        for root, dirs, files in os.walk ('/homedtic/jbustos/mel_specs/30_seconds_BoC/'+split+'/'+l):
            for file in files:
                line2write = root+'/'+file+','+l+'\n'
                writer.write(line2write)