# Task 2 - Minz Won & Marc Siquier
This is the code and report for task 2 - ASPMA LAB

### All necesary imports
We will use essentia in order to extract MFCCs for all audio tracks and sklearn in order to train and test GMMs and to evaluate the results

In [8]:
import essentia
import os
import json
import operator
import random
from essentia.standard import *
from essentia import pool
import numpy as np
from sklearn import mixture
from sklearn import preprocessing
from sklearn import metrics

### Setup working directory
Please set-up your inputDir to the genre folder...

for example: `inputDir = '/home/user/datasets/genre'`

this folder should contain folders named with genre class and inside of it the audios and extracted json with features.

In [9]:
inputDir = '/home/marcsiq/SMC/aspma-lab/genre'

### Fetch all mp3 files.
Define a function to fetch files that will be used to also fetch .json

__NOTE:__ If you downloaded this from github, audios are not provided, so number of fetched files will be 0. 

In [10]:
def fetchFiles(inputDir, descExt):
    files = []
    for path, dname, fnames  in os.walk(inputDir):
        for fname in fnames:
            if descExt in fname.lower():
                files.append((path, fname))
    return files

mp3files = fetchFiles(inputDir, ".mp3")
print "Number of mp3 files fetched: " + str(len(mp3files))

Number of mp3 files fetched: 438


### Extract and save mfcc for all fetched files
__NOTE:__ In order to save time, dont run this cell if mfccs are already extracted in json files.
In this cell we compute mfccs for all

In [None]:
w = Windowing(type = 'hann')
# FFT() would return the complex FFT, here we just want the magnitude spectrum
spectrum = Spectrum()  
mfcc = MFCC()
for path, file in mp3files:
    file_name, extension = os.path.splitext(file)
    file_location = path + "/" + file
    print file_location

    #computing mfcc
    loader = essentia.standard.EqloudLoader(filename = file_location)
    audio = loader()
    pool = essentia.Pool()
    for frame in FrameGenerator(audio, frameSize = 2048, hopSize = 512, startFromZero=True):
        mfcc_bands, mfcc_coeffs = mfcc(spectrum(w(frame)))
        pool.add('lowlevel.mfcc', mfcc_coeffs)
        pool.add('lowlevel.mfcc_bands', mfcc_bands)

    # saving Mfcc aggregated per audio file
    aggrPool = PoolAggregator(defaultStats = [ 'mean', 'var' ])(pool)
    YamlOutput(filename = path + "/"+ file_name + ".json", format = "json")(aggrPool)

### Fetch and collect all mfcc.json files
Reusing function `fetchFiles` defined in cell 3

In [11]:
jsonfiles = fetchFiles(inputDir, ".json")
print "Number of json files fetched: " + str(len(jsonfiles))
al = []
mfccs = dict()
for path, fname in jsonfiles:
    genre_clas = os.path.basename(os.path.normpath(path))
    pool = essentia.Pool()
    pool = YamlInput(filename = path + "/"+ fname, format = "json")()
    if genre_clas not in mfccs:
        mfccs[genre_clas] = []
    mfccs[genre_clas].append(pool)
    
for genre in mfccs:
    for pool in mfccs[genre]:
        al.append(pool['lowlevel.mfcc.mean'][1:] )

normalized = []
for mfcc_idx in range(0,12):
    normalized.append(preprocessing.normalize([item[mfcc_idx] for item in al]))
    
id = 0
for genre in mfccs:
    for idx, pool in enumerate(mfccs[genre]):
        mfcc_norm = []
        for idx2 in range(12):
            mf = [item[id] for item in normalized[idx2]][0]
            mfcc_norm.append(mf)

        mfccs[genre][idx].add('mfcc_normalized', mfcc_norm)
        id += 1



Number of json files fetched: 438




### Separate dataset into train and test
As we separate dataset randomly, results vary for each execution.

In [22]:
mfccs_train = dict()
mfccs_test = dict()
percentage_train = 0.9

for class_name, sounds in mfccs.items():
    sounds_class = sounds[:]
    train_per_class = int(np.ceil(len(sounds_class)*percentage_train))
    random.shuffle(sounds_class)
    mfccs_train[class_name] = sounds_class[:train_per_class]
    mfccs_test[class_name] = sounds_class[train_per_class:]

print 'Created training and testing sets with the following number of sounds:\n\tTrain\tTest\tTotal\tClass'
for class_name in mfccs_train:
    training_sounds = mfccs_train[class_name]
    testing_sounds = mfccs_test[class_name]
    print '\t%i\t%i\t%i\t%s' % (len(training_sounds), len(testing_sounds), len(mfccs[class_name]), class_name)


Created training and testing sets with the following number of sounds:
	Train	Test	Total	Class
	50	5	55	hip
	50	5	55	rhy
	50	5	55	jaz
	50	5	55	dan
	50	5	55	roc
	50	5	55	cla
	49	5	54	pop
	49	5	54	spe


### Compute a GMM with train dataset for each genre

In [23]:
gmms = dict()
gmms_non = dict()
for genre in mfccs_train:
    features = []
    features_non = []
    for pool in mfccs_train[genre]:
        #collect mfcc.mean withouth DC value
        mfcc = pool['mfcc_normalized'][0]
        mfcc_non = pool['lowlevel.mfcc.mean'][1:]
        features.append(mfcc)
        features_non.append(mfcc_non)
        
    gmms[genre] = mixture.GaussianMixture(n_components=1)
    gmms[genre].fit(features)
    gmms_non[genre] = mixture.GaussianMixture(n_components=1)
    gmms_non[genre].fit(features_non)

### Score test dataset

In [24]:
correct = []
predicted = []
predicted_non = []
for genre in mfccs_test:
    for pool in mfccs_test[genre]:
        results = dict()
        results_non = dict()
        for g in gmms:
            x = np.array(pool['mfcc_normalized'][0])
            x = x.reshape(1,-1)
            x_non= np.array(pool['lowlevel.mfcc.mean'][1:])
            x_non = x_non.reshape(1, -1)
            results[g] = gmms[g].score(x)
            results_non[g] = gmms_non[g].score(x_non)
            
        predicted.append(max(results, key=results.get))
        predicted_non.append(max(results_non, key=results_non.get))
        correct.append(genre)

print ".....WITH NORMALIZATION.....\n\nClassification report\n"
print metrics.classification_report(correct, predicted)
print "Confusion Matrix\n"
print metrics.confusion_matrix(correct, predicted)

print "\n\n.....WITHOUT NORMALIZATION.....\n\nClassification report\n"
print metrics.classification_report(correct, predicted_non)
print "Confusion Matrix\n"
print metrics.confusion_matrix(correct, predicted_non)

.....WITH NORMALIZATION.....

Classification report

             precision    recall  f1-score   support

        cla       1.00      0.60      0.75         5
        dan       0.75      0.60      0.67         5
        hip       0.57      0.80      0.67         5
        jaz       0.44      0.80      0.57         5
        pop       0.75      0.60      0.67         5
        rhy       0.50      0.40      0.44         5
        roc       1.00      0.40      0.57         5
        spe       0.71      1.00      0.83         5

avg / total       0.72      0.65      0.65        40

Confusion Matrix

[[3 0 0 0 0 0 0 2]
 [0 3 0 1 1 0 0 0]
 [0 0 4 1 0 0 0 0]
 [0 0 0 4 0 1 0 0]
 [0 1 1 0 3 0 0 0]
 [0 0 1 2 0 2 0 0]
 [0 0 1 1 0 1 2 0]
 [0 0 0 0 0 0 0 5]]


.....WITHOUT NORMALIZATION.....

Classification report

             precision    recall  f1-score   support

        cla       1.00      0.60      0.75         5
        dan       0.67      0.40      0.50         5
        hip       0.57   

By running last three cells several times we can see that results vary a lot depending on the random splitting of the dataset into train and test. Some times normalized features work better and some times non-normalized features work better.