# Challenge SD207 - 2017
# Data processing before classification:

In [30]:
import os
import re
import numpy as np
import pandas as pd
import librosa


# Constants:
data_home = "." #"/tsi/plato/sons/sd207"
training_file = 'audio/train.txt'
valid_file = 'audio/dev.txt'
n_vectors = 10 # Each audio file will be represented by 10 vectors
n_coefs = 20 # We will keep 20 MFCC.

labels_numbers = {"beach": 0,
"bus": 1,
"cafe/restaurant": 2,
"car": 3,
"city_center": 4,
"forest_path": 5,
"grocery_store": 6,
"home": 7,
"library": 8,
"metro_station": 9,
"office": 10,
"park": 11,
"residential_area": 12,
"train": 13,
"tram": 14}

# Retrieving files:
os.listdir(data_home)


training_set = pd.read_csv(data_home + '/' + training_file, sep='\s+', 
                           dtype=str, names=['filename', 'label'])
testing_set    = pd.read_csv(data_home + '/' + valid_file, sep='\s+', 
                           dtype=str, names=['filename', 'label'])

print("Training set: \n ", training_set[:10])
print("...\n(%d samples)" % len(training_set))
print("\nTesting set:\n", testing_set[:10])
print("...\n(%d samples)" % len(testing_set))

Training set: 
                   filename  label
0     audio/b010_0_30.wav  beach
1    audio/b010_60_90.wav  beach
2  audio/b010_150_180.wav  beach
3    audio/b010_30_60.wav  beach
4  audio/b010_120_150.wav  beach
5  audio/b022_120_150.wav  beach
6    audio/b022_60_90.wav  beach
7  audio/b022_180_210.wav  beach
8    audio/b022_30_60.wav  beach
9   audio/b022_90_120.wav  beach
...
(582 samples)

Testing set:
                  filename  label
0    audio/b021_30_60.wav  beach
1  audio/b021_150_180.wav  beach
2   audio/b021_90_120.wav  beach
3  audio/b021_120_150.wav  beach
4    audio/b021_60_90.wav  beach
5  audio/b021_180_210.wav  beach
6     audio/b021_0_30.wav  beach
7  audio/b019_180_210.wav  beach
8  audio/b019_120_150.wav  beach
9   audio/b019_90_120.wav  beach
...
(290 samples)


In [31]:
# Load the audio files:

def get_descr(path, n_coef, n_vectors):
    """Returns a matrix description of an audio file.
    
    :param path: The path to the audio file
    :param n_coef: The numbers of MFCC coefficients
    :param n_vectors: number of vectors
    
    :return: A matrix of n_coefs lines and n_vectors column,
    where each vector is the mean of dataframes and contains 
    n_coefs coefficients."""
    y, sr = librosa.load(path)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_coef)
    X = np.zeros((n_coef, n_vectors))
    # Numbers of samples we will take to make one vector
    p = mfcc.shape[1] // n_vectors 
    for i in range(n_vectors-1):
        X[:,i] = np.mean(mfcc[:,i*p:(i+1)*p-1], axis=1)
    # Last vector:
    i += 1
    X[:,i] = np.mean(mfcc[:,i*p:], axis=1)
    return X


def populate_training(data_home, training_set, n_vectors=10, n_coefs=20):
    """ Create the data matrix and the label vectors for the samples 
    in the training set.
    
    :param data_home: The path to the directory containing the files
    :param training_set: The dataframe linking each filename to its label
    :param n_vectors: number of vectors for each sample
    :param n_coefs: Numbers of MFCC coefficients for each sample
    
    :return: X [shape=(n_coefs, N_samples*n_vectors)], y [shape=(N_samples*n_vectors,)]
            The data matrix and the labels for each vector"""
    n_samples = len(training_set)
    data = np.zeros((n_coefs, n_samples * n_vectors))
    labels = np.zeros(n_samples * n_vectors, dtype=object)
    for i, row in training_set.iterrows():
        if i%10==0:
            print("Processed samples: %d/%d..." % (i, n_samples))
        filename, label = row.filename, row.label
        descr = get_descr(data_home + '/' + filename, n_coefs, n_vectors)
        data[:,i*n_vectors:(i+1)*n_vectors] = descr
        labels[i*n_vectors:(i+1)*n_vectors] = label
    print("Done processing")
    return data, labels
        

In [32]:
if "training_set.txt" not in os.listdir() or "training_labels.txt" not in os.listdir():
    print("First time processing audio files.")
    data, labels = populate_training(data_home, training_set)
    print(data.shape, labels.shape)
    print("Saving the processed data.")
    np.savetxt("training_set.txt", data)
    np.savetxt("training_labels.txt", labels,  fmt="%s")
else:
    print("Audio files already processed, retrieving the file.")
    data = np.loadtxt("training_set.txt")
    labels = np.array(pd.read_csv("training_labels.txt", header=None)).ravel()
    print("Overview of labels: ",labels)

Audio files already processed, retrieving the file.
Overview of labels:  ['beach' 'beach' 'beach' ..., 'tram' 'tram' 'tram']


In [33]:
# Checking the first samples:
filename = training_set.loc[0, 'filename']
print("Checking with sample: %s" % filename)
y, sr = librosa.load(path = filename)
mfcc = librosa.feature.mfcc(y, sr)
n_coefs, n_cols = mfcc.shape
p = n_cols//10

np.set_printoptions(precision=2, threshold=5, formatter={'float_kind':lambda x:"%.2f" % x})
np.set_printoptions()

print("Checking the MFCC sequence against the computed data matrix")
for i in range(10):
    print("-"*20)
    print("MFCC[%d:%d]:" % (i*p, (i+1)*p))
    print(mfcc[:5,i*p:(i+1)*p])
    print("\ndata[%d]:" % i)
    print("(Each term should be the mean of the corresponding line)")
    print(data[:5, i])

Checking with sample: audio/b010_0_30.wav
Checking the MFCC sequence against the computed data matrix
--------------------
MFCC[0:129]:
[[-448.55 -442.7  -440.29 ..., -471.38 -472.87 -471.52]
 [  91.99  110.43  106.02 ...,   94.46   91.11   95.27]
 [  -2.27  -17.39  -33.03 ...,  -32.32  -34.85  -34.73]
 [  52.51   62.08   64.41 ...,   49.     45.88   50.63]
 [   1.33   -4.51  -11.67 ...,  -10.59  -13.02   -9.88]]

data[0]:
(Each term should be the mean of the corresponding line)
[-473.36   95.75  -25.5    51.08  -12.73]
--------------------
MFCC[129:258]:
[[ -4.73e+02  -4.67e+02  -4.64e+02 ...,  -4.89e+02  -4.88e+02  -4.84e+02]
 [  1.00e+02   9.65e+01   9.14e+01 ...,   8.94e+01   8.91e+01   9.32e+01]
 [ -3.19e+01  -3.00e+01  -3.59e+01 ...,  -2.72e+01  -3.11e+01  -3.52e+01]
 [  5.41e+01   5.72e+01   5.80e+01 ...,   4.71e+01   4.04e+01   4.29e+01]
 [ -4.35e-01  -5.87e+00  -1.19e+01 ...,  -7.13e+00  -1.34e+01  -1.01e+01]]

data[1]:
(Each term should be the mean of the corresponding line)


## Let's do the actual training and prediction !

In [34]:
from sklearn.mixture import GaussianMixture

X = data.T
y = np.array([labels_numbers[name] for name in labels])



def most_frequent(array):
    unique, counts = np.unique(array, return_counts=True)
    return unique[counts.argmax()]

def predict(path, votes, clf, n_coefs=20, n_vect=10):
    """ Return the number of the assumed class of an audio file.
    
    :param path: The path to the file
    :param votes: A dictionary {number: label} that represents the association
        between the meaningful numbers returned by the classifier
        and the actual classes of the data.
    :param clf: The classifier object
    :param n_coefs: Number of MFCC coefficients to extract from the audio file
    :param n_vect: Number of vectors (audio frames) the audio file should be 
            split into
    
    :return: The number of the assumed class of this audio file."""
    descr = get_descr(path, n_coefs, n_vect) # n_features * n_samples
    predictions = clf.predict(descr.T)
    print("Predictions for sample %s:" % path)
    print(predictions)
    selection = most_frequent(predictions)
    #TODO: take all the predictions and take weights.
    print("%s seems to belong in class n. '%d'" % (path, selection))
    return votes[selection]
  

def get_votes(labels, predictions, verbose=False):
    """ Return the names of the classes returned by the classifer.
    :param labels: The labels names
    :param predictions: The predictions for the training set
    :return: a dictionary associating classes to label names"""
    corresp = {} # dictionary {number: label name}
    for i in np.unique(predictions):
        # The group of all labels that were assigned to the number i
        group = labels[predictions == i] 
        if verbose:
            print("Samples that were assigned to clas %d came from labels:"% i)
            print(np.unique(group, return_counts=True))
        vote = most_frequent(group)
        corresp[i] = vote
    return corresp

permut = np.random.permutation(len(X))
X = X[permut]
y = y[permut]
labels_permut = labels[permut]


clf = GaussianMixture(n_components=16).fit(X, y=y)

np.set_printoptions(edgeitems=3,infstr='inf',
                    linewidth=75, nanstr='nan', precision=8,
                    suppress=False, threshold=1000, formatter=None)

a = 100
b = 20
print(clf.predict(X[a:a+b]))
print(y[a:a+b])

clf_labels = clf.predict(X)

votes = get_votes(labels_permut, clf_labels)
print(votes)


[11  1  3  7  3  6  2 11  0  7 10  0  9 11 12  2 15  7  3  3]
[ 6  5 12  5 12  4  1 13  1  7  3  1 10  9  8 14  6  7 12  5]
{0: 'bus', 1: 'park', 2: 'tram', 3: 'residential_area', 4: 'cafe/restaurant', 5: 'forest_path', 6: 'city_center', 7: 'home', 8: 'train', 9: 'office', 10: 'car', 11: 'metro_station', 12: 'library', 13: 'car', 14: 'beach', 15: 'grocery_store'}


In [35]:
def populate_testing(data_home, testing_set, n_vectors=10, n_coefs=20):
    """ Create the data matrix and the label vectors for the samples 
    in the testing set.
    
    :param data_home: The path to the directory containing the files
    :param testing_set: The dataframe linking each filename to its label
    :param n_vectors: number of vectors for each sample 
        (number of frames an audio file will be split into)
    :param n_coefs: Numbers of MFCC coefficients for each sample
    
    :return: X [shape=(n_coefs, N_samples*n_vectors)], y [shape=(N_samples*n_vectors,)]
            The data matrix and the labels for each vector"""
    n_samples = len(testing_set)
    data = np.zeros((n_coefs, n_samples * n_vectors))
    labels = np.zeros(n_samples * n_vectors, dtype=object)
    for i, row in testing_set.iterrows():
        if i%10==0:
            print("Processed samples: %d/%d..." % (i, n_samples))
        filename, label = row.filename, row.label
        descr = get_descr(data_home + '/' + filename, n_coefs, n_vectors)
        data[:,i*n_vectors:(i+1)*n_vectors] = descr
        labels[i*n_vectors:(i+1)*n_vectors] = label
    print("Done processing")
    return data, labels
        
    
if "testing_set.txt" not in os.listdir() or "testing_labels.txt" not in os.listdir():
    print("First time processing audio files for testing.")
    test_data, test_labels = populate_testing(data_home, testing_set)
    print("Shape of testing data:", test_data.shape, test_labels.shape)
    print("Saving the processed data.")
    np.savetxt("testing_set.txt", data)
    np.savetxt("testing_labels.txt", labels,  fmt="%s")
else:
    print("Audio files already processed, retrieving the file.")
    test_data = np.loadtxt("testing_set.txt")
    test_labels = np.array(pd.read_csv("testing_labels.txt", header=None)).ravel()
    print("Overview of labels: ", test_labels)

First time processing audio files for testing.
Processed samples: 0/290...
Processed samples: 10/290...
Processed samples: 20/290...
Processed samples: 30/290...
Processed samples: 40/290...
Processed samples: 50/290...
Processed samples: 60/290...
Processed samples: 70/290...
Processed samples: 80/290...
Processed samples: 90/290...
Processed samples: 100/290...
Processed samples: 110/290...
Processed samples: 120/290...
Processed samples: 130/290...
Processed samples: 140/290...
Processed samples: 150/290...
Processed samples: 160/290...
Processed samples: 170/290...
Processed samples: 180/290...
Processed samples: 190/290...
Processed samples: 200/290...
Processed samples: 210/290...
Processed samples: 220/290...
Processed samples: 230/290...
Processed samples: 240/290...
Processed samples: 250/290...
Processed samples: 260/290...
Processed samples: 270/290...
Processed samples: 280/290...
Done processing
Shape of testing data: (20, 2900) (2900,)
Saving the processed data.


In [42]:
successes = 0

for i, row in testing_set.iterrows():
    prediction = predict(row.filename, votes, clf)
    print(prediction)
    print("Should be", row.label)
    if prediction == row.label:
        successes += 1
    print("So far, success rate: %d/%d" % (successes, i+1))
print("Success rate: %d/%d" % (successes, len(testing_set)))

Predictions for sample audio/b021_30_60.wav:
[7 7 7 7 7 7 7 7 7 7]
audio/b021_30_60.wav seems to belong in class n. '7'
home
Should be beach
So far, success rate: 0/1
Predictions for sample audio/b021_150_180.wav:
[14 14 14 14 14  6  6  6  6  6]
audio/b021_150_180.wav seems to belong in class n. '6'
city_center
Should be beach
So far, success rate: 0/2
Predictions for sample audio/b021_90_120.wav:
[14 14  6 14 14 14  6 14 14  6]
audio/b021_90_120.wav seems to belong in class n. '14'
beach
Should be beach
So far, success rate: 1/3
Predictions for sample audio/b021_120_150.wav:
[ 6  6  6  6  7 14 14 14 14 14]
audio/b021_120_150.wav seems to belong in class n. '14'
beach
Should be beach
So far, success rate: 2/4
Predictions for sample audio/b021_60_90.wav:
[14 14 14 14 14 14 14 14 14 14]
audio/b021_60_90.wav seems to belong in class n. '14'
beach
Should be beach
So far, success rate: 3/5
Predictions for sample audio/b021_180_210.wav:
[ 7 14 14 14 14 14 14  7  6  6]
audio/b021_180_210.wav 