# Challenge SD207 - 2017

Acoustic scene classification


# Data processing before classification:

We take the samples, we extract the MFCC sequence of these files to represent audio files with vectors (matrix in fact). We will then do the classifying on these data.

In [4]:
import os
import re
import numpy as np
import pandas as pd
import librosa
from IPython.display import clear_output

# Constants:
data_home = "." #"/tsi/plato/sons/sd207"
training_file = 'audio/train.txt'
valid_file = 'audio/dev.txt'
n_vectors = 10 # Each audio file will be represented by [n_vectors] vectors (default value: 1296)
n_coefs = 20 # We will keep [n_coefs] MFC coefficients.

labels_numbers = {"beach": 0,
"bus": 1,
"cafe/restaurant": 2,
"car": 3,
"city_center": 4,
"forest_path": 5,
"grocery_store": 6,
"home": 7,
"library": 8,
"metro_station": 9,
"office": 10,
"park": 11,
"residential_area": 12,
"train": 13,
"tram": 14}

# Loading text files into pd.Dataframes
training_set = pd.read_csv(data_home + '/' + training_file, sep='\s+', 
                           dtype=str, names=['filename', 'label'])
testing_set    = pd.read_csv(data_home + '/' + valid_file, sep='\s+', 
                           dtype=str, names=['filename', 'label'])

unknown_set    = pd.read_csv(data_home + '/test_files.txt', sep='\s+', 
                           dtype=str, names=['filename'])

print("Training set: \n ", training_set[:10])
print("...\n(%d samples)" % len(training_set))
print("\nTesting set:\n", testing_set[:10])
print("...\n(%d samples)" % len(testing_set))
print("\nUnknown set: \n ", unknown_set[:10])
print("...\n(%d samples)" % len(unknown_set))

Training set: 
                   filename  label
0     audio/b010_0_30.wav  beach
1    audio/b010_60_90.wav  beach
2  audio/b010_150_180.wav  beach
3    audio/b010_30_60.wav  beach
4  audio/b010_120_150.wav  beach
5  audio/b022_120_150.wav  beach
6    audio/b022_60_90.wav  beach
7  audio/b022_180_210.wav  beach
8    audio/b022_30_60.wav  beach
9   audio/b022_90_120.wav  beach
...
(582 samples)

Testing set:
                  filename  label
0    audio/b021_30_60.wav  beach
1  audio/b021_150_180.wav  beach
2   audio/b021_90_120.wav  beach
3  audio/b021_120_150.wav  beach
4    audio/b021_60_90.wav  beach
5  audio/b021_180_210.wav  beach
6     audio/b021_0_30.wav  beach
7  audio/b019_180_210.wav  beach
8  audio/b019_120_150.wav  beach
9   audio/b019_90_120.wav  beach
...
(290 samples)

Unknown set: 
                   filename
0     audio/b053_0_30.wav
1   audio/b035_90_120.wav
2  audio/b089_210_240.wav
3    audio/a034_30_60.wav
4  audio/a045_150_180.wav
5   audio/a058_90_120.wav
6    au

In [20]:
# Load the audio files:

def get_descr(path, n_coef, n_vectors):
    """Returns a matrix description of an audio file.
    
    :param path: The path to the audio file
    :param n_coef: The numbers of MFC coefficients
    :param n_vectors: number of vectors that will represent the audio file.
        This is the number of frames we will split the audio file into.
    
    :return: ndarray of shape (n_coefs, n_vectors) 
        where each column is a MFCC sequence of a frame in the audio file."""
    y, sr = librosa.load(path)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_coef)
    # We have to many columns in [mfcc], we will stack them into [X]
    X = np.zeros((n_coef, n_vectors))
    # Numbers of samples we will take to make one vector
    p = mfcc.shape[1] // n_vectors 
    for i in range(n_vectors-1):
        # We collapse the [p] vectors into one single column in [X]
        X[:,i] = np.mean(mfcc[:,i*p:(i+1)*p], axis=1)
    # Last vector:
    i += 1
    X[:,i] = np.mean(mfcc[:,i*p:], axis=1)
    return X


def _populate_training(data_home, training_set, n_vectors, n_coefs):
    """ Create the data matrix and the label vectors for the samples 
    in the training set.
    
    :param data_home: The path to the directory containing the files
    :param training_set: The dataframe linking each filename to its label
    :param n_vectors: number of vectors for each sample
    :param n_coefs: Numbers of MFC coefficients for each sample
    
    :return: X [shape=(n_coefs, N_samples*n_vectors)], y [shape=(N_samples*n_vectors,)]
            The data matrix and the labels for each vector"""
    n_samples = len(training_set)
    data = np.zeros((n_coefs, n_samples * n_vectors))
    labels = np.zeros(n_samples * n_vectors, dtype=object)
    for i, row in training_set.iterrows():
        if i%10==0:
            print("Processed samples: %d/%d..." % (i, n_samples))
        filename, label = row.filename, row.label
        descr = get_descr(data_home + '/' + filename, n_coefs, n_vectors)
        data[:,i*n_vectors:(i+1)*n_vectors] = descr
        labels[i*n_vectors:(i+1)*n_vectors] = label
    print("Done processing")
    return data, labels



def get_data_filename(descr, n_coefs, n_vectors):
    """Returns the file names for the saved data.
    
    :return: set, label
        set   - filename for the audio data
        label - filename for the labels"""
    return ('{0}_set_{1}coef_{2}vect.txt'.format(descr, n_coefs, n_vectors),
            '{0}_labels_{1}coef_{2}vect.txt'.format(descr, n_coefs, n_vectors))

def extract_features(dataframe, n_vectors, n_coefs, data_home=".",  saving='autosave'):
    """ Extract the features from a batch of audio files and saves them in a file.
    
    :param dataframa: pd.DataFrame, linking filename to labels
    :param data_home: The path to the audio files
    :param n_vectors: The number of vectors that represent an audio sample
    :param n_coefs:   The number of MFC coefficients to extract
    :param saving:    The description of the filename saving the data.
                      This function will save two files: the data from 
                      the audio files, and the labels.
        
    :return: data, labels
        - data: ndarray of shape (n_coefs, N_samples * n_vectors) containing 
                the MFC coefficients for each audio samples
        - labels: ndarray of shape (N_samples * n_vectors, ) containing the labels
                for each vector."""
    # Checking if the data is already extracted:
    #TODO
    filename_data, filename_labels = get_data_filename(saving, n_coefs, n_vectors)

    if filename_data not in os.listdir() or filename_labels not in os.listdir():
        print("%s or %s not found in the folder." % (filename_data, filename_labels))
        # Not already processed:
        print("First time processing audio files.")
        n_samples = len(dataframe)
        data = np.zeros((n_coefs, n_samples * n_vectors))
        labels = np.zeros(n_samples * n_vectors, dtype=object)
        for i, row in training_set.iterrows():
            if i%10==0:
                print("Processed samples: %d/%d..." % (i, n_samples))
                clear_output(wait=True)
            filename, label = row.filename, row.label
            descr = get_descr(data_home + '/' + filename, n_coefs, n_vectors)
            data[:,i*n_vectors:(i+1)*n_vectors] = descr
            labels[i*n_vectors:(i+1)*n_vectors] = label
        print("Done processing")
        print("You can check the shape of the data and labels: ",data.shape, labels.shape)
        print("Saving the processed data.")
        np.savetxt(filename_data, data)
        np.savetxt(filename_labels, labels,  fmt="%s")
    else:
        print("Audio files already processed, retrieving the file.")
        data = np.loadtxt(filename_data)
        labels = np.array(pd.read_csv(filename_labels, header=None)).ravel()
        print(data.shape, labels.shape)
        print("Overview of labels: ",labels)

    return data, labels
        
# TODO: threads

data, labels = extract_features(training_set, n_vectors, n_coefs, data_home=data_home, saving="training")
counts = np.unique(labels, return_counts=True)
for i in range(len(counts[0])):
    print("%s: %d" % (counts[0][i], counts[1][i]))

Audio files already processed, retrieving the file.
(20, 5820) (5820,)
Overview of labels:  ['beach' 'beach' 'beach' ..., 'tram' 'tram' 'tram']
beach: 380
bus: 390
cafe/restaurant: 380
car: 400
city_center: 400
forest_path: 420
grocery_store: 380
home: 400
library: 400
metro_station: 380
office: 370
park: 400
residential_area: 380
train: 360
tram: 380


In [6]:
# Checking if a random sample is well described in the data matrix:
rand_id = np.random.randint(0, len(training_set))
filename = training_set.loc[rand_id, 'filename']
#TODO: finish
print("Checking with sample: %s" % filename)
y, sr = librosa.load(path = filename)
mfcc = librosa.feature.mfcc(y, sr, n_mfcc=n_coefs)
n_cols = mfcc.shape[1]
p = n_cols//n_vectors

np.set_printoptions(precision=2, threshold=5, formatter={'float_kind':lambda x:"%.2f" % x})

print("Checking the MFCC sequence against the computed data matrix")
rand_ints = np.random.permutation(n_coefs)[:5] # random lines
print("rand_id:", rand_id)
print("rand_ints:", rand_ints)
print("data: ", data.shape)
for i in range(n_vectors):
    print("-"*20)
    print("MFCC:")
    print(mfcc[rand_ints,i*p:(i+1)*p])
    print("Computed mean: ", np.mean(mfcc[rand_ints,i*p:(i+1)*p], axis=1))
    print("\ndata[%d]:" % (rand_id*n_vectors + i))
    print("(Each term should be the mean of the corresponding line)")
    print(data[rand_ints, rand_id*n_vectors + i])


Checking with sample: audio/b082_150_180.wav
Checking the MFCC sequence against the computed data matrix
rand_id: 281
rand_ints: [18 14 17 13  0]
data:  (20, 5820)
--------------------
MFCC:
[[1.35 3.11 -4.74 ..., -2.60 -3.46 -4.60]
 [2.58 5.48 4.46 ..., 10.82 5.57 4.45]
 [2.92 5.05 -1.98 ..., 1.79 -1.49 -3.06]
 [-4.34 6.40 11.92 ..., 6.60 7.04 9.45]
 [-370.15 -369.59 -377.41 ..., -383.53 -386.30 -397.02]]
Computed mean:  [-2.46 7.07 1.17 3.10 -377.36]

data[2810]:
(Each term should be the mean of the corresponding line)
[-2.44 7.09 1.22 3.03 -377.21]
--------------------
MFCC:
[[-3.69 -5.92 -7.05 ..., -7.73 -1.70 -0.67]
 [15.22 14.68 8.22 ..., 12.71 9.60 3.03]
 [-1.51 -0.67 -4.18 ..., -3.08 -2.36 2.20]
 [8.81 1.48 -1.60 ..., 2.65 2.53 -3.25]
 [-378.76 -353.35 -340.50 ..., -420.61 -424.99 -423.57]]
Computed mean:  [-3.46 8.44 0.50 2.53 -378.59]

data[2811]:
(Each term should be the mean of the corresponding line)
[-3.48 8.49 0.48 2.57 -378.24]
--------------------
MFCC:
[[-4.53 -0.26 6

## Let's do the actual training and prediction !

In [23]:
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVR

import time

X = data.T
y = np.array([labels_numbers[name] for name in labels])



def most_frequent(array):
    unique, counts = np.unique(array, return_counts=True)
    return unique[counts.argmax()]

def predict(path, votes, clf, n_coefs=20, n_vect=10, verbose=False):
    """ Return the number of the assumed class of an audio file.
    
    :param path: The path to the file
    :param votes: A dictionary {number: label} that represents the association
        between the meaningful numbers returned by the classifier
        and the actual classes of the data.
    :param clf: The classifier object
    :param n_coefs: Number of MFCC coefficients to extract from the audio file
    :param n_vect: Number of vectors (audio frames) the audio file should be 
            split into
    
    :return: The number of the assumed class of this audio file."""
    descr = get_descr(path, n_coefs, n_vect) # n_features * n_samples
    predictions = clf.predict(descr.T).round()
    selection = most_frequent(predictions)
    #TODO: take all the predictions and take weights.
    if verbose:
        print("Predictions for sample %s:" % path)
        print(predictions)
        print("%s seems to belong in class n. '%d'" % (path, selection))
    return votes[selection]
  

def get_votes(labels, predictions, verbose=False):
    """ Return the names of the classes returned by the classifer.
    :param labels: The labels names
    :param predictions: The predictions for the training set
    :return: a dictionary associating classes to label names"""
    corresp = {} # dictionary {number: label name}
    round_pred = predictions.round()
    for i in np.unique(round_pred):
        # The group of all labels that were assigned to the number i
        group = labels[round_pred == i] 
        if verbose:
            print("Samples that were assigned to clas %d came from labels:"% i)
            print(np.unique(group, return_counts=True))
        vote = most_frequent(group)
        corresp[i] = vote
    return corresp

permut = np.random.permutation(len(X))
X = X[permut]
y = y[permut]
labels_permut = labels[permut]

regressors = {'knn':{'clf': KNeighborsRegressor(),
                     'params': {'n_neighbors': np.arange(1, 20, 10, dtype=int),
                              'weights': ('uniform', 'distance',),
                              'algorithm': ('auto', 'ball_tree', )}
                    },
              'gmm':{'clf': GaussianMixture(n_components=16),
                     'params': {'n_components':[16],
                                'covariance_type': ('full', 'tied', 'diag', 'spherical'),
                                'tol': np.logspace(-3, -1, 6)}
                    },
              'svr':{'clf':SVR(),
                     'params': {'C' : np.logspace(-1,0,3),
                                'epsilon' : np.logspace(-2,0,3),
                                'kernel' : ('rbf', )}# 'poly', 'rbf')}
                    }
}


choice = regressors['knn']
gridsearch = True

if gridsearch:
    cv = GridSearchCV(choice['clf'], choice['params'], verbose=2, n_jobs=-1)
    # Resetting the print options:
    np.set_printoptions(edgeitems=3,infstr='inf',
                        linewidth=75, nanstr='nan', precision=8,
                        suppress=False, threshold=1000, formatter=None)
    t0 = time.time()
    print("Fitting the GridSearch classifier:")
    cv.fit(X, y=y)
    print("Finished in %.2f s." % (time.time() - t0))
    print("Best parameters: ",cv.best_params_)
    best_clf = cv.best_estimator_
else:
    best_clf = choice['clf'].fit(X, y)
    
print("Getting the predictions for the training class")
clf_labels = best_clf.predict(X)

print("Getting the votes:")
votes = get_votes(labels_permut, clf_labels)
print(votes)


Fitting the GridSearch classifier:
Fitting 3 folds for each of 8 candidates, totalling 24 fits
[CV] weights=uniform, algorithm=auto, n_neighbors=1 ..................
[CV] weights=uniform, algorithm=auto, n_neighbors=1 ..................
[CV] weights=uniform, algorithm=auto, n_neighbors=1 ..................
[CV] weights=distance, algorithm=auto, n_neighbors=1 .................
[CV] ... weights=uniform, algorithm=auto, n_neighbors=1, total=   0.1s
[CV] ... weights=uniform, algorithm=auto, n_neighbors=1, total=   0.1s
[CV] weights=distance, algorithm=auto, n_neighbors=1 .................
[CV] ... weights=uniform, algorithm=auto, n_neighbors=1, total=   0.1s
[CV] .. weights=distance, algorithm=auto, n_neighbors=1, total=   0.1s
[CV] weights=uniform, algorithm=auto, n_neighbors=11 .................
[CV] weights=distance, algorithm=auto, n_neighbors=1 .................
[CV] weights=uniform, algorithm=auto, n_neighbors=11 .................
[CV] .. weights=distance, algorithm=auto, n_neighbors

[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:    2.0s finished


In [27]:
print(labels_numbers)
for k,v in votes.items():
    if 0 <= k <= 14:
        print("Label: '%15s', real number: %2d, guessed number: %2d" % (v, labels_numbers[v], k))
successes = 0

for i, row in testing_set.iterrows():
    prediction = predict(row.filename, votes, best_clf, n_coefs=n_coefs, n_vect=n_vectors)
    if prediction == row.label:
        successes += 1
        print('✓', end=' ')
    else:
        print('✗', end=' ')
    print("Prediction: '%s', should be '%s'" % (prediction, row.label))
    print("\tSo far, success rate: %d/%d (%.2f %%)" % (successes, i+1, successes/(i+1) * 100))
print("Done. Success rate: %d/%d (%.2f %%)" % (successes, i+1, successes/(i+1) * 100))

{'park': 11, 'bus': 1, 'office': 10, 'forest_path': 5, 'tram': 14, 'metro_station': 9, 'library': 8, 'car': 3, 'train': 13, 'cafe/restaurant': 2, 'city_center': 4, 'beach': 0, 'home': 7, 'residential_area': 12, 'grocery_store': 6}
Label: '          beach', real number:  0, guessed number:  0
Label: '            bus', real number:  1, guessed number:  1
Label: 'cafe/restaurant', real number:  2, guessed number:  2
Label: '            car', real number:  3, guessed number:  3
Label: '    city_center', real number:  4, guessed number:  4
Label: '    forest_path', real number:  5, guessed number:  5
Label: '  grocery_store', real number:  6, guessed number:  6
Label: '           home', real number:  7, guessed number:  7
Label: '        library', real number:  8, guessed number:  8
Label: '  metro_station', real number:  9, guessed number:  9
Label: '         office', real number: 10, guessed number: 10
Label: '           park', real number: 11, guessed number: 11
Label: 'residential_area'

## Prediction of unknown data:

In [28]:
os.remove("prediction.txt")

for i, row in unknown_set.iterrows():
    prediction = predict(row.filename, votes, best_clf, n_coefs=n_coefs, n_vect=n_vectors)
    #print("Prediction: '%s'" % prediction)
    with open("prediction.txt", "a+") as f:
        f.write(str(labels_numbers[prediction]) + '\n')
    if i%10 == 0:
        print("Predicted %d/%d (%.0f %%)" % (i, len(unknown_set), i/len(unknown_set) * 100))
print("Finished")

Predicted 0/298 (0 %)
Predicted 10/298 (3 %)
Predicted 20/298 (7 %)
Predicted 30/298 (10 %)
Predicted 40/298 (13 %)
Predicted 50/298 (17 %)
Predicted 60/298 (20 %)
Predicted 70/298 (23 %)
Predicted 80/298 (27 %)
Predicted 90/298 (30 %)
Predicted 100/298 (34 %)
Predicted 110/298 (37 %)
Predicted 120/298 (40 %)
Predicted 130/298 (44 %)
Predicted 140/298 (47 %)
Predicted 150/298 (50 %)
Predicted 160/298 (54 %)
Predicted 170/298 (57 %)
Predicted 180/298 (60 %)
Predicted 190/298 (64 %)
Predicted 200/298 (67 %)
Predicted 210/298 (70 %)
Predicted 220/298 (74 %)
Predicted 230/298 (77 %)
Predicted 240/298 (81 %)
Predicted 250/298 (84 %)
Predicted 260/298 (87 %)
Predicted 270/298 (91 %)
Predicted 280/298 (94 %)
Predicted 290/298 (97 %)
