# Challenge SD207 - 2017

Acoustic scene classification

Loïc Herbelot

# Data processing before classification:

We take the samples, we extract the MFCC feature of these files to represent audio files with vectors (matrix in fact). We will then do the classifying on these data.

In [1]:
import os
import re
import numpy as np
import pandas as pd
import librosa
from IPython.display import clear_output
import time

# Constants:
data_home = "." #"/tsi/plato/sons/sd207"
training_file = 'audio/train.txt'
valid_file = 'audio/dev.txt'
n_vectors = 72 # Each audio file will be represented by [n_vectors] vectors. Must be 1 < n_vectors < 1296.
              # If possible, try to set n_vectors a divisor of 1296:
              # 1, 2, 3, 4, 6, 8, 9, 12, 16, 18, 24, 27, 36, 48, 54, 72, 81, 108, 144, 162, 216, 324, 432, 648
n_coefs = 60 # We will keep [n_coefs] MFC coefficients.

labels_numbers = {"beach": 0,
"bus": 1,
"cafe/restaurant": 2,
"car": 3,
"city_center": 4,
"forest_path": 5,
"grocery_store": 6,
"home": 7,
"library": 8,
"metro_station": 9,
"office": 10,
"park": 11,
"residential_area": 12,
"train": 13,
"tram": 14}

# Loading text files into pd.Dataframes
training_set = pd.read_csv(data_home + '/' + training_file, sep='\s+', 
                           dtype=str, names=['filename', 'label'])
testing_set    = pd.read_csv(data_home + '/' + valid_file, sep='\s+', 
                           dtype=str, names=['filename', 'label'])

unknown_set    = pd.read_csv(data_home + '/test_files.txt', sep='\s+', 
                           dtype=str, names=['filename'])

print("Training set: \n ", training_set[:10])
print("...\n(%d samples)" % len(training_set))
print("\nTesting set:\n", testing_set[:10])
print("...\n(%d samples)" % len(testing_set))
print("\nUnknown set: \n ", unknown_set[:10])
print("...\n(%d samples)" % len(unknown_set))

Training set: 
                   filename  label
0     audio/b010_0_30.wav  beach
1    audio/b010_60_90.wav  beach
2  audio/b010_150_180.wav  beach
3    audio/b010_30_60.wav  beach
4  audio/b010_120_150.wav  beach
5  audio/b022_120_150.wav  beach
6    audio/b022_60_90.wav  beach
7  audio/b022_180_210.wav  beach
8    audio/b022_30_60.wav  beach
9   audio/b022_90_120.wav  beach
...
(582 samples)

Testing set:
                  filename  label
0    audio/b021_30_60.wav  beach
1  audio/b021_150_180.wav  beach
2   audio/b021_90_120.wav  beach
3  audio/b021_120_150.wav  beach
4    audio/b021_60_90.wav  beach
5  audio/b021_180_210.wav  beach
6     audio/b021_0_30.wav  beach
7  audio/b019_180_210.wav  beach
8  audio/b019_120_150.wav  beach
9   audio/b019_90_120.wav  beach
...
(290 samples)

Unknown set: 
                   filename
0     audio/b053_0_30.wav
1   audio/b035_90_120.wav
2  audio/b089_210_240.wav
3    audio/a034_30_60.wav
4  audio/a045_150_180.wav
5   audio/a058_90_120.wav
6    au

In [None]:
# Load the audio files:

def get_descr(path, n_coefs, n_vectors):
    """Returns a matrix description of an audio file.
    
    :param path: The path to the audio file
    :param n_coefs: The numbers of MFC coefficients
    :param n_vectors: number of vectors that will represent the audio file
    
    :return: ndarray of shape (2 * n_coefs, n_vectors) 
        where each column is the mean and standard deviation of MFCC sequence 
        of multiple frames in the audio file."""
    y, sr = librosa.load(path)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_coefs)
    # We have too many columns in [mfcc], we will group them into [X]
    X = np.zeros((2*n_coefs, n_vectors))
    # Numbers of samples we will take to make one vector
    p = mfcc.shape[1] // n_vectors 
    for i in range(n_vectors-1):
        # We collapse the [p] vectors into one single column in [X]
        X[:n_coefs,i]          = np.mean(mfcc[:,i*p:(i+1)*p], axis=1)
        X[n_coefs:2*n_coefs,i] = np.std(mfcc[:,i*p:(i+1)*p], axis=1)
    # Last vector:
    i += 1
    X[:n_coefs,i]          = np.mean(mfcc[:,i*p:], axis=1)
    X[n_coefs:2*n_coefs,i] = np.std(mfcc[:,i*p:], axis=1)
    return X

def get_data_filename(descr, n_coefs, n_vectors):
    """Returns the file names used to save and load the data.
    :param descr: The prefix of the files as a string
    :param n_coefs: The number of MFC coefficients
    :param n_vectors: The number of vectors that represent one audio file
    
    :return: set, label
        set   - filename for the audio data
        label - filename for the labels"""
    return ('{0}_set_{1}coef_{2}vect.txt'.format(descr, n_coefs, n_vectors),
            '{0}_labels_{1}coef_{2}vect.txt'.format(descr, n_coefs, n_vectors))

def extract_features(dataframe, n_vectors, n_coefs, data_home=".",  saving='autosave', return_labels=True):
    """ Extract the features from a batch of audio files and saves them in a file.
    
    :param dataframa:     pd.DataFrame, linking filename to labels. Must be the entire dataframe
    :param data_home:     The path to the audio files
    :param n_vectors:     The number of vectors that represent an audio sample
    :param n_coefs:       The number of MFC coefficients to extract
    :param saving:        The description of the filename saving the data.
                          This function will save two files: the data from 
                          the audio files, and the labels.
    :param return_labels: Wether or not to parse and return labels
        
    :return: data, labels
        - data: ndarray of shape (n_coefs, N_samples * n_vectors) containing 
                the MFC coefficients for each audio samples
        - labels: ndarray of shape (N_samples * n_vectors, ) containing the labels
                for each vector. If 'return_labels' = True """
    filename_data, filename_labels = get_data_filename(saving, n_coefs, n_vectors)

    if filename_data not in os.listdir() or (return_labels and filename_labels not in os.listdir()):
        print("%s (or %s) not found in the folder." % (filename_data, filename_labels))
        # Not already processed:
        print("First time processing audio files.")
        n_samples = len(dataframe)
        data = np.zeros((2*n_coefs, n_samples * n_vectors))
        if return_labels:
            labels = np.zeros(n_samples * n_vectors, dtype=object)
        t0 = time.time()
        # Get the description of all audio files:
        for i, row in dataframe.iterrows():
            if i%10==0:
                print("Processed samples: %d/%d..." % (i, n_samples))
            descr = get_descr(data_home + '/' + row.filename, n_coefs, n_vectors)
            data[:,i*n_vectors:(i+1)*n_vectors] = descr
            if return_labels:
                labels[i*n_vectors:(i+1)*n_vectors] = row.label
        print("Done processing in %.2f" % (time.time() - t0))
        print("You can check the shape of the data:",data.shape)
        if return_labels:
            print("And labels:", labels.shape)
        print("Saving the processed data.")
        np.savetxt(filename_data, data)
        if return_labels:
            np.savetxt(filename_labels, labels,  fmt="%s")
    else:
        print("Audio files already processed, retrieving the file.")
        data = np.loadtxt(filename_data)
        print("You can check the shape of the data:",data.shape)
        if return_labels:
            labels = np.array(pd.read_csv(filename_labels, header=None)).ravel()
            print("And labels:", labels.shape)
            print("Overview of labels: ",labels)

    if return_labels:
        return data, labels
    else:
        return data

data, labels = extract_features(training_set, n_vectors, n_coefs, data_home=data_home, saving="training")
counts = np.unique(labels, return_counts=True)
print("Distribution of training samples:")
for i in range(len(counts[0])):
    print("%s: %d" % (counts[0][i], counts[1][i]))

Audio files already processed, retrieving the file.
You can check the shape of the data: (120, 41904)
And labels: (41904,)
Overview of labels:  ['beach' 'beach' 'beach' ..., 'tram' 'tram' 'tram']
beach: 2736
bus: 2808
cafe/restaurant: 2736
car: 2880
city_center: 2880
forest_path: 3024
grocery_store: 2736
home: 2880
library: 2880
metro_station: 2736
office: 2664
park: 2880
residential_area: 2736
train: 2592
tram: 2736


## Let's do the actual training and prediction !

In [None]:
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVR
from sklearn.lda import LDA
from sklearn.qda import QDA
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import VotingClassifier

def most_frequent(array):
    """ Return the most frequent element in a list"""
    unique, counts = np.unique(array, return_counts=True)
    return unique[counts.argmax()]  

X = data.T
y = np.array([labels_numbers[name] for name in labels])

permut = np.random.permutation(len(X))
X = X[permut]
y = y[permut]

regressors = {'knn':{'clf': KNeighborsRegressor(),
                     'params': {'n_neighbors': np.arange(1, 20, 10, dtype=int),
                              'weights': ('uniform', 'distance',),
                              'algorithm': ('auto', 'ball_tree', )}
                    },
              'svr':{'clf':SVR(),
                     'params': {'C' : np.logspace(-1,0,3),
                                'epsilon' : np.logspace(-2,0,3),
                                'kernel' : ('rbf', )}# 'poly', 'rbf')}
                    },
              'logit':{'clf':LogisticRegression(),
                     'params': {'C' : np.logspace(-2,1,4),
                                'penalty' : ('l2',),
                                'solver': ('newton-cg', 'lbfgs', 'liblinear', 'sag')
                               }
                    },
              'lda':{'clf':LDA(),
                     'params': {'solver' : ('svd', 'lsqr', 'eigen'),
                                'n_components': np.arange(1, 15, 2),
                               }
                    },
              'qda':{'clf':QDA(),
                     'params': {
                               }
                    },
              'random_forest':{'clf':RandomForestRegressor(),
                     'params': {'n_estimators': np.arange(5, 50, 10),
                                'max_features': ('auto', ),#'sqrt', 'log2', None),
                                'max_depth': np.logspace(1, 1.6, 3, dtype=int),
                                'bootstrap': (True, ),#False),
                                
                               }
                    }
}


choices = ['lda', 'qda', 'knn', 'logit', 'svr', 'random_forest']
# Selecting the best parameters for the estimators and saving them
# into [estimators]
estimators = []
for shortname in choices:
    print("\nSelection: '%s'" % shortname)
    cv = GridSearchCV(regressors[shortname]['clf'], regressors[shortname]['params'], 
                      verbose=1, n_jobs=-1)
    t0 = time.time()
    print("Fitting the GridSearch classifier:")
    cv.fit(X, y=y)
    print("Finished in %.2f s." % (time.time() - t0))
    print("Best parameters: ",cv.best_params_)
    estimators.append((shortname, cv.best_estimator_))
    

print("\n\nFitting the VotingClassifer")
voting_clf = VotingClassifier(estimators, n_jobs=-1).fit(X, y)
# print("Getting the predictions for the training class")
# clf_labels = voting_clf.predict(X)

# print("Getting the votes:")
# votes = get_votes(labels_permut, clf_labels)
# print(votes)


Selection: 'lda'
Fitting the GridSearch classifier:
Fitting 3 folds for each of 21 candidates, totalling 63 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   16.4s
[Parallel(n_jobs=-1)]: Done  63 out of  63 | elapsed:   23.3s finished


Finished in 24.41 s.
Best parameters:  {'solver': 'svd', 'n_components': 1}

Selection: 'qda'
Fitting the GridSearch classifier:
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    6.7s finished


Finished in 7.62 s.
Best parameters:  {}

Selection: 'knn'
Fitting the GridSearch classifier:
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  8.1min finished


Finished in 483.94 s.
Best parameters:  {'algorithm': 'auto', 'n_neighbors': 1, 'weights': 'uniform'}

Selection: 'logit'
Fitting the GridSearch classifier:
Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed: 55.3min finished


Finished in 3476.08 s.
Best parameters:  {'penalty': 'l2', 'C': 0.01, 'solver': 'newton-cg'}

Selection: 'svr'
Fitting the GridSearch classifier:
Fitting 3 folds for each of 9 candidates, totalling 27 fits


In [None]:
# Checking our classifier on the testing dataset:

# Retrieve data
dev_data, dev_labels = extract_features(testing_set, n_vectors, n_coefs, data_home=data_home, saving="testing")

# Iterate over the audio files
for i, row in testing_set.iterrows():
    # Retrieve the description of this audio file
    descr = dev_data[:, i*n_vectors:(i+1)*n_vectors]
    predictions = voting_clf.predict(descr.T).round()
    # Select the most frequent prediction
    prediction = most_frequent(predictions)
    # eye candy:
    if prediction == row.label:
        successes += 1
        print('✓', end=' ')
    else:
        print('✗', end=' ')
    print("Prediction: '%s', should be '%s'" % (prediction, row.label), end="")
    print("\tSo far, success rate: %d/%d (%.2f %%)" % (successes, i+1, successes/(i+1) * 100))
print("Done. Success rate: %d/%d (%.2f %%)" % (successes, i+1, successes/(i+1) * 100))

## Prediction of unknown data:

In [None]:
os.rename("prediction.txt", "old_prediction.txt")
    
# Check if the 'unknown' features are already extracted in a file:
unknown_data = extract_features(unknown_set, n_vectors, n_coefs, 
                                data_home=data_home, saving="unknown", return_labels=False)

# Iterate over the audio files
for i, row in unknown_set.iterrows():
    # Retrieve the description of this audio file
    descr = unknown_data[:, i*n_vectors:(i+1)*n_vectors]
    predictions = voting_clf.predict(descr.T).round()
    # Select the most frequent prediction
    prediction = most_frequent(predictions)
    # Output to the file:
    with open("prediction.txt", "a+") as f:
        f.write(str(labels_numbers[prediction]) + '\n')
    if i%10 == 0:
        print("Predicted %d/%d (%.0f %%)" % (i, len(unknown_set), i/len(unknown_set) * 100))
print("Finished")