# Challenge SD207 - 2017

Acoustic scene classification


# Data processing before classification:

We take the samples, we extract the MFCC sequence of these files to represent audio files with vectors (matrix in fact). We will then do the classifying on these data.

In [31]:
import os
import re
import numpy as np
import pandas as pd
import librosa
from IPython.display import clear_output
import time

# Constants:
data_home = "." #"/tsi/plato/sons/sd207"
training_file = 'audio/train.txt'
valid_file = 'audio/dev.txt'
n_vectors = 36 # Each audio file will be represented by [n_vectors] vectors. Must be 1 < n_vectors < 1296.
              # If possible, try to set n_vectors a divisor of 1296:
              # 1, 2, 3, 4, 6, 8, 9, 12, 16, 18, 24, 27, 36, 48, 54, 72, 81, 108, 144, 162, 216, 324, 432, 648
n_coefs = 20 # We will keep [n_coefs] MFC coefficients.

labels_numbers = {"beach": 0,
"bus": 1,
"cafe/restaurant": 2,
"car": 3,
"city_center": 4,
"forest_path": 5,
"grocery_store": 6,
"home": 7,
"library": 8,
"metro_station": 9,
"office": 10,
"park": 11,
"residential_area": 12,
"train": 13,
"tram": 14}

# Loading text files into pd.Dataframes
training_set = pd.read_csv(data_home + '/' + training_file, sep='\s+', 
                           dtype=str, names=['filename', 'label'])
testing_set    = pd.read_csv(data_home + '/' + valid_file, sep='\s+', 
                           dtype=str, names=['filename', 'label'])

unknown_set    = pd.read_csv(data_home + '/test_files.txt', sep='\s+', 
                           dtype=str, names=['filename'])

print("Training set: \n ", training_set[:10])
print("...\n(%d samples)" % len(training_set))
print("\nTesting set:\n", testing_set[:10])
print("...\n(%d samples)" % len(testing_set))
print("\nUnknown set: \n ", unknown_set[:10])
print("...\n(%d samples)" % len(unknown_set))

Training set: 
                   filename  label
0     audio/b010_0_30.wav  beach
1    audio/b010_60_90.wav  beach
2  audio/b010_150_180.wav  beach
3    audio/b010_30_60.wav  beach
4  audio/b010_120_150.wav  beach
5  audio/b022_120_150.wav  beach
6    audio/b022_60_90.wav  beach
7  audio/b022_180_210.wav  beach
8    audio/b022_30_60.wav  beach
9   audio/b022_90_120.wav  beach
...
(582 samples)

Testing set:
                  filename  label
0    audio/b021_30_60.wav  beach
1  audio/b021_150_180.wav  beach
2   audio/b021_90_120.wav  beach
3  audio/b021_120_150.wav  beach
4    audio/b021_60_90.wav  beach
5  audio/b021_180_210.wav  beach
6     audio/b021_0_30.wav  beach
7  audio/b019_180_210.wav  beach
8  audio/b019_120_150.wav  beach
9   audio/b019_90_120.wav  beach
...
(290 samples)

Unknown set: 
                   filename
0     audio/b053_0_30.wav
1   audio/b035_90_120.wav
2  audio/b089_210_240.wav
3    audio/a034_30_60.wav
4  audio/a045_150_180.wav
5   audio/a058_90_120.wav
6    au

In [69]:
# Load the audio files:

def get_descr(path, n_coefs, n_vectors):
    """Returns a matrix description of an audio file.
    
    :param path: The path to the audio file
    :param n_coefs: The numbers of MFC coefficients
    :param n_vectors: number of vectors that will represent the audio file.
        This is the number of frames we will split the audio file into.
    
    :return: ndarray of shape (n_coefs, n_vectors) 
        where each column is a MFCC sequence of a frame in the audio file."""
    y, sr = librosa.load(path)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_coefs)
    # We have too many columns in [mfcc], we will stack them into [X]
    X = np.zeros((2*n_coefs, n_vectors))
    # Numbers of samples we will take to make one vector
    p = mfcc.shape[1] // n_vectors 
    for i in range(n_vectors-1):
        # We collapse the [p] vectors into one single column in [X]
        X[:n_coefs,i] = np.mean(mfcc[:,i*p:(i+1)*p], axis=1)
        X[n_coefs:2*n_coefs,i] = np.std(mfcc[:,i*p:(i+1)*p], axis=1)
    # Last vector:
    i += 1
    X[:n_coefs,i] = np.mean(mfcc[:,i*p:], axis=1)
    X[n_coefs:2*n_coefs,i] = np.std(mfcc[:,i*p:], axis=1)
    return X

def get_data_filename(descr, n_coefs, n_vectors):
    """Returns the file names for the saved data.
    
    :return: set, label
        set   - filename for the audio data
        label - filename for the labels"""
    return ('{0}_set_{1}coef_{2}vect.txt'.format(descr, n_coefs, n_vectors),
            '{0}_labels_{1}coef_{2}vect.txt'.format(descr, n_coefs, n_vectors))

def extract_features(dataframe, n_vectors, n_coefs, data_home=".",  saving='autosave', return_labels=True):
    """ Extract the features from a batch of audio files and saves them in a file.
    
    :param dataframa: pd.DataFrame, linking filename to labels. Must be the entire dataframe
    :param data_home: The path to the audio files
    :param n_vectors: The number of vectors that represent an audio sample
    :param n_coefs:   The number of MFC coefficients to extract
    :param saving:    The description of the filename saving the data.
                      This function will save two files: the data from 
                      the audio files, and the labels.
        
    :return: data, labels
        - data: ndarray of shape (n_coefs, N_samples * n_vectors) containing 
                the MFC coefficients for each audio samples
        - labels: ndarray of shape (N_samples * n_vectors, ) containing the labels
                for each vector. If 'return_labels' = True """
    filename_data, filename_labels = get_data_filename(saving, n_coefs, n_vectors)

    if filename_data not in os.listdir() or (return_labels and filename_labels not in os.listdir()):
        print("%s (or %s) not found in the folder." % (filename_data, filename_labels))
        # Not already processed:
        print("First time processing audio files.")
        n_samples = len(dataframe)
        data = np.zeros((2*n_coefs, n_samples * n_vectors))
        if return_labels:
            labels = np.zeros(n_samples * n_vectors, dtype=object)
        t0 = time.time()
        for i, row in dataframe.iterrows():
            if i%10==0:
                print("Processed samples: %d/%d..." % (i, n_samples))
                clear_output(wait=True)
            descr = get_descr(data_home + '/' + row.filename, n_coefs, n_vectors)
            data[:,i*n_vectors:(i+1)*n_vectors] = descr
            if return_labels:
                labels[i*n_vectors:(i+1)*n_vectors] = row.label
        print("Done processing in %.2f" % (time.time() - t0))
        print("You can check the shape of the data:",data.shape)
        if return_labels:
            print("And labels:", labels.shape)
        print("Saving the processed data.")
        np.savetxt(filename_data, data)
        if return_labels:
            np.savetxt(filename_labels, labels,  fmt="%s")
    else:
        print("Audio files already processed, retrieving the file.")
        data = np.loadtxt(filename_data)
        print("You can check the shape of the data:",data.shape)
        if return_labels:
            labels = np.array(pd.read_csv(filename_labels, header=None)).ravel()
            print("And labels:", labels.shape)
            print("Overview of labels: ",labels)

    if return_labels:
        return data, labels
    else:
        return data

data, labels = extract_features(training_set, n_vectors, n_coefs, data_home=data_home, saving="training")
counts = np.unique(labels, return_counts=True)
for i in range(len(counts[0])):
    print("%s: %d" % (counts[0][i], counts[1][i]))

Audio files already processed, retrieving the file.
You can check the shape of the data: (40, 20952)
And labels: (20952,)
Overview of labels:  ['beach' 'beach' 'beach' ..., 'tram' 'tram' 'tram']
beach: 1368
bus: 1404
cafe/restaurant: 1368
car: 1440
city_center: 1440
forest_path: 1512
grocery_store: 1368
home: 1440
library: 1440
metro_station: 1368
office: 1332
park: 1440
residential_area: 1368
train: 1296
tram: 1368


## Let's do the actual training and prediction !

In [None]:
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVR
from sklearn.lda import LDA
from sklearn.qda import QDA
from sklearn.ensemble import VotingClassifier

X = data.T
y = np.array([labels_numbers[name] for name in labels])



def most_frequent(array):
    unique, counts = np.unique(array, return_counts=True)
    return unique[counts.argmax()]

def predict(path, votes, clf, n_coefs=20, n_vect=10, verbose=False):
    """ Return the number of the assumed class of an audio file.
    
    :param path: The path to the file
    :param votes: A dictionary {number: label} that represents the association
        between the meaningful numbers returned by the classifier
        and the actual classes of the data.
    :param clf: The classifier object
    :param n_coefs: Number of MFCC coefficients to extract from the audio file
    :param n_vect: Number of vectors (audio frames) the audio file should be 
            split into
    
    :return: The number of the assumed class of this audio file."""
    descr = get_descr(path, n_coefs, n_vect) # n_features * n_samples
    predictions = clf.predict(descr.T).round()
    selection = most_frequent(predictions)
    #TODO: take all the predictions and take weights.
    if verbose:
        print("Predictions for sample %s:" % path)
        print(predictions)
        print("%s seems to belong in class n. '%d'" % (path, selection))
    return votes[selection]
  

def get_votes(labels, predictions, verbose=False):
    """ Return the names of the classes returned by the classifer.
    :param labels: The labels names
    :param predictions: The predictions for the training set
    :return: a dictionary associating classes to label names"""
    corresp = {} # dictionary {number: label name}
    round_pred = predictions.round()
    for i in np.unique(round_pred):
        # The group of all labels that were assigned to the number i
        group = labels[round_pred == i] 
        if verbose:
            print("Samples that were assigned to clas %d came from labels:"% i)
            print(np.unique(group, return_counts=True))
        vote = most_frequent(group)
        corresp[i] = vote
    return corresp

permut = np.random.permutation(len(X))
X = X[permut]
y = y[permut]
labels_permut = labels[permut]

regressors = {'knn':{'clf': KNeighborsRegressor(),
                     'params': {'n_neighbors': np.arange(1, 20, 10, dtype=int),
                              'weights': ('uniform', 'distance',),
                              'algorithm': ('auto', 'ball_tree', )}
                    },
              'gmm':{'clf': GaussianMixture(n_components=16),
                     'params': {'n_components':[16],
                                'covariance_type': ('full', 'tied', 'diag', 'spherical'),
                                'tol': np.logspace(-3, -1, 6)}
                    },
              'svr':{'clf':SVR(),
                     'params': {'C' : np.logspace(-1,0,3),
                                'epsilon' : np.logspace(-2,0,3),
                                'kernel' : ('rbf', )}# 'poly', 'rbf')}
                    },
              'logit':{'clf':LogisticRegression(),
                     'params': {'C' : np.logspace(-2,1,4),
                                'penalty' : ('l2',),
                                'solver': ('newton-cg', 'lbfgs', 'liblinear', 'sag')
                               }
                    },
              'lda':{'clf':LDA(),
                     'params': {'solver' : ('svd', 'lsqr', 'eigen'),
                               }
                    },
              'qda':{'clf':QDA(),
                     'params': {
                               }
                    }
              
              
              
              
}


choices = ['lda', 'qda', 'knn']
estimators = []
for shortname in choices:
    print("\nSelection: '%s'" % shortname)
    cv = GridSearchCV(regressors[shortname]['clf'], regressors[shortname]['params'], 
                      verbose=1, n_jobs=-1)
    t0 = time.time()
    print("Fitting the GridSearch classifier:")
    cv.fit(X, y=y)
    print("Finished in %.2f s." % (time.time() - t0))
    print("Best parameters: ",cv.best_params_)
    estimators.append((shortname, cv.best_estimator_))
    

print("Fitting the VotingClassifer")
voting_clf = VotingClassifier(estimators, n_jobs=-1).fit(X, y)
print("Getting the predictions for the training class")
clf_labels = voting_clf.predict(X)

print("Getting the votes:")
votes = get_votes(labels_permut, clf_labels)
print(votes)

Selection: 'lda'
Fitting the GridSearch classifier:
Fitting 3 folds for each of 3 candidates, totalling 9 fits
Finished in 0.70 s.
Best parameters:  {'solver': 'svd'}
Selection: 'qda'
Fitting the GridSearch classifier:
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:    0.5s finished


Finished in 0.98 s.
Best parameters:  {}
Selection: 'knn'
Fitting the GridSearch classifier:
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.7s finished


In [95]:
for k,v in votes.items():
    if 0 <= k <= 14:
        print("Label: '%16s', real number: %2d, guessed number: %2d" % (v, labels_numbers[v], k))
successes = 0

# Check if the 'dev' features are already extracted in a file:
dev_data, dev_labels = extract_features(testing_set, n_vectors, n_coefs, data_home=data_home, saving="testing")

# Do the prediction on the dataset
for i, row in testing_set.iterrows():
    descr = dev_data[:, i*n_vectors:(i+1)*n_vectors]
    predictions = voting_clf.predict(descr.T).round()
    selection = most_frequent(predictions)
    prediction = votes[selection]
    if prediction == row.label:
        successes += 1
        print('✓', end=' ')
    else:
        print('✗', end=' ')
    print("Prediction: '%s', should be '%s'" % (prediction, row.label), end="")
    print("\tSo far, success rate: %d/%d (%.2f %%)" % (successes, i+1, successes/(i+1) * 100))
print("Done. Success rate: %d/%d (%.2f %%)" % (successes, i+1, successes/(i+1) * 100))

Label: '           beach', real number:  0, guessed number:  0
Label: '             bus', real number:  1, guessed number:  1
Label: ' cafe/restaurant', real number:  2, guessed number:  2
Label: '             car', real number:  3, guessed number:  3
Label: '     city_center', real number:  4, guessed number:  4
Label: '     forest_path', real number:  5, guessed number:  5
Label: '   grocery_store', real number:  6, guessed number:  6
Label: '            home', real number:  7, guessed number:  7
Label: '         library', real number:  8, guessed number:  8
Label: '   metro_station', real number:  9, guessed number:  9
Label: '          office', real number: 10, guessed number: 10
Label: '            park', real number: 11, guessed number: 11
Label: 'residential_area', real number: 12, guessed number: 12
Label: '           train', real number: 13, guessed number: 13
Label: '            tram', real number: 14, guessed number: 14
Audio files already processed, retrieving the file.
You

## Prediction of unknown data:

In [96]:
os.remove("prediction.txt")




# Check if the 'unknown' features are already extracted in a file:
unknown_data = extract_features(unknown_set, n_vectors, n_coefs, 
                                data_home=data_home, saving="unknown", return_labels=False)

# Do the prediction on the dataset
for i, row in unknown_set.iterrows():
    descr = unknown_data[:, i*n_vectors:(i+1)*n_vectors]
    predictions = best_clf.predict(descr.T).round()
    selection = most_frequent(predictions)
    prediction = votes[selection]
    with open("prediction.txt", "a+") as f:
        f.write(str(labels_numbers[prediction]) + '\n')
    if i%10 == 0:
        print("Predicted %d/%d (%.0f %%)" % (i, len(unknown_set), i/len(unknown_set) * 100))
print("Finished")
    

# for i, row in unknown_set.iterrows():
#     prediction = predict(row.filename, votes, best_clf, n_coefs=n_coefs, n_vect=n_vectors)
#     #print("Prediction: '%s'" % prediction)
#     with open("prediction.txt", "a+") as f:
#         f.write(str(labels_numbers[prediction]) + '\n')
#     if i%10 == 0:
#         print("Predicted %d/%d (%.0f %%)" % (i, len(unknown_set), i/len(unknown_set) * 100))
#         clear_output(wait=True)
# print("Finished")

Audio files already processed, retrieving the file.
You can check the shape of the data: (40, 10728)
Predicted 0/298 (0 %)
Predicted 10/298 (3 %)
Predicted 20/298 (7 %)
Predicted 30/298 (10 %)
Predicted 40/298 (13 %)
Predicted 50/298 (17 %)
Predicted 60/298 (20 %)
Predicted 70/298 (23 %)
Predicted 80/298 (27 %)
Predicted 90/298 (30 %)
Predicted 100/298 (34 %)
Predicted 110/298 (37 %)
Predicted 120/298 (40 %)
Predicted 130/298 (44 %)
Predicted 140/298 (47 %)
Predicted 150/298 (50 %)
Predicted 160/298 (54 %)
Predicted 170/298 (57 %)
Predicted 180/298 (60 %)
Predicted 190/298 (64 %)
Predicted 200/298 (67 %)
Predicted 210/298 (70 %)
Predicted 220/298 (74 %)
Predicted 230/298 (77 %)
Predicted 240/298 (81 %)
Predicted 250/298 (84 %)
Predicted 260/298 (87 %)
Predicted 270/298 (91 %)
Predicted 280/298 (94 %)
Predicted 290/298 (97 %)
Finished
