Prepare audioset and some key hyperparameters

In [1]:
import pandas as pd

datapath = "audioset/"
train_df = pd.read_csv(datapath+"train.csv", sep="\t")
dev_df = pd.read_csv(datapath+"dev.csv", sep="\t")
eval_df = pd.read_csv(datapath+"eval.csv", sep="\t")

unique_labels = ['airport', 'bus', 'metro', 'metro_station',
                'park', 'public_square', 'shopping_mall',
                'street_pedestrian', 'street_traffic', 'tram']
label_dict = {}
for ind, c in enumerate(unique_labels):
    label_dict[c] = ind

n_mels = 40
feature_option = "mfcc"
gmm_n_components = 8
gmm_covariance_type = 'diag'

Train

In [2]:
# extract feature
import numpy as np
from Extract_feature import extract_mfcc

# train set
train_files = list(train_df.filename)
train_labels = list(train_df.scene_label)
train_features = []
for filename in train_files:
    wav_file_path = datapath+filename
    mfcc_feature = extract_mfcc(wav_file_path, n_mels=n_mels, option=feature_option)
    train_features.append(mfcc_feature.reshape(1,-1))           # [1, frames*n_features] (flatten)
train_features = np.concatenate(train_features, axis=0)
print(f"train feature size: {train_features.shape}")

# dev set
dev_files = list(dev_df.filename)
dev_labels = list(dev_df.scene_label)
dev_labels = [label_dict[c] for c in dev_labels]
dev_features = []
for filename in dev_files:
    wav_file_path = datapath+filename
    mfcc_feature = extract_mfcc(wav_file_path, n_mels=n_mels, option=feature_option)
    dev_features.append(mfcc_feature.reshape(1,-1))             # [1, frames*n_features] (flatten)
dev_features = np.concatenate(dev_features, axis=0)
print(f"dev feature size: {dev_features.shape}")

train feature size: (3008, 6487)
dev feature size: (379, 6487)


In [3]:
# train GMMs for each class
import numpy as np
from sklearn.mixture import GaussianMixture

gmm_classifier_dict = {}
start_pos, end_pos = 0, 0
for ind, c in enumerate(unique_labels):
    target_df = train_df[train_df.scene_label==c]
    end_pos += len(target_df)
    # print(start_pos, end_pos)
    gmm = GaussianMixture(n_components=gmm_n_components, max_iter=100, n_init=3, tol=1e-3,
                        covariance_type=gmm_covariance_type, init_params="kmeans", random_state=0)
    gmm.fit(train_features[start_pos:end_pos,:])
    start_pos = end_pos
    print(gmm.n_iter_)
    gmm_classifier_dict[ind] = gmm



11




11




9




12




8




4




8




10




8




6


In [4]:
# dev
from sklearn.metrics import confusion_matrix

# find max score
pre_scores_list = []
for i in label_dict.values():
    pre_scores = gmm_classifier_dict[i].score_samples(dev_features)
    pre_scores_list.append(pre_scores.reshape(1,-1))
    # print(pre_scores.shape)
pre_scores_list = np.concatenate(pre_scores_list, axis=0)
dev_pre_labels = np.argmax(pre_scores_list, axis=0)

dev_true_labels = np.asarray(dev_labels)
dev_cm = confusion_matrix(dev_true_labels, dev_pre_labels)
dev_cm_df = pd.DataFrame(dev_cm, columns=label_dict.keys(), index=label_dict.keys())
dev_cm_df

Unnamed: 0,airport,bus,metro,metro_station,park,public_square,shopping_mall,street_pedestrian,street_traffic,tram
airport,28,0,1,0,0,2,3,3,0,0
bus,0,32,4,0,2,0,0,1,0,1
metro,0,4,19,8,0,1,0,0,0,5
metro_station,2,2,2,13,2,3,5,4,1,3
park,0,2,0,0,34,1,0,0,1,1
public_square,1,3,0,4,4,15,1,10,1,0
shopping_mall,8,1,0,3,1,0,22,1,0,0
street_pedestrian,8,0,0,1,0,7,0,21,0,1
street_traffic,0,0,0,1,1,3,0,0,32,2
tram,0,4,7,0,1,2,0,1,0,22


In [5]:
dev_acc_df = pd.DataFrame()
for c, i in label_dict.items():
    acc = dev_cm[i,i] / np.sum(dev_cm[i])
    dev_acc_df[c] = [acc]
dev_acc_df["average"] = sum(dev_acc_df.iloc[0])/len(dev_acc_df.iloc[0])
dev_acc_df

Unnamed: 0,airport,bus,metro,metro_station,park,public_square,shopping_mall,street_pedestrian,street_traffic,tram,average
0,0.756757,0.8,0.513514,0.351351,0.871795,0.384615,0.611111,0.552632,0.820513,0.594595,0.625688


Test on eval set

In [6]:
# extract feature
eval_files = list(eval_df.filename)
eval_labels = list(eval_df.scene_label)
eval_labels = [label_dict[c] for c in eval_labels]
eval_features = []
for filename in eval_files:
    wav_file_path = datapath+filename
    mfcc_feature = extract_mfcc(wav_file_path, n_mels=n_mels, option=feature_option)
    eval_features.append(mfcc_feature.reshape(1,-1))             # [1, frames*n_features] (flatten)
eval_features = np.concatenate(eval_features, axis=0)
print(f"eval feature size: {eval_features.shape}")

eval feature size: (330, 6487)


In [7]:
from sklearn.metrics import confusion_matrix

# find max score
pre_scores_list = []
for i in label_dict.values():
    pre_scores = gmm_classifier_dict[i].score_samples(eval_features)
    pre_scores_list.append(pre_scores.reshape(1,-1))
    # print(pre_scores.shape)
pre_scores_list = np.concatenate(pre_scores_list, axis=0)
eval_pre_labels = np.argmax(pre_scores_list, axis=0)

eval_true_labels = np.asarray(eval_labels)
eval_cm = confusion_matrix(eval_true_labels, eval_pre_labels)
eval_cm_df = pd.DataFrame(eval_cm, columns=label_dict.keys(), index=label_dict.keys())
eval_cm_df

Unnamed: 0,airport,bus,metro,metro_station,park,public_square,shopping_mall,street_pedestrian,street_traffic,tram
airport,16,0,0,6,0,2,5,4,0,0
bus,0,19,5,1,1,0,0,0,0,7
metro,2,3,12,9,0,0,0,1,1,5
metro_station,1,0,7,10,1,3,4,4,0,3
park,0,0,1,0,26,1,0,0,2,3
public_square,2,0,2,0,4,14,0,4,6,1
shopping_mall,9,0,0,3,0,1,15,5,0,0
street_pedestrian,4,1,0,0,1,11,4,10,1,1
street_traffic,0,1,0,1,0,3,0,0,28,0
tram,0,9,13,3,0,0,0,0,0,8


In [8]:
eval_acc_df = pd.DataFrame()
for c, i in label_dict.items():
    acc = eval_cm[i,i] / np.sum(eval_cm[i])
    eval_acc_df[c] = [acc]
eval_acc_df["average"] = sum(eval_acc_df.iloc[0])/len(eval_acc_df.iloc[0])
eval_acc_df

Unnamed: 0,airport,bus,metro,metro_station,park,public_square,shopping_mall,street_pedestrian,street_traffic,tram,average
0,0.484848,0.575758,0.363636,0.30303,0.787879,0.424242,0.454545,0.30303,0.848485,0.242424,0.478788
