Prepare audioset

In [2]:
import pandas as pd

datapath = "audioset/"
train_df = pd.read_csv(datapath+"train.csv", sep="\t")
dev_df = pd.read_csv(datapath+"dev.csv", sep="\t")
eval_df = pd.read_csv(datapath+"eval.csv", sep="\t")

unique_labels = ['airport', 'bus', 'metro', 'metro_station',
                'park', 'public_square', 'shopping_mall',
                'street_pedestrian', 'street_traffic', 'tram']
label_dict = {}
for ind, c in enumerate(unique_labels):
    label_dict[c] = ind

Train

In [3]:
# extract feature
import numpy as np
from Extract_feature import extract_mfcc

# train set
train_files = list(train_df.filename)
train_labels = list(train_df.scene_label)
train_features = []
for filename in train_files:
    wav_file_path = datapath+filename
    mfcc_feature = extract_mfcc(wav_file_path, option="mfcc")
    train_features.append(mfcc_feature.reshape(1,-1))           # [1, frames*n_features] (flatten)
train_features = np.concatenate(train_features, axis=0)
print(f"train feature size: {train_features.shape}")

# dev set
dev_files = list(dev_df.filename)
dev_labels = list(dev_df.scene_label)
dev_features = []
for filename in dev_files:
    wav_file_path = datapath+filename
    mfcc_feature = extract_mfcc(wav_file_path, option="mfcc")
    dev_features.append(mfcc_feature.reshape(1,-1))             # [1, frames*n_features] (flatten)
dev_features = np.concatenate(dev_features, axis=0)
print(f"dev feature size: {dev_features.shape}")

train feature size: (3008, 6487)
dev feature size: (379, 6487)


In [64]:
# train
from sklearn.cluster import KMeans

model_km1 = KMeans(n_clusters=len(unique_labels), max_iter=1, random_state=0).fit(train_features)

# get each class's mean feature for cluster center
start_pos, end_pos = 0, 0
for ind, c in enumerate(unique_labels):
    target_df = train_df[train_df.scene_label==c]
    end_pos += len(target_df)
    # print(start_pos, end_pos)
    model_km1.cluster_centers_[ind] = np.mean(train_features[start_pos:end_pos,:], axis=0)
    start_pos = end_pos

In [44]:
# dev
from sklearn.metrics import confusion_matrix

label_dict = {}
for ind, c in enumerate(unique_labels):
    label_dict[c] = ind
dev_labels = [label_dict[c] for c in dev_labels]

In [65]:
dev_pre_labels = model_km1.predict(dev_features)
dev_true_labels = np.asarray(dev_labels)
dev_cm = confusion_matrix(dev_true_labels, dev_pre_labels)
dev_cm_df = pd.DataFrame(dev_cm, columns=label_dict.keys(), index=label_dict.keys())
dev_cm_df

Unnamed: 0,airport,bus,metro,metro_station,park,public_square,shopping_mall,street_pedestrian,street_traffic,tram
airport,8,2,0,3,7,8,3,0,5,1
bus,2,8,2,2,16,0,0,4,1,5
metro,2,5,11,2,1,0,1,1,13,1
metro_station,0,4,6,0,3,3,2,2,13,4
park,1,2,2,1,32,0,0,0,0,1
public_square,0,3,2,3,8,7,1,0,8,7
shopping_mall,1,1,0,1,5,3,6,2,15,2
street_pedestrian,2,5,0,1,6,4,3,2,8,7
street_traffic,0,0,0,3,2,5,1,1,24,3
tram,2,3,4,0,9,6,0,1,2,10


In [66]:
dev_acc_df = pd.DataFrame()
for c, i in label_dict.items():
    acc = dev_cm[i,i] / np.sum(dev_cm[i])
    dev_acc_df[c] = [acc]
dev_acc_df["average"] = sum(dev_acc_df.iloc[0])/len(dev_acc_df.iloc[0])
dev_acc_df

Unnamed: 0,airport,bus,metro,metro_station,park,public_square,shopping_mall,street_pedestrian,street_traffic,tram,average
0,0.216216,0.2,0.297297,0.0,0.820513,0.179487,0.166667,0.052632,0.615385,0.27027,0.281847


Test on eval set

In [68]:
# extract feature
import numpy as np
from Extract_feature import extract_mfcc

eval_files = list(eval_df.filename)
eval_labels = list(eval_df.scene_label)
eval_features = []
for filename in eval_files:
    wav_file_path = datapath+filename
    mfcc_feature = extract_mfcc(wav_file_path, option="mfcc")
    eval_features.append(mfcc_feature.reshape(1,-1))             # [1, frames*n_features] (flatten)
eval_features = np.concatenate(eval_features, axis=0)
print(f"dev feature size: {eval_features.shape}")

dev feature size: (330, 6487)


In [69]:
from sklearn.metrics import confusion_matrix

eval_labels = [label_dict[c] for c in eval_labels]
eval_pre_labels = model_km1.predict(eval_features)
eval_true_labels = np.asarray(eval_labels)
eval_cm = confusion_matrix(eval_true_labels, eval_pre_labels)
eval_cm_df = pd.DataFrame(eval_cm, columns=label_dict.keys(), index=label_dict.keys())
eval_cm_df

Unnamed: 0,airport,bus,metro,metro_station,park,public_square,shopping_mall,street_pedestrian,street_traffic,tram
airport,3,2,0,6,1,1,8,1,8,3
bus,2,8,1,1,10,2,0,0,0,9
metro,1,1,10,2,2,0,0,0,13,4
metro_station,0,1,2,0,5,3,5,0,11,6
park,1,5,0,0,26,1,0,0,0,0
public_square,0,0,0,2,9,7,0,0,12,3
shopping_mall,4,1,0,1,0,3,7,2,12,3
street_pedestrian,3,2,0,1,0,12,2,1,5,7
street_traffic,1,0,0,1,1,4,3,1,22,0
tram,1,8,5,1,6,3,0,0,2,7


In [70]:
eval_acc_df = pd.DataFrame()
for c, i in label_dict.items():
    acc = eval_cm[i,i] / np.sum(eval_cm[i])
    eval_acc_df[c] = [acc]
eval_acc_df["average"] = sum(eval_acc_df.iloc[0])/len(eval_acc_df.iloc[0])
eval_acc_df

Unnamed: 0,airport,bus,metro,metro_station,park,public_square,shopping_mall,street_pedestrian,street_traffic,tram,average
0,0.090909,0.242424,0.30303,0.0,0.787879,0.212121,0.212121,0.030303,0.666667,0.212121,0.275758
