# Exploratory Data Analysis

Comes from: https://www.kaggle.com/dude431/beginner-s-visualization-and-removing-uniformative

In [1]:
import numpy as np
import os
from tqdm import tqdm
import wave
from scipy.io import wavfile
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
train = pd.read_csv("data/train_curated.csv")
train_noisy = pd.read_csv("data/train_noisy.csv")
test = pd.read_csv("data/sample_submission.csv")

In [3]:
print(train.shape)
print(train_noisy.shape)
print(test.shape)

(4970, 2)
(19815, 2)
(1120, 81)


## Class Distribution

We would like to understand how the classes/labels are distributed in both the curated and noisy training sets. Are there any problems with class balance that we should be aware of? We will start by looking at the labels individually below.

In [24]:
def breakLabelsApart(dataDf, structureDf):

    columns = structureDf.columns
    new_df = pd.DataFrame(data=None, columns=columns, index=dataDf.index)

    for index, row in dataDf.iterrows():
        fname = row['fname']
        allLabels = row['labels']
        splitLabels = allLabels.split(',')

        columnDict = {}
        for i in columns:
            columnDict[i] = 0

        columnDict['fname'] = fname
        for label in splitLabels:
            columnDict[label] = 1

        new_df = new_df.append(columnDict, ignore_index=True)
    
    return new_df

In [26]:
%time train_one_hot = breakLabelsApart(train, test)

CPU times: user 2min 14s, sys: 18.5 s, total: 2min 32s
Wall time: 2min 32s


In [None]:
%time train_noisy_one_hot = breakLabelsApart(train_noisy, test)

In [None]:
plt.figure(figsize=(15,25))
audio_type = train_one_hot.loc[:, 'Accelerating_and_revving_and_vroom':'Zipper_(clothing)'].sum(axis = 0).sort_values(ascending=False)
sns.barplot(audio_type.values, audio_type.index)
for i, v in enumerate(audio_type.values):
    plt.text(0.8,i,v,color='k',fontsize=12)
plt.xticks(rotation='vertical')
plt.xlabel('Frequency')
plt.ylabel('Label Name')
plt.title("Labels with their frequencies in training data")
plt.show()

In [None]:
plt.figure(figsize=(15,25))
audio_type = train_noisy_one_hot.loc[:, 'Accelerating_and_revving_and_vroom':'Zipper_(clothing)'].sum(axis = 0).sort_values(ascending=False)
sns.barplot(audio_type.values, audio_type.index)
for i, v in enumerate(audio_type.values):
    plt.text(0.8,i,v,color='k',fontsize=12)
plt.xticks(rotation='vertical')
plt.xlabel('Frequency')
plt.ylabel('Label Name')
plt.title("Labels with their frequencies in training data")
plt.show()

Some audio samples contain multiple noise sources. We can look at the distribution for the combination of these sources as well.

In [None]:
plt.figure(figsize=(15,44))
audio_type = train['labels'].value_counts()
sns.barplot(audio_type.values, audio_type.index)
for i, v in enumerate(audio_type.values):
    plt.text(0.8,i,v,color='k',fontsize=12)
plt.xticks(rotation='vertical')
plt.xlabel('Frequency')
plt.ylabel('Label Name')
plt.title("Top 30 labels with their frequencies in training data")
plt.show()