In [None]:
from ibm_dataset import IBMDebater
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import librosa
import os

In [None]:
data_path = 'data/ibm_debater/full'
data_train = IBMDebater(data_path, 'train', load_text=True, load_audio=False, text_transform=None, chunk_length=None)
data_test = IBMDebater(data_path, 'test', load_text=True, load_audio=False, text_transform=None, chunk_length=None)

In [None]:
# Counting the samples in each data split
sns.set(color_codes=True)
train_metadata = data_train.annotations
test_metadata = data_test.annotations

concatenated = pd.concat([train_metadata.assign(dataset='train'),
                          test_metadata.assign(dataset='test')])

sns.countplot(x='dataset', data=concatenated)
plt.show()

In [None]:
# Counting motion polarity
_, axes = plt.subplots(1, 2, figsize=(15, 5))
for split, ax in zip(np.unique(concatenated['dataset'].values), axes.tolist()):
    ax.set_title(split + ' split')
    sns.countplot(x='speech-to-motion-polarity', data=concatenated[concatenated['dataset'] == split], ax=ax)
plt.show()

In [None]:
# length text 
lentext=[]
for data in data_train:
    text = data[0]
    text = text.split(' ')
    lentext.append(len(text))

plt.figure(figsize=(15, 7.5))
sns.histplot(lentext, kde=True, stat="density", linewidth=0)
plt.show()

In [None]:
# Show k most frequent cased words

from collections import Counter
import re
def get_k_most_freq_cased(data, k):
    texts = [t[0] for t in data]
    texts = ' '.join(texts)
    
    res = re.findall('([A-Z]\w+)', texts)

    counts = Counter()

    for w in res:
        counts[w] += 1
    items = counts.most_common(k)
    keys = [item[0] for item in items]
    values = [item[1] for item in items]

    return keys, values

words, freq = get_k_most_freq_cased(data_train, 50)

_, ax = plt.subplots(figsize=(25, 10))
sns.barplot(x=words, y=freq, ax=ax)
plt.xticks(rotation=45)
plt.show()

In [None]:
# Show the different sample rates of the audio files in the dataset
def get_sr_and_durations(data):
    sample_rates = []
    seconds = []
    for audio_file in data['wav-file-name']:
        audio_path = os.path.join(data_path, 'wav', audio_file)
        sr = librosa.get_samplerate(audio_path)
        sc = librosa.get_duration(filename=audio_path)
        sample_rates.append(sr)
        seconds.append(sc)
    return sample_rates, seconds

train_metadata = data_train.annotations
sample_rates, seconds = get_sr_and_durations(train_metadata)
sample_rates = pd.DataFrame(sample_rates, columns=['sr'])
sample_rates = sample_rates.value_counts(normalize=True).mul(100).rename('percentage').reset_index()
percentages = sample_rates['percentage']
# Define Seaborn color palette to use
colors = sns.color_palette('pastel')[0:len(sample_rates)]

# Create pie chart
plt.figure(figsize=(8, 8))
plt.pie(percentages, labels=sample_rates['sr'], colors=colors, autopct='%.2f%%')
plt.show()

In [None]:
# Show the distribution of the audio lenghts for our dataset
seconds = pd.DataFrame(seconds, columns=['sc'])
plt.figure(figsize=(15, 7.5))
sns.histplot(seconds, kde=True, stat="density", linewidth=0)

plt.show()