# Setup enviroment

## Import libraries

In [None]:
import numpy as np
import os
import matplotlib.pyplot as plt
import json
from Paths import output_dir_absolute_path, root_dir

## Define functions

In [None]:
def Deserialize(name):
    with open(os.path.join(output_dir_absolute_path, name + ".json"), 'r') as file:
        data = json.load(file)

    return data

In [None]:
def SetLabels(xlabel, ylabel,title):
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.title(title)

## Execute EDA Scripts

This step may be skipped if the needed files are already in the *Outputs* folder. 

In [None]:
import glob
import runpy

scripts = glob.glob(os.path.join(root_dir, "Source", "EDA scripts", '*.py'))

for script in scripts:
    print(f'Executing: {script}')
    runpy.run_path(script, run_name='__main__')

# Create Charts

## Speakers

In [None]:
speakers_counts = Deserialize('speakers_counts')
plt.figure(figsize=(10, 5))
n, bins, patches = plt.hist(speakers_counts.values(), edgecolor='black', bins=range(0,200,10))

SetLabels("Number of recordings","Number of speakers","Number of recordings per participant")
plt.xticks(range(0,200,10))
plt.yscale('log')
plt.bar_label(patches)
plt.show()

Most participants recorded around 30-40 recordings, but there is also a large group of participants that recored more than 100 recordings and less than 30 recordings. This shows us that the data set is **unbalanced**, which may lead to **biased** predictions.

## Word frequency

In [None]:
labels_counts = Deserialize('labels_counts')
bar = plt.barh(labels_counts.keys(), labels_counts.values(), edgecolor='black')

SetLabels("Number of recordings", "Word", "Frequency of each word in recordings")
plt.xlim(0,50000)
plt.bar_label(bar, padding=5)
plt.show()

The number of recordings per key word is **balanced**. However, there is a low number of recordings for "silence", which may lead to problems in detecting it. There is also a very high number of "unknown" recordings, which can result in tendency to mistakenly classify data in this class.

## Length

### Overall

In [None]:
lengths = Deserialize('lengths')
n, bins, patches = plt.hist(lengths, edgecolor='black',bins=np.array(range(0,110,10))/100)

SetLabels("Recording lengths in seconds", "Number of recordings", "Number of recordings per length")
plt.xticks(bins)
plt.yscale("log")
plt.bar_label(patches)
plt.show()

Most recording have the same length of ~1 second. Some of them are shorter, which may be a result of bad processing. 

### Length by Word 

In [None]:
lengths_word = Deserialize("lengths_by_word")
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
plt.boxplot(lengths_word.values(), tick_labels=lengths_word.keys(), orientation='horizontal')
SetLabels("Duration [s]", "Word", "Distribution of duration per word")

plt.subplot(1, 2, 2)
mean_lengths = {word: np.mean(times) if times else 0 for word, times in lengths_word.items()}
plt.barh(mean_lengths.keys(), mean_lengths.values(), edgecolor="black")
plt.xscale("log")
SetLabels("Average Duration [s]", "Word", "Average Duration of words")

plt.tight_layout()
plt.show()

The first plot shows that each class has many **outliers** - it may be mistakenly classified data or perhaps audio files cut too short. The second plot shows that there are significant differences between mean lengths of each word, which may be helpful in classifying them. We have to keep in mind that the mean duration may be heavily influenced by outliers.

## Amplitude

### Mean Amplitude

In [None]:
mean_amplitude = Deserialize("mean_amplitudes")
n, bins, patches = plt.hist(mean_amplitude, edgecolor='black', bins=range(0,18000,2000))

SetLabels("Mean amplitude", "Number of recordings", "Mean amplitudes of recordings")
plt.xticks(bins)
plt.yscale("log")
plt.bar_label(patches)
plt.show()


High number of recordings with mean amplitude smaller than 2000 shows us that most audio files have a similar volume level and it is not very high.

### Root Mean Square Amplitude

In [None]:
rms_amplitude = Deserialize("rms_amplitudes")
plt.figure(figsize=(10, 6))
n, bins, patches = plt.hist(rms_amplitude, edgecolor='black',bins=range(0,24000,2000))

SetLabels("RMS amplitude", "Number of recordings", "RMS amplitudes of recordings")
plt.xticks(bins)
plt.yscale("log")
plt.bar_label(patches)
plt.show()

RMS amplitude shows that most recordings have **lower** volume. This may suggest that there are many recordings with silent parts or just complete silence. Other audio files have the RMS amplitude reaching up to 22000, which may be caused by background noise. Such big differences in amplitudes might suggest that **normalizing** volume might be a good idea. 

RMS amplitude histogram and Mean amplitude histogram are quite similar, which implies **low Peak-to-RMS ratio** - this means that the audio in the recordings is more **uniform** and may contain **noise**.

### RMS by Word

In [None]:
rms_word = Deserialize("rms_by_word")
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
plt.boxplot(rms_word.values(), tick_labels=rms_word.keys(), orientation='horizontal')
SetLabels("Amplitude", "Word", "Distribution of RMS Amplitude per word")
plt.grid()

plt.subplot(1, 2, 2)
mean_rms = {word: np.mean(times) if times else 0 for word, times in rms_word.items()}
plt.barh(mean_rms.keys(), mean_rms.values(), edgecolor="black")
plt.xscale("log")
SetLabels("Average RMS Amplitude", "Word", "Average RMS Amplitude of words")

plt.tight_layout()
plt.show()


Most words have a pretty similar boxplot chart. Many **outliers** may suggest mistakenly classified data. Average value shows that there may be a **difference** for RMS amplitude of each word, which may be helpful in classifying them, but we have to keep in mind that the outliers may heavily influence this chart.