# Data preparation 1
This first version is used to provide the data to the model as raw as possible, without any adjustments or parameters set. The goal of this initial version is to see the "worst" possible model that can be created with this data. By doing so, the model can help identify the weaknesses in the data, and highlight areas where more attention and data collection may be needed. The idea behind this approach is that the subsequent versions of the model will only do better by adjusting lots of parameters and our way of preparing the data. As the model is optimized with more parameters, it should become more accurate and effective in predicting emotions. In essence, the first version serves as a benchmark for the model, showing how far it has come from its initial state and providing insight into the progress that has been made over time.

Initially, we extract the data into four separate lists. The initial list contains the audio waves, which are then split into chunks of 2 seconds to facilitate feeding it to the AI. 
The remaining three lists consist of labels, which appear to have three distinct interpretations. However, at present, we will only concentrate on using the first list.

In [None]:
import matplotlib.pyplot as plt  #MAKE SURE TO IMPORT MATPLOTLIB BEFORE LIBROSA, otherwise matplolib will return errors somehow..
import os
import librosa
import librosa.display
import re
from sklearn.preprocessing import OneHotEncoder
import numpy as np

label_dir = "../labels/"
audio_dir = "../audio/"

all_labels_version1 = []
all_labels_version2 = []
all_labels_version3 = []
all_chunks = []

all_audios = []  #only here for stats

#Data processing - audio chunking - labels onehot encoding

# iterate over the files in the directory, grouped by three
for i in range(0, len(os.listdir(label_dir)), 3):
    if os.listdir(label_dir)[i].endswith('.txt'):
        tmp_emotion1 =[]
        tmp_emotion2 =[]
        tmp_emotion3 =[]
        tmp_wav_file = []
        # read the content of each file
        with open(os.path.join(label_dir, os.listdir(label_dir)[i])) as f1:
            for line in f1:
                tmp_emotion1.append(re.search(r':(\w+)', line)[1])
                tmp_wav_file.append(re.search(r'(\w+)',line)[1])
            
        with open(os.path.join(label_dir, os.listdir(label_dir)[i+1])) as f2:
            for line in f2:
                tmp_emotion2.append(re.search(r':(\w+)', line)[1])
        
        with open(os.path.join(label_dir, os.listdir(label_dir)[i+2])) as f3:
            for line in f3:
                tmp_emotion3.append(re.search(r':(\w+)', line)[1])

            #exemple tmp_wav_file[i] -> Ses01F_impro02_F005 :Sadness; ()
            #example2                -> Ses01F_script01_1_F000 :Fear; ()
            for i in range(len(tmp_wav_file)):
                first_matching_file = None
                reg_session = re.search(r'Ses(\d+)', tmp_wav_file[i])[1]
                reg_impro = re.search(r'\d{2}([A-Za-z].*?_[A-Za-z].*?)_[A-Z]', tmp_wav_file[i]).group(1)
                audio_file = os.path.join(f"../audio/Session{reg_session[1]}/sentences/wav/Ses{reg_session}{reg_impro}/", tmp_wav_file[i]+".wav")

                y, sr = librosa.load(audio_file, sr=None, mono=True)
                y = librosa.to_mono(y)

                all_audios.append(y)

                # Calculate the duration of each chunk in samples
                chunk_length_samples = sr * 2

                # Split the audio file into chunks
                chunks = len(y) // chunk_length_samples
                if len(y)/chunk_length_samples<1.25:
                    all_chunks.append(y)
                    all_labels_version1.append(tmp_emotion1[i])
                    all_labels_version2.append(tmp_emotion2[i])
                    all_labels_version3.append(tmp_emotion3[i])

                for i in range(chunks):
                    start = i * chunk_length_samples
                    end = (i + 1) * chunk_length_samples
                    if end > len(y):
                        end = len(y)
                    chunk = y[start:end]
                    # Export each chunk to a new audio file
                    all_chunks.append(chunk)
                    all_labels_version1.append(tmp_emotion1[i])
                    all_labels_version2.append(tmp_emotion2[i])
                    all_labels_version3.append(tmp_emotion3[i])

                print(audio_file , tmp_emotion1[i],tmp_emotion2[i],tmp_emotion3[i],end='\n   ')
                print(f"{round(len(y)/sr,2)}s divisé en {chunks}")
print("audio chunking finished")
encoder = OneHotEncoder()
encoded_labels1 = encoder.fit_transform([[label] for label in all_labels_version1]).toarray()
encoded_labels2 = encoder.fit_transform([[label] for label in all_labels_version2]).toarray()
encoded_labels3 = encoder.fit_transform([[label] for label in all_labels_version3]).toarray()
print("finished !")



we then add padding to the audio waves since they don't have exactly the same length as we can see on this plot :

<img src="plots/line_plot_chunked_length_sorted.png" alt="line plot chunked length sorted" width="30%">


In [None]:
#Data padding 

import numpy as np

# find the maximum length of the soundwaves
max_length = max([len(soundwave) for soundwave in all_chunks])

# apply padding of zeros
padded_soundwaves = np.array([np.pad(soundwave, (0, max_length - len(soundwave)), mode='constant') for soundwave in all_chunks], dtype='float32')




In [None]:
#Save data in - final_dataset/
import numpy as np

np.save("processing_dataset/padded_soundwaves.npy", padded_soundwaves)
print("Padded_soundwaves saved!") 


In [None]:
#generates MEL-spectograms for each elements of padded_soundwaves

sr = 22050
n_fft = 2048
mel_spec_array=[]
# generate MEL spectrograms for each padded soundwave
for soundwave in padded_soundwaves:
    mel_spec = librosa.feature.melspectrogram(y=soundwave, sr=sr, n_fft=n_fft, hop_length=int(n_fft/2))
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)  # convert to decibel scale
    mel_spec_array.append(mel_spec_db)
    # plot the MEL spectrogram
plt.figure(figsize=(10, 4))
librosa.display.specshow(mel_spec_db, sr=sr, hop_length=int(n_fft/2), x_axis='time', y_axis='mel')
plt.colorbar(format='%+2.0f dB')
plt.title('MEL Spectrogram')
plt.tight_layout()
plt.show()

In [None]:
#Save data in - final_dataset/
import numpy as np

mel_spec_array = np.array(mel_spec_array)
np.save("final_dataset/spectrograms.npy", mel_spec_array)
print("spectrograms saved!") 

np.savez('final_dataset/labels.npz', labels1=encoded_labels1, labels2=encoded_labels2, labels3=encoded_labels3)
print("labels saved!") 


In [None]:
#load the data
import csv
import numpy as np

data = np.load('final_dataset/labels.npz')
# Extract the onehot encoded labels
labels1 = data['labels1']
labels2 = data['labels2']
labels3 = data['labels3']
print("labels load finished\n",labels1[:5])

spectrograms = np.load("final_dataset/spectrograms.npy")
print("Spectrograms load finished")


In [None]:

#This script deletes all the .anvil files in a specified directory since those are useless for our use in this project. 

import glob
# Directory containing the audio files
audio_dir = "labels/"

# List all the .anvil files in the directory
anvil_files = glob.glob(os.path.join(audio_dir, "*.anvil"))

# Loop over all the .anvil files and delete them
for file_path in anvil_files:
    os.remove(file_path)
