In [1]:
import os
import csv
from tqdm import tqdm
from scipy.io import wavfile
import numpy as np
import librosa
from sklearn.preprocessing import MinMaxScaler
import pickle
from vosk import Model, KaldiRecognizer
import json
import soundfile as sf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

root_dir = '/home/mabikana/Documents/PhD/Datasets/RECOLA/'
emotional_data_dir = root_dir + 'RECOLA-Annotation/emotional_behaviour/'
valence_data_dir = emotional_data_dir + '/valence/'
arousal_data_dir = emotional_data_dir + '/arousal/'
audio_data_dir = root_dir + 'RECOLA-Audio-recordings/'
chunks_dir = root_dir + 'audio_chunks/'
resampled_chunks_dir = root_dir + 'audio_chunks_reduced_sr/'
pickle_input_file = '/home/mabikana/Documents/PhD/SER Code/RECOLA Processing/melspectrograms.pickle'
text_input_file = '/home/mabikana/Documents/PhD/SER Code/RECOLA Processing/text.pickle'
tokenizer_file = '/home/mabikana/Documents/PhD/SER Code/RECOLA Processing/tokenizer_file.pickle'
pickle_valence_file = '/home/mabikana/Documents/PhD/SER Code/RECOLA Processing/valence.pickle'
pickle_arousal_file = '/home/mabikana/Documents/PhD/SER Code/RECOLA Processing/arousal.pickle'
pickle_spectrogram_scaler_file = '/home/mabikana/Documents/PhD/SER Code/RECOLA Processing/melspectrograms_scaler.pickle'
pickle_arousal_scaler_file = '/home/mabikana/Documents/PhD/SER Code/RECOLA Processing/arousal_scaler.pickle'
pickle_valence_scaler_file = '/home/mabikana/Documents/PhD/SER Code/RECOLA Processing/valence_scaler.pickle'
speech_to_text_model_dir = '/home/mabikana/Documents/PhD/SER Code/EmergencyOutcomePrediction/speech_to_text/model'
speech_to_text_model = Model(speech_to_text_model_dir)

speech_rate = 44100
dur = 4
dur_multiplier = 25
target_speech_rate = 8000

# Large vocabulary free form recognition
rec = KaldiRecognizer(speech_to_text_model, target_speech_rate)

In [2]:
all_data = {}

for audio_file in tqdm(os.listdir(audio_data_dir)):
    all_data[audio_file.split(".")[0]] = {}


for valence_file in tqdm(os.listdir(valence_data_dir)):
    valence_data = {}
    with open(valence_data_dir + valence_file) as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=';')
        line_count = 0
        for row in csv_reader:
            if line_count == 0:
                line_count += 1
            else:
                valence_data[row[0]] = {}
                valence_data[row[0]]["valence"] = float(row[1]) # extract time and only first annotator's values
                line_count += 1

    all_data[valence_file.split(".")[0]] = valence_data



for arousal_file in tqdm(os.listdir(arousal_data_dir)):
    arousal_data = {}
    with open(arousal_data_dir + arousal_file) as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=';')
        line_count = 0
        for row in csv_reader:
            if line_count == 0:
                line_count += 1
            else:
                all_data[arousal_file.split(".")[0]][row[0]]["arousal"] = float(row[1])
                line_count += 1


100%|██████████| 23/23 [00:00<00:00, 382813.46it/s]
100%|██████████| 23/23 [00:00<00:00, 140.06it/s]
100%|██████████| 23/23 [00:00<00:00, 149.23it/s]


In [3]:
file = list(all_data.keys())[0]
# print("Values for " + file)
# print(all_data[file])

In [4]:
# create and save spectrograms
def get_log_mel_spectrogram(path, n_fft, hop_length, n_mels, file_name):
    y, sr = librosa.load(path, sr=speech_rate, duration=dur)
    signal = librosa.resample(y, orig_sr=speech_rate, target_sr=target_speech_rate)
    resampled_chunk_name = resampled_chunks_dir + file_name
    sf.write(resampled_chunk_name, signal, target_speech_rate)
        
    audio_len = speech_rate * dur
    
    file_length = np.size(y)
    if file_length != audio_len:
        y = np.concatenate((y, np.zeros(audio_len-file_length)), axis=0)
    
    mel_spectrogram = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels)
    log_mel_spectrogram = librosa.power_to_db(mel_spectrogram)
    log_mel_spectrogram = log_mel_spectrogram.reshape(-1,1)
    

    return log_mel_spectrogram, resampled_chunk_name


In [5]:
def chunk_to_text(chunk_path):
    textResults = []
    wf = open(chunk_path, "rb")
    wf.read(44) # skip header

    while True:
        data = wf.read(2000)
        if len(data) == 0:
            break
        if rec.AcceptWaveform(data):
            res = json.loads(rec.Result())
    
    resultDict = json.loads(rec.FinalResult())
    textResults.append(resultDict.get("text", ""))
    
    return ' '.join(textResults)

In [6]:
all_spectrograms = []
num_rows = dur * dur_multiplier
valence = []
arousal = []
wav_text_data = []
max_text_len = 0
min_text_len = 999999
all_lens = []

for file in tqdm(os.listdir(audio_data_dir)):
    chunk_length_ms = dur * 1000
    frequency, signal = wavfile.read(audio_data_dir + '/' + file)

    slice_length = int(chunk_length_ms / 1000) # in seconds
    overlap =  int((chunk_length_ms / 1000) / 2) # in seconds
    slices = np.arange(0, len(signal)/frequency, slice_length-overlap, dtype=int)
    chunks = []
    i = 0
    valence_arousal_values = all_data[file.split(".")[0]]
    chunk_counter = 0

    for start, end in zip(slices[:-1], slices[1:]):
        start_audio = start * frequency
        end_audio = (end + overlap)* frequency
        audio_slice = signal[int(start_audio): int(end_audio)]
        chunks.append(audio_slice)
        chunk_name = chunks_dir + '/' + file + '_{0}.wav'.format(i)
        wavfile.write(chunk_name, speech_rate, audio_slice)
        chunk_spectrogram, resampled_chunk_name = get_log_mel_spectrogram(path=chunk_name, \
                                                                          n_fft=2048, hop_length=512, n_mels=128,\
                                                                          file_name=file + '_{0}.wav'.format(i))
        all_spectrograms.append(chunk_spectrogram)
        chunks_row_start = int(chunk_counter)
        chunks_row_end = int(num_rows + chunk_counter)
        chunk_valence_arousal_values = list(valence_arousal_values.values())[chunks_row_end]
        chunk_valence_values = chunk_valence_arousal_values["valence"]
        chunk_arousal_values = chunk_valence_arousal_values["arousal"]
        valence.append(chunk_valence_values)
        arousal.append(chunk_arousal_values)
#         print("For file " + file + " and chunk start ", chunks_row_start, " and end ", chunks_row_end)
#         print("valence ", chunk_valence_values ," and arousal ", chunk_arousal_values, "\n")
        chunk_counter += num_rows / overlap


        # Extract text
        text = chunk_to_text(resampled_chunk_name)

        if(len(text) > max_text_len):
            max_text_len = len(text)
        if(len(text) < min_text_len):
            min_text_len = len(text)
            
#         print("\nFor chunk ", chunk_name, " text is: ", text)
        all_lens.append(len(text))
        wav_text_data.append(text)
    
        i += 1


print('Max text length = ' + str(max_text_len))
print('Min text length = ' + str(min_text_len))
print('Avg text length = ' + str(np.mean(all_lens)))

# normalize spectrograms
all_spectrograms = np.array(all_spectrograms) 
scaler = MinMaxScaler(feature_range=(0, 1))
arousal_scaler = MinMaxScaler(feature_range=(-1, 1))
valence_scaler = MinMaxScaler(feature_range=(-1, 1))

all_spectrograms = all_spectrograms.reshape(len(all_spectrograms),-1)
scaler.fit(all_spectrograms)
valence = np.array(valence)
arousal = np.array(arousal)

valence = valence.reshape(len(valence),-1)
arousal = arousal.reshape(len(arousal),-1)

arousal_scaler.fit(arousal)
valence_scaler.fit(valence)

normalized_melspectrograms = scaler.transform(all_spectrograms)
normalized_valence = valence_scaler.transform(valence)
normalized_arousal = arousal_scaler.transform(arousal)

input_melspectrograms = np.reshape(normalized_melspectrograms,(len(normalized_melspectrograms),128, -1,1))
print(np.amax(all_spectrograms))
print(np.amax(normalized_melspectrograms))

print(np.amax(arousal))
print(np.amax(normalized_arousal))

input_melspectrograms.shape 
# (400, 128, 427, 1)

100%|██████████| 23/23 [1:01:46<00:00, 161.13s/it]


Max text length = 115
Min text length = 0
Avg text length = 19.175663845929385
37.669827
1.0000001
0.9
0.9999999999999999


(3427, 128, 345, 1)

In [7]:
# Preprocess text

tokenizer = Tokenizer()
tokenizer.fit_on_texts(wav_text_data)
vocab_size = len(tokenizer.word_index) + 1
tokenized_text = []
max_seq_len = 20

for i in range(0, len(wav_text_data)):
    new_token_text = tokenizer.texts_to_sequences([wav_text_data[i]])
    new_text = pad_sequences(new_token_text, maxlen=max_seq_len)
    tokenized_text.append(new_text)

input_text = np.array(tokenized_text)

In [8]:
with open(pickle_input_file, "wb") as f:
    pickle.dump(input_melspectrograms, f, protocol=pickle.HIGHEST_PROTOCOL)
    
with open(pickle_valence_file, "wb") as f:
    pickle.dump(normalized_valence, f, protocol=pickle.HIGHEST_PROTOCOL)
    
with open(pickle_arousal_file, "wb") as f:
    pickle.dump(normalized_arousal, f, protocol=pickle.HIGHEST_PROTOCOL)
    
with open(pickle_spectrogram_scaler_file, "wb") as f:
    pickle.dump(scaler, f, protocol=pickle.HIGHEST_PROTOCOL)
    
with open(pickle_arousal_scaler_file, "wb") as f:
    pickle.dump(arousal_scaler, f, protocol=pickle.HIGHEST_PROTOCOL)

with open(pickle_valence_scaler_file, "wb") as f:
    pickle.dump(valence_scaler, f, protocol=pickle.HIGHEST_PROTOCOL)
    
with open(text_input_file, "wb") as f:
    pickle.dump(input_text, f, protocol=pickle.HIGHEST_PROTOCOL)

with open(tokenizer_file, "wb") as f:
    pickle.dump(tokenizer, f, protocol=pickle.HIGHEST_PROTOCOL)

In [9]:
# input_melspectrograms[0]
normalized_arousal.shape

(3427, 1)

In [10]:
# speech to text and save text