In [None]:
# Installations
!pip install malaya_speech
!pip install pydub

# Imports
import pandas as pd
import os
import json
import numpy as np
from google.colab import files
import librosa as librosa
import librosa.display
from malaya_speech import Pipeline
import malaya_speech
import matplotlib.pyplot as plt
from pydub import AudioSegment
from pydub.silence import split_on_silence
import math




---


# Files Uploading
The files were either uploaded from the PC's memory.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
caller_audio = files.upload()

Saving calleraudio.zip to calleraudio.zip


In [None]:
agent_audio = files.upload()

Saving agentaudio.zip to agentaudio.zip


In [None]:
transcripts = files.upload()

Saving transcripts.zip to transcripts.zip


In [None]:
!unzip -q calleraudio.zip

In [None]:
!unzip -q agentaudio.zip

In [None]:
!unzip -q transcripts.zip



---


# JSON Data Feature Extraction 
The JSON files are transcripts contain data regarding the speech itself, words spoken and scores of emotions shown in the speech. 
The JSON files were created by using this trained library:

[gridspace-stanford-harper-valley
](https://github.com/cricketclub/gridspace-stanford-harper-valley/tree/master/experiments/src/datasets)

The features we decided to extract from the transcripts are:
* Average sentence speaking duration.
* Average word speaking duration.
* Average Negative emotion score in a conversation.
* Average Positive emotion score in a conversation.
* Average Neutral emotion score in a conversation.
* Average sentence length in a conversation (amount of words spoken).

In [None]:
def calc_word_dur(lst):
  """
  Calculates the average word speaking duration in a single sentence.

  Args: 
    lst (list): A list of measured speech duration of each word extracted from the JSON file.

  Returns: 
    float: The average word speaking duration in a specific sentence.
  """
  sum = 0
  for i in range(len(lst)):
    sum += lst[i]
  return sum/len(lst)


In [None]:

def calc_data(data, speaker):
  """
  Calculates for a single call : 
    1. Average Neutral, Negative and Positive emotions showing scores in a single conversation.
    2. Average sentences duration in a single call.
    3. Average words speech duration in a single call.
    4. Average sentence speech duration in a single call.

  Args: 
    data (loaded json file): A json file loaded that holds data of an entire call.
    speaker (string): Either the bank's employee(agent) or the customer(caller).
  
  Returns: 
    An array with all averages calculated.
  """
  counter = 0
  neu = 0
  neg = 0
  pos = 0
  total_dur = 0
  word_dur = 0
  sentence_len = 0
  for i in range(len(data)):
    if(data[i]['speaker_role'] == speaker):
      counter += 1
      neu += float(data[i]['emotion']['neutral'])
      neg += float(data[i]['emotion']['negative'])
      pos += float(data[i]['emotion']['positive'])
      total_dur += int(data[i]['duration_ms'])
      word_dur += int(calc_word_dur(data[i]['word_durations_ms']))
      sentence_len += int(len(data[i]['transcript'].split()))

  neu /= counter
  neg /= counter
  pos /= counter
  total_dur /= counter
  word_dur /= counter
  sentence_len /= counter


  return [neu, neg, pos, total_dur, word_dur, sentence_len]

In [None]:
def makeTenSecondsAudio(filename):
  """
  Removes dead recorded parts, then trims to maximum 10 seconds of recorded speech.

  Args:
    filename (string): The name of the file to load and process.

  Returns:
    tuple: A tuple containing the processed file and it's signal rate.

  """

  #Load audio
  y, sr = librosa.load(filename)

  #Cuts silence from the beggining and the end of the call
  y_trim = librosa.effects.trim(y, top_db=20)[0]

  #Cuts dead audio moments
  y_int = malaya_speech.astype.float_to_int(y_trim)
  audio = AudioSegment(
      y_int.tobytes(),
      frame_rate = sr,
      sample_width = y_int.dtype.itemsize,
      channels = 1
  )
  audio_chunks = split_on_silence(
      audio,
      min_silence_len = 200,
      silence_thresh = -30,
      keep_silence = 100,
  )
  
  #Sums all active audio parts and creates a united audio file
  final_y = sum(audio_chunks)
  final_y = np.array(final_y.get_array_of_samples())
  final_y = malaya_speech.astype.int_to_float(final_y)
  
  #If longer then 10 seconds, shortens to 10 ; otherwise remainds the same
  final_y = final_y[:sr*10]

  return final_y, sr

---
# Audio Feature Extractions
* **AE - Amplitude Envelope** - Refers to the changes in the amplitude of a sound over time. An important property of sound as it allows us to identify sounds and uniquely distinguish them from other sounds.

* **RMS - Root Means Squared** - Used to describe the average size of a function or a series of values.

* **ZCR - Zero Crossing Rate** - The rate at which a signal changes from positive to zero to negative OR from negative to zero to positive. Highly used for speech recognition and a key feature to classify percussive sounds.

* **Log-Mel Spectrogram** -
  1. The Fourier Transform - A mathematical formula that allows us to decompose a signal into it's individual frequencies and the frequency's amplitude.
  2. FFT - Fast Fourier Transform - An algorithm that can compute the Fourier Transform efficiently.
  3. The Spectrogram - Using the FFT, we can analyze the frequency content of a signal. By computing it on overlapping windowed segments of the signal we can get what's called spectrogram.  This method is called short-time Fourier Transform.
  4. Mel Spectorgram - A spectrogram where the frequencies are converted to mel scale. Mel scale is a converted method of a frequency that can be calculated by a mathematical operation in order to help us humans diffrenciate between high pitches a lot better, as studies have shown that humans do not percieve frequencies on a linear scale.
  5. Log-Mel Spectorgram is a logarithmic mathematical computation on the Mel Spectrogram for clearer visualization of the Spectrogram.
  

* **MFCC** - Mel Frequency Cepstrum Coefficients - A small set of features which conciesely describe the overall shape of a spectral envelope. Often used to describe timbre, in MRI for example. After calculating and creating Log-Mel Spectrogram, by discrete cosine transform we calculate MFCC.

For each of those, we calculated the following scores:
* Average.
* Maximum point.
* Minimum point.
* Standard deviation.
* Median.



In [None]:
def amplitude_envelope(signal, frame_size, hop_length):
  """
  Calculates the amplitude envelope of the trimmed record and data extracted from it:
    1. Average.
    2. Maximum point.
    3. Minimum point.
    4. Standard deviation.
    5. Median.


  Args:
    signal (ndarray): The trimmed 10 seconds long converastion record.
    frame_size (int): The given frame size to calculate.
    hop_length (int): The given hop length for each calculation.

  Returns:
    dictionary: A dictionary that holds data calculated.
  """
  amp_env = []

  # calculate AE for each frame
  for i in range(0, len(signal), hop_length):
    current_amp_env = max(signal[i:i+frame_size])
    amp_env.append(current_amp_env)
  
  ae_array = np.array(amp_env)

  dictionary = {}
  dictionary['average'] = np.average(ae_array)
  dictionary['max'] = np.max(ae_array)
  dictionary['min'] = np.min(ae_array)
  dictionary['std'] = np.std(ae_array)
  dictionary['median'] = np.median(ae_array)
  
  return dictionary

In [None]:
def rms(signal, frame_size, hop_length):
  """
  Calculates the Root Mean Squared of the trimmed record and data extracted from it:
    1. Average.
    2. Maximum point.
    3. Minimum point.
    4. Standard deviation.
    5. Median.

  Args:
    signal (ndarray): The trimmed 10 seconds long converastion record.
    frame_size (int): The given frame size to calculate.
    hop_length (int): The given hop length for each calculation.

  Returns:
    dictionary: A dictionary that holds data calculated.
  """

  rms = []

  for i in range(0, len(signal), hop_length):
    current_frame = np.sqrt(np.sum(signal[i:i+frame_size]**2)/frame_size)
    rms.append(current_frame)
  
  rms = np.array(rms)

  dictionary = {}
  dictionary['average'] = np.average(rms)
  dictionary['max'] = np.max(rms)
  dictionary['min'] = np.min(rms)
  dictionary['std'] = np.std(rms)
  dictionary['median'] = np.median(rms)
  
  return dictionary

In [None]:
def zcr(signal, frame_size, hop_length):
  """
  Calculates the Zero Crossing Rate of the trimmed record and data extracted from it:
    1. Average.
    2. Maximum point.
    3. Minimum point.
    4. Standard deviation.
    5. Median.


  Args:
    signal (ndarray): The trimmed 10 seconds long converastion record.
    frame_size (int): The given frame size to calculate.
    hop_length (int): The given hop length for each calculation.

  Returns:
    dictionary: A dictionary that holds data calculated.
  """

  zcr_signal = librosa.feature.zero_crossing_rate(signal, frame_length=frame_size, hop_length=hop_length)[0]

  dictionary = {}
  dictionary['average'] = np.average(zcr_signal)
  dictionary['max'] = np.max(zcr_signal)
  dictionary['min'] = np.min(zcr_signal)
  dictionary['std'] = np.std(zcr_signal)
  dictionary['median'] = np.median(zcr_signal)

  return dictionary

In [None]:
def log_mel_spectrogram_data(signal,sr , n_fft, hop_length):
  """
  Calculates the logarithmic mel spectrogram of the trimmed record and data extracted from it:
    1. Average.
    2. Maximum point.
    3. Minimum point.
    4. Standard deviation.
    5. Median.


  Args:
    signal (ndarray): The trimmed 10 seconds long converastion record.
    frame_size (int): The given frame size to calculate.
    hop_length (int): The given hop length for each calculation.

  Returns:
    dictionary: A dictionary that holds data calculated.
  """
  #Create MelSpectrogram
  mel_spectrogram = librosa.feature.melspectrogram(y=signal, sr=sr, n_fft=n_fft, win_length=n_fft) ##, n_mels=mel_bins)

  #Create LogMelSpectrogram out of the MelSpectrogram
  log_mel_spectrogram = librosa.power_to_db(mel_spectrogram, ref=np.max)

  dictionary = {}
  dictionary['average'] = np.average(log_mel_spectrogram)
  dictionary['max'] = np.max(log_mel_spectrogram)
  dictionary['min'] = np.min(log_mel_spectrogram)
  dictionary['std'] = np.std(log_mel_spectrogram)
  dictionary['median'] = np.median(log_mel_spectrogram)

  return dictionary

In [None]:
def mfcc(signal, n_mfcc, sr):
  """
  Calculates the Mel-Frequency Cepstrum of the trimmed record and data extracted from it:
    1. Average.
    2. Maximum point.
    3. Minimum point.
    4. Standard deviation.
    5. Median.

  Args:
    signal (ndarray): The trimmed 10 seconds long converastion record.
    frame_size (int): The given frame size to calculate.
    hop_length (int): The given hop length for each calculation.

  Returns:
    dictionary: A dictionary that holds data calculated.
  """
  mfcc = librosa.feature.mfcc(signal, n_mfcc=13, sr=sr)

  dictionary = {}
  dictionary['average'] = np.average(mfcc)
  dictionary['max'] = np.max(mfcc)
  dictionary['min'] = np.min(mfcc)
  dictionary['std'] = np.std(mfcc)
  dictionary['median'] = np.median(mfcc)

  return dictionary


In [None]:
def createAudioDataRow(dict1, dict2, dict3, dict4, dict5):
  """
  Creates a list from values of 5 dictionaries.

  Arg:
    dict1, dict2, dict3, dict4, dict5 (dictionary): Dictionaries with with data as values.
  
  Returns:
    List with the data from the dictionaries extracted to it.
  """
  lst = np.append(list(dict1.values()), list(dict2.values()))
  lst = np.append(lst, list(dict3.values()))
  lst = np.append(lst, list(dict4.values()))
  lst = np.append(lst, list(dict5.values()))

  return lst
  

In [None]:
def audio_data_calc(path, name):
  """
  With makeTenSecondsAudio(filename), clears dead audio parts then makes 10 seconds record.
  Then extracts audio features from the edited record.

  Args: 
    path (string): The path of the audio file.
    name (string): The audio file's name.

  Returns:
    A list with the extracted features of a single audio file.
  """
  filename = path + name
  signal, sr = makeTenSecondsAudio(filename)

  # properties
  FRAME_SIZE = 256
  N_FFT = 256
  HOP_LENGTH = 128
  N_MFCC=13

  # Time-Domain Features
  amp_env = amplitude_envelope(signal, frame_size=FRAME_SIZE, hop_length=HOP_LENGTH)
  rms_signal = rms(signal, frame_size=FRAME_SIZE, hop_length=HOP_LENGTH)
  zcr_signal = zcr(signal, frame_size=FRAME_SIZE, hop_length=HOP_LENGTH)

  # Time-Frequency-Domain Features
  log_mel = log_mel_spectrogram_data(signal, sr, n_fft=N_FFT, hop_length=HOP_LENGTH)
  mfcc_ = mfcc(signal, n_mfcc=N_MFCC, sr=sr)

  features = createAudioDataRow(amp_env, rms_signal, zcr_signal, log_mel, mfcc_)
  

  return features

---
# Full Feature Extraction
Each conversation recorded in our dataset was made of the following parts:
 1. JSON FILE containing data of the conversation as explained above.
 2. The recorded part of the bank's employee (agent).
 3. The recorded part of the customer (caller).

Each JSON file (transcript) and 2 audio records, each in a different folder, combine to hold data of a single full conversation.

For example:
*   Call5.json - transcript of Call5, found in the transcripts folder.
*   Call5.wav - agent's record of Call5, found in the **agent** folder.
*   Call5.wav - caller's record of Call5, found in the **caller** folder.



In [None]:
def extraction(key):
  """
  The extraction of transcript features and audio features of a single conversation.

  Args:
    key (string): The conversation's files name as shown in the example above.

  Returns:
    tuple- A tuple contains 2 rows of data:
      A row of the agent's data in the conversation, and a row of the caller's data. 
  """
  #Load json file (transcript)
  filename = str(key)
  with open(os.path.join('transcripts', filename), 'r') as f:
    data = json.load(f)

  # Extract transcript data
  agent_data = calc_data(data,'agent')
  caller_data = calc_data(data,'caller')
  
  #Change filename to .wav from .json
  filename = filename.split('.')
  filename = filename[0]
  filename = filename + '.wav'

  # Extract audio features data
  agent_audio_data = audio_data_calc('./agentaudio/', filename)
  caller_audio_data = audio_data_calc('./calleraudio/', filename)

  # Build a row of a single prediction of each type for each < json : (agent wav, caller wav) >
  agent_row = np.append(agent_data, agent_audio_data)
  caller_row = np.append(caller_data, caller_audio_data)

  # Y label
  # Assuming that the agent is the higher status in this dataset
  agent_row = np.append(agent_row, [1])
  caller_row = np.append(caller_row, [0])

  # Return the row into the dataframe
  return agent_row, caller_row

---
# Dataset Creation
Extracting the features from all 2892 records in the dataset - 
1446 conversations in total each divided to 2 records (agent and caller) and 1 transcript.json file.

For this experiment, we decided to label the agent as the higher status (1) and the caller as the lower status (0) in order to learn the relations between the labels we chose and the status we've given the participants and the features we extracted.

In [None]:
def createDataset():
  """
  Creates the entire dataset from all recorded conversations.

  Args:
    None.

  Returns:
    Labeled dataframe with extracted features.
  """
  # Properties
  completed_files = []
  id_counter = 0

  # Create Labels Row Here
  labels = ['ID','Filename', 'Neutral', 'Negative', 'Positive', 'Avg_Sentence_Duration', 'Avg_Word_Duration','Avg_Sentence_Length',
            'Amp_Env_Avg', 'Amp_Env_Max', 'Amp_Env_Min','Amp_Env_STD', 'Amp_Env_Median',
            'RMS_Avg', 'RMS_Max', 'RMS_Min','RMS_STD', 'RMS_Median',
            'ZCR_Avg', 'ZCR_Max', 'ZCR_Min','ZCR_STD', 'ZCR_Median',
            'Mel_Spec_Avg', 'Mel_Spec_Max', 'Mel_Spec_Min','Mel_Spec_STD', 'Mel_Spec_Median',
            'MFCC_Avg', 'MFCC_Max', 'MFCC_Min','MFCC_STD', 'MFCC_Median','Label']

  # Create empty data framehere
  mydataframe = pd.DataFrame(columns = labels)

  # Extract all data
  directory = 'transcripts'
  tempCounter = 0
  for filename in os.listdir(directory):
    if not filename in completed_files:
      print(f"{tempCounter} : {filename}")
      completed_files.append(filename)
      agent_features, caller_features = extraction(filename)

      agent_row = np.append([id_counter, filename.split(".")[0]], agent_features)
      id_counter += 1
      caller_row = np.append([id_counter, filename.split(".")[0]], caller_features)
      id_counter += 1
      # add rows to the dataframe
      mydataframe.loc[len(mydataframe.index)] = agent_row
      mydataframe.loc[len(mydataframe.index)] = caller_row
      tempCounter += 1
  return mydataframe

In [None]:
# Created the dataframe
df = createDataset()

In [None]:
# After we finished our feature extraction, we created an excel file to work with.
df.to_excel('BankersExtractedData.xlsx')