# Perception Pipeline

## Installs and Imports

In [None]:
# Local Whisper setup
!pip install git+https://github.com/openai/whisper.git
!pip install --upgrade --no-deps --force-reinstall git+https://github.com/openai/whisper.git
!apt update && apt install ffmpeg
!pip install setuptools-rust

# Huggingface
!pip install git+https://github.com/huggingface/datasets.git
!pip install git+https://github.com/huggingface/transformers.git

# Install core audio processing libraries
!pip install librosa numpy==1.26.4 soundfile resampy==0.4.3 pydub

# Install speech recognition and analysis libraries
!pip install SpeechRecognition==3.10.4 pyAudioAnalysis

# Install machine learning and data science libraries
!pip install tensorflow scikit-learn pandas hmmlearn imblearn

# Install visualization and utility libraries
!pip install streamlit plotly eyed3 feat

# Install system dependencies
!apt-get install -y ffmpeg


Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-v_d317f4
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-v_d317f4
  Resolved https://github.com/openai/whisper.git to commit 517a43ecd132a2089d85f4ebc044728a71d49f6e
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting tiktoken (from openai-whisper==20240930)
  Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->openai-whisper==20240930)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->openai-whisper==20240930)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-

In [None]:
import whisper
import torch
import os
import json
import time
import librosa
import IPython.display as ipd
import numpy as np
import pandas as pd
import soundfile as sf
import streamlit as st
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchaudio
from pydub import AudioSegment
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import load_model
from transformers import AutoFeatureExtractor, AutoModelForAudioClassification, AutoConfig
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2ForCTC

## Data Import

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Path to folder containing CREMA voice files
# Sofia's path
# crema_folder_path = '/content/drive/My Drive/Conversational Agents/data/CremaEqual'
# output_path = '/content/drive/My Drive/Conversational Agents/data/output/'


# Lemon's path
crema_folder_path = '/content/drive/My Drive/TU Delft/Conversational Agents/data/CremaEqual'
output_path = '/content/drive/My Drive/TU Delft/Conversational Agents/data/output/'

# Test file
test_file_name = '1013_TSI_ANG_XX'
test_file_path = os.path.join(crema_folder_path, '1013_TSI_ANG_XX.wav')

# Setting
os.makedirs(output_path, exist_ok=True)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(f"Using device: {device}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Using device: cpu


## ASR using Whisper turbo

In [None]:
# Import Whisper model
model_turbo = whisper.load_model("turbo", device)

100%|█████████████████████████████████████| 1.51G/1.51G [00:17<00:00, 94.5MiB/s]
  checkpoint = torch.load(fp, map_location=device)


In [None]:
def whisper_local_transcribe(model, file_path):
  '''
  Transcribes the given audio file using the specified Whisper model.
  Returns the hypothesis and the transcription time.
  '''
  start_time = time.time()
  hypothesis = model.transcribe(file_path, language="en", fp16=False) # optimized for english
  end_time = time.time()
  elapsed_time = end_time - start_time

  return hypothesis["text"], elapsed_time

In [None]:
# Transcribe the audio file
hypothesis, elapsed_time = whisper_local_transcribe(model_turbo, test_file_path)
print(f'Hypothesis: {hypothesis}')
print(f'Time: {elapsed_time}')

Hypothesis:  The surface is slick!
Time: 50.04418873786926


## Emotion Detection

### Method1: Librosa

In [None]:
# Define label mapping
# The below label mapping is originally used for Crema dataset.
# Since we also selected the Crema dataset for our evaluation, there is no need
# to modify the below label mapping.
label_mapping = {0: 'angry',
                 1: 'excited',
                 2: 'fear',
                 3: 'happy',
                 4: 'neutral',
                 5: 'sad'}

# Feature Extraction Module
class FeatureExtractor:
    """
    Reference: https://huggingface.co/spaces/Rashmiranjan28/Speech_Emotion_Recognition/tree/main
    """
    @staticmethod
    def librosa_features_extractor(file_name):
        audio, sample_rate = librosa.load(file_name, res_type='kaiser_fast')
        # print(f"Audio size: {audio.shape}")
        # print(f"Sample rate: {sample_rate}")

        # Extract MFCC features
        mfccs_features = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=25)
        mfccs_scaled_features = np.mean(mfccs_features.T, axis=0)

        # Extract Zero Crossing Rate
        zcr = librosa.feature.zero_crossing_rate(y=audio)
        zcr_scaled_features = np.mean(zcr.T, axis=0)

        # Extract Chroma Features
        chroma = librosa.feature.chroma_stft(y=audio, sr=sample_rate)
        chroma_scaled_features = np.mean(chroma.T, axis=0)

        # Extract Mel Spectrogram Features
        mel = librosa.feature.melspectrogram(y=audio, sr=sample_rate)
        mel_scaled_features = np.mean(mel.T, axis=0)

        # Concatenate all features into a single array
        features = np.hstack((mfccs_scaled_features, zcr_scaled_features, chroma_scaled_features, mel_scaled_features))
        # print(f"Feature vector size: {features.shape}")
        return features

# Emotion Prediction Module
class EmotionPredictor:
    def __init__(self, model_path, feature_extractor):
        self.model = load_model(model_path)
        self.feature_extractor = feature_extractor

    """
    Reference: https://huggingface.co/spaces/Rashmiranjan28/Speech_Emotion_Recognition/tree/main
    """
    def predict_emotions(self, audio_path, interval=3.0):
        audio_data, samplerate = sf.read(audio_path)
        duration = len(audio_data) / samplerate
        emotions = []

        for start in np.arange(0, duration, interval):
            end = start + interval
            if end > duration:
                end = duration
            segment = audio_data[int(start*samplerate):int(end*samplerate)]
            segment_path = 'segment.wav'
            sf.write(segment_path, segment, samplerate)

            feat = self.feature_extractor(segment_path)
            feat = feat.reshape(1, -1)
            predictions = self.model.predict(feat)
            predicted_label = np.argmax(predictions, axis=1)
            emotions.append((start, end, label_mapping[predicted_label[0]]))

            # Cleanup segment file
            os.remove(segment_path)

        return emotions

In [None]:
# Specify the emotion prediction model path
cnn_lstm_model_path = '/content/drive/My Drive/TU Delft/Conversational Agents/data/cnn_lstm.keras'

feature_extractor = FeatureExtractor.librosa_features_extractor
emotion_predictor = EmotionPredictor(cnn_lstm_model_path, feature_extractor)

### Method2: Whisper

In [None]:
def preprocess_audio(audio_path, feature_extractor, max_duration=30.0):
    audio_array, sampling_rate = librosa.load(audio_path, sr=feature_extractor.sampling_rate)

    max_length = int(feature_extractor.sampling_rate * max_duration)
    if len(audio_array) > max_length:
        audio_array = audio_array[:max_length]
    else:
        audio_array = np.pad(audio_array, (0, max_length - len(audio_array)))

    inputs = feature_extractor(
        audio_array,
        sampling_rate=feature_extractor.sampling_rate,
        max_length=max_length,
        truncation=True,
        return_tensors="pt",
    )
    # print(inputs)
    # print(f"Feature extracted from whisper has size: {inputs['input_features'].shape}")
    return inputs

def predict_emotion(audio_path, model, feature_extractor, id2label, max_duration=30.0):
    inputs = preprocess_audio(audio_path, feature_extractor, max_duration)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    inputs = {key: value.to(device) for key, value in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits
    predicted_id = torch.argmax(logits, dim=-1).item()
    predicted_label = id2label[predicted_id]

    return predicted_label

In [None]:
whisper_model_id = "firdhokk/speech-emotion-recognition-with-openai-whisper-large-v3"
whisper_model = AutoModelForAudioClassification.from_pretrained(whisper_model_id)

whisper_feature_extractor = AutoFeatureExtractor.from_pretrained(whisper_model_id, do_normalize=True)
id2label = whisper_model.config.id2label

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.56k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.55G [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/340 [00:00<?, ?B/s]

### Connotation

In [None]:
connotation_dict = {
    "angry": "negative",
    "disgust": "negative",
    "fear": "negative",
    "fearful": "negative",
    "happy": "positive",
    "neutral": "neutral",
    "sad": "negative",
    "surprised": "positive",
    "excited": "positive"
}

def conflict_detection(test_file_path):
  # import tensorflow as tf
  # with tf.device('/CPU:0'):
  #   librosa_emotion = emotion_predictor.predict_emotions(test_file_path)[0][2]
  librosa_emotion = emotion_predictor.predict_emotions(test_file_path)[0][2]
  # print(f"Librosa emotion: {librosa_emotion}")
  whisper_emotion = predict_emotion(test_file_path, whisper_model, whisper_feature_extractor, id2label)
  print(f"Librosa emotion: {librosa_emotion}, Whisper emotion: {whisper_emotion}")

  librosa_connotation = connotation_dict[librosa_emotion]
  whisper_connotation = connotation_dict[whisper_emotion]

  # Conflict detected, use the speech emotion
  if librosa_connotation == "negative" and whisper_connotation == "positive" or librosa_connotation == "positive" and whisper_connotation == "negative":
    print(f"Conflict detected, use the speech emotion: {librosa_emotion}")
    return librosa_emotion
  else:
    print(f"No conflict, use the text emotion: {whisper_emotion}")
    return whisper_emotion

In [None]:
final_emotion = conflict_detection(test_file_path)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 170ms/step
Librosa emotion: excited, Whisper emotion: angry
Conflict detected, use the speech emotion: excited


## Output Storage

In [None]:
# The output of our perception module is stored as json pair of {file_name, transcripted_text, emotion}
# which will be furthur used in the memory module.

class PerceptionOutput:
  def __init__(self, file_name, text, emotion):
    self.file_name = file_name
    self.text = text
    self.emotion = emotion
  def save_output(self, output_path):
    output_file_path = os.path.join(output_path, self.file_name + ".json")
    with open(output_file_path, 'w') as f:
      json.dump(self.__dict__, f)
    print(f"Output saved to {output_file_path}")

In [None]:
final_output = PerceptionOutput(test_file_name, hypothesis, final_emotion)
final_output.save_output(output_path)

Output saved to /content/drive/My Drive/TU Delft/Conversational Agents/data/output/1013_TSI_ANG_XX.json
