# LibriSpeech Dataset

In [None]:
!wget -q https://www.openslr.org/resources/12/dev-clean.tar.gz
!tar -xvzf dev-clean.tar.gz --wildcards --no-anchored "*.flac" -C /content/
audio_file_path = "/content/LibriSpeech/dev-clean/1272/128104/1272-128104-0000.flac"

LibriSpeech/dev-clean/2277/149896/2277-149896-0026.flac
LibriSpeech/dev-clean/2277/149896/2277-149896-0005.flac
LibriSpeech/dev-clean/2277/149896/2277-149896-0033.flac
LibriSpeech/dev-clean/2277/149896/2277-149896-0006.flac
LibriSpeech/dev-clean/2277/149896/2277-149896-0018.flac
LibriSpeech/dev-clean/2277/149896/2277-149896-0034.flac
LibriSpeech/dev-clean/2277/149896/2277-149896-0021.flac
LibriSpeech/dev-clean/2277/149896/2277-149896-0015.flac
LibriSpeech/dev-clean/2277/149896/2277-149896-0012.flac
LibriSpeech/dev-clean/2277/149896/2277-149896-0027.flac
LibriSpeech/dev-clean/2277/149896/2277-149896-0007.flac
LibriSpeech/dev-clean/2277/149896/2277-149896-0030.flac
LibriSpeech/dev-clean/2277/149896/2277-149896-0011.flac
LibriSpeech/dev-clean/2277/149896/2277-149896-0009.flac
LibriSpeech/dev-clean/2277/149896/2277-149896-0003.flac
LibriSpeech/dev-clean/2277/149896/2277-149896-0004.flac
LibriSpeech/dev-clean/2277/149896/2277-149896-0017.flac
LibriSpeech/dev-clean/2277/149896/2277-149896-00

# Observe the Audio Shape

In [None]:
import torchaudio

signal, sample_rate = torchaudio.load('/content/LibriSpeech/dev-clean/422/122949/422-122949-0000.flac')
print(f"Shape of audio signal: {signal.shape}")
print(f"Sample rate: {sample_rate}")

Shape of audio signal: torch.Size([1, 323520])
Sample rate: 16000


# Convert Audio Shape to 16Hz

In [None]:
!pip install torchaudio scipy


import torchaudio
import torch
from scipy.io.wavfile import write
import os


def download_sample_audio():
    url = "https://www2.cs.uic.edu/~i101/SoundFiles/preamble10.wav"
    audio_path = "sample_audio.wav"
    if not os.path.exists(audio_path):
        !wget -O {audio_path} {url}
    return audio_path


def convert_audio(input_path, output_path, desired_duration=5.0, desired_sample_rate=16000):

    if not os.path.exists(input_path):
        raise FileNotFoundError(f"The specified audio file does not exist: {input_path}")

    signal, sample_rate = torchaudio.load(input_path)

    if sample_rate != desired_sample_rate:
        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=desired_sample_rate)
        signal = resampler(signal)
        sample_rate = desired_sample_rate

    if signal.shape[0] > 1:
        signal = signal.mean(dim=0, keepdim=True)

    num_samples = int(desired_sample_rate * desired_duration)
    if signal.shape[1] > num_samples:
        signal = signal[:, :num_samples]
    else:
        padding = num_samples - signal.shape[1]
        signal = torch.nn.functional.pad(signal, (0, padding))

    signal = signal.squeeze().unsqueeze(0)

    signal_np = signal.numpy()
    write(output_path, sample_rate, signal_np[0])

    print(f"Audio saved to: {output_path}")
    return output_path


input_audio_path = download_sample_audio()
output_audio_path = "converted_audio.wav"
converted_audio_path = convert_audio(input_audio_path, output_audio_path, desired_duration=5.0, desired_sample_rate=16000)

--2024-11-08 16:07:17--  https://www2.cs.uic.edu/~i101/SoundFiles/preamble10.wav
Resolving www2.cs.uic.edu (www2.cs.uic.edu)... 131.193.32.16
Connecting to www2.cs.uic.edu (www2.cs.uic.edu)|131.193.32.16|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 423364 (413K) [audio/x-wav]
Saving to: ‘sample_audio.wav’


2024-11-08 16:07:18 (1.45 MB/s) - ‘sample_audio.wav’ saved [423364/423364]

Audio saved to: converted_audio.wav


# Speech to Emotion with Time Interval

In [None]:
!pip install transformers torchaudio gradio

Collecting gradio
  Downloading gradio-5.5.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.4-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.4.2 (from gradio)
  Downloading gradio_client-1.4.2-py3-none-any.whl.metadata (7.1 kB)
Collecting huggingface-hub<1.0,>=0.23.2 (from transformers)
  Downloading huggingface_hub-0.26.2-py3-none-any.whl.metadata (13 kB)
Collecting markupsafe~=2.0 (from gradio)
  Downloading MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart==0.0.12 (from gradio)
  Downloading python_multipart-0.0.12-py3-none-any.whl.metadata (

In [None]:
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, pipeline
import torch
import torchaudio
import gradio as gr

In [None]:
asr_processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
asr_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.60k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/378M [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/wav2vec2-base-960h were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You sho

In [None]:
emotion_recognition = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", return_all_scores=True)

config.json:   0%|          | 0.00/1.00k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/329M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/294 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]



In [None]:
def predict_emotion_from_audio(audio, chunk_duration=1.0):
    try:
        signal, fs = torchaudio.load(audio)

        if fs != 16000:
            resampler = torchaudio.transforms.Resample(orig_freq=fs, new_freq=16000)
            signal = resampler(signal)

        if signal.shape[0] > 1:
            signal = signal.mean(dim=0)


        signal = signal.unsqueeze(0) if signal.ndim == 1 else signal
        if signal.ndim == 3:
            signal = signal.squeeze(0)

        chunk_samples = int(16000 * chunk_duration)
        num_chunks = signal.shape[1] // chunk_samples

        time_emotions = []

        for i in range(num_chunks):
            chunk = signal[:, i * chunk_samples : (i + 1) * chunk_samples]

            chunk = chunk.squeeze(0) if chunk.dim() == 2 else chunk

            input_values = asr_processor(chunk, sampling_rate=16000, return_tensors="pt").input_values
            with torch.no_grad():
                logits = asr_model(input_values).logits
            predicted_ids = torch.argmax(logits, dim=-1)
            transcription = asr_processor.decode(predicted_ids[0])

            emotions = emotion_recognition(transcription)

            dominant_emotion = max(emotions[0], key=lambda x: x['score'])

            time_emotions.append((i * chunk_duration, dominant_emotion['label'], dominant_emotion['score']))

        remaining_chunk = signal[:, num_chunks * chunk_samples:]
        if remaining_chunk.shape[1] > 0:
            remaining_chunk = remaining_chunk.squeeze(0) if remaining_chunk.dim() == 2 else remaining_chunk
            input_values = asr_processor(remaining_chunk, sampling_rate=16000, return_tensors="pt").input_values
            with torch.no_grad():
                logits = asr_model(input_values).logits
            predicted_ids = torch.argmax(logits, dim=-1)
            transcription = asr_processor.decode(predicted_ids[0])

            emotions = emotion_recognition(transcription)
            dominant_emotion = max(emotions[0], key=lambda x: x['score'])
            time_emotions.append((num_chunks * chunk_duration, dominant_emotion['label'], dominant_emotion['score']))

        output = "Time vs Dominant Emotion:\n\n"
        for time, emotion, score in time_emotions:
            output += f"Time {time:.1f}s: Emotion: {emotion}, Score: {score:.2f}\n"

        return output

    except OSError as e:
        return f"File error: {e}"
    except AttributeError as e:
        return f"Model or processing error: {e}"
    except Exception as e:
        return f"An unexpected error occurred: {e}"

In [None]:
iface = gr.Interface(
    fn=predict_emotion_from_audio,
    inputs=gr.Audio(type="filepath"),
    outputs="text",
    title="Emotion Detection from Speech",
    description="Upload an audio file to detect dominant emotion for each second of the speech."
)

In [None]:
iface.launch()

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://b48064d9ce828aee3c.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


