# Creating a audio pipeline for Urban Sounds

###  Goal of the notebook
This notebook is a pipeline for audio classification. With the notebook you can record an audio sample and do audio classification with CLAP.

### The Pipeline
1. Record an audio sample
2. Audio classification using CLAP general large model
3. Store the results (to do)

### Contents
0. Install packages & check versions
1. Take audio sample
2. Classify the sample
3. Store the sample and the result in a database


## 0. Install packages

In [None]:
#!pip install pyaudio

In [2]:
#!pip install soundfile

In [3]:
#!pip install librosa

In [1]:
#!pip install datasets

In [4]:
#%pip install datasets\[audio\]

In [1]:
pip install numpy==1.26

Note: you may need to restart the kernel to use updated packages.


In [1]:
#check python version
import platform
print(platform.python_version())

3.12.2


In [11]:
import numpy as np
print(f'numpy={np.__version__}')
import soundfile
print(f'soundfile={soundfile.__version__}')
import librosa
print(f'librosa={librosa.__version__}')
import IPython
print(f'ipython={IPython.__version__}')
import pyaudio
print(f'pyaudio= {pyaudio.__version__}')


numpy=1.26.0
soundfile=0.9.0
librosa=0.10.2.post1
ipython=8.27.0
pyaudio= 0.2.14


In [3]:
import torch
print(f'torch= {torch.__version__}')
import datasets
print(f'datasets={datasets.__version__}')
import transformers
print(f'transformers={transformers.__version__}')

torch= 2.2.2
datasets=3.1.0
transformers=4.46.3


## 1. Take audio sample and save it

In [4]:

import pyaudio
import wave
import datetime
import os

#set the filename based on current time
current_time = datetime.datetime.now()
WAVE_OUTPUT_FILENAME = current_time.strftime("%Y-%m-%d_%H-%M-%S") + ".wav"

# Audio recording parameters
CHUNK = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 48100
RECORD_SECONDS = 10

# Initialize PyAudio
audio = pyaudio.PyAudio()

# Open stream
stream = audio.open(format=FORMAT, channels=CHANNELS,
                    rate=RATE, input=True,
                    frames_per_buffer=CHUNK)

print("Recording...")

frames = []

# Record for RECORD_SECONDS
for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
    data = stream.read(CHUNK)
    frames.append(data)

print("Recording finished.")

# Stop and close the stream
stream.stop_stream()
stream.close()
audio.terminate()

# Save the recorded data as a WAV file

# Ensure the "samples" folder exists
os.makedirs("samples", exist_ok=True)

# Define the output filename with the "samples" folder
WAVE_OUTPUT_FILENAME = os.path.join("samples", WAVE_OUTPUT_FILENAME)

wf = wave.open(WAVE_OUTPUT_FILENAME, 'wb')
wf.setnchannels(CHANNELS)
wf.setsampwidth(audio.get_sample_size(FORMAT))
wf.setframerate(RATE)
wf.writeframes(b''.join(frames))
wf.close()

print(f"Audio sample saved as {WAVE_OUTPUT_FILENAME}")


Recording...
Recording finished.
Audio sample saved as samples/2025-02-10_13-45-37.wav


In [5]:
import IPython
IPython.display.Audio(WAVE_OUTPUT_FILENAME)

## 2. Classify the audio sample


In [6]:
#Convert the recorded .wav file to a numpy array for use in CLAP
import soundfile as sf
import numpy as np
import IPython

# Path to the .wav file
wav_file_path = WAVE_OUTPUT_FILENAME
#wav_file_path = "./samples/2025-02-10_12-14-34.wav"  # Replace with your .wav file path

# Read the .wav file using soundfile
audio, samplerate = sf.read(wav_file_path)

# Display the NumPy array and sample rate
print(f"Audio data: {audio}")
print("Sample Rate:", samplerate)
IPython.display.Audio(audio, rate=48000)

Audio data: [-0.00061035 -0.00140381 -0.00128174 ... -0.03311157 -0.03424072
 -0.03457642]
Sample Rate: 48100


In [7]:
# With the similarity search approach of CLAP we can now use a whole range of labels 
labels_list =['Gunshot', 'Alarm', 'Moped', 'Car', 'Motorcycle', 'Claxon', 'Slamming door', 'Screaming', 'Talking','Music', 'Birds', 'Airco', 'Noise', 'Silence']

print(labels_list)

['Gunshot', 'Alarm', 'Moped', 'Car', 'Motorcycle', 'Claxon', 'Slamming door', 'Screaming', 'Talking', 'Music', 'Birds', 'Airco', 'Noise', 'Silence']


In [14]:
#%%timeit
#larger_clap_general
from transformers import pipeline
import IPython

audio_classifier = pipeline(task="zero-shot-audio-classification", model="laion/larger_clap_general")
output = audio_classifier(audio, candidate_labels=labels_list)

print(f'First result is {output[0]['label']}: {output[0]['score']}')
print(f'Second result is {output[1]['label']}: {output[1]['score']}')
print(f'Third result is {output[2]['label']}: {output[2]['score']}')
IPython.display.Audio(audio, rate=48000)

First result is Talking: 0.5139683485031128
Second result is Silence: 0.39317864179611206
Third result is Airco: 0.08884799480438232
