In [1]:
import io
import requests
import numpy as np
import sounddevice as sd
from pydub import AudioSegment
from pydub.utils import mediainfo

In [None]:
# Constants
API_URL = "http://localhost:8001/predict_instrument"
DURATION = 2
SAMPLE_RATE = 44100

In [8]:
def record_audio(duration, sample_rate):
    """Record audio from the microphone."""
    print(f"Recording for {duration} seconds...")
    audio_data = sd.rec(int(duration * sample_rate), samplerate=sample_rate, channels=1, dtype='float32')
    sd.wait()  # Wait for the recording to finish
    return audio_data.flatten()  # Flatten the array

def prepare_audio(audio_data, sample_rate):
    """Prepare audio data for the API."""
    # Convert audio data to PCM format using pydub
    audio_segment = AudioSegment(
        np.int16(audio_data * 32767).tobytes(),  # Scale float32 to int16
        frame_rate=sample_rate,
        sample_width=2,
        channels=1
    )
    buffer = io.BytesIO()
    audio_segment.export(buffer, format="wav")
    buffer.seek(0)

    # Check audio info (optional)
    info = mediainfo(buffer)
    print("Audio info:", info)

    return buffer.getvalue()

In [6]:
audio_data = record_audio(DURATION, SAMPLE_RATE)

Recording for 2 seconds...


In [54]:
import pyaudio
import wave

# Parameters
chunk = 1024  # Record in chunks of 1024 samples
sample_format = pyaudio.paInt16  # 16 bits per sample
channels = 2  # Stereo
sample_rate = 44100  # Sample rate
duration = 5  # Duration of recording in seconds
filename = "recording.wav"

# Initialize PyAudio
p = pyaudio.PyAudio()

# Open a stream
stream = p.open(format=sample_format,
                channels=channels,
                rate=sample_rate,
                frames_per_buffer=chunk,
                input=True)

print("Recording...")
frames = []

# Record in chunks
for _ in range(0, int(sample_rate / chunk * duration)):
    data = stream.read(chunk)
    frames.append(data)

# Stop and close the stream
stream.stop_stream()
stream.close()
p.terminate()

# Save as a WAV file
with wave.open(filename, 'wb') as wf:
    wf.setnchannels(channels)
    wf.setsampwidth(p.get_sample_size(sample_format))
    wf.setframerate(sample_rate)
    wf.writeframes(b''.join(frames))

ALSA lib pcm_dsnoop.c:601:(snd_pcm_dsnoop_open) unable to open slave
ALSA lib pcm_dmix.c:1032:(snd_pcm_dmix_open) unable to open slave
ALSA lib pcm.c:2664:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.rear
ALSA lib pcm.c:2664:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.center_lfe
ALSA lib pcm.c:2664:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.side
Cannot connect to server socket err = No such file or directory
Cannot connect to server request channel
jack server is not running or cannot be started
JackShmReadWritePtr::~JackShmReadWritePtr - Init not done for -1, skipping unlock
JackShmReadWritePtr::~JackShmReadWritePtr - Init not done for -1, skipping unlock
Cannot connect to server socket err = No such file or directory
Cannot connect to server request channel
jack server is not running or cannot be started
JackShmReadWritePtr::~JackShmReadWritePtr - Init not done for -1, skipping unlock
JackShmReadWritePtr::~JackShmReadWritePtr - Init not done for -1, skipping unlock
ALSA lib pcm

Recording...


In [15]:
payload = {
    "audio": audio_data.tolist(),
    "sampling_rate": SAMPLE_RATE
}

In [16]:
response = requests.post(API_URL, json=payload)

In [1]:
import torch
from AudioClassifier import AudioClassifier

In [2]:
# Model configuration
model = AudioClassifier()
model.load_state_dict(torch.load("../src/server/model.pth", weights_only=True, map_location=torch.device('cpu')))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

AudioClassifier(
  (conv1): Conv2d(1, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv2): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv3): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=1344, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=4, bias=True)
  (dropout): Dropout(p=0.3, inplace=False)
)

In [3]:
# Define constants
N_MFCC = 15
TARGET_SR = 44100
DURATION = 2

In [4]:
import torchaudio
from torchaudio.transforms import Resample, MFCC

def preprocess_wav(file_path, target_sr=TARGET_SR, n_mfcc=N_MFCC, duration=DURATION):
    # Load audio
    waveform, sr = torchaudio.load(file_path)

    # Resample if necessary
    if sr != target_sr:
        resample = Resample(orig_freq=sr, new_freq=target_sr)
        waveform = resample(waveform)

    # Duration in samples
    num_samples = target_sr * duration

    # Trim or pad waveform
    if waveform.shape[1] > num_samples:
        waveform = waveform[:, :num_samples]  # Truncate
    else:
        padding = num_samples - waveform.shape[1]
        waveform = torch.nn.functional.pad(waveform, (0, padding))  # Pad with zeros

    # Extract MFCCs
    mfcc = MFCC(
        sample_rate=target_sr,
        n_mfcc=n_mfcc,
        melkwargs={"n_fft": 1024, "hop_length": 512, "n_mels": 64}
    )(waveform)

    # Ensure single channel dimension
    mfcc = mfcc.unsqueeze(1)  # Adding channel dimension (1)

    return mfcc

In [5]:
mfcc_features = preprocess_wav('recording.wav', TARGET_SR, N_MFCC, DURATION)

In [None]:
# Inference
with torch.no_grad():
    outputs = model(mfcc_features)
    _, predicted_class = torch.max(outputs, 1)

In [None]:
import librosa
librosa.load(io.BytesIO(sound))

1

In [9]:
predicted_class

tensor([1, 1])

In [7]:
outputs

tensor([[-0.6716,  1.3333,  0.8122, -0.6638],
        [-0.5137,  0.8274,  0.6384, -0.5072]])

In [8]:
predicted_class

tensor([1, 1])

In [74]:
import librosa
audio_data, sample_rate = librosa.load('recording.wav')

In [75]:
# Convert to PyTorch tensor
audio_tensor = torch.tensor(audio_data, dtype=torch.float32)

In [76]:
# Resample if needed
target_sr = 44100
if sample_rate != target_sr:
    resample_transform = Resample(orig_freq=sample_rate, new_freq=target_sr)
    audio_tensor = resample_transform(audio_tensor)

In [77]:
# Find the loudest part of the audio
max_idx = torch.argmax(audio_tensor)
start_idx = max(0, max_idx - target_sr)
end_idx = min(audio_tensor.size(0), max_idx + target_sr)

audio_tensor = audio_tensor[start_idx:end_idx]

In [78]:
num_samples = target_sr * 2

if audio_tensor.shape[0] > num_samples:
    audio_tensor = audio_tensor[:num_samples]
else:
    padding = num_samples - audio_tensor.shape[0]
    audio_tensor = torch.nn.functional.pad(audio_tensor, (0, padding))

In [90]:
mfcc_transform = MFCC(
    sample_rate=target_sr,
    n_mfcc=15,
    melkwargs={
        "n_fft": 1024,
        "hop_length": 512,
        "n_mels": 64
    }
)
# Convert to MFCC
mfcc = mfcc_transform(audio_tensor).unsqueeze(0)