# Audio Recording and Vocoder Demo

This notebook lets you record audio, convert it to a mel spectrogram, and synthesize audio using a vocoder.

In [None]:
# Install required packages (uncomment if needed)
# !pip install sounddevice torchaudio numpy IPython

import sounddevice as sd
import numpy as np
import torchaudio
import torch
from IPython.display import Audio, display
import matplotlib.pyplot as plt

In [None]:
# Record audio from microphone
SAMPLE_RATE = 22050  # Match vocoder's expected sample rate
DURATION = 5  # seconds
print(f"Recording for {DURATION} seconds...")
audio = sd.rec(int(DURATION * SAMPLE_RATE), samplerate=SAMPLE_RATE, channels=1, dtype='float32')
sd.wait()
print("Recording complete.")

# Listen to the recorded audio
Audio(audio.T, rate=SAMPLE_RATE)

In [None]:
# Convert audio to mel spectrogram

audio_tensor = torch.from_numpy(audio.T)
if audio_tensor.dim() == 1:
    audio_tensor = audio_tensor.unsqueeze(0)

mel_transform = torchaudio.transforms.MelSpectrogram(
    sample_rate=SAMPLE_RATE,
    n_fft=1024,
    hop_length=256,
    n_mels=80
)
mel_spec = mel_transform(audio_tensor)
mel_spec_db = torchaudio.functional.amplitude_to_DB(mel_spec, multiplier=10.0, amin=1e-10, db_multiplier=0)

plt.figure(figsize=(10, 4))
plt.imshow(mel_spec_db.squeeze().numpy(), aspect='auto', origin='lower')
plt.title('Mel Spectrogram (dB)')
plt.xlabel('Frame')
plt.ylabel('Mel Bin')
plt.colorbar(format='%+2.0f dB')
plt.show()

In [None]:
# Load pretrained HiFi-GAN vocoder from torchaudio
bundle = torchaudio.pipelines.HIFIGAN_VOCODER_VCTK
vocoder = bundle.get_vocoder().eval()

In [None]:
# Generate audio from mel spectrogram and play it
with torch.no_grad():
    # HiFi-GAN expects (batch, n_mels, frames)
    mel_for_vocoder = mel_spec
    if mel_for_vocoder.dim() == 2:
        mel_for_vocoder = mel_for_vocoder.unsqueeze(0)
    audio_out = vocoder(mel_for_vocoder)

# Play the generated audio
Audio(audio_out.squeeze().cpu().numpy(), rate=SAMPLE_RATE)