In [None]:
import torch
import torchaudio
import torchaudio.transforms as T
import time
import sounddevice as sd
import numpy as np
from scipy.io.wavfile import write
from Utils.models import ConvTasNet  # adjust if your path differs


In [None]:
# Audio settings
SAMPLE_RATE = 48000
N_FFT = 1024
HOP_LENGTH = 256
DURATION = 3  # seconds of live audio capture

# Model path
MODEL_PATH = ".Project/Models/ConvTasNet_dynamic.pth"
OUTPUT_PATH = "Output/wav/real_time_denoised.wav"


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")

# Load the trained model
model = ConvTasNet()
model.load_state_dict(torch.load(MODEL_PATH, map_location=device))
model.to(device)
model.eval()

# Define transforms
spectrogram = T.Spectrogram(n_fft=N_FFT, hop_length=HOP_LENGTH, power=None).to(device)
inverse_spectrogram = T.InverseSpectrogram(n_fft=N_FFT, hop_length=HOP_LENGTH).to(device)


In [None]:
print(f"üé§ Recording {DURATION} seconds of audio...")
recording = sd.rec(int(DURATION * SAMPLE_RATE), samplerate=SAMPLE_RATE, channels=1, dtype='float32')
sd.wait()
print("‚úÖ Recording complete.")

# Convert to PyTorch tensor
noisy_waveform = torch.tensor(recording.T, dtype=torch.float32).to(device)  # shape: (1, samples)


In [None]:
# Convert to spectrogram
noisy_spec = spectrogram(noisy_waveform)

# Normalize
noisy_spec = (noisy_spec - noisy_spec.mean()) / (noisy_spec.std() + 1e-6)

# Split real and imaginary
noisy_real, noisy_imag = noisy_spec.real, noisy_spec.imag

# Add channel dim
noisy_real = noisy_real.unsqueeze(1)
noisy_imag = noisy_imag.unsqueeze(1)

# Inference and timing
start_time = time.time()
with torch.no_grad():
    denoised_real, denoised_imag = model(noisy_real, noisy_imag)
end_time = time.time()

inference_time = end_time - start_time
print(f"‚è±Ô∏è Inference time: {inference_time:.4f} seconds")


In [None]:
# Merge back to complex
denoised_spec = torch.complex(denoised_real.squeeze(1), denoised_imag.squeeze(1))

# Convert to waveform
denoised_waveform = inverse_spectrogram(denoised_spec)

# Clamp to avoid clipping
denoised_waveform = torch.clamp(denoised_waveform, min=-1.0, max=1.0)

# Save as WAV
write(OUTPUT_PATH, SAMPLE_RATE, denoised_waveform.cpu().numpy().T.astype(np.float32))
print(f"‚úÖ Denoised audio saved at: {OUTPUT_PATH}")


In [None]:
# Justification: frame processing rate
print(f"üìà Real-time factor (RTF): {inference_time:.4f} / {DURATION:.2f} = {inference_time / DURATION:.4f}")
if inference_time < DURATION:
    print("‚úÖ This implementation qualifies for real-time deployment.")
else:
    print("‚ö†Ô∏è May not be real-time. Consider reducing model complexity or batching.")
