# Video AI Audio Cleaner

Enter the following parameters:

In [None]:
VIDEO_PATH = "C:\\Users\\javiv\\Videos\\Películas\\In The Heights - AAG.mp4"
EXTRACTED_AUDIO_PATH = "extracted_audio.wav"
# TEMP_COMPATIBLE_AUDIO_PATH = "audio_compatible.wav"
CLEANED_AUDIO_PATH = "cleaned_audio.wav" # Intermediate file after cleaning
FINAL_AUDIO_PATH = "audio_final.mp3"
FINAL_VIDEO_PATH = "final_video.mp4"

## Previous installations

Execute this command:

pip install moviepy torch torchaudio soundfile demucs pydub

In [25]:
import os
from moviepy.editor import VideoFileClip, AudioFileClip
import torchaudio
import soundfile as sf
import torch
from demucs.pretrained import get_model
from demucs.apply import apply_model
from pydub import AudioSegment

## Function definitions

In [17]:
def extract_audio(video_path, output_audio_path):
    print("[1] Extrayendo audio del vídeo...")
    video = VideoFileClip(video_path)
    audio = video.audio
    audio.write_audiofile(output_audio_path, codec='pcm_s16le')  # WAV sin compresión
    print("Audio extraído.")

In [None]:
'''
def convert_to_torchaudio_compatible(input_path, output_path):
    audio = AudioSegment.from_file(input_path)
    audio.export(output_path, format="wav", codec="pcm_s16le")
'''

In [26]:
def clean_audio_with_demucs(input_audio_path, output_audio_path):
    print("[2] Limpiando audio con Demucs...")
    model = get_model(name="htdemucs")  # usa el modelo "htdemucs" preentrenado

    # Cargar audio con soundfile
    wav, sr = sf.read(input_audio_path)
    wav = torch.tensor(wav.T).float().unsqueeze(0)  # Convertir a [channels, samples] y float

    with torch.no_grad():
        sources = apply_model(model, wav, sr, split=True, overlap=0.25, progress=True)

    # Extraer la pista de voz ("vocals")
    vocals = sources[model.sources.index("vocals")]

    # Asegurar tipo y normalización (por si acaso)
    vocals = vocals.clamp(-1.0, 1.0)  # Evitar clipping
    torchaudio.save(output_audio_path, vocals, sample_rate=sr)
    print("Audio limpio guardado.")

In [10]:
def export_audio(input_wav_path, output_path_mp3=None):
    print("[3] Exportando audio final...")
    audio = AudioSegment.from_wav(input_wav_path)
    if output_path_mp3:
        audio.export(output_path_mp3, format="mp3")
        print(f"Audio exportado como MP3: {output_path_mp3}")
    else:
        print("Audio WAV ya listo.")

In [None]:
def replace_audio_in_video(original_video_path, cleaned_audio_path, output_video_path):
    print("[4] Generando nuevo vídeo con audio limpio...")
    video = VideoFileClip(original_video_path)
    audio = AudioSegment.from_wav(cleaned_audio_path)
    
    # Guardamos el audio limpio temporalmente como WAV compatible
    temp_wav_path = "temp_clean.wav"
    audio.export(temp_wav_path, format="wav")
    
    # Cargar el audio limpio y reemplazar en el vídeo
    cleaned_audio = AudioFileClip(temp_wav_path)
    final_video = video.set_audio(cleaned_audio)
    final_video.write_videofile(output_video_path, codec="libx264", audio_codec="aac")

    # Limpiar temporal
    os.remove(temp_wav_path)
    print(f"Nuevo vídeo exportado: {output_video_path}")

## Execution

In [11]:
extract_audio(VIDEO_PATH, EXTRACTED_AUDIO_PATH)

[1] Extrayendo audio del vídeo...
MoviePy - Writing audio in extracted_audio.wav


                                                                          

MoviePy - Done.
Audio extraído.




In [None]:
# convert_to_torchaudio_compatible(EXTRACTED_AUDIO_PATH, TEMP_COMPATIBLE_AUDIO_PATH)

In [27]:
clean_audio_with_demucs(EXTRACTED_AUDIO_PATH, CLEANED_AUDIO_PATH)

[2] Limpiando audio con Demucs...


100%|██████████████████████████████████████████████| 7815.599999999999/7815.599999999999 [2:00:26<00:00,  1.08seconds/s]
100%|██████████████████████████████████████████████| 7815.599999999999/7815.599999999999 [1:50:10<00:00,  1.18seconds/s]
100%|██████████████████████████████████████████████| 7815.599999999999/7815.599999999999 [1:49:25<00:00,  1.19seconds/s]
 40%|██████████████████████▍                                 | 3129.75/7815.599999999999 [44:15<1:06:15,  1.18seconds/s]


KeyboardInterrupt: 

In [None]:
export_audio(CLEANED_AUDIO_PATH, FINAL_AUDIO_PATH)

In [None]:
replace_audio_in_video(VIDEO_PATH, CLEANED_AUDIO_PATH, FINAL_VIDEO_PATH)