<a href="https://colab.research.google.com/github/HaddoucheMilissa/Django_bootcamp/blob/main/Audio_pipline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Audio Pipeline

# Step 1: Preprocess Audio (Denoising & Conversion)

In [10]:
# 🚀 System Dependencies
!apt update && apt install -y ffmpeg  # Install FFmpeg for audio processing

# 🛠️ Install Compatible Python Dependencies
!pip install --upgrade --no-cache-dir numpy==1.26.4 pandas==2.2.2
!pip install --no-cache-dir torch torchaudio --index-url https://download.pytorch.org/whl/cu121
!pip uninstall -y whisper
!pip install --no-cache-dir faster-whisper  # Install faster-whisper
!pip install --no-cache-dir yt-dlp pydub librosa soundfile noisereduce pytube deepfilternet

[33m0% [Working][0m            Hit:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:3 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:4 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:6 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:7 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
41 packages can be upgraded. Run 'apt list --upgradable' to see them.
[1;33mW: [0mSkipping acquire of configured file 'main/source/Sources' as repository 

# **DOWNLOAD THE VIDEO AND EXTRACT AUDIO** *italicized text*

In [11]:
import os
import torch
import yt_dlp
import numpy as np
from df import enhance, init_df
from pytube import YouTube
import torchaudio
from pydub import AudioSegment, silence
import soundfile as sf  # Use sf.write() instead of librosa.output.write_wav (deprecated)
from faster_whisper import WhisperModel  # Use faster-whisper instead of openai-whisper

# ✅ Load Faster-Whisper model
model = WhisperModel("medium", device="cuda", compute_type="float32")  # Use fp32 precision


In [12]:
# Step 0: Download Video and Extract Audio
def download_video(video_url, output_audio):
    """Downloads a video from a URL and extracts audio."""
    ydl_opts = {
        'format': 'bestaudio/best',
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'wav',
            'preferredquality': '192',
        }],
        'outtmpl': output_audio.replace('.wav', ''),
    }

    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([video_url])

    print(f"Downloaded and extracted audio: {output_audio}")

In [13]:
def split_audio(file_path, chunk_length=30000):
    """Splits audio into chunks of 30 seconds each."""
    file_dir = os.path.dirname(file_path) or os.getcwd()  # Fix: Use current directory if empty

    # Remove only related old chunks
    for file in os.listdir(file_dir):
        if file.startswith("chunk_") and file.endswith(".wav"):
            os.remove(os.path.join(file_dir, file))

    # Load and split the audio
    audio = AudioSegment.from_file(file_path, format="wav")
    chunks = [audio[i:i+chunk_length] for i in range(0, len(audio), chunk_length)]

    chunk_files = []
    for idx, chunk in enumerate(chunks):
        chunk_file = os.path.join(file_dir, f"chunk_{idx}.wav")
        chunk.export(chunk_file, format="wav")
        chunk_files.append(chunk_file)

    print(f"✅ Split into {len(chunk_files)} chunks.")
    return chunk_files


In [14]:
# Step 2: Denoise Each Chunk

def load_audio(file_path, target_sr=48000):
    """Loads an audio file and resamples it to the target sample rate."""
    waveform, orig_sr = torchaudio.load(file_path)
    if orig_sr != target_sr:
        resampler = torchaudio.transforms.Resample(orig_sr, target_sr)
        waveform = resampler(waveform)
    return waveform.numpy().squeeze()


import torch
import soundfile as sf
from df import enhance, init_df

def denoise_audio_parallel(chunk_files):
    """Denoises audio chunks in parallel using batch processing."""

    model, df_state, _ = init_df(model_base_dir="DeepFilterNet3")
    if model is None or df_state is None:
        print("❌ Failed to initialize DeepFilterNet.")
        return []

    enhanced_files = []

    # Load all audio chunks in parallel
    audio_data = load_audio_parallel(chunk_files)  # Load all chunks at once

    # Convert to PyTorch tensors (batch processing)
    audio_tensors = [torch.from_numpy(audio) for audio in audio_data]
    batch_tensor = torch.stack(audio_tensors)  # Stack all chunks into a batch

    # Apply denoising in batch
    enhanced_audio_batch = enhance(model, df_state, batch_tensor)

    # Save the enhanced chunks
    for i, enhanced_audio in enumerate(enhanced_audio_batch):
        enhanced_audio_np = enhanced_audio.cpu().numpy()

        # Ensure correct shape (stereo compatibility)
        if enhanced_audio_np.ndim == 2 and enhanced_audio_np.shape[0] > enhanced_audio_np.shape[1]:
            enhanced_audio_np = enhanced_audio_np.T

        output_file = f"enhanced_{os.path.basename(chunk_files[i])}"
        sf.write(output_file, enhanced_audio_np, 48000, format="WAV", subtype="PCM_16")
        enhanced_files.append(output_file)

    return enhanced_files


In [15]:
def concatenate_audios(output_path="output.wav"):
    """
    Concatenates all audio files that start with 'enhanced_' into a single WAV file.

    Args:
        output_path (str): Path to save the final concatenated audio.

    Returns:
        None
    """
    # Get all files starting with "enhanced_" and ending with ".wav"
    audio_files = sorted([file for file in os.listdir() if file.startswith("enhanced_") and file.endswith(".wav")])

    if not audio_files:
        print("❌ No enhanced audio files found to concatenate.")
        return

    # Initialize an empty audio segment
    final_audio = AudioSegment.empty()

    for file in audio_files:
        audio = AudioSegment.from_wav(file)  # Load each enhanced audio file
        final_audio += audio  # Append it to the final audio

    # Export as WAV (16-bit PCM, 16kHz for Whisper compatibility)
    final_audio.export(output_path, format="wav", codec="pcm_s16le", parameters=["-ar", "16000"])
    print(f"✅ Concatenated audio saved as: {output_path}")


In [16]:
from faster_whisper import WhisperModel

# Load the model with automatic chunking (30 seconds)
model = WhisperModel("medium", device="cuda", compute_type="float32")

# Transcribe the entire audio file
def transcribe_audio(input_audio):
    """
    Transcribes an entire audio file using Faster-Whisper with automatic chunking.

    Args:
        input_audio (str): Path to the input audio file.

    Returns:
        str: Full transcription text.
    """
    print(f"🎙️ Transcribing {input_audio} with chunk_size=30s...")

    # Transcribe the entire file at once (Faster-Whisper will chunk automatically)
    segments, _ = model.transcribe(input_audio, chunk_size=30)

    # Combine all segments into the full transcript
    full_transcript = " ".join(segment.text for segment in segments)

    # Save the full transcript
    with open("full_transcript.txt", "w") as f:
        f.write(full_transcript)

    print("\n📄 Transcription complete! Full transcript saved as full_transcript.txt")
    return full_transcript


In [8]:
# Remove temporary chunk files
def cleanup_temp_files():
    for file in os.listdir():
        if file.startswith("chunk_") or file.startswith("enhanced_") :
            os.remove(file)
    print("Temporary files removed.")

In [9]:
cleanup_temp_files()

Temporary files removed.


In [None]:
# ✅ Run Full Pipeline
video_url = "https://youtu.be/ua8zZy5tTxE?si=-vKlgBB5i47fs_wB"
input_audio = "input.wav"

download_video(video_url, input_audio)
chunks = split_audio(input_audio)  # Split for denoising
enhanced_chunks = denoise_audio_parallel(chunks)  # Denoise in parallel
final_audio = concatenate_audios()  # Merge clean chunks
transcript = transcribe_audio(final_audio)  # Transcribe full audio

[youtube] Extracting URL: https://youtu.be/ua8zZy5tTxE?si=-vKlgBB5i47fs_wB
[youtube] ua8zZy5tTxE: Downloading webpage
[youtube] ua8zZy5tTxE: Downloading tv client config
[youtube] ua8zZy5tTxE: Downloading player 20830619
[youtube] ua8zZy5tTxE: Downloading tv player API JSON
[youtube] ua8zZy5tTxE: Downloading ios player API JSON
[youtube] ua8zZy5tTxE: Downloading m3u8 information
[info] ua8zZy5tTxE: Downloading 1 format(s): 251
[download] input has already been downloaded
[download] 100% of  530.35MiB
[ExtractAudio] Destination: input.wav
Deleting original file input (pass -k to keep)
Downloaded and extracted audio: input.wav


In [None]:
pip install whisper-timestamped


In [None]:
import whisper_timestamped as whisper

# Load model
model = whisper.load_model("medium")

# Transcribe with word-level timestamps
result = model.transcribe("output.wav", word_timestamps=True)

# Group words into sentences
sentences = []
current_sentence = []
sentence_start = None

for segment in result['segments']:
    for word in segment['words']:
        if sentence_start is None:
            sentence_start = word['start']  # Mark sentence start
        current_sentence.append(word['text'])

        # If punctuation found, finalize sentence
        if word['text'] in [".", "?", "!"]:
            sentence_text = " ".join(current_sentence)
            sentence_end = word['end']
            sentences.append((sentence_text, sentence_start, sentence_end))
            current_sentence = []
            sentence_start = None

# Save sentence-aligned transcript
with open("sentence_aligned_transcript.txt", "w") as f:
    for sentence, start, end in sentences:
        f.write(f"[{start:.2f}s - {end:.2f}s] {sentence}\n")

print("✅ Sentence-level alignment complete! Results saved in sentence_aligned_transcript.txt")
