# Step 0: Import Dataset and check the files in each folder

In [2]:
import os

DATASETS_PATH = "/kaggle/input"

print("📂 Available datasets in /kaggle/input:")
print(os.listdir(DATASETS_PATH))


📂 Available datasets in /kaggle/input:
['testfiles-demucs', 'musan-data', 'demucs_epoch_8', 'compressedtest5', 'reconstructed-demucs', 'compressed-data', 'testonyoutubetedtalk']


In [3]:
DATA_PATH = "/kaggle/input/musan-data/musan"  # Adjust if needed

if os.path.exists(DATA_PATH):
    print("✅ Dataset found! Listing contents:")
    print(os.listdir(DATA_PATH))
else:
    print("❌ Dataset not found! Double-check the path.")


✅ Dataset found! Listing contents:
['noise', 'README', 'music', 'speech']


In [4]:
for category in ["music", "noise", "speech"]:
    folder_path = os.path.join(DATA_PATH, category)
    if os.path.exists(folder_path):
        files = os.listdir(folder_path)[:5]  # Show first 5 files
        print(f"\n📂 {category} - {len(os.listdir(folder_path))} files")
        print(files)
    else:
        print(f"⚠️ Folder '{category}' not found!")



📂 music - 6 files
['README', 'jamendo', 'hd-classical', 'fma', 'fma-western-art']

📂 noise - 3 files
['README', 'sound-bible', 'free-sound']

📂 speech - 3 files
['README', 'librivox', 'us-gov']


In [5]:
for category in ["music", "noise", "speech"]:
    folder_path = os.path.join(DATA_PATH, category)

    for subfolder in os.listdir(folder_path):
        subfolder_path = os.path.join(folder_path, subfolder)
        
        if os.path.isdir(subfolder_path):  # Ensure it's a directory
            audio_files = [f for f in os.listdir(subfolder_path) if f.endswith(('.wav', '.mp3'))][:5]
            print(f"\n📂 {category}/{subfolder} - {len(os.listdir(subfolder_path))} files")
            print(audio_files)



📂 music/jamendo - 219 files
['music-jamendo-0044.wav', 'music-jamendo-0208.wav', 'music-jamendo-0093.wav', 'music-jamendo-0131.wav', 'music-jamendo-0132.wav']

📂 music/hd-classical - 77 files
['music-hd-0073.wav', 'music-hd-0063.wav', 'music-hd-0040.wav', 'music-hd-0056.wav', 'music-hd-0015.wav']

📂 music/fma - 130 files
['music-fma-0039.wav', 'music-fma-0087.wav', 'music-fma-0037.wav', 'music-fma-0109.wav', 'music-fma-0068.wav']

📂 music/fma-western-art - 95 files
['music-fma-wa-0032.wav', 'music-fma-wa-0051.wav', 'music-fma-wa-0028.wav', 'music-fma-wa-0009.wav', 'music-fma-wa-0088.wav']

📂 music/rfm - 150 files
['music-rfm-0044.wav', 'music-rfm-0014.wav', 'music-rfm-0022.wav', 'music-rfm-0003.wav', 'music-rfm-0125.wav']

📂 noise/sound-bible - 88 files
['noise-sound-bible-0017.wav', 'noise-sound-bible-0023.wav', 'noise-sound-bible-0019.wav', 'noise-sound-bible-0021.wav', 'noise-sound-bible-0057.wav']

📂 noise/free-sound - 845 files
['noise-free-sound-0626.wav', 'noise-free-sound-0162

# Step 1: Preprocessing: Checking all files to know its sample rates and channel

> Librosa : Librosa is a Python package designed for music and audio analysis. It provides tools for extracting meaningful features from audio signals, such as spectrograms and MFCCs (Mel-frequency cepstral coefficients), which 
are crucial for tasks like music information retrieval and sound classification

## Exception Handling (i)

> The code below also handles exception to prevent the program from crashing if a corrupt file is encountered.
> There could be some files that could be corrupt or in a format that librosa can’t read, have unexpected metadata etc.
> Even 1 bad file could crash the entire loop,thus with this block - it skips the problematic file and prints a message showing the file path and the error

In [1]:
# tells how many files at each sample rate and tells how many mono and stereo files

import os
import librosa

DATA_PATH = "/kaggle/input/musan-data/musan/"  # Update this with your actual dataset path

# Function to check sample rate & channels
def check_audio_properties(root_folder):
    sample_rates = {}
    channel_counts = {}

    for root, _, files in os.walk(root_folder):
        for file in files:
            if file.endswith(".wav"):  # Only process WAV files
                file_path = os.path.join(root, file)
                
                try:
                    audio, sr = librosa.load(file_path, sr=None, mono=False)  # Load in original format
                    #this returns a numpy array of audio samples and sr is the sampling rates
                    #in the parameters its set to None so that librosa doesnt change the sr or mono
                    
                    channels = 1 if len(audio.shape) == 1 else 2  # Determine mono or stereo
                    #audio is either of shape (T,) or (2,T) so based on the dimension its classified as mono or stereo
                    
                    # Store results
                    if sr not in sample_rates:
                        sample_rates[sr] = 0
                    sample_rates[sr] += 1
                    #returns a dictionary of the sampling rates and their corresponding counts

                    if channels not in channel_counts:
                        channel_counts[channels] = 0
                    channel_counts[channels] += 1

                except Exception as e:
                    print(f"Error processing {file_path}: {e}")

    return sample_rates, channel_counts
    

# Run check
sample_rates, channel_counts = check_audio_properties(DATA_PATH)

# Print results
print("🎵 Sample Rate Distribution:", sample_rates)
print("🎧 Channel Distribution (1=Mono, 2=Stereo):", channel_counts)


🎵 Sample Rate Distribution: {16000: 2016}
🎧 Channel Distribution (1=Mono, 2=Stereo): {1: 2016}


## Exception Handling (ii) 
> ffprobe returns the bit rate info as JSON objects
> The exception handling prevents crashes from: Malformed JSON output, Missing keys like "streams" or "bit_rate", Empty or unexpected structures
> Keeps the script robust and fault-tolerant



In [6]:
# Analyzing the bit-rate used across 5 random files across each of the 5 categories

import os
import subprocess #to run shell commands ( use it to call ffprobe)
import json
from collections import defaultdict

# Define dataset path
DATA_PATH = "/kaggle/input/musan-data/musan"
CATEGORIES = ["music", "noise", "speech"]
file_count = defaultdict(int)
MAX_FILES = 5  # Limit to 5 files per category

# Function to extract bit rate
def get_bit_rate(file_path):
    cmd = f"ffprobe -i '{file_path}' -show_entries stream=bit_rate -of json -v quiet"
    #The list of individual media streams (e.g., audio, video, subtitles) inside a
    #multimedia file as reported by ffprobe
    #for a video file, streams might include video stream, audio, subtitle stream etc
    result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
    #capture_output=True: stores stdout and stderr ; text=True: return output as strings
    
    try:
        #Parse the JSON output into a Python dictionary
        #Return the bit rate from the first stream (since its a .wav file, it most def. has only 1 stream - audio)
        #if it exists
        data = json.loads(result.stdout)  # Convert JSON string to dictionary
        return data["streams"][0]["bit_rate"] if "streams" in data and data["streams"] else "N/A"
    #If any of the following errors occur:

#JSONDecodeError: if output is not valid JSON
#KeyError: if "streams" or "bit_rate" is missing  #IndexError: if "streams" list is empty #Then return "N/A" as a fallback value instead of crashing
    except (json.JSONDecodeError, KeyError, IndexError):
        return "N/A"

# Process only 5 files from each category
for root, _, files in os.walk(DATA_PATH):
    for file in files:
        if file.endswith(".wav"):
            file_path = os.path.join(root, file)

            # Identify category from folder structure
            for category in CATEGORIES:
                if f"/{category}/" in file_path and file_count[category] < MAX_FILES:
                    bit_rate = get_bit_rate(file_path)
                    print(f"🔍 {category.upper()} | {file}: {bit_rate} bps")

                    file_count[category] += 1
                    break  # Move to next file

        if all(count >= MAX_FILES for count in file_count.values()):
            break

print("\n✅ Bit rate extraction complete for 5 files per category!")


🔍 NOISE | noise-sound-bible-0017.wav: 256000 bps
🔍 NOISE | noise-sound-bible-0023.wav: 256000 bps
🔍 NOISE | noise-sound-bible-0019.wav: 256000 bps
🔍 NOISE | noise-sound-bible-0021.wav: 256000 bps
🔍 NOISE | noise-sound-bible-0057.wav: 256000 bps
🔍 MUSIC | music-jamendo-0044.wav: 256000 bps
🔍 MUSIC | music-jamendo-0208.wav: 256000 bps
🔍 MUSIC | music-jamendo-0093.wav: 256000 bps
🔍 MUSIC | music-jamendo-0131.wav: 256000 bps
🔍 MUSIC | music-jamendo-0132.wav: 256000 bps
🔍 SPEECH | speech-librivox-0014.wav: 256000 bps
🔍 SPEECH | speech-librivox-0134.wav: 256000 bps
🔍 SPEECH | speech-librivox-0129.wav: 256000 bps
🔍 SPEECH | speech-librivox-0066.wav: 256000 bps
🔍 SPEECH | speech-librivox-0003.wav: 256000 bps

✅ Bit rate extraction complete for 5 files per category!


## Since all files have the same sample rate of 16kHz and all are mono, we dont have to perform the explicit standardization

## Since all files show a high bit-rate of 256kbps, we compress to different low bit-rates


# Step 2: Compression at different bit rates for all files


In [8]:
!apt-get install -y ffmpeg


The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 129 not upgraded.


> ffmpeg - It is a free and open source software project that offers many tools for video and audio processing. It's designed to run on a command line interface, and has many different libraries and programs to manipulate and handle video files.

In [None]:
import os
import subprocess
import shutil

# Paths
DATA_PATH = "/kaggle/input/musan-data/musan/music"
OUTPUT_PATH = "/kaggle/working/compressed_music_sample"

# Bitrates for music compression
BITRATES = {"16kbps": "16000", "32kbps": "32000", "64kbps": "64000"}

# Ensure output directory exists
os.makedirs(OUTPUT_PATH, exist_ok=True)

# Get a subfolder inside 'music'
music_subfolders = [f.path for f in os.scandir(DATA_PATH) if f.is_dir()]
if not music_subfolders:
    print("❌ No subfolders found in music.")
else:
    music_folder = music_subfolders[0]  # Choose the first subfolder
    music_files = [f for f in os.listdir(music_folder) if f.endswith(".wav")][:5]  # Pick 5 files

    # Compress 5 files
    for file in music_files:
        input_file = os.path.join(music_folder, file)
        for bitrate_name, bitrate_value in BITRATES.items():
            output_dir = os.path.join(OUTPUT_PATH, f"compressed_{bitrate_name}")
            os.makedirs(output_dir, exist_ok=True)
            output_file = os.path.join(output_dir, file.replace(".wav", ".opus"))
            
            # Compress using ffmpeg
            cmd = f"ffmpeg -i '{input_file}' -c:a libopus -b:a {bitrate_value} '{output_file}' -y"
            subprocess.run(cmd, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

    print("✅ Compression complete! Files saved in:", OUTPUT_PATH)


In [None]:
import shutil

# Path to save the zip file
ZIP_PATH = "/kaggle/working/compressed_music_sample.zip"

# Create zip archive
shutil.make_archive(ZIP_PATH.replace(".zip", ""), 'zip', "/kaggle/working/compressed_music_sample")

print(f"✅ Zip file created: {ZIP_PATH}")


# Step 3.1: Compressing the speech folder applying 3 bit rates corresponding to speech data


>### Here the compressing code is shown for speech data, similarly compression is been carried out for music files (5 subfolders - fma, fma-western-art, hd-classical, jamendo, rfm) by taking the first 100 files in each subfolder and for noise data too (by taking first 100 files from the 2 subfolders - free-sound and sound-bible)

In [None]:
import os
import subprocess
import shutil

# Paths
DATA_PATH = "/kaggle/input/musan-data/musan/speech"
OUTPUT_PATH = "/kaggle/working/compressed_speech"

# Bitrates for speech compression
BITRATES = {
    "3kbps": "3000",
    "6kbps": "6000",
    "12kbps": "12000"
}

# Create output directory if not exists
os.makedirs(OUTPUT_PATH, exist_ok=True)

# Function to compress a single file
def compress_audio(input_file, output_file, bitrate):
    cmd = f"ffmpeg -i '{input_file}' -c:a libopus -b:a {bitrate} '{output_file}' -y"
    subprocess.run(cmd, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

# Resume check: Load last processed file if exists
resume_file = "/kaggle/working/speech_last_processed.txt"
last_processed = None
if os.path.exists(resume_file):
    with open(resume_file, "r") as f:
        last_processed = f.read().strip()

# Process files
processed_count = 0
resume_found = last_processed is None  # If no resume file, start immediately

for root, _, files in os.walk(DATA_PATH):
    # Only process files in 'librivox' folder
    if 'librivox' in root:
        for file in sorted(files):  # Sorting ensures consistent processing order
            if file.endswith(".wav"):
                input_file = os.path.join(root, file)
                relative_path = os.path.relpath(root, DATA_PATH)  # Maintain hierarchy
                
                if not resume_found:
                    if input_file == last_processed:
                        resume_found = True  # Resume processing from the next file
                    continue
                
                # Apply compression for each bitrate
                for bitrate_label, bitrate_value in BITRATES.items():
                    output_dir = os.path.join(OUTPUT_PATH, f"compressed_{bitrate_label}", relative_path)
                    os.makedirs(output_dir, exist_ok=True)  # Create directories if missing
                    
                    output_file = os.path.join(output_dir, file.replace(".wav", ".opus"))
                    compress_audio(input_file, output_file, bitrate_value)

                processed_count += 1
                
                # Save progress every 50 files
                if processed_count % 50 == 0:
                    with open(resume_file, "w") as f:
                        f.write(input_file)
                    
                    # Zip and upload files every 50 processed
                    zip_path = "/kaggle/working/compressed_speech.zip"
                    shutil.make_archive(zip_path.replace(".zip", ""), 'zip', OUTPUT_PATH)
                    print("✅ 50 files compressed & backup saved!")

print("✅ Speech compression complete!")

# Final dataset backup
final_zip_path = "/kaggle/working/compressed_speech_final.zip"
shutil.make_archive(final_zip_path.replace(".zip", ""), 'zip', OUTPUT_PATH)
print(f"✅ Final backup saved at: {final_zip_path}")


## Compressed all 175 files from LibriVox and first 150 files from US-GOV folder 

# Step 3.2: Compressing speech subfolder : us-gov 


In [None]:
import os
import subprocess
import shutil

# Paths
DATA_PATH = "/kaggle/input/musan-data/musan/speech/us-gov"
OUTPUT_PATH = "/kaggle/working/usgov_compressed_speech"

# Bitrates for speech compression
BITRATES = {
    "3kbps": "3000",
    "6kbps": "6000",
    "12kbps": "12000"
}

# Create output directory if not exists
os.makedirs(OUTPUT_PATH, exist_ok=True)

# Function to compress a single file
def compress_audio(input_file, output_file, bitrate):
    cmd = f"ffmpeg -i '{input_file}' -c:a libopus -b:a {bitrate} '{output_file}' -y"
    subprocess.run(cmd, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

# Resume check: Load last processed file if exists
resume_file = "/kaggle/working/usgov_last_processed.txt"
last_processed = None
if os.path.exists(resume_file):
    with open(resume_file, "r") as f:
        last_processed = f.read().strip()

# Process files
processed_count = 0
resume_found = last_processed is None  # If no resume file, start immediately

for root, _, files in os.walk(DATA_PATH):
    for file in sorted(files):  # Sorting ensures consistent processing order
        if file.endswith(".wav"):
            input_file = os.path.join(root, file)
            relative_path = os.path.relpath(root, DATA_PATH)  # Maintain hierarchy

            if not resume_found:
                if input_file == last_processed:
                    resume_found = True  # Resume processing from the next file
                continue

            # Apply compression for each bitrate
            for bitrate_label, bitrate_value in BITRATES.items():
                output_dir = os.path.join(OUTPUT_PATH, f"usgov_compressed_{bitrate_label}", relative_path)
                os.makedirs(output_dir, exist_ok=True)  # Create directories if missing

                output_file = os.path.join(output_dir, file.replace(".wav", ".opus"))
                compress_audio(input_file, output_file, bitrate_value)

            processed_count += 1

            # Stop after compressing 150 files
            if processed_count >= 150:
                break

            # Save progress every 50 files
            if processed_count % 50 == 0:
                with open(resume_file, "w") as f:
                    f.write(input_file)
                
                # Zip and upload files every 50 processed
                zip_path = "/kaggle/working/usgov_compressed_speech.zip"
                shutil.make_archive(zip_path.replace(".zip", ""), 'zip', OUTPUT_PATH)
                print("✅ 50 files compressed & backup saved!")

print("✅ US-GOV speech compression complete!")

# Final dataset backup
final_zip_path = "/kaggle/working/usgov_compressed_speech_final.zip"
shutil.make_archive(final_zip_path.replace(".zip", ""), 'zip', OUTPUT_PATH)
print(f"✅ Final backup saved at: {final_zip_path}")

# Step 4: Analyzing the speech signal quality before and after compression using SNR


In [None]:
pip install pystoi


In [None]:
#analyzing using SNR,SDR,STOI and PESQ for librivox data of 3kbps

import os
import librosa
import numpy as np
from pesq import pesq
from pystoi import stoi
from mir_eval.separation import bss_eval_sources

# === Constants ===
SAMPLE_RATE = 16000
DURATION = 30
TARGET_SAMPLES = SAMPLE_RATE * DURATION

# === Evaluation Functions ===
def fix_length(audio, target_len):
    if len(audio) > target_len:
        return audio[:target_len]
    return np.pad(audio, (0, target_len - len(audio)))

def snr(original, compressed):
    min_len = min(len(original), len(compressed))
    original = original[:min_len]
    compressed = compressed[:min_len]
    noise = original - compressed
    eps = 1e-10
    return 10 * np.log10(np.sum(original ** 2) / (np.sum(noise ** 2) + eps))

# === Paths ===
ORIGINAL_PATH = "/kaggle/input/musan-data/musan/speech/librivox"
COMPRESSED_PATH = "/kaggle/input/compressed-data/compressed_musan/speech/librivox/compressed_3kbps/librivox"

# === Metric Storage ===
snr_values, sdr_values, stoi_values, pesq_values = [], [], [], []

# === Evaluation Loop ===
for root, _, files in os.walk(ORIGINAL_PATH):
    for file in sorted(files):
        if file.endswith(".wav"):
            original_file = os.path.join(root, file)
            compressed_file = os.path.join(COMPRESSED_PATH, file.replace(".wav", ".opus"))

            try:
                orig_audio, _ = librosa.load(original_file, sr=SAMPLE_RATE)
                comp_audio, _ = librosa.load(compressed_file, sr=SAMPLE_RATE)

                orig_audio = fix_length(orig_audio, TARGET_SAMPLES)
                comp_audio = fix_length(comp_audio, TARGET_SAMPLES)

                # SNR
                snr_values.append(snr(orig_audio, comp_audio))

                # SDR
                sdr, _, _, _ = bss_eval_sources(orig_audio[None, :], comp_audio[None, :])
                sdr_values.append(sdr[0])

                # STOI
                stoi_values.append(stoi(orig_audio, comp_audio, SAMPLE_RATE, extended=False))

                # PESQ
                pesq_score = pesq(SAMPLE_RATE, orig_audio, comp_audio, 'wb')
                pesq_values.append(pesq_score)

                print(f"✅ {file}: SNR={snr_values[-1]:.2f}, SDR={sdr[0]:.2f}, STOI={stoi_values[-1]:.3f}, PESQ={pesq_score:.3f}")

            except Exception as e:
                print(f"⚠️ Error processing {file}: {e}")

# === Print Averages ===
if snr_values:
    print("\n📊 Compressed Audio Evaluation (vs Original):")
    print(f"✅ Avg SNR :  {np.mean(snr_values):.2f} dB")
    print(f"✅ Avg SDR :  {np.mean(sdr_values):.2f} dB")
    print(f"✅ Avg STOI:  {np.mean(stoi_values):.3f}")
    print(f"✅ Avg PESQ:  {np.mean(pesq_values):.3f}")
else:
    print("\n⚠ No valid files processed for metric calculation.")


In [None]:
#analyzing using SNR librivox data of 12kbps


import os
import librosa
import numpy as np

# Function to compute SNR
def snr(original, compressed):
    min_len = min(len(original), len(compressed))
    original = original[:min_len]
    compressed = compressed[:min_len]
    
    noise = original - compressed
    eps = 1e-10  # Small value to prevent division by zero
    
    return 10 * np.log10(np.sum(original ** 2) / (np.sum(noise ** 2) + eps))

# Paths
ORIGINAL_PATH = "/kaggle/input/musan-data/musan/speech/librivox"
COMPRESSED_PATH = "/kaggle/input/compressed-data/compressed_musan/speech/librivox/compressed_12kbps/librivox"

# List to store SNR values
snr_values = []

# Process all files for SNR evaluation
for root, _, files in os.walk(ORIGINAL_PATH):
    for file in sorted(files):
        if file.endswith(".wav"):
            original_file = os.path.join(root, file)
            compressed_file = os.path.join(COMPRESSED_PATH, file.replace(".wav", ".opus"))
            
            # Load original and compressed audio
            try:
                original_audio, _ = librosa.load(original_file, sr=16000)  # Standard sampling rate
                compressed_audio, _ = librosa.load(compressed_file, sr=16000)
                
                # Compute SNR
                snr_value = snr(original_audio, compressed_audio)
                snr_values.append(snr_value)
                
                print(f"SNR for {file}: {snr_value:.2f} dB")
            except Exception as e:
                print(f"Error processing {file}: {e}")

# Compute and print the average SNR
if snr_values:
    avg_snr = np.mean(snr_values)
    print(f"\n✅ Average SNR for compressed dataset: {avg_snr:.2f} dB")
else:
    print("\n⚠ No valid files processed for SNR calculation.")


In [None]:
#analyzing using SNR,SDR,STOI and PESQ for librivox data of 12kbps

import os
import librosa
import numpy as np
from pesq import pesq
from pystoi import stoi
from mir_eval.separation import bss_eval_sources

# === Constants ===
SAMPLE_RATE = 16000
DURATION = 30
TARGET_SAMPLES = SAMPLE_RATE * DURATION

# === Evaluation Functions ===
def fix_length(audio, target_len):
    if len(audio) > target_len:
        return audio[:target_len]
    return np.pad(audio, (0, target_len - len(audio)))

def snr(original, compressed):
    min_len = min(len(original), len(compressed))
    original = original[:min_len]
    compressed = compressed[:min_len]
    noise = original - compressed
    eps = 1e-10
    return 10 * np.log10(np.sum(original ** 2) / (np.sum(noise ** 2) + eps))

# === Paths ===
ORIGINAL_PATH = "/kaggle/input/musan-data/musan/speech/librivox"
COMPRESSED_PATH = "/kaggle/input/compressed-data/compressed_musan/speech/librivox/compressed_12kbps/librivox"

# === Metric Storage ===
snr_values, sdr_values, stoi_values, pesq_values = [], [], [], []

# === Evaluation Loop ===
for root, _, files in os.walk(ORIGINAL_PATH):
    for file in sorted(files):
        if file.endswith(".wav"):
            original_file = os.path.join(root, file)
            compressed_file = os.path.join(COMPRESSED_PATH, file.replace(".wav", ".opus"))

            try:
                orig_audio, _ = librosa.load(original_file, sr=SAMPLE_RATE)
                comp_audio, _ = librosa.load(compressed_file, sr=SAMPLE_RATE)

                orig_audio = fix_length(orig_audio, TARGET_SAMPLES)
                comp_audio = fix_length(comp_audio, TARGET_SAMPLES)

                # SNR
                snr_values.append(snr(orig_audio, comp_audio))

                # SDR
                sdr, _, _, _ = bss_eval_sources(orig_audio[None, :], comp_audio[None, :])
                sdr_values.append(sdr[0])

                # STOI
                stoi_values.append(stoi(orig_audio, comp_audio, SAMPLE_RATE, extended=False))

                # PESQ
                pesq_score = pesq(SAMPLE_RATE, orig_audio, comp_audio, 'wb')
                pesq_values.append(pesq_score)

                print(f"✅ {file}: SNR={snr_values[-1]:.2f}, SDR={sdr[0]:.2f}, STOI={stoi_values[-1]:.3f}, PESQ={pesq_score:.3f}")

            except Exception as e:
                print(f"⚠️ Error processing {file}: {e}")

# === Print Averages ===
if snr_values:
    print("\n📊 Compressed Audio Evaluation (vs Original):")
    print(f"✅ Avg SNR :  {np.mean(snr_values):.2f} dB")
    print(f"✅ Avg SDR :  {np.mean(sdr_values):.2f} dB")
    print(f"✅ Avg STOI:  {np.mean(stoi_values):.3f}")
    print(f"✅ Avg PESQ:  {np.mean(pesq_values):.3f}")
else:
    print("\n⚠ No valid files processed for metric calculation.")


In [None]:
#analyzing using SNR for us-gov data of 12kbps


import os
import librosa
import numpy as np

# Function to compute SNR
def snr(original, compressed):
    min_len = min(len(original), len(compressed))
    original = original[:min_len]
    compressed = compressed[:min_len]
    
    noise = original - compressed
    eps = 1e-10  # Small value to prevent division by zero
    
    return 10 * np.log10(np.sum(original ** 2) / (np.sum(noise ** 2) + eps))

# Paths
ORIGINAL_PATH = "/kaggle/input/musan-data/musan/speech/us-gov"
COMPRESSED_PATH = "/kaggle/input/compressed-data/compressed_musan/speech/us-gov/usgov_compressed_12kbps"

# List to store SNR values
snr_values = []
file_count = 0
MAX_FILES = 150  # Limit to first 150 files

# Process files for SNR evaluation
for root, _, files in os.walk(ORIGINAL_PATH):
    for file in sorted(files):
        if file.endswith(".wav"):
            if file_count >= MAX_FILES:
                break  # Stop after processing 150 files
            
            original_file = os.path.join(root, file)
            compressed_file = os.path.join(COMPRESSED_PATH, file.replace(".wav", ".opus"))
            
            # Load original and compressed audio
            try:
                original_audio, _ = librosa.load(original_file, sr=16000)  # Standard sampling rate
                compressed_audio, _ = librosa.load(compressed_file, sr=16000)
                
                # Compute SNR
                snr_value = snr(original_audio, compressed_audio)
                snr_values.append(snr_value)
                
                print(f"SNR for {file}: {snr_value:.2f} dB")
                file_count += 1
            except Exception as e:
                print(f"Error processing {file}: {e}")

# Compute and print the average SNR
if snr_values:
    avg_snr = np.mean(snr_values)
    print(f"\n✅ Average SNR for compressed dataset (first {MAX_FILES} files): {avg_snr:.2f} dB")
else:
    print("\n⚠ No valid files processed for SNR calculation.")


In [None]:
#analyzing using SNR,SDR,STOI and PESQ for us-gov data of 12kbps


import os
import librosa
import numpy as np
from pesq import pesq
from pystoi import stoi
from mir_eval.separation import bss_eval_sources

# === Constants ===
SAMPLE_RATE = 16000
DURATION = 30
TARGET_SAMPLES = SAMPLE_RATE * DURATION

# === Evaluation Functions ===
def fix_length(audio, target_len):
    if len(audio) > target_len:
        return audio[:target_len]
    return np.pad(audio, (0, target_len - len(audio)))

def snr(original, compressed):
    min_len = min(len(original), len(compressed))
    original = original[:min_len]
    compressed = compressed[:min_len]
    noise = original - compressed
    eps = 1e-10
    return 10 * np.log10(np.sum(original ** 2) / (np.sum(noise ** 2) + eps))

# === Paths ===
ORIGINAL_PATH = "/kaggle/input/musan-data/musan/speech/us-gov"
COMPRESSED_PATH = "/kaggle/input/compressed-data/compressed_musan/speech/us-gov/usgov_compressed_12kbps"

# === Metric Storage ===
snr_values, sdr_values, stoi_values, pesq_values = [], [], [], []

# === Evaluation Loop ===
for root, _, files in os.walk(ORIGINAL_PATH):
    for file in sorted(files):
        if file.endswith(".wav"):
            original_file = os.path.join(root, file)
            compressed_file = os.path.join(COMPRESSED_PATH, file.replace(".wav", ".opus"))

            try:
                orig_audio, _ = librosa.load(original_file, sr=SAMPLE_RATE)
                comp_audio, _ = librosa.load(compressed_file, sr=SAMPLE_RATE)

                orig_audio = fix_length(orig_audio, TARGET_SAMPLES)
                comp_audio = fix_length(comp_audio, TARGET_SAMPLES)

                # SNR
                snr_values.append(snr(orig_audio, comp_audio))

                # SDR
                sdr, _, _, _ = bss_eval_sources(orig_audio[None, :], comp_audio[None, :])
                sdr_values.append(sdr[0])

                # STOI
                stoi_values.append(stoi(orig_audio, comp_audio, SAMPLE_RATE, extended=False))

                # PESQ
                pesq_score = pesq(SAMPLE_RATE, orig_audio, comp_audio, 'wb')
                pesq_values.append(pesq_score)

                print(f"✅ {file}: SNR={snr_values[-1]:.2f}, SDR={sdr[0]:.2f}, STOI={stoi_values[-1]:.3f}, PESQ={pesq_score:.3f}")

            except Exception as e:
                print(f"⚠️ Error processing {file}: {e}")

# === Print Averages ===
if snr_values:
    print("\n📊 Compressed Audio Evaluation (vs Original):")
    print(f"✅ Avg SNR :  {np.mean(snr_values):.2f} dB")
    print(f"✅ Avg SDR :  {np.mean(sdr_values):.2f} dB")
    print(f"✅ Avg STOI:  {np.mean(stoi_values):.3f}")
    print(f"✅ Avg PESQ:  {np.mean(pesq_values):.3f}")
else:
    print("\n⚠ No valid files processed for metric calculation.")


In [None]:
#analyzing using SNR for us-gov data of 3kbps


import os
import librosa
import numpy as np

# Function to compute SNR
def snr(original, compressed):
    min_len = min(len(original), len(compressed))
    original = original[:min_len]
    compressed = compressed[:min_len]
    
    noise = original - compressed
    eps = 1e-10  # Small value to prevent division by zero
    
    return 10 * np.log10(np.sum(original ** 2) / (np.sum(noise ** 2) + eps))

# Paths
ORIGINAL_PATH = "/kaggle/input/musan-data/musan/speech/us-gov"
COMPRESSED_PATH = "/kaggle/input/compressed-data/compressed_musan/speech/us-gov/usgov_compressed_3kbps"

# List to store SNR values
snr_values = []
file_count = 0
MAX_FILES = 150  # Limit to first 150 files

# Process files for SNR evaluation
for root, _, files in os.walk(ORIGINAL_PATH):
    for file in sorted(files):
        if file.endswith(".wav"):
            if file_count >= MAX_FILES:
                break  # Stop after processing 150 files
            
            original_file = os.path.join(root, file)
            compressed_file = os.path.join(COMPRESSED_PATH, file.replace(".wav", ".opus"))
            
            # Load original and compressed audio
            try:
                original_audio, _ = librosa.load(original_file, sr=16000)  # Standard sampling rate
                compressed_audio, _ = librosa.load(compressed_file, sr=16000)
                
                # Compute SNR
                snr_value = snr(original_audio, compressed_audio)
                snr_values.append(snr_value)
                
                print(f"SNR for {file}: {snr_value:.2f} dB")
                file_count += 1
            except Exception as e:
                print(f"Error processing {file}: {e}")

# Compute and print the average SNR
if snr_values:
    avg_snr = np.mean(snr_values)
    print(f"\n✅ Average SNR for compressed dataset (first {MAX_FILES} files): {avg_snr:.2f} dB")
else:
    print("\n⚠ No valid files processed for SNR calculation.")


In [None]:
#analyzing using SNR,SDR,STOI and PESQ for us-gov data of 3kbps


import os
import librosa
import numpy as np
from pesq import pesq
from pystoi import stoi
from mir_eval.separation import bss_eval_sources

# === Constants ===
SAMPLE_RATE = 16000
DURATION = 30
TARGET_SAMPLES = SAMPLE_RATE * DURATION

# === Evaluation Functions ===
def fix_length(audio, target_len):
    if len(audio) > target_len:
        return audio[:target_len] #trim if audio is longer than 30secs
    return np.pad(audio, (0, target_len - len(audio))) #pad with 0s if shorter

def snr(original, compressed):
    min_len = min(len(original), len(compressed))
    original = original[:min_len]
    compressed = compressed[:min_len]
    noise = original - compressed
    eps = 1e-10 #uses a small eps to avoid div by 0 error
    return 10 * np.log10(np.sum(original ** 2) / (np.sum(noise ** 2) + eps))

# === Paths ===
ORIGINAL_PATH = "/kaggle/input/musan-data/musan/speech/us-gov"
COMPRESSED_PATH = "/kaggle/input/compressed-data/compressed_musan/speech/us-gov/usgov_compressed_3kbps"

# === Metric Storage ===
snr_values, sdr_values, stoi_values, pesq_values = [], [], [], []

# === Evaluation Loop ===
for root, _, files in os.walk(ORIGINAL_PATH):
    for file in sorted(files):
        if file.endswith(".wav"):
            original_file = os.path.join(root, file)
            compressed_file = os.path.join(COMPRESSED_PATH, file.replace(".wav", ".opus"))

            try:
                orig_audio, _ = librosa.load(original_file, sr=SAMPLE_RATE)
                comp_audio, _ = librosa.load(compressed_file, sr=SAMPLE_RATE)

                orig_audio = fix_length(orig_audio, TARGET_SAMPLES)
                comp_audio = fix_length(comp_audio, TARGET_SAMPLES)

                # SNR
                # a measure of the strength
                #of the desired signal relative to background noise (undesired signal).
                snr_values.append(snr(orig_audio, comp_audio))

                # SDR
                sdr, _, _, _ = bss_eval_sources(orig_audio[None, :], comp_audio[None, :])
                sdr_values.append(sdr[0])

                # STOI
                stoi_values.append(stoi(orig_audio, comp_audio, SAMPLE_RATE, extended=False))

                # PESQ
                pesq_score = pesq(SAMPLE_RATE, orig_audio, comp_audio, 'wb')
                pesq_values.append(pesq_score)

                print(f"✅ {file}: SNR={snr_values[-1]:.2f}, SDR={sdr[0]:.2f}, STOI={stoi_values[-1]:.3f}, PESQ={pesq_score:.3f}")

            except Exception as e:
                print(f"⚠️ Error processing {file}: {e}")

# === Print Averages ===
if snr_values:
    print("\n📊 Compressed Audio Evaluation (vs Original):")
    print(f"✅ Avg SNR :  {np.mean(snr_values):.2f} dB")
    print(f"✅ Avg SDR :  {np.mean(sdr_values):.2f} dB")
    print(f"✅ Avg STOI:  {np.mean(stoi_values):.3f}")
    print(f"✅ Avg PESQ:  {np.mean(pesq_values):.3f}")
else:
    print("\n⚠ No valid files processed for metric calculation.")


>**SNR :** How much useful signal is present relative to noise.
        It's a basic, energy-based metric.
> Basic comparison of energy differences between signals
  Quick check of degradation in systems like audio compression
> Higher SNR --> Better quality
>
> **SDR :** Used mainly in source separation tasks
> Measures how much of the desired signal is preserved vs distortion (including artifacts, interference, and noise)
>  It Decomposes:
> estimated signal = true source + distortion + artifacts
  So, SDR captures more nuanced errors than SNR
> Interpretation:
Higher SDR = better reconstruction/separation
Accounts for perceptual quality, not just energy
Often used in blind source separation (BSS) and enhancement tasks

# Step 5: Aligning Original and compressed pairs for training the model

In [None]:
import os
import librosa
import torch
from torch.utils.data import Dataset, DataLoader
#PyTorch's torch.utils.data.Dataset is an abstract class that 
#helps you define how your data is structured and how to retrieve it.

# Define paths
ORIGINAL_SPEECH_PATH = "/kaggle/input/musan-data/musan/speech"
COMPRESSED_SPEECH_PATH = "/kaggle/input/compressed-data/compressed_musan/speech"
BITRATES = ["3kbps", "6kbps", "12kbps"]
SUBSETS = ["librivox", "us-gov"]
US_GOV_COMPRESSED_LIMIT = 150  # Only first 150 files are compressed

#custom dataset class
#Here, we are dealing with pairs of data (original, compressed) that are in different loc, needs to be
#to be loaded as pairs, formated as tensors and passed for training in a structured way
class SpeechDataset(Dataset):
    #nitializes dataset parameters.
    def __init__(self, original_path, compressed_path, bitrates, subsets, sr=16000):
        self.original_path = original_path
        self.compressed_path = compressed_path
        self.bitrates = bitrates
        self.subsets = subsets
        self.sr = sr
        self.file_pairs = self._get_file_pairs()
        
        if len(self.file_pairs) == 0:
            raise ValueError("❌ No matching file pairs found! Check your paths.")

    def _get_file_pairs(self):
        file_pairs = []
        for subset in self.subsets:
            subset_path = os.path.join(self.original_path, subset)
            file_list = sorted([f for f in os.listdir(subset_path) if f.endswith(".wav")])

            if subset == "us-gov":
                file_list = file_list[:US_GOV_COMPRESSED_LIMIT]  # Limit to 150 files

            for file in file_list:
                original_file = os.path.join(subset_path, file)

                for bitrate in self.bitrates:
                    if subset == "librivox":
                        compressed_file = os.path.join(
                            self.compressed_path, subset, f"compressed_{bitrate}", subset, file.replace(".wav", ".opus")
                        )
                    elif subset == "us-gov":
                        compressed_file = os.path.join(
                            self.compressed_path, subset, f"usgov_compressed_{bitrate}", file.replace(".wav", ".opus")
                        )

                    if os.path.exists(compressed_file):
                        file_pairs.append((original_file, compressed_file))
                        #Validates that both original and compressed files exist before adding them.
                    else:
                        print(f"❌ Missing file: {compressed_file}")  # Debugging output
                
        return file_pairs

    def __len__(self):
        return len(self.file_pairs)

    def __getitem__(self, idx):
        orig_file, comp_file = self.file_pairs[idx]
        #Loads both original and compressed audio at 16 kHz.
        orig_audio, _ = librosa.load(orig_file, sr=self.sr)
        comp_audio, _ = librosa.load(comp_file, sr=self.sr)

        #Wraps them as PyTorch tensors with shape (1, samples) (mono channel).
        # ✅ Ensure correct format (channels, samples) → (1, samples)
        orig_audio = torch.tensor(orig_audio).unsqueeze(0)  # (1, samples)
        comp_audio = torch.tensor(comp_audio).unsqueeze(0)  # (1, samples)

        return orig_audio, comp_audio

# Collate function to pad all audio to the longest sample in the batch
def collate_fn(batch):
    orig_batch, comp_batch = zip(*batch)
    
    max_len = max(max(x.shape[1] for x in orig_batch), max(x.shape[1] for x in comp_batch))
    
    def pad_audio(audio):
        return torch.nn.functional.pad(audio, (0, max_len - audio.shape[1]))
    #batches the data
    orig_batch = torch.stack([pad_audio(x) for x in orig_batch])  # (batch, 1, samples)
    comp_batch = torch.stack([pad_audio(x) for x in comp_batch])  # (batch, 1, samples)
    #Converts list of tensors to a single batch tensor of shape: (batch_size, 1, samples)
    return orig_batch, comp_batch

# ✅ Load dataset into DataLoader
# DataLoader makes reading files easier, batches the data, randomly shuffles it, pads non-uniform data using collate func

dataset = SpeechDataset(ORIGINAL_SPEECH_PATH, COMPRESSED_SPEECH_PATH, BITRATES, SUBSETS)
dataloader = DataLoader(dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)

# ✅ Check dataset loading
for orig, comp in dataloader:
    print(f"✅ After Fix - Original Shape: {orig.shape}, Compressed Shape: {comp.shape}")
    break  # Only check the first batch


# Step 6 : Model Training using demucs

In [None]:
!pip install -q demucs


In [None]:
!pip install -q demucs
import os
import librosa
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from demucs.hdemucs import HDemucs  # Corrected import

# ✅ Ensure GPU Usage
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"🔥 Using device: {device}")


In [None]:
# Paths
ORIGINAL_SPEECH_PATH = "/kaggle/input/musan-data/musan/speech"
COMPRESSED_SPEECH_PATH = "/kaggle/input/compressed-data/compressed_musan/speech"
BITRATES = ["3kbps", "6kbps", "12kbps"]
SUBSETS = ["librivox", "us-gov"]
SAMPLE_RATE = 16000
DURATION = 30  # 30 seconds
TARGET_SAMPLES = SAMPLE_RATE * DURATION

class SpeechDataset(Dataset):
    def __init__(self, original_path, compressed_path, bitrates, subsets, sr=SAMPLE_RATE):
        self.original_path = original_path
        self.compressed_path = compressed_path
        self.bitrates = bitrates
        self.subsets = subsets
        self.sr = sr
        self.file_pairs = self._get_file_pairs()
        
        if len(self.file_pairs) == 0:
            raise ValueError("❌ No matching file pairs found! Check your paths.")

    def _get_file_pairs(self):
        file_pairs = []
        for subset in self.subsets:
            subset_path = os.path.join(self.original_path, subset)
            file_list = sorted([f for f in os.listdir(subset_path) if f.endswith(".wav")])

            for file in file_list:
                original_file = os.path.join(subset_path, file)

                for bitrate in self.bitrates:
                    compressed_file = os.path.join(
                        self.compressed_path, subset, f"compressed_{bitrate}", subset, file.replace(".wav", ".opus")
                    )

                    if os.path.exists(compressed_file):
                        file_pairs.append((original_file, compressed_file))
                
        return file_pairs

    def _load_audio(self, path):
        audio, _ = librosa.load(path, sr=self.sr, mono=True)
        if len(audio) > TARGET_SAMPLES:
            audio = audio[:TARGET_SAMPLES]
        else:
            audio = F.pad(torch.tensor(audio), (0, TARGET_SAMPLES - len(audio)))
        return audio

    def __len__(self):
        return len(self.file_pairs)

    def __getitem__(self, idx):
        orig_file, comp_file = self.file_pairs[idx]
        orig_audio = self._load_audio(orig_file)
        comp_audio = self._load_audio(comp_file)
        return orig_audio, comp_audio


In [None]:
import torch

def collate_fn(batch):
    orig_batch, comp_batch = zip(*batch)  # Unpack pairs

    #Converts each 1D audio tensor to stereo (2 channels) by duplicating it
    def stereo(audio):
        audio = torch.tensor(audio, dtype=torch.float32)  # ✅ Convert NumPy to Tensor
        return torch.stack([audio, audio], dim=0)  # Duplicate mono signal to stereo
    #Finally stacks into batch tensors
    orig_batch = torch.stack([stereo(x) for x in orig_batch])  # (B, 2, T)
    comp_batch = torch.stack([stereo(x) for x in comp_batch])  # (B, 2, T)

    return orig_batch, comp_batch



In [None]:
dataset = SpeechDataset(ORIGINAL_SPEECH_PATH, COMPRESSED_SPEECH_PATH, BITRATES, SUBSETS)
dataloader = DataLoader(dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)

print(f"✅ Loaded {len(dataset)} audio pairs")


In [None]:
model = HDemucs(sources=["speech"]).to(device)  # Load pretrained Demucs
optimizer = optim.Adam(model.parameters(), lr=1e-4)
loss_fn = nn.MSELoss()


In [None]:
EPOCHS = 10  # Adjust as needed

for epoch in range(EPOCHS):
    total_loss = 0
    model.train()
    
    for orig_wave, comp_wave in dataloader:
        orig_wave, comp_wave = orig_wave.to(device), comp_wave.to(device)

        optimizer.zero_grad()
        enhanced_wave = model(comp_wave)  # Demucs predicts clean audio

        #  Match shape with target - some models return output with shape (B, 1, 2, T) — extra dim.
        #This fixes it to standard shape (B, 2, T).
        if enhanced_wave.ndim == 4:
            enhanced_wave = enhanced_wave.squeeze(1)  # From (B,1,2,T) → (B,2,T)

        #  Sanity check- ensures output matches with target
        assert enhanced_wave.shape == orig_wave.shape, \
            f"Shape mismatch: got {enhanced_wave.shape}, expected {orig_wave.shape}"

        loss = loss_fn(enhanced_wave, orig_wave)# calc mse

        loss.backward() #gradient computation
        optimizer.step()#weight update
        total_loss += loss.item()

    print(f"📢 Epoch {epoch+1}/{EPOCHS}, Loss: {total_loss / len(dataloader):.6f}")


> ## Training was stopped forcefully since it was taking longer than expected to complete 10 epochs  

In [None]:

torch.save(model.state_dict(), "demucs_epoch_8.pth")
# retrieves the current state of the model — this includes all learnable parameters 
#(weights, biases, etc.) at that point in training.
print("✅ Model saved successfully!")


# Step 7: Reconstructing using the model on the unseen data

In [None]:
#compressing 150th - 180th file in us-gov folder which was not used for training earlier

import os
from tqdm import tqdm

def compress_files(input_folder, output_folder, bitrate_kbps=6):
    os.makedirs(output_folder, exist_ok=True)
    files = sorted([f for f in os.listdir(input_folder) if f.endswith(".wav")])[150:180]  # Take only 30

    for file in tqdm(files, desc=f"Compressing to {bitrate_kbps}kbps"):
        input_path = os.path.join(input_folder, file)
        output_path = os.path.join(output_folder, file.replace(".wav", ".opus"))

        os.system(f"ffmpeg -y -loglevel error -i '{input_path}' -c:a libopus -b:a {bitrate_kbps}k '{output_path}'")

    return [os.path.join(output_folder, f.replace(".wav", ".opus")) for f in files]

# Define paths
ORIG_UNSEEN_PATH = "/kaggle/input/musan-data/musan/speech/us-gov"
COMPRESSED_TEST_PATH = "./test_compressed/us-gov"

# Run compression
compressed_files = compress_files(ORIG_UNSEEN_PATH, COMPRESSED_TEST_PATH, bitrate_kbps=6)


In [None]:
#reconstructing using demucs model

import os
import torch
import torchaudio
import torch.nn.functional as F
from torchaudio.transforms import Resample
from tqdm import tqdm
from demucs.hdemucs import HDemucs
import soundfile as sf

# === CONSTANTS ===
SAMPLE_RATE = 16000
DURATION = 30
TARGET_SAMPLES = SAMPLE_RATE * DURATION
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# === PATHS ===
COMPRESSED_FOLDER = "/kaggle/working/test_compressed/us-gov"
SAVE_RECONSTRUCTED = "/kaggle/working/reconstructed_usgov_test"
os.makedirs(SAVE_RECONSTRUCTED, exist_ok=True)

# === LOAD MODEL ===
model = HDemucs(sources=["speech"]).to(DEVICE)
model.load_state_dict(torch.load("/kaggle/input/demucs_epoch_8/pytorch/default/1/demucs_epoch_8.pth", map_location=DEVICE))
model.eval()
print("✅ Model loaded.")

# === HELPER FUNCTION ===
def prepare_input(file_path):
    waveform, sr = torchaudio.load(file_path)  # (1, T) for mono
    if sr != SAMPLE_RATE:
        resampler = Resample(sr, SAMPLE_RATE)
        waveform = resampler(waveform)

    waveform = waveform.mean(dim=0)  # Ensure mono
    if waveform.shape[0] > TARGET_SAMPLES:
        waveform = waveform[:TARGET_SAMPLES]
    else:
        waveform = F.pad(waveform, (0, TARGET_SAMPLES - waveform.shape[0]))

    # Convert to stereo (2, T)
    stereo_wave = torch.stack([waveform, waveform], dim=0)
    return stereo_wave.unsqueeze(0).to(DEVICE)  # (1, 2, T)

# === SELECT FILES ===
compressed_files = sorted([
    os.path.join(COMPRESSED_FOLDER, f)
    for f in os.listdir(COMPRESSED_FOLDER)
    if f.endswith(".opus")
])[:30]  # First 30

# === RECONSTRUCT LOOP ===
for comp_path in tqdm(compressed_files, desc="🔊 Reconstructing"):
    input_tensor = prepare_input(comp_path)

    with torch.no_grad():
        output = model(input_tensor)

        # Handle (B,1,2,T) if needed
        if output.ndim == 4:
            output = output.squeeze(1)  # (B, 2, T)

        output = output.squeeze(0).cpu()  # (2, T)

    # Save as (T, 2)
    out_path = os.path.join(
        SAVE_RECONSTRUCTED,
        os.path.basename(comp_path).replace(".opus", "_reconstructed.wav")
    )
    sf.write(out_path, output.permute(1, 0).numpy(), SAMPLE_RATE)

print("✅ Done reconstructing and saving 30 test files.")


In [None]:
#zipping the compressed data (prepared for testing ) and reconstructed
#samples from the working directory

import zipfile
import os

def zip_folder(folder_path, zip_path):
    with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, _, files in os.walk(folder_path):
            for file in files:
                full_path = os.path.join(root, file)
                arcname = os.path.relpath(full_path, start=folder_path)
                zipf.write(full_path, arcname=arcname)

# Paths
reconstructed_folder = "/kaggle/working/reconstructed_usgov_test"
compressed_folder = "/kaggle/working/test_compressed"

# Output ZIP paths
reconstructed_zip = "/kaggle/working/reconstructed_usgov_test.zip"
compressed_zip = "/kaggle/working/test_compressed.zip"

# Zip both folders
zip_folder(reconstructed_folder, reconstructed_zip)
zip_folder(compressed_folder, compressed_zip)

print("✅ Both folders zipped and saved:")
print(f"📁 {reconstructed_zip}")
print(f"📁 {compressed_zip}")


In [None]:
!pip install pystoi
!pip install pesq


In [None]:
!pip install mir_eval


# Step 8: Evaluating the model on STOI, SDR and PESQ

In [None]:
import os
import torch
import torchaudio
from torchaudio.transforms import Resample
from tqdm import tqdm
from pesq import pesq
from pystoi import stoi
from mir_eval.separation import bss_eval_sources
import numpy as np

# === CONFIG ===
SAMPLE_RATE = 16000
ORIGINAL_PATH = "/kaggle/input/musan-data/musan/speech/us-gov"
RECONSTRUCTED_PATH = "/kaggle/working/reconstructed_usgov_test"

# === EVALUATION METRICS ===
sdr_scores = []
stoi_scores = []
pesq_scores = []

# === FILE LIST ===
reconstructed_files = sorted([
    f for f in os.listdir(RECONSTRUCTED_PATH) if f.endswith("_reconstructed.wav")
])

for rec_file in tqdm(reconstructed_files, desc="🎧 Evaluating"):
    # Match original
    original_name = rec_file.replace("_reconstructed.wav", ".wav")
    orig_path = os.path.join(ORIGINAL_PATH, original_name)
    rec_path = os.path.join(RECONSTRUCTED_PATH, rec_file)

    if not os.path.exists(orig_path):
        print(f"❌ Missing original: {original_name}")
        continue

    # Load both files
    orig_audio, sr1 = torchaudio.load(orig_path)
    rec_audio, sr2 = torchaudio.load(rec_path)

    # Convert to mono
    orig_audio = orig_audio.mean(dim=0)
    rec_audio = rec_audio.mean(dim=0)

    # Resample if needed
    if sr1 != SAMPLE_RATE:
        orig_audio = Resample(sr1, SAMPLE_RATE)(orig_audio)
    if sr2 != SAMPLE_RATE:
        rec_audio = Resample(sr2, SAMPLE_RATE)(rec_audio)

    # Truncate/pad to same length
    min_len = min(orig_audio.shape[-1], rec_audio.shape[-1])
    orig_audio = orig_audio[:min_len]
    rec_audio = rec_audio[:min_len]

    # Convert to numpy
    orig_np = orig_audio.numpy()
    rec_np = rec_audio.numpy()

    # === Metrics ===
    # SDR
    sdr, _, _, _ = bss_eval_sources(orig_np[None], rec_np[None])
    sdr_scores.append(sdr[0])

    # STOI
    stoi_val = stoi(orig_np, rec_np, SAMPLE_RATE, extended=False)
    stoi_scores.append(stoi_val)

    # PESQ
    pesq_val = pesq(SAMPLE_RATE, orig_np, rec_np, 'wb')
    pesq_scores.append(pesq_val)

# === AVERAGES ===
print("\n📊 Evaluation Results on 30 Files:")
print(f"✅ SDR  (Signal-to-Distortion Ratio): {np.mean(sdr_scores):.2f} dB")
print(f"✅ STOI (Speech Intelligibility):     {np.mean(stoi_scores):.3f}")
print(f"✅ PESQ (Perceptual Quality):        {np.mean(pesq_scores):.3f}")


# Step 9: Test on real-world speech data


In [None]:
pip install yt-dlp


In [None]:
!yt-dlp -x --audio-format wav -o "yt_originalTalk.%(ext)s" https://www.youtube.com/watch?v=t2oOFs4WgI0


In [None]:
import os

# List all .wav files in the current directory
for f in os.listdir():
    if f.endswith(".wav"):
        print(f)


In [None]:
!apt-get install -y ffmpeg  # If not already installed

# Compress to 6kbps opus
!ffmpeg -i yt_originalTalk.wav -c:a libopus -b:a 6k yt_compressedTalk.opus


In [None]:
!ffmpeg -i yt_compressedTalk.opus -ac 1 -ar 16000 yt_compressedTalk_16k_mono.wav


In [None]:
import torchaudio
import torch
from torchaudio.transforms import Resample
import soundfile as sf
from demucs.hdemucs import HDemucs

# Load full audio
waveform, sr = torchaudio.load("yt_compressedTalk_16k_mono.wav")
if sr != 16000:
    waveform = Resample(sr, 16000)(waveform)

# Convert to mono if not already
waveform = waveform.mean(dim=0)

# Convert mono to stereo
stereo_wave = torch.stack([waveform, waveform], dim=0).unsqueeze(0).to("cuda" if torch.cuda.is_available() else "cpu")

# Load model
model = HDemucs(sources=["speech"]).to(stereo_wave.device)
model.load_state_dict(torch.load("/kaggle/input/demucs_epoch_8/pytorch/default/1/demucs_epoch_8.pth", map_location=stereo_wave.device))
model.eval()

# Run inference
with torch.no_grad():
    output = model(stereo_wave)
    output = output.squeeze()

# Save output
sf.write("yt_reconstructed_full.wav", output.permute(1, 0).cpu().numpy(), 16000)
print("✅ Reconstructed audio saved as yt_reconstructed_full.wav")


In [None]:
from IPython.display import Audio

print("Original (compressed):")
display(Audio("yt_compressedTalk_16k_mono.wav"))

print("Reconstructed:")
display(Audio("yt_reconstructed_full.wav"))
