# Step 0: Import Dataset and check the files in each folder

In [None]:
import os

DATASETS_PATH = "/kaggle/input"

print("📂 Available datasets in /kaggle/input:")
print(os.listdir(DATASETS_PATH))


In [None]:
DATA_PATH = "/kaggle/input/musan-data/musan"  # Adjust if needed

if os.path.exists(DATA_PATH):
    print("✅ Dataset found! Listing contents:")
    print(os.listdir(DATA_PATH))
else:
    print("❌ Dataset not found! Double-check the path.")


In [None]:
for category in ["music", "noise", "speech"]:
    folder_path = os.path.join(DATA_PATH, category)
    if os.path.exists(folder_path):
        files = os.listdir(folder_path)[:5]  # Show first 5 files
        print(f"\n📂 {category} - {len(os.listdir(folder_path))} files")
        print(files)
    else:
        print(f"⚠️ Folder '{category}' not found!")


In [None]:
for category in ["music", "noise", "speech"]:
    folder_path = os.path.join(DATA_PATH, category)

    for subfolder in os.listdir(folder_path):
        subfolder_path = os.path.join(folder_path, subfolder)
        
        if os.path.isdir(subfolder_path):  # Ensure it's a directory
            audio_files = [f for f in os.listdir(subfolder_path) if f.endswith(('.wav', '.mp3'))][:5]
            print(f"\n📂 {category}/{subfolder} - {len(os.listdir(subfolder_path))} files")
            print(audio_files)


# Step 1: Preprocessing: Checking all files to know its sample rates and channel

In [None]:
import os
import librosa

DATA_PATH = "/kaggle/input/musan-data/musan/"  # Update this with your actual dataset path

# Function to check sample rate & channels
def check_audio_properties(root_folder):
    sample_rates = {}
    channel_counts = {}

    for root, _, files in os.walk(root_folder):
        for file in files:
            if file.endswith(".wav"):  # Only process WAV files
                file_path = os.path.join(root, file)
                
                try:
                    audio, sr = librosa.load(file_path, sr=None, mono=False)  # Load in original format
                    channels = 1 if len(audio.shape) == 1 else 2  # Determine mono or stereo

                    # Store results
                    if sr not in sample_rates:
                        sample_rates[sr] = 0
                    sample_rates[sr] += 1

                    if channels not in channel_counts:
                        channel_counts[channels] = 0
                    channel_counts[channels] += 1

                except Exception as e:
                    print(f"Error processing {file_path}: {e}")

    return sample_rates, channel_counts

# Run check
sample_rates, channel_counts = check_audio_properties(DATA_PATH)

# Print results
print("🎵 Sample Rate Distribution:", sample_rates)
print("🎧 Channel Distribution (1=Mono, 2=Stereo):", channel_counts)


In [None]:
import os
import subprocess
import json
from collections import defaultdict

# Define dataset path
DATA_PATH = "/kaggle/input/musan-data/musan"
CATEGORIES = ["music", "noise", "speech"]
file_count = defaultdict(int)
MAX_FILES = 5  # Limit to 5 files per category

# Function to extract bit rate
def get_bit_rate(file_path):
    cmd = f"ffprobe -i '{file_path}' -show_entries stream=bit_rate -of json -v quiet"
    result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
    
    try:
        data = json.loads(result.stdout)  # Convert JSON string to dictionary
        return data["streams"][0]["bit_rate"] if "streams" in data and data["streams"] else "N/A"
    except (json.JSONDecodeError, KeyError, IndexError):
        return "N/A"

# Process only 5 files from each category
for root, _, files in os.walk(DATA_PATH):
    for file in files:
        if file.endswith(".wav"):
            file_path = os.path.join(root, file)

            # Identify category from folder structure
            for category in CATEGORIES:
                if f"/{category}/" in file_path and file_count[category] < MAX_FILES:
                    bit_rate = get_bit_rate(file_path)
                    print(f"🔍 {category.upper()} | {file}: {bit_rate} bps")

                    file_count[category] += 1
                    break  # Move to next file

        if all(count >= MAX_FILES for count in file_count.values()):
            break

print("\n✅ Bit rate extraction complete for 5 files per category!")


## Since all files have the same sample rate of 16kHz and all are mono, we dont have to perform the explicit standardization

## Since all files show a high bit-rate of 256kbps, we compress to different low bit-rates


# Step 2: Compression at different bit rates for all files


In [None]:
!apt-get install -y ffmpeg


In [None]:
import os
dataset_path = "/kaggle/input/compressed-sample1"
if os.path.exists(dataset_path):
    print("✅ Dataset exists! Contents:")
    print(os.listdir(dataset_path))
else:
    print("❌ Dataset not found. Check the dataset name.")


In [None]:
import os
import subprocess
import shutil

# Paths
DATA_PATH = "/kaggle/input/musan-data/musan/music"
OUTPUT_PATH = "/kaggle/working/compressed_music_sample"

# Bitrates for music compression
BITRATES = {"16kbps": "16000", "32kbps": "32000", "64kbps": "64000"}

# Ensure output directory exists
os.makedirs(OUTPUT_PATH, exist_ok=True)

# Get a subfolder inside 'music'
music_subfolders = [f.path for f in os.scandir(DATA_PATH) if f.is_dir()]
if not music_subfolders:
    print("❌ No subfolders found in music.")
else:
    music_folder = music_subfolders[0]  # Choose the first subfolder
    music_files = [f for f in os.listdir(music_folder) if f.endswith(".wav")][:5]  # Pick 5 files

    # Compress 5 files
    for file in music_files:
        input_file = os.path.join(music_folder, file)
        for bitrate_name, bitrate_value in BITRATES.items():
            output_dir = os.path.join(OUTPUT_PATH, f"compressed_{bitrate_name}")
            os.makedirs(output_dir, exist_ok=True)
            output_file = os.path.join(output_dir, file.replace(".wav", ".opus"))
            
            # Compress using ffmpeg
            cmd = f"ffmpeg -i '{input_file}' -c:a libopus -b:a {bitrate_value} '{output_file}' -y"
            subprocess.run(cmd, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

    print("✅ Compression complete! Files saved in:", OUTPUT_PATH)


In [None]:
import shutil

# Path to save the zip file
ZIP_PATH = "/kaggle/working/compressed_music_sample.zip"

# Create zip archive
shutil.make_archive(ZIP_PATH.replace(".zip", ""), 'zip', "/kaggle/working/compressed_music_sample")

print(f"✅ Zip file created: {ZIP_PATH}")


# Step 3.1: Compressing the speech folder applying 3 bit rates corresponding to speech data


>### Here the compressing code is shown for speech data, similarly compression is been carried out for music files (5 subfolders - fma, fma-western-art, hd-classical, jamendo, rfm) by taking the first 100 files in each subfolder and for noise data too (by taking first 100 files from the 2 subfolders - free-sound and sound-bible)

In [1]:
import os
import subprocess
import shutil

# Paths
DATA_PATH = "/kaggle/input/musan-data/musan/speech"
OUTPUT_PATH = "/kaggle/working/compressed_speech"

# Bitrates for speech compression
BITRATES = {
    "3kbps": "3000",
    "6kbps": "6000",
    "12kbps": "12000"
}

# Create output directory if not exists
os.makedirs(OUTPUT_PATH, exist_ok=True)

# Function to compress a single file
def compress_audio(input_file, output_file, bitrate):
    cmd = f"ffmpeg -i '{input_file}' -c:a libopus -b:a {bitrate} '{output_file}' -y"
    subprocess.run(cmd, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

# Resume check: Load last processed file if exists
resume_file = "/kaggle/working/speech_last_processed.txt"
last_processed = None
if os.path.exists(resume_file):
    with open(resume_file, "r") as f:
        last_processed = f.read().strip()

# Process files
processed_count = 0
resume_found = last_processed is None  # If no resume file, start immediately

for root, _, files in os.walk(DATA_PATH):
    # Only process files in 'librivox' folder
    if 'librivox' in root:
        for file in sorted(files):  # Sorting ensures consistent processing order
            if file.endswith(".wav"):
                input_file = os.path.join(root, file)
                relative_path = os.path.relpath(root, DATA_PATH)  # Maintain hierarchy
                
                if not resume_found:
                    if input_file == last_processed:
                        resume_found = True  # Resume processing from the next file
                    continue
                
                # Apply compression for each bitrate
                for bitrate_label, bitrate_value in BITRATES.items():
                    output_dir = os.path.join(OUTPUT_PATH, f"compressed_{bitrate_label}", relative_path)
                    os.makedirs(output_dir, exist_ok=True)  # Create directories if missing
                    
                    output_file = os.path.join(output_dir, file.replace(".wav", ".opus"))
                    compress_audio(input_file, output_file, bitrate_value)

                processed_count += 1
                
                # Save progress every 50 files
                if processed_count % 50 == 0:
                    with open(resume_file, "w") as f:
                        f.write(input_file)
                    
                    # Zip and upload files every 50 processed
                    zip_path = "/kaggle/working/compressed_speech.zip"
                    shutil.make_archive(zip_path.replace(".zip", ""), 'zip', OUTPUT_PATH)
                    print("✅ 50 files compressed & backup saved!")

print("✅ Speech compression complete!")

# Final dataset backup
final_zip_path = "/kaggle/working/compressed_speech_final.zip"
shutil.make_archive(final_zip_path.replace(".zip", ""), 'zip', OUTPUT_PATH)
print(f"✅ Final backup saved at: {final_zip_path}")


✅ 50 files compressed & backup saved!
✅ 50 files compressed & backup saved!
✅ 50 files compressed & backup saved!
✅ Speech compression complete!
✅ Final backup saved at: /kaggle/working/compressed_speech_final.zip


# Step 3.2: Compressing speech subfolder : us-gov 


In [2]:
import os
import subprocess
import shutil

# Paths
DATA_PATH = "/kaggle/input/musan-data/musan/speech/us-gov"
OUTPUT_PATH = "/kaggle/working/usgov_compressed_speech"

# Bitrates for speech compression
BITRATES = {
    "3kbps": "3000",
    "6kbps": "6000",
    "12kbps": "12000"
}

# Create output directory if not exists
os.makedirs(OUTPUT_PATH, exist_ok=True)

# Function to compress a single file
def compress_audio(input_file, output_file, bitrate):
    cmd = f"ffmpeg -i '{input_file}' -c:a libopus -b:a {bitrate} '{output_file}' -y"
    subprocess.run(cmd, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

# Resume check: Load last processed file if exists
resume_file = "/kaggle/working/usgov_last_processed.txt"
last_processed = None
if os.path.exists(resume_file):
    with open(resume_file, "r") as f:
        last_processed = f.read().strip()

# Process files
processed_count = 0
resume_found = last_processed is None  # If no resume file, start immediately

for root, _, files in os.walk(DATA_PATH):
    for file in sorted(files):  # Sorting ensures consistent processing order
        if file.endswith(".wav"):
            input_file = os.path.join(root, file)
            relative_path = os.path.relpath(root, DATA_PATH)  # Maintain hierarchy

            if not resume_found:
                if input_file == last_processed:
                    resume_found = True  # Resume processing from the next file
                continue

            # Apply compression for each bitrate
            for bitrate_label, bitrate_value in BITRATES.items():
                output_dir = os.path.join(OUTPUT_PATH, f"usgov_compressed_{bitrate_label}", relative_path)
                os.makedirs(output_dir, exist_ok=True)  # Create directories if missing

                output_file = os.path.join(output_dir, file.replace(".wav", ".opus"))
                compress_audio(input_file, output_file, bitrate_value)

            processed_count += 1

            # Stop after compressing 150 files
            if processed_count >= 150:
                break

            # Save progress every 50 files
            if processed_count % 50 == 0:
                with open(resume_file, "w") as f:
                    f.write(input_file)
                
                # Zip and upload files every 50 processed
                zip_path = "/kaggle/working/usgov_compressed_speech.zip"
                shutil.make_archive(zip_path.replace(".zip", ""), 'zip', OUTPUT_PATH)
                print("✅ 50 files compressed & backup saved!")

print("✅ US-GOV speech compression complete!")

# Final dataset backup
final_zip_path = "/kaggle/working/usgov_compressed_speech_final.zip"
shutil.make_archive(final_zip_path.replace(".zip", ""), 'zip', OUTPUT_PATH)
print(f"✅ Final backup saved at: {final_zip_path}")

✅ 50 files compressed & backup saved!
✅ 50 files compressed & backup saved!
✅ US-GOV speech compression complete!
✅ Final backup saved at: /kaggle/working/usgov_compressed_speech_final.zip


# Step 4: Analyzing the speech signal quality before and after compression using SNR


In [6]:
pip install pystoi


Collecting pystoi
  Downloading pystoi-0.4.1-py2.py3-none-any.whl.metadata (4.0 kB)
Downloading pystoi-0.4.1-py2.py3-none-any.whl (8.2 kB)
Installing collected packages: pystoi
Successfully installed pystoi-0.4.1
Note: you may need to restart the kernel to use updated packages.


In [21]:
#analyzing using SNR,SDR,STOI and PESQ for librivox data of 3kbps

import os
import librosa
import numpy as np
from pesq import pesq
from pystoi import stoi
from mir_eval.separation import bss_eval_sources

# === Constants ===
SAMPLE_RATE = 16000
DURATION = 30
TARGET_SAMPLES = SAMPLE_RATE * DURATION

# === Evaluation Functions ===
def fix_length(audio, target_len):
    if len(audio) > target_len:
        return audio[:target_len]
    return np.pad(audio, (0, target_len - len(audio)))

def snr(original, compressed):
    min_len = min(len(original), len(compressed))
    original = original[:min_len]
    compressed = compressed[:min_len]
    noise = original - compressed
    eps = 1e-10
    return 10 * np.log10(np.sum(original ** 2) / (np.sum(noise ** 2) + eps))

# === Paths ===
ORIGINAL_PATH = "/kaggle/input/musan-data/musan/speech/librivox"
COMPRESSED_PATH = "/kaggle/input/compressed-data/compressed_musan/speech/librivox/compressed_3kbps/librivox"

# === Metric Storage ===
snr_values, sdr_values, stoi_values, pesq_values = [], [], [], []

# === Evaluation Loop ===
for root, _, files in os.walk(ORIGINAL_PATH):
    for file in sorted(files):
        if file.endswith(".wav"):
            original_file = os.path.join(root, file)
            compressed_file = os.path.join(COMPRESSED_PATH, file.replace(".wav", ".opus"))

            try:
                orig_audio, _ = librosa.load(original_file, sr=SAMPLE_RATE)
                comp_audio, _ = librosa.load(compressed_file, sr=SAMPLE_RATE)

                orig_audio = fix_length(orig_audio, TARGET_SAMPLES)
                comp_audio = fix_length(comp_audio, TARGET_SAMPLES)

                # SNR
                snr_values.append(snr(orig_audio, comp_audio))

                # SDR
                sdr, _, _, _ = bss_eval_sources(orig_audio[None, :], comp_audio[None, :])
                sdr_values.append(sdr[0])

                # STOI
                stoi_values.append(stoi(orig_audio, comp_audio, SAMPLE_RATE, extended=False))

                # PESQ
                pesq_score = pesq(SAMPLE_RATE, orig_audio, comp_audio, 'wb')
                pesq_values.append(pesq_score)

                print(f"✅ {file}: SNR={snr_values[-1]:.2f}, SDR={sdr[0]:.2f}, STOI={stoi_values[-1]:.3f}, PESQ={pesq_score:.3f}")

            except Exception as e:
                print(f"⚠️ Error processing {file}: {e}")

# === Print Averages ===
if snr_values:
    print("\n📊 Compressed Audio Evaluation (vs Original):")
    print(f"✅ Avg SNR :  {np.mean(snr_values):.2f} dB")
    print(f"✅ Avg SDR :  {np.mean(sdr_values):.2f} dB")
    print(f"✅ Avg STOI:  {np.mean(stoi_values):.3f}")
    print(f"✅ Avg PESQ:  {np.mean(pesq_values):.3f}")
else:
    print("\n⚠ No valid files processed for metric calculation.")


	Deprecated as of mir_eval version 0.8.
	It will be removed in mir_eval version 0.9.
  sdr, _, _, _ = bss_eval_sources(orig_audio[None, :], comp_audio[None, :])


✅ speech-librivox-0000.wav: SNR=0.39, SDR=-3.40, STOI=0.747, PESQ=1.641
✅ speech-librivox-0001.wav: SNR=0.83, SDR=-4.23, STOI=0.717, PESQ=1.442
✅ speech-librivox-0002.wav: SNR=0.65, SDR=-3.31, STOI=0.803, PESQ=1.419
✅ speech-librivox-0003.wav: SNR=-1.02, SDR=-7.24, STOI=0.696, PESQ=1.357
✅ speech-librivox-0004.wav: SNR=0.75, SDR=-3.26, STOI=0.683, PESQ=1.369
✅ speech-librivox-0005.wav: SNR=-0.56, SDR=-3.79, STOI=0.588, PESQ=1.359
✅ speech-librivox-0006.wav: SNR=1.37, SDR=-2.36, STOI=0.688, PESQ=1.547
✅ speech-librivox-0007.wav: SNR=-1.07, SDR=-6.40, STOI=0.752, PESQ=1.332
✅ speech-librivox-0008.wav: SNR=1.78, SDR=0.18, STOI=0.781, PESQ=1.508
✅ speech-librivox-0009.wav: SNR=1.01, SDR=-2.09, STOI=0.731, PESQ=1.590
✅ speech-librivox-0010.wav: SNR=1.90, SDR=-0.51, STOI=0.790, PESQ=1.547
✅ speech-librivox-0011.wav: SNR=-4.17, SDR=-11.31, STOI=0.779, PESQ=1.757
✅ speech-librivox-0012.wav: SNR=0.61, SDR=-4.49, STOI=0.676, PESQ=1.344
✅ speech-librivox-0013.wav: SNR=-1.44, SDR=-7.45, STOI=0.699

In [13]:
#analyzing using SNR librivox data of 12kbps


import os
import librosa
import numpy as np

# Function to compute SNR
def snr(original, compressed):
    min_len = min(len(original), len(compressed))
    original = original[:min_len]
    compressed = compressed[:min_len]
    
    noise = original - compressed
    eps = 1e-10  # Small value to prevent division by zero
    
    return 10 * np.log10(np.sum(original ** 2) / (np.sum(noise ** 2) + eps))

# Paths
ORIGINAL_PATH = "/kaggle/input/musan-data/musan/speech/librivox"
COMPRESSED_PATH = "/kaggle/input/compressed-data/compressed_musan/speech/librivox/compressed_12kbps/librivox"

# List to store SNR values
snr_values = []

# Process all files for SNR evaluation
for root, _, files in os.walk(ORIGINAL_PATH):
    for file in sorted(files):
        if file.endswith(".wav"):
            original_file = os.path.join(root, file)
            compressed_file = os.path.join(COMPRESSED_PATH, file.replace(".wav", ".opus"))
            
            # Load original and compressed audio
            try:
                original_audio, _ = librosa.load(original_file, sr=16000)  # Standard sampling rate
                compressed_audio, _ = librosa.load(compressed_file, sr=16000)
                
                # Compute SNR
                snr_value = snr(original_audio, compressed_audio)
                snr_values.append(snr_value)
                
                print(f"SNR for {file}: {snr_value:.2f} dB")
            except Exception as e:
                print(f"Error processing {file}: {e}")

# Compute and print the average SNR
if snr_values:
    avg_snr = np.mean(snr_values)
    print(f"\n✅ Average SNR for compressed dataset: {avg_snr:.2f} dB")
else:
    print("\n⚠ No valid files processed for SNR calculation.")


SNR for speech-librivox-0000.wav: 10.78 dB
SNR for speech-librivox-0001.wav: 13.62 dB
SNR for speech-librivox-0002.wav: 9.91 dB
SNR for speech-librivox-0003.wav: 7.61 dB
SNR for speech-librivox-0004.wav: 13.27 dB
SNR for speech-librivox-0005.wav: 7.68 dB
SNR for speech-librivox-0006.wav: 14.23 dB
SNR for speech-librivox-0007.wav: 9.42 dB
SNR for speech-librivox-0008.wav: 11.65 dB
SNR for speech-librivox-0009.wav: 13.85 dB
SNR for speech-librivox-0010.wav: 13.30 dB
SNR for speech-librivox-0011.wav: 4.72 dB
SNR for speech-librivox-0012.wav: 12.18 dB
SNR for speech-librivox-0013.wav: 8.22 dB
SNR for speech-librivox-0014.wav: 12.56 dB
SNR for speech-librivox-0015.wav: 10.46 dB
SNR for speech-librivox-0016.wav: 14.19 dB
SNR for speech-librivox-0017.wav: 12.34 dB
SNR for speech-librivox-0018.wav: 13.17 dB
SNR for speech-librivox-0019.wav: 15.15 dB
SNR for speech-librivox-0020.wav: 11.77 dB
SNR for speech-librivox-0021.wav: 20.24 dB
SNR for speech-librivox-0022.wav: 15.34 dB
SNR for speech-li

In [None]:
#analyzing using SNR,SDR,STOI and PESQ for librivox data of 12kbps

import os
import librosa
import numpy as np
from pesq import pesq
from pystoi import stoi
from mir_eval.separation import bss_eval_sources

# === Constants ===
SAMPLE_RATE = 16000
DURATION = 30
TARGET_SAMPLES = SAMPLE_RATE * DURATION

# === Evaluation Functions ===
def fix_length(audio, target_len):
    if len(audio) > target_len:
        return audio[:target_len]
    return np.pad(audio, (0, target_len - len(audio)))

def snr(original, compressed):
    min_len = min(len(original), len(compressed))
    original = original[:min_len]
    compressed = compressed[:min_len]
    noise = original - compressed
    eps = 1e-10
    return 10 * np.log10(np.sum(original ** 2) / (np.sum(noise ** 2) + eps))

# === Paths ===
ORIGINAL_PATH = "/kaggle/input/musan-data/musan/speech/librivox"
COMPRESSED_PATH = "/kaggle/input/compressed-data/compressed_musan/speech/librivox/compressed_12kbps/librivox"

# === Metric Storage ===
snr_values, sdr_values, stoi_values, pesq_values = [], [], [], []

# === Evaluation Loop ===
for root, _, files in os.walk(ORIGINAL_PATH):
    for file in sorted(files):
        if file.endswith(".wav"):
            original_file = os.path.join(root, file)
            compressed_file = os.path.join(COMPRESSED_PATH, file.replace(".wav", ".opus"))

            try:
                orig_audio, _ = librosa.load(original_file, sr=SAMPLE_RATE)
                comp_audio, _ = librosa.load(compressed_file, sr=SAMPLE_RATE)

                orig_audio = fix_length(orig_audio, TARGET_SAMPLES)
                comp_audio = fix_length(comp_audio, TARGET_SAMPLES)

                # SNR
                snr_values.append(snr(orig_audio, comp_audio))

                # SDR
                sdr, _, _, _ = bss_eval_sources(orig_audio[None, :], comp_audio[None, :])
                sdr_values.append(sdr[0])

                # STOI
                stoi_values.append(stoi(orig_audio, comp_audio, SAMPLE_RATE, extended=False))

                # PESQ
                pesq_score = pesq(SAMPLE_RATE, orig_audio, comp_audio, 'wb')
                pesq_values.append(pesq_score)

                print(f"✅ {file}: SNR={snr_values[-1]:.2f}, SDR={sdr[0]:.2f}, STOI={stoi_values[-1]:.3f}, PESQ={pesq_score:.3f}")

            except Exception as e:
                print(f"⚠️ Error processing {file}: {e}")

# === Print Averages ===
if snr_values:
    print("\n📊 Compressed Audio Evaluation (vs Original):")
    print(f"✅ Avg SNR :  {np.mean(snr_values):.2f} dB")
    print(f"✅ Avg SDR :  {np.mean(sdr_values):.2f} dB")
    print(f"✅ Avg STOI:  {np.mean(stoi_values):.3f}")
    print(f"✅ Avg PESQ:  {np.mean(pesq_values):.3f}")
else:
    print("\n⚠ No valid files processed for metric calculation.")


	Deprecated as of mir_eval version 0.8.
	It will be removed in mir_eval version 0.9.
  sdr, _, _, _ = bss_eval_sources(orig_audio[None, :], comp_audio[None, :])


✅ speech-librivox-0000.wav: SNR=10.79, SDR=11.22, STOI=0.978, PESQ=4.174
✅ speech-librivox-0001.wav: SNR=13.07, SDR=13.49, STOI=0.974, PESQ=4.003
✅ speech-librivox-0002.wav: SNR=9.75, SDR=10.12, STOI=0.975, PESQ=3.720
✅ speech-librivox-0003.wav: SNR=6.81, SDR=7.15, STOI=0.972, PESQ=4.014
✅ speech-librivox-0004.wav: SNR=13.96, SDR=14.50, STOI=0.966, PESQ=3.896
✅ speech-librivox-0005.wav: SNR=9.81, SDR=10.32, STOI=0.835, PESQ=3.767
✅ speech-librivox-0006.wav: SNR=15.46, SDR=16.11, STOI=0.968, PESQ=4.107
✅ speech-librivox-0007.wav: SNR=8.84, SDR=8.94, STOI=0.980, PESQ=3.665
✅ speech-librivox-0008.wav: SNR=10.52, SDR=11.37, STOI=0.977, PESQ=3.751
✅ speech-librivox-0009.wav: SNR=13.14, SDR=13.64, STOI=0.957, PESQ=3.867
✅ speech-librivox-0010.wav: SNR=14.02, SDR=14.40, STOI=0.980, PESQ=3.848
✅ speech-librivox-0011.wav: SNR=6.19, SDR=5.63, STOI=0.963, PESQ=3.874
✅ speech-librivox-0012.wav: SNR=11.78, SDR=12.35, STOI=0.971, PESQ=3.941
✅ speech-librivox-0013.wav: SNR=9.54, SDR=9.79, STOI=0.970,

In [1]:
#analyzing using SNR for us-gov data of 12kbps


import os
import librosa
import numpy as np

# Function to compute SNR
def snr(original, compressed):
    min_len = min(len(original), len(compressed))
    original = original[:min_len]
    compressed = compressed[:min_len]
    
    noise = original - compressed
    eps = 1e-10  # Small value to prevent division by zero
    
    return 10 * np.log10(np.sum(original ** 2) / (np.sum(noise ** 2) + eps))

# Paths
ORIGINAL_PATH = "/kaggle/input/musan-data/musan/speech/us-gov"
COMPRESSED_PATH = "/kaggle/input/compressed-data/compressed_musan/speech/us-gov/usgov_compressed_12kbps"

# List to store SNR values
snr_values = []
file_count = 0
MAX_FILES = 150  # Limit to first 150 files

# Process files for SNR evaluation
for root, _, files in os.walk(ORIGINAL_PATH):
    for file in sorted(files):
        if file.endswith(".wav"):
            if file_count >= MAX_FILES:
                break  # Stop after processing 150 files
            
            original_file = os.path.join(root, file)
            compressed_file = os.path.join(COMPRESSED_PATH, file.replace(".wav", ".opus"))
            
            # Load original and compressed audio
            try:
                original_audio, _ = librosa.load(original_file, sr=16000)  # Standard sampling rate
                compressed_audio, _ = librosa.load(compressed_file, sr=16000)
                
                # Compute SNR
                snr_value = snr(original_audio, compressed_audio)
                snr_values.append(snr_value)
                
                print(f"SNR for {file}: {snr_value:.2f} dB")
                file_count += 1
            except Exception as e:
                print(f"Error processing {file}: {e}")

# Compute and print the average SNR
if snr_values:
    avg_snr = np.mean(snr_values)
    print(f"\n✅ Average SNR for compressed dataset (first {MAX_FILES} files): {avg_snr:.2f} dB")
else:
    print("\n⚠ No valid files processed for SNR calculation.")


SNR for speech-us-gov-0000.wav: 15.61 dB
SNR for speech-us-gov-0001.wav: 15.80 dB
SNR for speech-us-gov-0002.wav: 13.65 dB
SNR for speech-us-gov-0003.wav: 13.93 dB
SNR for speech-us-gov-0004.wav: 12.47 dB
SNR for speech-us-gov-0005.wav: 9.54 dB
SNR for speech-us-gov-0006.wav: 10.81 dB
SNR for speech-us-gov-0007.wav: 9.85 dB
SNR for speech-us-gov-0008.wav: 10.54 dB
SNR for speech-us-gov-0009.wav: 11.35 dB
SNR for speech-us-gov-0010.wav: 11.34 dB
SNR for speech-us-gov-0011.wav: 10.44 dB
SNR for speech-us-gov-0012.wav: 10.65 dB
SNR for speech-us-gov-0013.wav: 9.88 dB
SNR for speech-us-gov-0014.wav: 9.36 dB
SNR for speech-us-gov-0015.wav: 12.77 dB
SNR for speech-us-gov-0016.wav: 13.95 dB
SNR for speech-us-gov-0017.wav: 12.58 dB
SNR for speech-us-gov-0018.wav: 12.63 dB
SNR for speech-us-gov-0019.wav: 12.85 dB
SNR for speech-us-gov-0020.wav: 15.87 dB
SNR for speech-us-gov-0021.wav: 12.67 dB
SNR for speech-us-gov-0022.wav: 12.40 dB
SNR for speech-us-gov-0023.wav: 12.50 dB
SNR for speech-us-go

In [None]:
#analyzing using SNR,SDR,STOI and PESQ for us-gov data of 12kbps


import os
import librosa
import numpy as np
from pesq import pesq
from pystoi import stoi
from mir_eval.separation import bss_eval_sources

# === Constants ===
SAMPLE_RATE = 16000
DURATION = 30
TARGET_SAMPLES = SAMPLE_RATE * DURATION

# === Evaluation Functions ===
def fix_length(audio, target_len):
    if len(audio) > target_len:
        return audio[:target_len]
    return np.pad(audio, (0, target_len - len(audio)))

def snr(original, compressed):
    min_len = min(len(original), len(compressed))
    original = original[:min_len]
    compressed = compressed[:min_len]
    noise = original - compressed
    eps = 1e-10
    return 10 * np.log10(np.sum(original ** 2) / (np.sum(noise ** 2) + eps))

# === Paths ===
ORIGINAL_PATH = "/kaggle/input/musan-data/musan/speech/us-gov"
COMPRESSED_PATH = "/kaggle/input/compressed-data/compressed_musan/speech/us-gov/usgov_compressed_12kbps"

# === Metric Storage ===
snr_values, sdr_values, stoi_values, pesq_values = [], [], [], []

# === Evaluation Loop ===
for root, _, files in os.walk(ORIGINAL_PATH):
    for file in sorted(files):
        if file.endswith(".wav"):
            original_file = os.path.join(root, file)
            compressed_file = os.path.join(COMPRESSED_PATH, file.replace(".wav", ".opus"))

            try:
                orig_audio, _ = librosa.load(original_file, sr=SAMPLE_RATE)
                comp_audio, _ = librosa.load(compressed_file, sr=SAMPLE_RATE)

                orig_audio = fix_length(orig_audio, TARGET_SAMPLES)
                comp_audio = fix_length(comp_audio, TARGET_SAMPLES)

                # SNR
                snr_values.append(snr(orig_audio, comp_audio))

                # SDR
                sdr, _, _, _ = bss_eval_sources(orig_audio[None, :], comp_audio[None, :])
                sdr_values.append(sdr[0])

                # STOI
                stoi_values.append(stoi(orig_audio, comp_audio, SAMPLE_RATE, extended=False))

                # PESQ
                pesq_score = pesq(SAMPLE_RATE, orig_audio, comp_audio, 'wb')
                pesq_values.append(pesq_score)

                print(f"✅ {file}: SNR={snr_values[-1]:.2f}, SDR={sdr[0]:.2f}, STOI={stoi_values[-1]:.3f}, PESQ={pesq_score:.3f}")

            except Exception as e:
                print(f"⚠️ Error processing {file}: {e}")

# === Print Averages ===
if snr_values:
    print("\n📊 Compressed Audio Evaluation (vs Original):")
    print(f"✅ Avg SNR :  {np.mean(snr_values):.2f} dB")
    print(f"✅ Avg SDR :  {np.mean(sdr_values):.2f} dB")
    print(f"✅ Avg STOI:  {np.mean(stoi_values):.3f}")
    print(f"✅ Avg PESQ:  {np.mean(pesq_values):.3f}")
else:
    print("\n⚠ No valid files processed for metric calculation.")


In [2]:
#analyzing using SNR for us-gov data of 3kbps


import os
import librosa
import numpy as np

# Function to compute SNR
def snr(original, compressed):
    min_len = min(len(original), len(compressed))
    original = original[:min_len]
    compressed = compressed[:min_len]
    
    noise = original - compressed
    eps = 1e-10  # Small value to prevent division by zero
    
    return 10 * np.log10(np.sum(original ** 2) / (np.sum(noise ** 2) + eps))

# Paths
ORIGINAL_PATH = "/kaggle/input/musan-data/musan/speech/us-gov"
COMPRESSED_PATH = "/kaggle/input/compressed-data/compressed_musan/speech/us-gov/usgov_compressed_3kbps"

# List to store SNR values
snr_values = []
file_count = 0
MAX_FILES = 150  # Limit to first 150 files

# Process files for SNR evaluation
for root, _, files in os.walk(ORIGINAL_PATH):
    for file in sorted(files):
        if file.endswith(".wav"):
            if file_count >= MAX_FILES:
                break  # Stop after processing 150 files
            
            original_file = os.path.join(root, file)
            compressed_file = os.path.join(COMPRESSED_PATH, file.replace(".wav", ".opus"))
            
            # Load original and compressed audio
            try:
                original_audio, _ = librosa.load(original_file, sr=16000)  # Standard sampling rate
                compressed_audio, _ = librosa.load(compressed_file, sr=16000)
                
                # Compute SNR
                snr_value = snr(original_audio, compressed_audio)
                snr_values.append(snr_value)
                
                print(f"SNR for {file}: {snr_value:.2f} dB")
                file_count += 1
            except Exception as e:
                print(f"Error processing {file}: {e}")

# Compute and print the average SNR
if snr_values:
    avg_snr = np.mean(snr_values)
    print(f"\n✅ Average SNR for compressed dataset (first {MAX_FILES} files): {avg_snr:.2f} dB")
else:
    print("\n⚠ No valid files processed for SNR calculation.")


SNR for speech-us-gov-0000.wav: 1.62 dB
SNR for speech-us-gov-0001.wav: 1.46 dB
SNR for speech-us-gov-0002.wav: 1.07 dB
SNR for speech-us-gov-0003.wav: 0.40 dB
SNR for speech-us-gov-0004.wav: -0.61 dB
SNR for speech-us-gov-0005.wav: -2.82 dB
SNR for speech-us-gov-0006.wav: -0.72 dB
SNR for speech-us-gov-0007.wav: -1.63 dB
SNR for speech-us-gov-0008.wav: -1.24 dB
SNR for speech-us-gov-0009.wav: -0.21 dB
SNR for speech-us-gov-0010.wav: -0.89 dB
SNR for speech-us-gov-0011.wav: -1.35 dB
SNR for speech-us-gov-0012.wav: -1.08 dB
SNR for speech-us-gov-0013.wav: -1.43 dB
SNR for speech-us-gov-0014.wav: -1.86 dB
SNR for speech-us-gov-0015.wav: -0.19 dB
SNR for speech-us-gov-0016.wav: 0.69 dB
SNR for speech-us-gov-0017.wav: -0.03 dB
SNR for speech-us-gov-0018.wav: -0.38 dB
SNR for speech-us-gov-0019.wav: -0.38 dB
SNR for speech-us-gov-0020.wav: 0.91 dB
SNR for speech-us-gov-0021.wav: -0.56 dB
SNR for speech-us-gov-0022.wav: -0.89 dB
SNR for speech-us-gov-0023.wav: -0.34 dB
SNR for speech-us-gov-

In [5]:
#analyzing using SNR,SDR,STOI and PESQ for us-gov data of 3kbps


import os
import librosa
import numpy as np
from pesq import pesq
from pystoi import stoi
from mir_eval.separation import bss_eval_sources

# === Constants ===
SAMPLE_RATE = 16000
DURATION = 30
TARGET_SAMPLES = SAMPLE_RATE * DURATION

# === Evaluation Functions ===
def fix_length(audio, target_len):
    if len(audio) > target_len:
        return audio[:target_len]
    return np.pad(audio, (0, target_len - len(audio)))

def snr(original, compressed):
    min_len = min(len(original), len(compressed))
    original = original[:min_len]
    compressed = compressed[:min_len]
    noise = original - compressed
    eps = 1e-10
    return 10 * np.log10(np.sum(original ** 2) / (np.sum(noise ** 2) + eps))

# === Paths ===
ORIGINAL_PATH = "/kaggle/input/musan-data/musan/speech/us-gov"
COMPRESSED_PATH = "/kaggle/input/compressed-data/compressed_musan/speech/us-gov/usgov_compressed_3kbps"

# === Metric Storage ===
snr_values, sdr_values, stoi_values, pesq_values = [], [], [], []

# === Evaluation Loop ===
for root, _, files in os.walk(ORIGINAL_PATH):
    for file in sorted(files):
        if file.endswith(".wav"):
            original_file = os.path.join(root, file)
            compressed_file = os.path.join(COMPRESSED_PATH, file.replace(".wav", ".opus"))

            try:
                orig_audio, _ = librosa.load(original_file, sr=SAMPLE_RATE)
                comp_audio, _ = librosa.load(compressed_file, sr=SAMPLE_RATE)

                orig_audio = fix_length(orig_audio, TARGET_SAMPLES)
                comp_audio = fix_length(comp_audio, TARGET_SAMPLES)

                # SNR
                snr_values.append(snr(orig_audio, comp_audio))

                # SDR
                sdr, _, _, _ = bss_eval_sources(orig_audio[None, :], comp_audio[None, :])
                sdr_values.append(sdr[0])

                # STOI
                stoi_values.append(stoi(orig_audio, comp_audio, SAMPLE_RATE, extended=False))

                # PESQ
                pesq_score = pesq(SAMPLE_RATE, orig_audio, comp_audio, 'wb')
                pesq_values.append(pesq_score)

                print(f"✅ {file}: SNR={snr_values[-1]:.2f}, SDR={sdr[0]:.2f}, STOI={stoi_values[-1]:.3f}, PESQ={pesq_score:.3f}")

            except Exception as e:
                print(f"⚠️ Error processing {file}: {e}")

# === Print Averages ===
if snr_values:
    print("\n📊 Compressed Audio Evaluation (vs Original):")
    print(f"✅ Avg SNR :  {np.mean(snr_values):.2f} dB")
    print(f"✅ Avg SDR :  {np.mean(sdr_values):.2f} dB")
    print(f"✅ Avg STOI:  {np.mean(stoi_values):.3f}")
    print(f"✅ Avg PESQ:  {np.mean(pesq_values):.3f}")
else:
    print("\n⚠ No valid files processed for metric calculation.")


	Deprecated as of mir_eval version 0.8.
	It will be removed in mir_eval version 0.9.
  sdr, _, _, _ = bss_eval_sources(orig_audio[None, :], comp_audio[None, :])


✅ speech-us-gov-0000.wav: SNR=1.38, SDR=-0.73, STOI=0.587, PESQ=1.534
✅ speech-us-gov-0001.wav: SNR=1.10, SDR=-1.52, STOI=0.554, PESQ=1.306
✅ speech-us-gov-0002.wav: SNR=1.03, SDR=-2.70, STOI=0.539, PESQ=1.465
✅ speech-us-gov-0003.wav: SNR=-0.33, SDR=-5.64, STOI=0.452, PESQ=1.299
✅ speech-us-gov-0004.wav: SNR=-0.29, SDR=-6.55, STOI=0.386, PESQ=1.210
✅ speech-us-gov-0005.wav: SNR=-2.82, SDR=-10.94, STOI=0.480, PESQ=1.291
✅ speech-us-gov-0006.wav: SNR=-1.30, SDR=-11.92, STOI=0.502, PESQ=1.286
✅ speech-us-gov-0007.wav: SNR=-1.94, SDR=-11.59, STOI=0.527, PESQ=1.252
✅ speech-us-gov-0008.wav: SNR=-1.24, SDR=-10.07, STOI=0.524, PESQ=1.301
✅ speech-us-gov-0009.wav: SNR=0.09, SDR=-7.19, STOI=0.581, PESQ=1.201
✅ speech-us-gov-0010.wav: SNR=-0.25, SDR=-7.15, STOI=0.572, PESQ=1.256
✅ speech-us-gov-0011.wav: SNR=-1.03, SDR=-9.26, STOI=0.593, PESQ=1.208
✅ speech-us-gov-0012.wav: SNR=-0.54, SDR=-9.11, STOI=0.582, PESQ=1.228
✅ speech-us-gov-0013.wav: SNR=-0.38, SDR=-8.26, STOI=0.542, PESQ=1.256
✅ spee

  comp_audio, _ = librosa.load(compressed_file, sr=SAMPLE_RATE)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


⚠️ Error processing speech-us-gov-0150.wav: [Errno 2] No such file or directory: '/kaggle/input/compressed-data/compressed_musan/speech/us-gov/usgov_compressed_3kbps/speech-us-gov-0150.opus'
⚠️ Error processing speech-us-gov-0151.wav: [Errno 2] No such file or directory: '/kaggle/input/compressed-data/compressed_musan/speech/us-gov/usgov_compressed_3kbps/speech-us-gov-0151.opus'
⚠️ Error processing speech-us-gov-0152.wav: [Errno 2] No such file or directory: '/kaggle/input/compressed-data/compressed_musan/speech/us-gov/usgov_compressed_3kbps/speech-us-gov-0152.opus'
⚠️ Error processing speech-us-gov-0153.wav: [Errno 2] No such file or directory: '/kaggle/input/compressed-data/compressed_musan/speech/us-gov/usgov_compressed_3kbps/speech-us-gov-0153.opus'
⚠️ Error processing speech-us-gov-0154.wav: [Errno 2] No such file or directory: '/kaggle/input/compressed-data/compressed_musan/speech/us-gov/usgov_compressed_3kbps/speech-us-gov-0154.opus'
⚠️ Error processing speech-us-gov-0155.wav: [

# Step 5: Aligning Original and compressed pairs for training the model

In [26]:
import os
import librosa
import torch
from torch.utils.data import Dataset, DataLoader

# Define paths
ORIGINAL_SPEECH_PATH = "/kaggle/input/musan-data/musan/speech"
COMPRESSED_SPEECH_PATH = "/kaggle/input/compressed-data/compressed_musan/speech"
BITRATES = ["3kbps", "6kbps", "12kbps"]
SUBSETS = ["librivox", "us-gov"]
US_GOV_COMPRESSED_LIMIT = 150  # Only first 150 files are compressed

class SpeechDataset(Dataset):
    def __init__(self, original_path, compressed_path, bitrates, subsets, sr=16000):
        self.original_path = original_path
        self.compressed_path = compressed_path
        self.bitrates = bitrates
        self.subsets = subsets
        self.sr = sr
        self.file_pairs = self._get_file_pairs()
        
        if len(self.file_pairs) == 0:
            raise ValueError("❌ No matching file pairs found! Check your paths.")

    def _get_file_pairs(self):
        file_pairs = []
        for subset in self.subsets:
            subset_path = os.path.join(self.original_path, subset)
            file_list = sorted([f for f in os.listdir(subset_path) if f.endswith(".wav")])

            if subset == "us-gov":
                file_list = file_list[:US_GOV_COMPRESSED_LIMIT]  # Limit to 150 files

            for file in file_list:
                original_file = os.path.join(subset_path, file)

                for bitrate in self.bitrates:
                    if subset == "librivox":
                        compressed_file = os.path.join(
                            self.compressed_path, subset, f"compressed_{bitrate}", subset, file.replace(".wav", ".opus")
                        )
                    elif subset == "us-gov":
                        compressed_file = os.path.join(
                            self.compressed_path, subset, f"usgov_compressed_{bitrate}", file.replace(".wav", ".opus")
                        )

                    if os.path.exists(compressed_file):
                        file_pairs.append((original_file, compressed_file))
                    else:
                        print(f"❌ Missing file: {compressed_file}")  # Debugging output
                
        return file_pairs

    def __len__(self):
        return len(self.file_pairs)

    def __getitem__(self, idx):
        orig_file, comp_file = self.file_pairs[idx]
        orig_audio, _ = librosa.load(orig_file, sr=self.sr)
        comp_audio, _ = librosa.load(comp_file, sr=self.sr)

        # ✅ Ensure correct format (channels, samples) → (1, samples)
        orig_audio = torch.tensor(orig_audio).unsqueeze(0)  # (1, samples)
        comp_audio = torch.tensor(comp_audio).unsqueeze(0)  # (1, samples)

        return orig_audio, comp_audio

# Collate function to pad all audio to the longest sample in the batch
def collate_fn(batch):
    orig_batch, comp_batch = zip(*batch)
    
    max_len = max(max(x.shape[1] for x in orig_batch), max(x.shape[1] for x in comp_batch))
    
    def pad_audio(audio):
        return torch.nn.functional.pad(audio, (0, max_len - audio.shape[1]))

    orig_batch = torch.stack([pad_audio(x) for x in orig_batch])  # (batch, 1, samples)
    comp_batch = torch.stack([pad_audio(x) for x in comp_batch])  # (batch, 1, samples)
    
    return orig_batch, comp_batch

# ✅ Load dataset into DataLoader
dataset = SpeechDataset(ORIGINAL_SPEECH_PATH, COMPRESSED_SPEECH_PATH, BITRATES, SUBSETS)
dataloader = DataLoader(dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)

# ✅ Check dataset loading
for orig, comp in dataloader:
    print(f"✅ After Fix - Original Shape: {orig.shape}, Compressed Shape: {comp.shape}")
    break  # Only check the first batch


✅ After Fix - Original Shape: torch.Size([4, 1, 9599687]), Compressed Shape: torch.Size([4, 1, 9599687])


# Step 6 : Model Training using demucs

In [3]:
!pip install -q demucs


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.1/87.1 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.6/59.6 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m248.9/248.9 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.0/40.0 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━

In [5]:
!pip install -q demucs
import os
import librosa
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from demucs.hdemucs import HDemucs  # Corrected import

# ✅ Ensure GPU Usage
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"🔥 Using device: {device}")


🔥 Using device: cuda


In [12]:
# Paths
ORIGINAL_SPEECH_PATH = "/kaggle/input/musan-data/musan/speech"
COMPRESSED_SPEECH_PATH = "/kaggle/input/compressed-data/compressed_musan/speech"
BITRATES = ["3kbps", "6kbps", "12kbps"]
SUBSETS = ["librivox", "us-gov"]
SAMPLE_RATE = 16000
DURATION = 30  # 30 seconds
TARGET_SAMPLES = SAMPLE_RATE * DURATION

class SpeechDataset(Dataset):
    def __init__(self, original_path, compressed_path, bitrates, subsets, sr=SAMPLE_RATE):
        self.original_path = original_path
        self.compressed_path = compressed_path
        self.bitrates = bitrates
        self.subsets = subsets
        self.sr = sr
        self.file_pairs = self._get_file_pairs()
        
        if len(self.file_pairs) == 0:
            raise ValueError("❌ No matching file pairs found! Check your paths.")

    def _get_file_pairs(self):
        file_pairs = []
        for subset in self.subsets:
            subset_path = os.path.join(self.original_path, subset)
            file_list = sorted([f for f in os.listdir(subset_path) if f.endswith(".wav")])

            for file in file_list:
                original_file = os.path.join(subset_path, file)

                for bitrate in self.bitrates:
                    compressed_file = os.path.join(
                        self.compressed_path, subset, f"compressed_{bitrate}", subset, file.replace(".wav", ".opus")
                    )

                    if os.path.exists(compressed_file):
                        file_pairs.append((original_file, compressed_file))
                
        return file_pairs

    def _load_audio(self, path):
        audio, _ = librosa.load(path, sr=self.sr, mono=True)
        if len(audio) > TARGET_SAMPLES:
            audio = audio[:TARGET_SAMPLES]
        else:
            audio = F.pad(torch.tensor(audio), (0, TARGET_SAMPLES - len(audio)))
        return audio

    def __len__(self):
        return len(self.file_pairs)

    def __getitem__(self, idx):
        orig_file, comp_file = self.file_pairs[idx]
        orig_audio = self._load_audio(orig_file)
        comp_audio = self._load_audio(comp_file)
        return orig_audio, comp_audio


In [18]:
import torch

def collate_fn(batch):
    orig_batch, comp_batch = zip(*batch)  # Unpack pairs

    def stereo(audio):
        audio = torch.tensor(audio, dtype=torch.float32)  # ✅ Convert NumPy to Tensor
        return torch.stack([audio, audio], dim=0)  # Duplicate mono signal to stereo

    orig_batch = torch.stack([stereo(x) for x in orig_batch])  # (B, 2, T)
    comp_batch = torch.stack([stereo(x) for x in comp_batch])  # (B, 2, T)

    return orig_batch, comp_batch



In [19]:
dataset = SpeechDataset(ORIGINAL_SPEECH_PATH, COMPRESSED_SPEECH_PATH, BITRATES, SUBSETS)
dataloader = DataLoader(dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)

print(f"✅ Loaded {len(dataset)} audio pairs")


✅ Loaded 519 audio pairs


In [23]:
model = HDemucs(sources=["speech"]).to(device)  # Load pretrained Demucs
optimizer = optim.Adam(model.parameters(), lr=1e-4)
loss_fn = nn.MSELoss()


In [26]:
EPOCHS = 10  # Adjust as needed

for epoch in range(EPOCHS):
    total_loss = 0
    model.train()
    
    for orig_wave, comp_wave in dataloader:
        orig_wave, comp_wave = orig_wave.to(device), comp_wave.to(device)

        optimizer.zero_grad()
        enhanced_wave = model(comp_wave)  # Demucs predicts clean audio

        # 🔧 Fix: Match shape with target
        if enhanced_wave.ndim == 4:
            enhanced_wave = enhanced_wave.squeeze(1)  # From (B,1,2,T) → (B,2,T)

        # 🧪 Optional check
        assert enhanced_wave.shape == orig_wave.shape, \
            f"Shape mismatch: got {enhanced_wave.shape}, expected {orig_wave.shape}"

        loss = loss_fn(enhanced_wave, orig_wave)

        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"📢 Epoch {epoch+1}/{EPOCHS}, Loss: {total_loss / len(dataloader):.6f}")


📢 Epoch 1/10, Loss: 0.002582
📢 Epoch 2/10, Loss: 0.002220
📢 Epoch 3/10, Loss: 0.002198
📢 Epoch 4/10, Loss: 0.002170
📢 Epoch 5/10, Loss: 0.002157
📢 Epoch 6/10, Loss: 0.002139
📢 Epoch 7/10, Loss: 0.002085
📢 Epoch 8/10, Loss: 0.002064


KeyboardInterrupt: 

> ## Training was stopped forcefully since it was taking longer than expected to complete 10 epochs  

In [27]:

torch.save(model.state_dict(), "demucs_epoch_8.pth")
print("✅ Model saved successfully!")


✅ Model saved successfully!


# Step 7: Reconstructing using the model on the unseen data

In [1]:
#compressing 150th - 180th file in us-gov folder which was not used for training earlier

import os
from tqdm import tqdm

def compress_files(input_folder, output_folder, bitrate_kbps=6):
    os.makedirs(output_folder, exist_ok=True)
    files = sorted([f for f in os.listdir(input_folder) if f.endswith(".wav")])[150:180]  # Take only 30

    for file in tqdm(files, desc=f"Compressing to {bitrate_kbps}kbps"):
        input_path = os.path.join(input_folder, file)
        output_path = os.path.join(output_folder, file.replace(".wav", ".opus"))

        os.system(f"ffmpeg -y -loglevel error -i '{input_path}' -c:a libopus -b:a {bitrate_kbps}k '{output_path}'")

    return [os.path.join(output_folder, f.replace(".wav", ".opus")) for f in files]

# Define paths
ORIG_UNSEEN_PATH = "/kaggle/input/musan-data/musan/speech/us-gov"
COMPRESSED_TEST_PATH = "./test_compressed/us-gov"

# Run compression
compressed_files = compress_files(ORIG_UNSEEN_PATH, COMPRESSED_TEST_PATH, bitrate_kbps=6)


Compressing to 6kbps: 100%|██████████| 30/30 [03:02<00:00,  6.08s/it]


In [13]:
#reconstructing using demucs model

import os
import torch
import torchaudio
import torch.nn.functional as F
from torchaudio.transforms import Resample
from tqdm import tqdm
from demucs.hdemucs import HDemucs
import soundfile as sf

# === CONSTANTS ===
SAMPLE_RATE = 16000
DURATION = 30
TARGET_SAMPLES = SAMPLE_RATE * DURATION
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# === PATHS ===
COMPRESSED_FOLDER = "/kaggle/working/test_compressed/us-gov"
SAVE_RECONSTRUCTED = "/kaggle/working/reconstructed_usgov_test"
os.makedirs(SAVE_RECONSTRUCTED, exist_ok=True)

# === LOAD MODEL ===
model = HDemucs(sources=["speech"]).to(DEVICE)
model.load_state_dict(torch.load("/kaggle/input/demucs_epoch_8/pytorch/default/1/demucs_epoch_8.pth", map_location=DEVICE))
model.eval()
print("✅ Model loaded.")

# === HELPER FUNCTION ===
def prepare_input(file_path):
    waveform, sr = torchaudio.load(file_path)  # (1, T) for mono
    if sr != SAMPLE_RATE:
        resampler = Resample(sr, SAMPLE_RATE)
        waveform = resampler(waveform)

    waveform = waveform.mean(dim=0)  # Ensure mono
    if waveform.shape[0] > TARGET_SAMPLES:
        waveform = waveform[:TARGET_SAMPLES]
    else:
        waveform = F.pad(waveform, (0, TARGET_SAMPLES - waveform.shape[0]))

    # Convert to stereo (2, T)
    stereo_wave = torch.stack([waveform, waveform], dim=0)
    return stereo_wave.unsqueeze(0).to(DEVICE)  # (1, 2, T)

# === SELECT FILES ===
compressed_files = sorted([
    os.path.join(COMPRESSED_FOLDER, f)
    for f in os.listdir(COMPRESSED_FOLDER)
    if f.endswith(".opus")
])[:30]  # First 30

# === RECONSTRUCT LOOP ===
for comp_path in tqdm(compressed_files, desc="🔊 Reconstructing"):
    input_tensor = prepare_input(comp_path)

    with torch.no_grad():
        output = model(input_tensor)

        # Handle (B,1,2,T) if needed
        if output.ndim == 4:
            output = output.squeeze(1)  # (B, 2, T)

        output = output.squeeze(0).cpu()  # (2, T)

    # Save as (T, 2)
    out_path = os.path.join(
        SAVE_RECONSTRUCTED,
        os.path.basename(comp_path).replace(".opus", "_reconstructed.wav")
    )
    sf.write(out_path, output.permute(1, 0).numpy(), SAMPLE_RATE)

print("✅ Done reconstructing and saving 30 test files.")


  model.load_state_dict(torch.load("/kaggle/input/demucs_epoch_8/pytorch/default/1/demucs_epoch_8.pth", map_location=DEVICE))


✅ Model loaded.


🔊 Reconstructing: 100%|██████████| 30/30 [00:56<00:00,  1.88s/it]

✅ Done reconstructing and saving 30 test files.





In [15]:
#zipping the compressed data (prepared for testing ) and reconstructed
#samples from the working directory

import zipfile
import os

def zip_folder(folder_path, zip_path):
    with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, _, files in os.walk(folder_path):
            for file in files:
                full_path = os.path.join(root, file)
                arcname = os.path.relpath(full_path, start=folder_path)
                zipf.write(full_path, arcname=arcname)

# Paths
reconstructed_folder = "/kaggle/working/reconstructed_usgov_test"
compressed_folder = "/kaggle/working/test_compressed"

# Output ZIP paths
reconstructed_zip = "/kaggle/working/reconstructed_usgov_test.zip"
compressed_zip = "/kaggle/working/test_compressed.zip"

# Zip both folders
zip_folder(reconstructed_folder, reconstructed_zip)
zip_folder(compressed_folder, compressed_zip)

print("✅ Both folders zipped and saved:")
print(f"📁 {reconstructed_zip}")
print(f"📁 {compressed_zip}")


✅ Both folders zipped and saved:
📁 /kaggle/working/reconstructed_usgov_test.zip
📁 /kaggle/working/test_compressed.zip


In [2]:
!pip install pystoi
!pip install pesq


Collecting pystoi
  Downloading pystoi-0.4.1-py2.py3-none-any.whl.metadata (4.0 kB)
Downloading pystoi-0.4.1-py2.py3-none-any.whl (8.2 kB)
Installing collected packages: pystoi
Successfully installed pystoi-0.4.1
Collecting pesq
  Downloading pesq-0.0.4.tar.gz (38 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pesq
  Building wheel for pesq (setup.py) ... [?25l[?25hdone
  Created wheel for pesq: filename=pesq-0.0.4-cp310-cp310-linux_x86_64.whl size=262945 sha256=f76cd0c1fcdd2e2a3a9dede1f3415f4e883e57fe93a79f5a89f136a7496e5e33
  Stored in directory: /root/.cache/pip/wheels/c5/4e/2c/251524370c0fdd659e99639a0fbd0ca5a782c3aafcd456b28d
Successfully built pesq
Installing collected packages: pesq
Successfully installed pesq-0.0.4


In [4]:
!pip install mir_eval


Collecting mir_eval
  Downloading mir_eval-0.8.2-py3-none-any.whl.metadata (3.0 kB)
Downloading mir_eval-0.8.2-py3-none-any.whl (102 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m102.8/102.8 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: mir_eval
Successfully installed mir_eval-0.8.2


# Step 8: Evaluating the model on STOI, SDR and PESQ

In [19]:
import os
import torch
import torchaudio
from torchaudio.transforms import Resample
from tqdm import tqdm
from pesq import pesq
from pystoi import stoi
from mir_eval.separation import bss_eval_sources
import numpy as np

# === CONFIG ===
SAMPLE_RATE = 16000
ORIGINAL_PATH = "/kaggle/input/musan-data/musan/speech/us-gov"
RECONSTRUCTED_PATH = "/kaggle/working/reconstructed_usgov_test"

# === EVALUATION METRICS ===
sdr_scores = []
stoi_scores = []
pesq_scores = []

# === FILE LIST ===
reconstructed_files = sorted([
    f for f in os.listdir(RECONSTRUCTED_PATH) if f.endswith("_reconstructed.wav")
])

for rec_file in tqdm(reconstructed_files, desc="🎧 Evaluating"):
    # Match original
    original_name = rec_file.replace("_reconstructed.wav", ".wav")
    orig_path = os.path.join(ORIGINAL_PATH, original_name)
    rec_path = os.path.join(RECONSTRUCTED_PATH, rec_file)

    if not os.path.exists(orig_path):
        print(f"❌ Missing original: {original_name}")
        continue

    # Load both files
    orig_audio, sr1 = torchaudio.load(orig_path)
    rec_audio, sr2 = torchaudio.load(rec_path)

    # Convert to mono
    orig_audio = orig_audio.mean(dim=0)
    rec_audio = rec_audio.mean(dim=0)

    # Resample if needed
    if sr1 != SAMPLE_RATE:
        orig_audio = Resample(sr1, SAMPLE_RATE)(orig_audio)
    if sr2 != SAMPLE_RATE:
        rec_audio = Resample(sr2, SAMPLE_RATE)(rec_audio)

    # Truncate/pad to same length
    min_len = min(orig_audio.shape[-1], rec_audio.shape[-1])
    orig_audio = orig_audio[:min_len]
    rec_audio = rec_audio[:min_len]

    # Convert to numpy
    orig_np = orig_audio.numpy()
    rec_np = rec_audio.numpy()

    # === Metrics ===
    # SDR
    sdr, _, _, _ = bss_eval_sources(orig_np[None], rec_np[None])
    sdr_scores.append(sdr[0])

    # STOI
    stoi_val = stoi(orig_np, rec_np, SAMPLE_RATE, extended=False)
    stoi_scores.append(stoi_val)

    # PESQ
    pesq_val = pesq(SAMPLE_RATE, orig_np, rec_np, 'wb')
    pesq_scores.append(pesq_val)

# === AVERAGES ===
print("\n📊 Evaluation Results on 30 Files:")
print(f"✅ SDR  (Signal-to-Distortion Ratio): {np.mean(sdr_scores):.2f} dB")
print(f"✅ STOI (Speech Intelligibility):     {np.mean(stoi_scores):.3f}")
print(f"✅ PESQ (Perceptual Quality):        {np.mean(pesq_scores):.3f}")


	Deprecated as of mir_eval version 0.8.
	It will be removed in mir_eval version 0.9.
  sdr, _, _, _ = bss_eval_sources(orig_np[None], rec_np[None])
🎧 Evaluating: 100%|██████████| 30/30 [00:34<00:00,  1.16s/it]


📊 Evaluation Results on 30 Files:
✅ SDR  (Signal-to-Distortion Ratio): 6.97 dB
✅ STOI (Speech Intelligibility):     0.818
✅ PESQ (Perceptual Quality):        1.635



