In [311]:
!ls /kaggle/input/jan-2026-dl-gen-ai-project
!ls /kaggle/input/jan-2026-dl-gen-ai-project/messy_mashup


messy_mashup
ESC-50-master  genres_stems  mashups  sample_submission.csv  test.csv


In [312]:
import os
import numpy as np
import pandas as pd
import librosa
import librosa.display
import matplotlib.pyplot as plt
from tqdm import tqdm
from typing import Dict
import os
import glob
import random
import torch

import warnings
warnings.filterwarnings("ignore")

In [313]:
BASE_PATH = "/kaggle/input/jan-2026-dl-gen-ai-project/messy_mashup"

GENRES_PATH = os.path.join(BASE_PATH, "genres_stems")
MASHUPS_PATH = os.path.join(BASE_PATH, "mashups")
ESC_PATH = os.path.join(BASE_PATH, "ESC-50-master")
TEST_CSV = os.path.join(BASE_PATH, "test.csv")


In [314]:
#----------------------------- DON'T CHANGE THIS --------------------------
DATA_SEED = 67
TRAINING_SEED = 1234
SR = 22050
DURATION = 5.0
N_FFT = 2048
HOP_LENGTH = 512
N_MELS = 128
TOP_DB=20
TARGET_SNR_DB = 10

random.seed(DATA_SEED)
np.random.seed(DATA_SEED)
torch.manual_seed(DATA_SEED)
torch.cuda.manual_seed(DATA_SEED)

In [315]:
DATA_ROOT = "/kaggle/input/jan-2026-dl-gen-ai-project/messy_mashup/genres_stems"

GENRES = sorted(os.listdir(DATA_ROOT))

STEMS = ["drums.wav", "vocals.wav", "bass.wav", "other.wav"]

STEM_KEYS = ['drums', 'vocals', 'bass', 'other']


In [316]:
# Question - 1

# Constants
KB = 1024
MB = 1024 * 1024

CORRUPTED_THRESHOLD = 4 * KB
SMALL_FILE_THRESHOLD = 5.0491 * MB


def build_dataset(root_dir, val_split=0.17, seed=42):
    # Initialize empty dictionaries
    train_dataset = {g: {s.replace('.wav', ''): [] for s in STEMS} for g in GENRES}
    val_dataset   = {g: {s.replace('.wav', ''): [] for s in STEMS} for g in GENRES}

    rng = random.Random(seed)

    corrupted_count = 0
    small_file_count = 0

    for genre in GENRES:
        genre_path = os.path.join(root_dir, genre)

        # Check if genre folder exists
        if not os.path.exists(genre_path):
            continue

        valid_songs = []

        for song in os.listdir(genre_path):
            song_path = os.path.join(genre_path, song)

            if not os.path.isdir(song_path):
                continue

            stems_present = True
            stem_paths = []

            # Check completeness
            for stem in STEMS:
                stem_path = os.path.join(song_path, stem)

                if not os.path.exists(stem_path):
                    stems_present = False
                    break

                stem_paths.append(stem_path)

            if not stems_present:
                continue

            # Check corruption + size conditions
            for stem_path in stem_paths:
                size = os.path.getsize(stem_path)

                if size < CORRUPTED_THRESHOLD:
                    corrupted_count += 1

                if size < SMALL_FILE_THRESHOLD:
                    small_file_count += 1

            valid_songs.append(song)

        # Stratified shuffle split
        rng.shuffle(valid_songs)
        split_idx = int(len(valid_songs) * (1 - val_split))

        train_songs = valid_songs[:split_idx]
        val_songs   = valid_songs[split_idx:]

        # Helper function to populate dictionary
        def add_to_dict(target_dict, song_list):
            for song in song_list:
                song_path = os.path.join(genre_path, song)
                for stem in STEMS:
                    stem_key = stem.replace('.wav', '')
                    stem_path = os.path.join(song_path, stem)
                    target_dict[genre][stem_key].append(stem_path)

        add_to_dict(train_dataset, train_songs)
        add_to_dict(val_dataset, val_songs)

    total_required_answer = corrupted_count + small_file_count

    print("Corrupted (<4KB):", corrupted_count)
    print("Files < 5.0491MB:", small_file_count)
    print("Final Answer (Q1):", total_required_answer)

    return train_dataset, val_dataset


In [317]:
tr, val = build_dataset(DATA_ROOT)



Corrupted (<4KB): 0
Files < 5.0491MB: 1256
Final Answer (Q1): 1256


In [318]:
# Question 2

LOWER_THRESHOLD = 5.0491 * MB
UPPER_THRESHOLD = 5.0493 * MB

greater_than_upper = 0
less_than_lower = 0

for genre in GENRES:
    genre_path = os.path.join(DATA_ROOT, genre)

    if not os.path.exists(genre_path):
        continue

    for song in os.listdir(genre_path):
        song_path = os.path.join(genre_path, song)

        if not os.path.isdir(song_path):
            continue

        for stem in STEMS:
            stem_path = os.path.join(song_path, stem)

            if not os.path.exists(stem_path):
                continue

            size = os.path.getsize(stem_path)

            if size > UPPER_THRESHOLD:
                greater_than_upper += 1

            if size < LOWER_THRESHOLD:
                less_than_lower += 1


absolute_difference = abs(greater_than_upper - less_than_lower)

print("Sounds > 5.0493MB:", greater_than_upper)
print("Sounds < 5.0491MB:", less_than_lower)
print("Absolute Difference (Q2):", absolute_difference)


Sounds > 5.0493MB: 184
Sounds < 5.0491MB: 1256
Absolute Difference (Q2): 1072


In [319]:
# Question 3

train_reggae_drums = len(tr["reggae"]["drums"])
val_country_vocals = len(val["country"]["vocals"])

absolute_difference_q3 = abs(train_reggae_drums - val_country_vocals)

print("Training Reggae Drum Samples:", train_reggae_drums)
print("Validation Country Vocal Samples:", val_country_vocals)
print("Absolute Difference (Q3):", absolute_difference_q3)


Training Reggae Drum Samples: 83
Validation Country Vocal Samples: 17
Absolute Difference (Q3): 66


In [320]:
# Question 4
def find_long_silences(dataset_dict, sr=SR, threshold_sec=DURATION, top_db=TOP_DB):
    """
    Input:
        dataset_dict: {genre: {stem: [paths...]}}
    Output:
        df: DataFrame of files with silence >= threshold_sec
    """
    records = []
    total_files = 0

    for genre in dataset_dict:
        for stem_name in dataset_dict[genre]:
            for file_path in dataset_dict[genre][stem_name]:

                total_files += 1

                # ---- Load Audio ----
                y, sr = librosa.load(file_path, sr=sr)
                total_duration = librosa.get_duration(y=y, sr=sr)

                # ---- Find Non-Silent Intervals ----
                intervals = librosa.effects.split(y, top_db=top_db)

                silence_durations = []
                silence_type = []

                # ---- CASE A: Fully Silent ----
                if len(intervals) == 0:
                    max_silence = total_duration
                    silence_type.append("Full")
                else:
                    max_silence = 0

                    # ---- CASE B: Start Silence ----
                    if intervals[0][0] > 0:
                        start_silence = intervals[0][0] / sr
                        silence_durations.append(start_silence)
                        silence_type.append("Start")

                    # ---- CASE D: Middle Silence ----
                    for i in range(len(intervals) - 1):
                        end_current = intervals[i][1]
                        start_next = intervals[i+1][0]
                        gap = (start_next - end_current) / sr
                        silence_durations.append(gap)
                        silence_type.append("Middle")

                    # ---- CASE C: End Silence ----
                    if intervals[-1][1] < len(y):
                        end_silence = (len(y) - intervals[-1][1]) / sr
                        silence_durations.append(end_silence)
                        silence_type.append("End")

                    if silence_durations:
                        max_silence = max(silence_durations)
                    else:
                        max_silence = 0

                # ---- Store result ----
                if max_silence >= threshold_sec:
                    records.append({
                        "Genre": genre,
                        "Stem": stem_name,
                        "Duration": round(total_duration, 2),
                        "Max_Silence_Sec": round(max_silence, 2),
                        "Silence_Location": ", ".join(set(silence_type)),
                        "File_Path": file_path
                    })

    df = pd.DataFrame(records)

    print("Total Files Checked:", total_files)
    print("Files with Silence >= {} sec:".format(threshold_sec), len(df))

    return df


In [321]:
df_silence = find_long_silences(tr, threshold_sec=DURATION, top_db=TOP_DB)



Total Files Checked: 3320
Files with Silence >= 5.0 sec: 678


In [323]:
# Question 5
# Count only vocals from silence dataframe
total_vocal_long_silence = len(df_silence[df_silence["Stem"] == "vocals"])

print("Total number of vocal tracks with silence >= 5 seconds:",
      total_vocal_long_silence)




Total number of vocal tracks with silence >= 5 seconds: 315


In [324]:
# Question 6
# Filter only vocals
vocal_silence_df = df_silence[df_silence["Stem"] == "vocals"]

# Compute average silence length
average_silence_vocals = vocal_silence_df["Max_Silence_Sec"].mean()

print("Average Silence Length in Vocals (secs):",
      round(average_silence_vocals, 2))


Average Silence Length in Vocals (secs): 12.78


In [325]:
print("Total vocal files considered:", len(vocal_silence_df))
print("Min silence:", vocal_silence_df["Max_Silence_Sec"].min())
print("Max silence:", vocal_silence_df["Max_Silence_Sec"].max())


Total vocal files considered: 315
Min silence: 5.02
Max silence: 29.73


In [326]:
# Question 7
jazz_drum_silence_count = len(
    df_silence[
        (df_silence["Genre"] == "jazz") &
        (df_silence["Stem"] == "drums")
    ]
)

print("Total number of jazz drum tracks with silence >= 5 seconds:",
      jazz_drum_silence_count)


Total number of jazz drum tracks with silence >= 5 seconds: 20


In [327]:
# Question 8
jazz_drum_middle_only = len(
    df_silence[
        (df_silence["Genre"] == "jazz") &
        (df_silence["Stem"] == "drums") &
        (df_silence["Silence_Location"] == "Middle")
    ]
)

print("Jazz drum tracks with silence >=5s and only middle silence:",
      jazz_drum_middle_only)


Jazz drum tracks with silence >=5s and only middle silence: 0


In [328]:
# Question 9
jazz_drum_long_silence = len(
    df_silence[
        (df_silence["Genre"] == "jazz") &
        (df_silence["Stem"] == "drums") &
        (df_silence["Max_Silence_Sec"] >= 10)
    ]
)

print("Jazz drum tracks with silence >=5s and Max_Silence_Sec >= 10:",
      jazz_drum_long_silence)


Jazz drum tracks with silence >=5s and Max_Silence_Sec >= 10: 7


In [329]:
# Question 10
stems_audio = []

GENRE_TO_TEST = "rock"
SONG_INDEX = 0  # first song

try:
    for key in STEM_KEYS:
        # Get file path
        file_path = tr[GENRE_TO_TEST][key][SONG_INDEX]

        # Load exactly 5 seconds
        y, sr = librosa.load(file_path, sr=SR, duration=5.0)

        stems_audio.append(y)

    # Ensure equal length (important!)
    min_length = min(len(stem) for stem in stems_audio)
    stems_audio = [stem[:min_length] for stem in stems_audio]

    # Combine stems
    mix = np.sum(stems_audio, axis=0)

    print("Audio loaded successfully.")
    print("Length of mix sample:", len(mix))

except NameError:
    print("ERROR: 'tr' dictionary not found. Please run build_dataset() first.")
except IndexError:
    print(f"ERROR: Song index {SONG_INDEX} out of range.")
except Exception as e:
    print(f"ERROR: {e}")


Audio loaded successfully.
Length of mix sample: 110250


In [330]:
# Question 11 and 12

# Stack stems into numpy array (4 x Samples)
stems_stack = np.vstack(stems_audio)

# Mix stems by summing element-wise
mix_raw = np.sum(stems_stack, axis=0)

# ----- RMS Amplitude (MANUAL) -----
rms_val = np.sqrt(np.mean(mix_raw ** 2))

# ----- Peak Normalization -----
max_val = np.max(np.abs(mix_raw))

if max_val > 0:
    mix_norm = mix_raw / max_val
else:
    mix_norm = mix_raw

# VALIDATION
assert np.isclose(np.max(np.abs(mix_norm)), 1.0), "Normalization failed."

print("RMS Amplitude of mix sample:", rms_val)
print("Max value of peak normalized sample:", np.max(np.abs(mix_norm)))


RMS Amplitude of mix sample: 0.16697016
Max value of peak normalized sample: 1.0
