**Task 4, Challenge**

Question 1, a,

In [1]:
import pandas as pd
from sklearn.model_selection import GroupShuffleSplit
import numpy as np

# Load metadata to get list of audio files (each corresponds to one feature/label file)
metadata_df = pd.read_csv("metadata.csv")

# Ensure the 'filename' column exists
assert 'filename' in metadata_df.columns, "Expected 'filename' column in metadata.csv"

# Each filename is a unique group (e.g., '1234.mp3')
audio_files = metadata_df['filename'].tolist()

# Setup group split (80% train, 20% eval)
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

# Use file names as both X and groups
train_idx, eval_idx = next(gss.split(audio_files, groups=audio_files))
train_files = [audio_files[i] for i in train_idx]
eval_files = [audio_files[i] for i in eval_idx]

# Double-check no overlap
assert set(train_files).isdisjoint(set(eval_files)), "Leakage: some files in both splits!"

# Optional: save for later use
pd.DataFrame({'filename': train_files}).to_csv("train_files.csv", index=False)
pd.DataFrame({'filename': eval_files}).to_csv("eval_files.csv", index=False)

print(f"# Train files: {len(train_files)}")
print(f"# Eval files:  {len(eval_files)}")


# Train files: 6584
# Eval files:  1646


In [4]:
labels = np.load("labels/14_labels.npz")
print("Available classes:", labels.files)
print("Shape of one class array:", labels['Speech'].shape)


Available classes: ['Airplane', 'Alarm', 'Beep/Bleep', 'Bell', 'Bicycle', 'Bird Chirp', 'Bus', 'Car', 'Cat Meow', 'Chainsaw', 'Clapping', 'Cough', 'Cow Moo', 'Cowbell', 'Crying', 'Dog Bark', 'Doorbell', 'Drip', 'Drums', 'Fire', 'Footsteps', 'Guitar', 'Hammer', 'Helicopter', 'Hiccup', 'Horn Honk', 'Horse Neigh', 'Insect Buzz', 'Jackhammer', 'Laughter', 'Lawn Mower', 'Motorcycle', 'Piano', 'Pig Oink', 'Power Drill', 'Power Saw', 'Rain', 'Rooster Crow', 'Saxophone', 'Sewing Machine', 'Sheep/Goat Bleat', 'Ship/Boat', 'Shout', 'Singing', 'Siren', 'Sneeze', 'Snoring', 'Speech', 'Stream/River', 'Thunder', 'Train', 'Truck', 'Trumpet', 'Vacuum Cleaner', 'Violin', 'Washing Machine', 'Waves', 'Wind']
Shape of one class array: (166, 1)


In [1]:
#code with multi-label stratification to ensure balanced splits

import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split

# Load metadata
metadata_path = "metadata.csv"
metadata_df = pd.read_csv(metadata_path)
n_metadata = len(metadata_df)

# Define target classes
target_classes = ['Speech', 'Dog Bark', 'Rooster Crow', 'Shout',
                  'Lawn Mower', 'Chainsaw', 'Jackhammer',
                  'Power Drill', 'Horn Honk', 'Siren']

# Folder containing label files
labels_folder = "labels"

# Gather all npz label files
label_files = sorted([f for f in os.listdir(labels_folder) if f.endswith(".npz")])
file_class_matrix = []
all_filenames = []

base_idx = 0

# Iterate over label files and extract presence info
for label_file in label_files:
    label_path = os.path.join(labels_folder, label_file)
    labels = np.load(label_path)

    n_files = labels[target_classes[0]].shape[0]
    file_indices = range(base_idx, base_idx + n_files)

    for i, file_idx in enumerate(file_indices):
        if file_idx >= n_metadata:
            break
        filename = metadata_df.loc[file_idx, "filename"].replace(".mp3", ".npz")
        all_filenames.append(filename)

        row = []
        for cls in target_classes:
            presence = int(np.any(labels[cls][i]))
            row.append(presence)
        file_class_matrix.append(row)

    base_idx += n_files

# Create DataFrame for stratification
file_class_df = pd.DataFrame(file_class_matrix, columns=target_classes)
file_class_df["filename"] = all_filenames
file_class_df["strata"] = file_class_df[target_classes].astype(str).agg("".join, axis=1)

# Perform stratified train-test split
train_files, test_files = train_test_split(
    file_class_df["filename"],
    test_size=0.2,
    stratify=file_class_df["strata"],
    random_state=42
)

len(train_files), len(test_files)


(6584, 1646)

In [2]:
import pandas as pd
# Load test filenames
test_files = pd.read_csv("test_filenames.csv")["filename"].tolist()
test_files = [f.replace(".npz", ".wav") for f in test_files]  # Required format

In [3]:
# Generate random predictions for the test set

import numpy as np
from compute_cost import get_segment_prediction_df, get_ground_truth_df

# Define your 10 target classes
target_classes = ['Speech', 'Dog Bark', 'Rooster Crow', 'Shout',
                  'Lawn Mower', 'Chainsaw', 'Jackhammer',
                  'Power Drill', 'Horn Honk', 'Siren']

# Random prediction generator
def generate_random_predictions(test_files, dataset_path, classes):
    predictions = {}
    for filename in test_files:
        feature_file = filename.replace(".wav", ".npz")
        path = f"{dataset_path}/audio_features/{feature_file}"
        try:
            n_frames = np.load(path)["mfcc"].shape[0]  # Use MFCC to determine frame count
        except FileNotFoundError:
            print(f"Missing feature file: {path}")
            continue
        predictions[filename] = {
            cls: np.random.randint(0, 2, size=n_frames).tolist() for cls in classes
        }
    return predictions


In [4]:
#create ground truth and prediction dataframes

dataset_path = "C:/Users/kathr/Documents/AI-Bachelor/MLPC/MLPC2025_classification"

import os

def filter_existing_files(file_list, dataset_path):
    valid_files = []
    for fname in file_list:
        base = os.path.splitext(fname)[0]
        feat_path = os.path.join(dataset_path, 'audio_features', f"{base}.npz")
        label_path = os.path.join(dataset_path, 'labels', f"{base}_labels.npz")
        if os.path.exists(feat_path) and os.path.exists(label_path):
            valid_files.append(fname)
    return valid_files

filtered_test_files = filter_existing_files(test_files, dataset_path)

# Generate predictions
random_preds = generate_random_predictions(filtered_test_files, dataset_path, target_classes)

# Create and save prediction and ground truth CSVs
get_segment_prediction_df(random_preds, target_classes).to_csv("predictions.csv", index=False)

get_ground_truth_df(filtered_test_files, dataset_path).to_csv("ground_truth.csv", index=False)

print(f"Filtered {len(test_files) - len(filtered_test_files)} test files due to missing .npz files.")



Filtered 0 test files due to missing .npz files.


In [5]:
import pandas as pd

pred_df = pd.read_csv("predictions.csv")
print(pred_df.head())
print(pred_df.describe())


     filename  onset  Speech  Dog Bark  Rooster Crow  Shout  Lawn Mower  \
0  197321.wav    0.0       1         1             1      1           1   
1  197321.wav    1.2       1         1             1      1           1   
2  197321.wav    2.4       1         1             1      1           1   
3  197321.wav    3.6       1         1             1      1           1   
4  197321.wav    4.8       1         1             1      1           1   

   Chainsaw  Jackhammer  Power Drill  Horn Honk  Siren  
0         1           1            1          1      1  
1         1           1            1          1      1  
2         1           1            1          1      1  
3         1           1            1          1      1  
4         1           1            1          1      1  
            onset      Speech    Dog Bark  Rooster Crow       Shout  \
count  647.000000  647.000000  647.000000    647.000000  647.000000   
mean    11.261824    0.995363    0.996909      0.993818    0.9984

In [17]:
def generate_all_zero_predictions(test_files, dataset_path, target_classes):
    predictions = {}
    for fname in test_files:
        base = os.path.splitext(fname)[0]
        feat_path = os.path.join(dataset_path, "audio_features", f"{base}.npz")
        with np.load(feat_path) as data:
            n_segments = data["mfcc"].shape[0]
        predictions[fname] = {cls: [0] * n_segments for cls in target_classes}
    return predictions


In [19]:
def generate_all_one_predictions(test_files, dataset_path, target_classes):
    predictions = {}
    for fname in test_files:
        base = os.path.splitext(fname)[0]
        feat_path = os.path.join(dataset_path, "audio_features", f"{base}.npz")
        with np.load(feat_path) as data:
            n_segments = data["mfcc"].shape[0]
        predictions[fname] = {cls: [1] * n_segments for cls in target_classes}
    return predictions


In [20]:
# Choose one (all zeros or all ones):
#preds = generate_all_zero_predictions(test_files, dataset_path, target_classes)
preds = generate_all_one_predictions(test_files, dataset_path, target_classes)

# Save predictions and ground truth
get_segment_prediction_df(preds, target_classes).to_csv("predictions.csv", index=False)
get_ground_truth_df(test_files, dataset_path).to_csv("ground_truth.csv", index=False)
