In [None]:
import os
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from ipynb.fs.defs.feature_generation import *

# Different functions for each dataset
def load_birdclef(audio_root, path, target, folds=5):
    df = pd.read_csv(path)
    
    df["path"] = audio_root + "/" + df["primary_label"] + "/" + df["filename"]
    
    # Generate the binary labels
    y = (df['primary_label'] == target).astype(int).values
    
    # Sanity check
    dups = df['path'].duplicated().sum()  # or 'filename'
    print("Duplicate file rows:", dups)
    
    # Get k folds
    skf = StratifiedKFold(folds, shuffle=True, random_state=42)
    splits = []
    for tr, va in skf.split(df, y):
        splits.append({
            "train":tr, 
            "val": va, 
            "train_size": len(tr),
            "val_size": len(va),
            "train_pos_ratio": y[tr].mean(),
            "val_pos_ratio": y[va].mean(),
            }
        )
    
    return df, splits

os.chdir("/home/joris/Thesis/new_attempt")

df, splits = load_birdclef("datasets/birdclef_2021/train_short_audio", "datasets/birdclef_2021/train_metadata.csv", target="rucwar", folds=5)


Duplicate file rows: 0
[{'train': array([    0,     2,     3, ..., 62870, 62872, 62873], shape=(50299,)), 'val': array([    1,     5,    14, ..., 62864, 62866, 62871], shape=(12575,)), 'train_size': 50299, 'val_size': 12575, 'train_pos_ratio': np.float64(0.0024453766476470705), 'val_pos_ratio': np.float64(0.0024652087475149106)}, {'train': array([    0,     1,     3, ..., 62870, 62871, 62873], shape=(50299,)), 'val': array([    2,    19,    21, ..., 62860, 62863, 62872], shape=(12575,)), 'train_size': 50299, 'val_size': 12575, 'train_pos_ratio': np.float64(0.0024453766476470705), 'val_pos_ratio': np.float64(0.0024652087475149106)}, {'train': array([    0,     1,     2, ..., 62871, 62872, 62873], shape=(50299,)), 'val': array([    7,    10,    11, ..., 62839, 62842, 62861], shape=(12575,)), 'train_size': 50299, 'val_size': 12575, 'train_pos_ratio': np.float64(0.0024453766476470705), 'val_pos_ratio': np.float64(0.0024652087475149106)}, {'train': array([    1,     2,     4, ..., 62871, 62