In [29]:
import pandas as pd
import os
import numpy as np
import librosa
import matplotlib.pyplot as plt

data_path = os.path.join('..', 'Datasets')
audio_path = os.path.join(data_path, 'groove')
spectrum_path = os.path.join(data_path, 'spectrums_cropped')
meta_data_file = os.path.join("data_balanced.csv")

data = pd.read_csv(meta_data_file, encoding="latin-1")
data.head()

Unnamed: 0,drummer,session,id,style,simplified_style,bpm,beat_type,time_signature,midi_filename,audio_filename,duration,split,start,end,times_sampled,possible_samples,oversampling_ratio
0,drummer1,drummer1/eval_session,drummer1/eval_session/1,funk/groove1,funk,138,1,4-4,drummer1/eval_session/1_funk-groove1_138_beat_...,drummer1/eval_session/1_funk-groove1_138_beat_...,27.872308,test,11.692,14.692,2,2.645385,0.215267
1,drummer1,drummer1/eval_session,drummer1/eval_session/10,soul/groove10,funk,102,1,4-4,drummer1/eval_session/10_soul-groove10_102_bea...,drummer1/eval_session/10_soul-groove10_102_bea...,37.691158,test,25.264,28.264,2,4.28186,0.159189
2,drummer1,drummer1/eval_session,drummer1/eval_session/2,funk/groove2,funk,105,1,4-4,drummer1/eval_session/2_funk-groove2_105_beat_...,drummer1/eval_session/2_funk-groove2_105_beat_...,36.351218,test,10.13,13.13,3,3.058536,0.247585
3,drummer1,drummer1/eval_session,drummer1/eval_session/3,soul/groove3,funk,86,1,4-4,drummer1/eval_session/3_soul-groove3_86_beat_4...,drummer1/eval_session/3_soul-groove3_86_beat_4...,44.716543,test,37.015,40.015,3,4.452757,0.201268
4,drummer1,drummer1/eval_session,drummer1/eval_session/4,soul/groove4,funk,80,1,4-4,drummer1/eval_session/4_soul-groove4_80_beat_4...,drummer1/eval_session/4_soul-groove4_80_beat_4...,47.9875,test,18.449,21.449,2,5.997917,0.125033


# Spectograms

In [30]:
def generate_spectogram(audio_filename, start, end, S_db, sr, cropped=True):
    spectrum_filename = audio_filename.replace('.wav', '')
    spectrum_filename += f"_{start}-{end}.png"
    spectrum_path = os.path.join(data_path, 'spectrums_cropped', spectrum_filename)
    
    if os.path.exists(spectrum_path):
        print(f"File already: {spectrum_path}")
        return spectrum_filename
    
    fig, ax = plt.subplots(figsize=(10, 4))
    librosa.display.specshow(S_db, sr=sr, x_axis='time', y_axis='log')
    
    if(cropped):
        ax.set_xticklabels([])
        ax.set_yticklabels([])
        ax.set_axis_off()
        fig.subplots_adjust(left=0, bottom=0, right=0.1, top=0.1, wspace=0, hspace=0)
        plt.tight_layout(pad=0)
    else:
        plt.colorbar(format='%+2.0f dB')
        plt.title('Spectrogram')
        plt.xlabel('Time')
        plt.ylabel('Frequency')
        plt.tight_layout()
    
    os.makedirs(os.path.dirname(spectrum_path), exist_ok=True)
    plt.savefig(spectrum_path, format='png')
    plt.close('all')
    #save_spectogram_image(S_db, sr, spectrum_path, cropped=cropped)
    return spectrum_filename

# Features

In [31]:
def audio_feature_extraction(y, sr, stft, n_fft, hop_length):
    # Berechnung der Merkmale
    onset_env = librosa.onset.onset_strength(y=y, sr=sr, hop_length=hop_length)
    onset_env_mean = onset_env.mean()
    onset_env_std = onset_env.std()
    
    # Berechnen des STFT und des Spectral Flux
    #stft = np.abs(librosa.stft(y, n_fft=n_fft, hop_length=hop_length))
    spectral_flux = np.sqrt(np.mean((np.diff(stft, axis=1))**2, axis=0))
    spectral_flux_mean = spectral_flux.mean()
    spectral_flux_std = spectral_flux.std()

    # Reduziere n_mels und passe fmax an, um leere Filter zu vermeiden
    n_mels = 40
    fmax = sr / 2

    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels, fmax=fmax)
    mfcc_mean = mfcc.mean()
    mfcc_std = mfcc.std()

    spectral_contrast = librosa.feature.spectral_contrast(y=y, sr=sr, n_fft=n_fft, hop_length=hop_length)
    spectral_contrast_mean = spectral_contrast.mean()
    spectral_contrast_std = spectral_contrast.std()

    tonnetz = librosa.feature.tonnetz(y=y, sr=sr, hop_length=hop_length)
    tonnetz_mean = tonnetz.mean()
    tonnetz_std = tonnetz.std()

    rms = librosa.feature.rms(y=y, frame_length=n_fft, hop_length=hop_length)
    rms_mean = rms.mean()
    rms_std = rms.std()

    spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr, n_fft=n_fft, hop_length=hop_length)
    spectral_centroid_mean = spectral_centroid.mean()
    spectral_centroid_std = spectral_centroid.std()

    spectral_bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr, n_fft=n_fft, hop_length=hop_length)
    spectral_bandwidth_mean = spectral_bandwidth.mean()
    spectral_bandwidth_std = spectral_bandwidth.std()

    spectral_flatness = librosa.feature.spectral_flatness(y=y, n_fft=n_fft, hop_length=hop_length)
    spectral_flatness_mean = spectral_flatness.mean()
    spectral_flatness_std = spectral_flatness.std()

    tempogram = librosa.feature.tempogram_ratio(y=y, sr=sr)
    tempogram_mean = librosa.feature.tempogram_ratio(y=y, sr=sr).mean()
    tempogram_std = librosa.feature.tempogram_ratio(y=y, sr=sr).std()
    
    return {
        'onset_env_mean': onset_env_mean,
        'onset_env_std': onset_env_std,
        'spectral_flux_mean': spectral_flux_mean,
        'spectral_flux_std': spectral_flux_std,
        'mfcc_mean': mfcc_mean,
        'mfcc_std': mfcc_std,
        'spectral_contrast_mean': spectral_contrast_mean,
        'spectral_contrast_std': spectral_contrast_std,
        'tonnetz_mean': tonnetz_mean,
        'tonnetz_std': tonnetz_std,
        'rms_mean': rms_mean,
        'rms_std': rms_std,
        'spectral_centroid_mean': spectral_centroid_mean,
        'spectral_centroid_std': spectral_centroid_std,
        'spectral_bandwidth_mean': spectral_bandwidth_mean,
        'spectral_bandwidth_std': spectral_bandwidth_std,
        'spectral_flatness_mean': spectral_flatness_mean,
        'spectral_flatness_std': spectral_flatness_std,
        'tempogram_mean': tempogram_mean,
        'tempogram_std': tempogram_std
    }

In [32]:
def process_data(audio_filename, start, end, cropped_spectograms=True):
    audio_path = os.path.join(data_path, 'groove', audio_filename)
    duration = end-start
    
    try:
        y, sr = librosa.load(audio_path, sr=None, offset=start, duration=duration)
    except Exception as e:
        print(f"Error loading {audio_path}: {e}")
        return
    
    # librosa magic
    n_fft = min(1024, len(y))
    hop_length = n_fft // 2
    stft = np.abs(librosa.stft(y, n_fft=n_fft, hop_length=hop_length))
    S_db = librosa.amplitude_to_db(stft, ref=np.max)  # Konvertieren der Amplitude in Dezibel
    
    features = audio_feature_extraction(y, sr, stft, n_fft, hop_length)
    features['spectrum_filename'] = generate_spectogram(audio_filename, start, end, S_db, sr, cropped=cropped_spectograms)
    
    return pd.Series(features)

#### test code

# Processing

In [33]:
# progress bar for progress_appyly()
from tqdm.auto import tqdm
tqdm.pandas()

test_mask = data.audio_filename == 'drummer1/session1/145_latin-brazilian-baiao_95_beat_4-4.wav'

new_columns = [
    "onset_env_mean", "onset_env_std", "mfcc_mean", "mfcc_std", 
    "spectral_flux_mean", "spectral_flux_std", "spectral_contrast_mean", 
    "spectral_contrast_std", "tonnetz_mean", "tonnetz_std", "rms_mean", 
    "rms_std", "spectral_centroid_mean", "spectral_centroid_std", 
    "spectral_bandwidth_mean", "spectral_bandwidth_std", 
    "spectral_flatness_mean", "spectral_flatness_std", "tempogram_mean" ,"tempogram_std",
    "spectrum_filename"
]

In [34]:
data[new_columns] = data.progress_apply(lambda x: process_data(x.audio_filename, x.start, x.end), axis=1)
data.head(5)

  0%|          | 0/1200 [00:00<?, ?it/s]

Unnamed: 0,drummer,session,id,style,simplified_style,bpm,beat_type,time_signature,midi_filename,audio_filename,...,rms_std,spectral_centroid_mean,spectral_centroid_std,spectral_bandwidth_mean,spectral_bandwidth_std,spectral_flatness_mean,spectral_flatness_std,tempogram_mean,tempogram_std,spectrum_filename
0,drummer1,drummer1/eval_session,drummer1/eval_session/1,funk/groove1,funk,138,1,4-4,drummer1/eval_session/1_funk-groove1_138_beat_...,drummer1/eval_session/1_funk-groove1_138_beat_...,...,0.056157,5608.665252,2501.480437,4647.658427,941.263597,0.090434,0.124045,0.294583,0.171183,drummer1/eval_session/1_funk-groove1_138_beat_...
1,drummer1,drummer1/eval_session,drummer1/eval_session/10,soul/groove10,funk,102,1,4-4,drummer1/eval_session/10_soul-groove10_102_bea...,drummer1/eval_session/10_soul-groove10_102_bea...,...,0.062846,7204.425221,3338.747216,5212.773742,1194.91465,0.148189,0.143111,0.267859,0.234713,drummer1/eval_session/10_soul-groove10_102_bea...
2,drummer1,drummer1/eval_session,drummer1/eval_session/2,funk/groove2,funk,105,1,4-4,drummer1/eval_session/2_funk-groove2_105_beat_...,drummer1/eval_session/2_funk-groove2_105_beat_...,...,0.061557,6172.80937,3874.975566,4711.894615,1483.392124,0.116454,0.132516,0.238676,0.153792,drummer1/eval_session/2_funk-groove2_105_beat_...
3,drummer1,drummer1/eval_session,drummer1/eval_session/3,soul/groove3,funk,86,1,4-4,drummer1/eval_session/3_soul-groove3_86_beat_4...,drummer1/eval_session/3_soul-groove3_86_beat_4...,...,0.053402,5704.099294,3309.100713,5098.568553,1251.293005,0.101191,0.128717,0.149444,0.144291,drummer1/eval_session/3_soul-groove3_86_beat_4...
4,drummer1,drummer1/eval_session,drummer1/eval_session/4,soul/groove4,funk,80,1,4-4,drummer1/eval_session/4_soul-groove4_80_beat_4...,drummer1/eval_session/4_soul-groove4_80_beat_4...,...,0.046688,8042.372855,3504.921958,5396.953176,999.442925,0.181278,0.149775,0.115379,0.12898,drummer1/eval_session/4_soul-groove4_80_beat_4...


In [35]:
data.to_csv("data_balanced_processed.csv", index=False)