In [None]:
import os
import numpy as np
import librosa
from scipy.fftpack import dct
from tqdm import tqdm
import soundfile as sf
import opensmile

# --- 1. CONFIGURATION ---

# --- Paths ---
TEAMMATE_DATA_PATH = '/mount/studenten/arbeitsdaten-studenten1/team-lab-phonetics/2025/student_directories/AuFa/'
OUTPUT_DIR = os.path.join(TEAMMATE_DATA_PATH, "processed_data_aligned_lld")
os.makedirs(OUTPUT_DIR, exist_ok=True)
TEMP_AUDIO_PATH = os.path.join(OUTPUT_DIR, "temp_5s_audio.wav") # For temporary audio clips

# --- Feature Parameters ---
TARGET_SHAPE_CQCC = (128, 157)
TARGET_SHAPE_LLD = (23, 157) # eGeMAPS LLDs have 23 features
SAMPLE_RATE = 16000
DURATION = 5.0

# --- 2. HELPER FUNCTIONS ---

def extract_cqcc(y, sr, n_bins=90, n_cqcc=128):
    """Extracts CQCC features."""
    try:
        cqt = np.abs(librosa.cqt(y=y, sr=sr, n_bins=n_bins, fmin=librosa.note_to_hz('C1')))
        log_cqt = np.log(cqt + 1e-6)
        cqcc = dct(log_cqt, type=2, axis=0, norm='ortho')
        return cqcc[:n_cqcc, :]
    except Exception:
        return None

def pad_or_truncate(array, target_shape):
    """Pads or truncates a 2D array to a target shape."""
    padded_array = np.full(target_shape, 0.0, dtype=np.float32) # Pad with 0
    copy_shape = tuple(min(c, t) for c, t in zip(array.shape, target_shape))
    padded_array[:copy_shape[0], :copy_shape[1]] = array[:copy_shape[0], :copy_shape[1]]
    return padded_array

def process_data_aligned(directories, label, smile_instance):
    """
    Extracts aligned CQCC and eGeMAPS LLD features.
    """
    cqcc_list, lld_list, labels_list = [], [], []

    for directory in directories:
        full_dir_path = os.path.join(TEAMMATE_DATA_PATH, directory)
        print(f"\nProcessing directory: {full_dir_path}")
        if not os.path.isdir(full_dir_path):
            continue

        files = [f for f in os.listdir(full_dir_path) if f.endswith(('.flac', '.wav'))]
        for filename in tqdm(files, desc=f"Extracting from {directory}"):
            filepath = os.path.join(full_dir_path, filename)
            try:
                # 1. Load the 5-second audio clip once
                audio_5s, sr = librosa.load(filepath, sr=SAMPLE_RATE, duration=DURATION)

                # 2. Extract CQCC from the 5s clip
                cqcc_feats = extract_cqcc(audio_5s, sr, n_cqcc=TARGET_SHAPE_CQCC[0])
                if cqcc_feats is None: continue

                # 3. Extract LLDs from the same 5s clip
                # We need to save the clip to a temporary file for openSMILE to process
                sf.write(TEMP_AUDIO_PATH, audio_5s, sr)
                lld_df = smile_instance.process_file(TEMP_AUDIO_PATH)
                lld_feats = lld_df.values.T # Transpose to get (features, time)

                # 4. Pad both feature sets to the target shape
                padded_cqcc = pad_or_truncate(cqcc_feats, TARGET_SHAPE_CQCC)
                padded_lld = pad_or_truncate(lld_feats, TARGET_SHAPE_LLD)

                # 5. Append to lists
                cqcc_list.append(padded_cqcc)
                lld_list.append(padded_lld)
                labels_list.append(label)

            except Exception as e:
                print(f"\nError processing {filepath}: {e}")

    # Clean up the temporary audio file
    if os.path.exists(TEMP_AUDIO_PATH):
        os.remove(TEMP_AUDIO_PATH)

    return np.array(cqcc_list), np.array(lld_list), np.array(labels_list)


# --- 3. MAIN EXECUTION SCRIPT ---

if __name__ == '__main__':
    # --- Initialize openSMILE for LLDs ---
    smile = opensmile.Smile(
        feature_set=opensmile.FeatureSet.eGeMAPS,
        feature_level=opensmile.FeatureLevel.LowLevelDescriptors, # Use the full name, # Set to LLD
    )

    # --- Process Training Data ---
    print("--- Processing Training Set ---")
    cqcc_bf_train, lld_bf_train, labels_bf_train = process_data_aligned(['bonafide_audio_train', 'augmented_bonafide'], 1, smile)
    cqcc_spf_train, lld_spf_train, labels_spf_train = process_data_aligned(['spoof_audio_train'], 0, smile)

    # --- Process Validation Data ---
    print("\n--- Processing Validation Set ---")
    cqcc_bf_val, lld_bf_val, labels_bf_val = process_data_aligned(['bonafide_audio_val'], 1, smile)
    cqcc_spf_val, lld_spf_val, labels_spf_val = process_data_aligned(['spoof_audio_val'], 0, smile)

    # --- Combine and Save Training Data ---
    X_cqcc_train = np.concatenate((cqcc_bf_train, cqcc_spf_train), axis=0)
    X_lld_train = np.concatenate((lld_bf_train, lld_spf_train), axis=0)
    y_train = np.concatenate((labels_bf_train, labels_spf_train), axis=0)
    np.save(os.path.join(OUTPUT_DIR, "cqcc_features_train.npy"), X_cqcc_train)
    np.save(os.path.join(OUTPUT_DIR, "egmaps_lld_features_train.npy"), X_lld_train)
    np.save(os.path.join(OUTPUT_DIR, "labels_train.npy"), y_train)
    print(f"\n✅ Training data saved. Shapes: CQCC={X_cqcc_train.shape}, LLD={X_lld_train.shape}")

    # --- Combine and Save Validation Data ---
    X_cqcc_val = np.concatenate((cqcc_bf_val, cqcc_spf_val), axis=0)
    X_lld_val = np.concatenate((lld_bf_val, lld_spf_val), axis=0)
    y_val = np.concatenate((labels_bf_val, labels_spf_val), axis=0)
    np.save(os.path.join(OUTPUT_DIR, "cqcc_features_val.npy"), X_cqcc_val)
    np.save(os.path.join(OUTPUT_DIR, "egmaps_lld_features_val.npy"), X_lld_val)
    np.save(os.path.join(OUTPUT_DIR, "labels_dev.npy"), y_val)
    print(f"✅ Validation data saved. Shapes: CQCC={X_cqcc_val.shape}, LLD={X_lld_val.shape}")



--- Processing Training Set ---

Processing directory: /mount/studenten/arbeitsdaten-studenten1/team-lab-phonetics/2025/student_directories/AuFa/bonafide_audio_train


Extracting from bonafide_audio_train:  22%|██▏       | 579/2580 [01:26<04:21,  7.65it/s]