In [20]:
import os
import pandas as pd
from utils.feature_extraction import create_dataset


In [24]:
import os
import numpy as np
import pandas as pd
import librosa

def extract_features(file_path):
    """Ekstraksi fitur MFCC + fitur tambahan"""
    try:
        y, sr = librosa.load(file_path, sr=None, mono=True)
        y = librosa.util.normalize(y)

        mfcc = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13).T, axis=0)
        chroma = np.mean(librosa.feature.chroma_stft(y=y, sr=sr).T, axis=0)
        spec_contrast = np.mean(librosa.feature.spectral_contrast(y=y, sr=sr).T, axis=0)
        zcr = np.mean(librosa.feature.zero_crossing_rate(y))
        rms = np.mean(librosa.feature.rms(y=y))
        centroid = np.mean(librosa.feature.spectral_centroid(y=y, sr=sr))
        bandwidth = np.mean(librosa.feature.spectral_bandwidth(y=y, sr=sr))
        rolloff = np.mean(librosa.feature.spectral_rolloff(y=y, sr=sr))

        features = np.hstack([
            mfcc,
            chroma,
            spec_contrast,
            [zcr, rms, centroid, bandwidth, rolloff]
        ])

        features = np.nan_to_num(features)
        return {f"mfcc{i}": float(v) for i, v in enumerate(features)}

    except Exception as e:
        print(f"[ERROR] Gagal memproses {file_path}: {e}")
        return None


def create_dataset(data_dir="data", output_csv="data/voice_dataset.csv"):
    rows = []
    supported_ext = (".wav", ".m4a", ".mp3")

    print("[INFO] Mulai membaca dataset...")
    total_files = 0
    good = 0
    bad = 0

    for user in os.listdir(data_dir):
        user_path = os.path.join(data_dir, user)
        if not os.path.isdir(user_path):
            continue

        for status in ["buka", "tutup"]:
            status_path = os.path.join(user_path, status)
            if not os.path.isdir(status_path):
                continue

            for file in os.listdir(status_path):
                if file.lower().endswith(supported_ext):
                    total_files += 1
                    file_path = os.path.join(status_path, file)

                    feats = extract_features(file_path)
                    if feats is not None:
                        good += 1
                        feats["user"] = user
                        feats["status"] = status
                        feats["filename"] = file
                        rows.append(feats)
                    else:
                        bad += 1
                        print(f"[SKIP] File gagal diproses: {file_path}")

    df = pd.DataFrame(rows)

    print(f"\nTotal file audio: {total_files}")
    print(f"Good: {good}")
    print(f"Bad : {bad}\n")

    # ============================
    # AUTO LABEL USER & STATUS
    # ============================
    df["user"] = df["user"].astype(str).str.strip()
    df["status"] = df["status"].astype(str).str.strip()

    # Mapping otomatis
    user_map = {name: idx for idx, name in enumerate(sorted(df["user"].unique()))}
    status_map = {"buka": 0, "tutup": 1}

    df["user"] = df["user"].map(user_map)
    df["status"] = df["status"].map(status_map)

    # Pastikan semua fitur numerik
    for col in df.columns:
        if col not in ["user", "status", "filename"]:
            df[col] = pd.to_numeric(df[col], errors="coerce").fillna(0)

    # Simpan CSV
    os.makedirs(os.path.dirname(output_csv), exist_ok=True)
    df.to_csv(output_csv, index=False, encoding="utf-8")

    print(f"[INFO] Dataset berhasil dibuat: {output_csv} ({len(df)} sampel)")
    print("[INFO] user_map:", user_map)
    print("[INFO] status_map:", status_map)

    return df
if __name__ == "__main__":
    create_dataset()


[INFO] Mulai membaca dataset...

Total file audio: 600
Good: 600
Bad : 0

[INFO] Dataset berhasil dibuat: data/voice_dataset.csv (600 sampel)
[INFO] user_map: {'user1': 0, 'user2': 1, 'user3': 2}
[INFO] status_map: {'buka': 0, 'tutup': 1}


In [23]:
df, feature_cols = main()


[INFO] Dataset sudah ada: data/voice_dataset_baru.csv (560 sampel)
[INFO] Contoh 5 baris dataset:
        mfcc0      mfcc1      mfcc2      mfcc3     mfcc4     mfcc5     mfcc6  \
0 -493.793915  54.684177  10.899656   0.599356  0.892953 -2.432448 -1.405340   
1 -410.737488  77.005905  16.918690   7.285315  0.847548 -8.428810 -4.446471   
2 -498.079376  47.904259  12.686226   3.156374  3.864322  1.550516  0.679076   
3 -411.690216  52.576534  -1.328665   2.500839  2.465005 -7.701874 -3.539030   
4 -409.559906  70.350960  10.662391  11.399956  6.045907 -6.577566 -2.859235   

      mfcc7     mfcc8     mfcc9  ...     mfcc30     mfcc31    mfcc32  \
0  1.641003  1.199440 -3.698258  ...  16.727251  17.430390  0.012001   
1 -2.462752 -1.674619 -3.003543  ...  17.604445  18.730287  0.022926   
2  2.481086  2.445888 -2.372571  ...  16.789003  17.103317  0.024730   
3  0.286026  3.740849  4.236199  ...  16.034323  18.852358  0.012389   
4 -1.914279 -0.658268 -0.607756  ...  17.611368  19.187455  0