# Notebook Pelatihan Utama - Pengenalan Ucapan Disartria (KAGGLE VERSION)
**Version:** 20260115_2356\n
**Tujuan:** Analisis Perbandingan **Lightweight CNN-STFT** (Diusulkan) vs **Model Transfer Learning**.
**Platform:** Kaggle Kernels (GPU T4 x2).
**Strategy:** Subject-Independent Split (Verified).

## 🆕 Log Perubahan (Changelog)

*   **Fix**: Resolved `axis 1 out of bounds` error for NASNetMobile (Robust 1D/2D output handling).
*   **Fix**: Added `%load_ext tensorboard` magic command to ensure TensorBoard works in Kaggle.
*   **Feature**: Added **Thesis-Ready Visualization** suite (CM, ROC, PRC, Learning Curves saved as PNG).
*   **Feature**: Added **Comparison Plots** (Accuracy & Time Bar Charts) at the end of the notebook.
*   **Feature**: Added **Extended CSV Logs** (Predictions with file names for Error Analysis).
*   **Optimization**: Fully implemented **Paper 2 Alignment** (16kHz, Librosa STFT).
\n
## 📋 Panduan Setup Kaggle
1. **Add Data**: Upload folder `backend` anda sebagai Dataset (beri nama `thesis-backend` misalnya).
2. **Add Data**: Cari dataset `UASpeech` dan `TORGO` (atau upload zip-nya jika punya privasi).
3. **Internet**: Aktifkan Internet di menu Settings (kanan) jika perlu download via `gdown`.

In [None]:
# 1. Setup Environment & Path (Kaggle Symlink Fix)
%load_ext tensorboard
import os
import sys
import glob

print("🚀 Memulai Setup Kaggle Environment...")

# A. Cari file 'config.py' dimanapun dia berada
config_path = None
for root, dirs, files in os.walk('/kaggle/input'):
    if 'config.py' in files:
        config_path = os.path.join(root, 'config.py')
        break

if config_path:
    print(f"✅ Ditemukan Config di: {config_path}")
    source_dir = os.path.dirname(config_path)
    
    # B. Buat Symlink 'src' di Working Directory
    # Tujuannya agar 'from src import config' SELALU jalan, tidak peduli struktur aslinya rusak/flatten
    target_link = '/kaggle/working/src'
    if os.path.exists(target_link):
        if os.path.islink(target_link):
            os.unlink(target_link)
        else:
            import shutil
            shutil.rmtree(target_link)
            
    os.symlink(source_dir, target_link)
    print(f"🔗 Symlink dibuat: {target_link} -> {source_dir}")
    
    # C. Tambah Working Dir ke Sys Path
    if '/kaggle/working' not in sys.path:
        sys.path.append('/kaggle/working')
else:
    print("❌ FATAL: File 'config.py' tidak ditemukan di Input manapun!")
    print("   Pastikan Anda sudah 'Add Data' folder backend.")

# D. Setup Output Paths
OUTPUT_ROOT = '/kaggle/working'
LOCAL_DATA_ROOT = '/kaggle/working/data'
os.makedirs(LOCAL_DATA_ROOT, exist_ok=True)

print("Environment Siap!")

In [None]:
# 2. Install Dependencies
!pip install -q tensorflow-io
!pip install -q pandas matplotlib seaborn scikit-learn librosa

In [None]:
# 3. Import Modul Proyek
try:
    from src import config, data_loader, models, trainer
    print("✅ Modul berhasil diimport: config, data_loader, models, trainer")

    # Override Config untuk Kaggle Output
    config.MODELS_DIR = os.path.join(OUTPUT_ROOT, 'models')
    config.OUTPUTS_DIR = os.path.join(OUTPUT_ROOT, 'outputs')
    os.makedirs(config.MODELS_DIR, exist_ok=True)
    os.makedirs(config.OUTPUTS_DIR, exist_ok=True)
    print(f"📂 Output Directory set to: {config.OUTPUTS_DIR}")

except ImportError as e:
    print(f"❌ Gagal import modul: {e}")
    print("Pastikan 'backend' terdeteksi dengan benar.")

In [None]:
# 3.5 Visualization Helpers (From Paper 2 - CLONED)\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nfrom sklearn.metrics import confusion_matrix, roc_curve, auc, precision_recall_curve, classification_report\nimport numpy as np\nimport pandas as pd\nimport os\n\ndef smooth_curve(points, factor=0.8):\n    \"\"\"Membuat kurva lebih halus menggunakan exponential moving average.\"\"\"\n    smoothed = []\n    for point in points:\n        if smoothed:\n            # Basic EMA\n            smoothed.append(smoothed[-1] * factor + point * (1 - factor))\n        else:\n            smoothed.append(point)\n    return smoothed\n\ndef plot_learning_curve(history, model_name, run_name):\n    # 1. Grafik Kurva Pembelajaran (ASLI / TANPA SMOOTHING)\n    sns.set_style(\"whitegrid\")\n    fig_learning_asli, axs_learning_asli = plt.subplots(1, 2, figsize=(15, 5))\n    fig_learning_asli.suptitle(f'Kurva Pembelajaran (Asli): {model_name}', fontsize=16)\n\n    axs_learning_asli[0].plot(history['accuracy'], '-', label='Akurasi Training', linewidth=2)\n    axs_learning_asli[0].plot(history['val_accuracy'], '-', label='Akurasi Validasi', linewidth=2)\n    axs_learning_asli[0].set_title('Grafik Akurasi (Asli)')\n    axs_learning_asli[0].set_xlabel('Epoch'); axs_learning_asli[0].set_ylabel('Akurasi')\n    axs_learning_asli[0].legend(loc='lower right')\n\n    axs_learning_asli[1].plot(history['loss'], '-', label='Loss Training', linewidth=2)\n    axs_learning_asli[1].plot(history['val_loss'], '-', label='Loss Validasi', linewidth=2)\n    axs_learning_asli[1].set_title('Grafik Loss (Asli)')\n    axs_learning_asli[1].set_xlabel('Epoch'); axs_learning_asli[1].set_ylabel('Loss')\n    axs_learning_asli[1].legend(loc='upper right')\n    \n    plt.tight_layout(rect=[0, 0.03, 1, 0.95])\n    plt.savefig(os.path.join(config.OUTPUTS_DIR, f\"{run_name}_learning_asli.png\"))\n    plt.show()\n\n    # 2. Grafik Kurva Pembelajaran (DENGAN SMOOTHING)\n    fig_learning_smooth, axs_learning_smooth = plt.subplots(1, 2, figsize=(15, 5))\n    fig_learning_smooth.suptitle(f'Kurva Pembelajaran (Smoothed): {model_name}', fontsize=16)\n\n    smoothed_accuracy = smooth_curve(history['accuracy'])\n    smoothed_val_accuracy = smooth_curve(history['val_accuracy'])\n    smoothed_loss = smooth_curve(history['loss'])\n    smoothed_val_loss = smooth_curve(history['val_loss'])\n\n    axs_learning_smooth[0].plot(history['accuracy'], '-', label='Akurasi Training (Asli)', alpha=0.3)\n    axs_learning_smooth[0].plot(history['val_accuracy'], '-', label='Akurasi Validasi (Asli)', alpha=0.3)\n    axs_learning_smooth[0].plot(smoothed_accuracy, '-', label='Akurasi Training (Smoothed)', linewidth=2)\n    axs_learning_smooth[0].plot(smoothed_val_accuracy, '-', label='Akurasi Validasi (Smoothed)', linewidth=2)\n    axs_learning_smooth[0].set_title('Grafik Akurasi (Smoothed)')\n    axs_learning_smooth[0].set_xlabel('Epoch'); axs_learning_smooth[0].set_ylabel('Akurasi')\n    axs_learning_smooth[0].legend(loc='lower right')\n\n    axs_learning_smooth[1].plot(history['loss'], '-', label='Loss Training (Asli)', alpha=0.3)\n    axs_learning_smooth[1].plot(history['val_loss'], '-', label='Loss Validasi (Asli)', alpha=0.3)\n    axs_learning_smooth[1].plot(smoothed_loss, '-', label='Loss Training (Smoothed)', linewidth=2)\n    axs_learning_smooth[1].plot(smoothed_val_loss, '-', label='Loss Validasi (Smoothed)', linewidth=2)\n    axs_learning_smooth[1].set_title('Grafik Loss (Smoothed)')\n    axs_learning_smooth[1].set_xlabel('Epoch'); axs_learning_smooth[1].set_ylabel('Loss')\n    axs_learning_smooth[1].legend(loc='upper right')\n\n    plt.tight_layout(rect=[0, 0.03, 1, 0.95])\n    plt.savefig(os.path.join(config.OUTPUTS_DIR, f\"{run_name}_learning_smooth.png\"))\n    plt.show()\n\ndef plot_confusion_matrix(y_true, y_pred, classes, model_name, run_name):\n    # Paper 2 Style: Heatmap with Count + Percentage\n    cm = confusion_matrix(y_true, y_pred)\n    cm_percent = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]\n    annot_labels = (np.asarray([\"{0:d}\\n({1:.1%})\".format(value, P_value)\n                                  for value, P_value in zip(cm.flatten(), cm_percent.flatten())])\n                    ).reshape(cm.shape)\n    plt.figure(figsize=(8, 6))\n    sns.heatmap(cm_percent, annot=annot_labels, fmt='', cmap='Blues',\n                xticklabels=classes, yticklabels=classes)\n    plt.title(f'Confusion Matrix: {model_name}', fontsize=14)\n    plt.ylabel('Label Aktual'); plt.xlabel('Label Prediksi')\n    plt.tight_layout()\n    plt.savefig(os.path.join(config.OUTPUTS_DIR, f\"{run_name}_cm.png\"))\n    plt.show()\n\ndef plot_roc_curve(y_true, y_pred_probs, model_name, run_name):\n    fpr, tpr, _ = roc_curve(y_true, y_pred_probs)\n    roc_auc = auc(fpr, tpr)\n    plt.figure(figsize=(8, 6))\n    plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'AUC = {roc_auc:.2f}')\n    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')\n    plt.legend(loc=\"lower right\")\n    plt.title(f'ROC: {model_name}')\n    plt.grid(True)\n    plt.xlabel('False Positive Rate'); plt.ylabel('True Positive Rate')\n    plt.savefig(os.path.join(config.OUTPUTS_DIR, f\"{run_name}_roc.png\"))\n    plt.close()\n\ndef plot_pr_curve(y_true, y_pred_probs, model_name, run_name):\n    precision, recall, _ = precision_recall_curve(y_true, y_pred_probs)\n    plt.figure(figsize=(8, 6))\n    plt.plot(recall, precision, color='purple', lw=2)\n    plt.title(f'PR Curve: {model_name}')\n    plt.xlabel('Recall'); plt.ylabel('Precision')\n    plt.grid(True)\n    plt.savefig(os.path.join(config.OUTPUTS_DIR, f\"{run_name}_prc.png\"))\n    plt.close()\n\ndef plot_class_report(y_true, y_pred, classes, model_name, run_name):\n    # Paper 2 Style: Detailed Bar Chart per Class\n    report_dict = classification_report(y_true, y_pred, target_names=classes, output_dict=True)\n    df = pd.DataFrame(report_dict).transpose().drop(['accuracy', 'macro avg', 'weighted avg'])\n    df = df.reset_index().rename(columns={'index':'class'}).melt(id_vars='class', value_vars=['precision','recall','f1-score'])\n    \n    plt.figure(figsize=(10, 6))\n    ax = sns.barplot(x='class', y='value', hue='variable', data=df, palette='viridis')\n    plt.title(f'Grafik Laporan Klasifikasi per Kelas: {model_name}', fontsize=14)\n    plt.ylim(0, 1.1)\n    plt.xlabel('Kelas'); plt.ylabel('Skor')\n    plt.legend(title='Metrik')\n    for p in ax.patches:\n        ax.annotate(f\"{p.get_height():.2f}\", (p.get_x() + p.get_width() / 2., p.get_height()),\n                      ha='center', va='center', xytext=(0, 9), textcoords='offset points')\n                      \n    plt.tight_layout()\n    plt.savefig(os.path.join(config.OUTPUTS_DIR, f\"{run_name}_report_bar.png\"))\n    plt.close()\n

In [None]:
# 4. Persiapan Data (Kaggle Auto-Detect or Gdown)
import shutil
import subprocess
import gdown

# IDs Google Drive (Backup jika file tidak ada di Kaggle Dataset)
UASPEECH_ID = '1L17F0SAkRk3rEjHDUyToLUvNp99sNMvE'
TORGO_ID = '1YU7aCqa4qyn75XRdFPAWEqVv_1Qpl9cG'

def setup_dataset_kaggle(name, file_id, extract_path):
    print(f"\n--- Setup Dataset: {name} ---")
    
    # 1. Cek di /kaggle/input (Siapa tau user sudah add data)
    # Polanya: /kaggle/input/<name> atau /kaggle/input/<anything>/<name>
    candidates = glob.glob(f'/kaggle/input/**/*{name}*', recursive=True)
    
    # Filter hanya folder yang valid (bukan file zip/meta)
    potential_dirs = [c for c in candidates if os.path.isdir(c)]
    
    # Spesifik untuk TORGO/UASpeech foldernya biasanya 'UASpeech' atau 'TORGO'
    for p in potential_dirs:
        if os.path.basename(p).lower() == name.lower() or os.path.basename(p).lower() == f"{name}_smalldataset".lower():
             print(f"✅ Ditemukan Dataset di Input: {p}")
             return p

    # 3. Jika tidak ketemu di Input, Coba Download (Gdown)
    print(f"⚠️ {name} tidak ditemukan di ke Kaggle Input. Mencoba download via Gdown...")
    
    local_zip_path = os.path.join(extract_path, f"{name}.zip")
    target_extract = os.path.join(extract_path, name)
    
    if os.path.exists(target_extract):
         print(f"✅ Dataset sudah ada di Working Dir: {target_extract}")
         return target_extract
         
    url = f'https://drive.google.com/uc?id={file_id}'
    gdown.download(url, local_zip_path, quiet=False)
    
    print(f"Mengekstrak {name}...")
    subprocess.check_call(['unzip', '-o', '-q', local_zip_path, '-d', extract_path])
    print(f"✅ {name} Selesai diekstrak.")
    
    # Handle nama folder TORGO yang kadang beda
    if name == 'TORGO' and not os.path.exists(target_extract):
         alt = os.path.join(extract_path, 'TORGO_smalldataset')
         if os.path.exists(alt): return alt
         
    return target_extract

# Jalankan Setup
uaspeech_path = setup_dataset_kaggle('UASpeech', UASPEECH_ID, LOCAL_DATA_ROOT)
torgo_path = setup_dataset_kaggle('TORGO', TORGO_ID, LOCAL_DATA_ROOT)

# ---------------------------------------------------------
# LOADING DATA
# ---------------------------------------------------------
print("\nMemuat Path File...")

# Load Path File Audio
uaspeech_files, uaspeech_labels, uaspeech_speakers = data_loader.get_file_paths(uaspeech_path, 'UASpeech')
torgo_files, torgo_labels, torgo_speakers = data_loader.get_file_paths(torgo_path, 'TORGO')

# --- GENERATE DATASET STATS FOR DASHBOARD ---
import json
print("Generating Dataset Statistics...")

def get_stats(name, files, labels, speakers):
    unique_lbl = list(set(labels))
    counts = {l: 0 for l in unique_lbl}
    for l in labels: counts[l] += 1
    
    summary = []
    for l in unique_lbl:
        cat = "Dysarthric" if l == 1 else "Control"
        total = counts[l]
        summary.append({
            "category": cat,
            "speakers": len(set(speakers)), # Rough approx
            "totalRaw": total,
            "trainRaw": int(total * 0.8),
            "testRaw": total - int(total * 0.8)
        })
        
    return {
        "name": name,
        "stats": {
            "samples": f"{len(files):,}",
            "classes": str(len(unique_lbl)),
            "avgLen": "N/A" # Skip expensive calc
        },
        "summaryData": summary
    }

stats_export = {
    "uaspeech": get_stats('UASpeech', uaspeech_files, uaspeech_labels, uaspeech_speakers),
    "torgo": get_stats('TORGO', torgo_files, torgo_labels, torgo_speakers)
}

with open(os.path.join(config.OUTPUTS_DIR, "dataset_stats.json"), 'w') as f:
    json.dump(stats_export, f, indent=4)
print("✅ dataset_stats.json saved.")

# --- GENERATE REAL EDA SAMPLES (Audio + Signals) ---
print("Generating EDA Samples (Waveform & Spectrogram data)...")
import random
import shutil
import librosa
import numpy as np

samples_out_dir = os.path.join(config.OUTPUTS_DIR, "samples")
os.makedirs(samples_out_dir, exist_ok=True)

eda_export = {}

# Iterate over both datasets
for ds_name, (ds_files, ds_labels, ds_speakers) in [('uaspeech', (uaspeech_files, uaspeech_labels, uaspeech_speakers)), ('torgo', (torgo_files, torgo_labels, torgo_speakers))]:
    eda_export[ds_name] = {'dysarthric': [], 'control': []}
    
    # Binary Classification Logic (Assuming 1=Dysarthric)
    # Note: If labels are different, adjust accordingly.
    # Based on previous cell: "Dysarthric" if l == 1 else "Control"
    
    indices_dys = [i for i, x in enumerate(ds_labels) if x == 1]
    indices_ctrl = [i for i, x in enumerate(ds_labels) if x != 1]
    
    # Pick 5 random from each
    picks_dys = random.sample(indices_dys, min(5, len(indices_dys)))
    picks_ctrl = random.sample(indices_ctrl, min(5, len(indices_ctrl)))
    
    for category, picks in [('dysarthric', picks_dys), ('control', picks_ctrl)]:
        for idx in picks:
            src = ds_files[idx]
            fname = f"{ds_name}_{os.path.basename(src)}" # Prefix to avoid collision
            dst = os.path.join(samples_out_dir, fname)
            shutil.copy(src, dst)
            
            # Analyze Signal
            try:
                y, sr = librosa.load(src, sr=16000)
                duration = len(y) / sr
                
                # 1. Waveform (100 points max, absolute amplitude)
                hop_len = max(1, len(y) // 80) # 80 bars
                waveform = [float(np.max(np.abs(y[i:i+hop_len]))) for i in range(0, len(y), hop_len)][:80]
                # Normalize waveform 0-100 for CSS height
                max_val = max(waveform) if waveform else 1
                waveform = [int((v / max_val) * 100) for v in waveform]
                
                # 2. Mel Spectrogram (Low Res for JSON: 40 bands x 60 time steps)
                # Enough for visual "texture" without bloating JSON
                n_mels = 40
                hop_spec = len(y) // 60
                if hop_spec < 512: hop_spec = 512 # Minimum hop
                
                S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels, hop_length=hop_spec)
                S_db = librosa.power_to_db(S, ref=np.max)
                
                # --- SAVE PNGs (Thesis Requirement - Matches Paper 2 Colors) ---
                # Determine Color based on Dataset & Category
                # Paper 2: UASpeech (Blue/Orange), TORGO (Green/Tomato)
                color = 'blue' # Default
                if ds_name.lower() == 'uaspeech':
                    color = 'dodgerblue' if category == 'control' else 'orangered'
                else: # TORGO
                    color = 'mediumseagreen' if category == 'control' else 'tomato'

                # 1. Waveform
                plt.figure(figsize=(4, 2))
                librosa.display.waveshow(y, sr=sr, alpha=0.7, color=color)
                plt.title(f'Wave: {os.path.basename(fname)}')
                plt.tight_layout()
                plt.savefig(os.path.join(samples_out_dir, f\"{os.path.splitext(fname)[0]}_wave.png\"))
                plt.close()
                
                # 2. Spectrogram
                plt.figure(figsize=(4, 2))
                librosa.display.specshow(S_db, sr=sr, hop_length=hop_spec, x_axis='time', y_axis='mel')
                plt.colorbar(format='%+2.0f dB')
                plt.title(f'MelSpec: {os.path.basename(fname)}')
                plt.tight_layout()
                plt.savefig(os.path.join(samples_out_dir, f\"{os.path.splitext(fname)[0]}_spec.png\"))
                plt.close()
                # --------------------------------------
                
                # Normalize 0-1
                min_db, max_db = S_db.min(), S_db.max()
                S_norm = (S_db - min_db) / (max_db - min_db)
                
                # Ensure dimensions (cut if too long)
                if S_norm.shape[1] > 60: S_norm = S_norm[:, :60]
                
                spectrogram = S_norm.tolist() # List of lists
                
                eda_export[ds_name][category].append({
                    "id": os.path.splitext(fname)[0],
                    "name": fname,
                    "duration": f"{duration:.1f}s",
                    "durationSec": duration,
                    "type": category,
                    "severity": "Unknown",
                    "waveform": waveform,
                    "spectrogram": spectrogram,
                    "url": f"/static/samples/{fname}"
                })
            except Exception as e:
                print(f"⚠️ Error processing {fname}: {e}")

with open(os.path.join(config.OUTPUTS_DIR, "eda_samples.json"), 'w') as f:
    json.dump(eda_export, f)
print("✅ eda_samples.json saved (Audio & Visuals).")

print("Data terload. Siap training.")

In [None]:
# 5. ANALISIS MODEL & PERBANDINGAN STRUKTUR (WAJIB PAPER 2)
# Bagian ini dipisahkan agar analisa FLOPs, Parameter, dan Memory terlihat jelas sebelum Training dimulai.
import io
import pandas as pd
import tensorflow as tf
import os

print("\n--- 2. Membangun dan Meringkas Semua Arsitektur Model ---")
summary_list = []

# Setup Input Shape Standar untuk Analisa (3 Channel untuk Model TL, 1 Channel untuk STFT)
# Fix: Gunakan MFCC_MAX_LEN yang benar dari config
# UPDATED: ImageNet models need 3 channels (RGB emulation)
input_shape_mfcc = (config.N_MFCC, config.MFCC_MAX_LEN, 3)
# Fix: Hitung N_STFT dari N_FFT/2 + 1 (Spectrogram Height)
n_stft_bins = (config.N_FFT // 2) + 1
input_shape_stft = (n_stft_bins, config.MFCC_MAX_LEN, 1)

for model_key, model_display_name in config.MODELS.items():
    print(f"Menganalisis arsitektur untuk: {model_display_name}...")
    
    # Tentukan input shape berdasarkan jenis model
    current_input_shape = input_shape_stft if model_key == 'cnn_stft' else input_shape_mfcc
    
    # Build Model
    tf.keras.backend.clear_session()
    try:
        model = models.get_model(model_key, current_input_shape, num_classes=2)
        
        # Hitung Metrik
        total_params = model.count_params()
        # Hitung FLOPs
        flops = trainer.get_flops(model)
        peak_mem_32bit, disk_size_32bit = trainer.get_model_memory_usage(model)
    except Exception as e:
        print(f"⚠️ Gagal build/metric {model_display_name}: {e}")
        flops = 0; peak_mem_32bit = 0; disk_size_32bit = 0
        # Dummy summary
        architecture_summary = "Error building model"
    else:
        # Capture Summary
        stream = io.StringIO()
        model.summary(print_fn=lambda x: stream.write(x + '\n'))
        architecture_summary = stream.getvalue()
        stream.close()

    summary_list.append({
        "Model": model_display_name,
        "Total Parameter": total_params,
        "FLOPs": flops,
        "Ukuran di Disk (32-bit)": disk_size_32bit,
        "Estimasi Ukuran 8-bit": disk_size_32bit / 4,
        "Estimasi Memori Aktivasi 8-bit": peak_mem_32bit / 4,
        "Architecture Summary": architecture_summary
    })

    # --- SAVE EFFICIENCY METRICS (JSON) ---
    efficiency_export = {}
    for item in summary_list:
        # Clean up keys for JSON export
        efficiency_export[item['Model']] = {
            "params": str(item['Total Parameter']),
            "flops": str(item['FLOPs']),
            "size": f"{item['Estimasi Ukuran 8-bit'] / 1024:.2f} MB",
            "activation": f"{item['Estimasi Memori Aktivasi 8-bit'] / 1024:.2f} KB"
        }
    
    with open(os.path.join(config.OUTPUTS_DIR, "model_efficiency.json"), 'w') as f:
        json.dump(efficiency_export, f, indent=4)
    print("✅ model_efficiency.json saved.")

# Tampilkan Tabel Ringkasan
print("\n" + "="*80)
print(f"--- 3. Tabel Ringkasan Metrik untuk Edge Device ---")
print("="*80)

columns_to_show = ["Model", "Total Parameter", "FLOPs", "Estimasi Ukuran 8-bit", "Estimasi Memori Aktivasi 8-bit"]
param_summary_df = pd.DataFrame(summary_list)[columns_to_show]

def format_flops_str(f):
    if f is None or f == 0: return "N/A"
    return f'{f / 1e9:.2f} GFLOPs' if f > 1e9 else f'{f / 1e6:.2f} MFLOPs'
def format_bytes_str(b):
    if b is None or b == 0: return "N/A"
    return f'{b / 1e6:.2f} MB' if b > 1e6 else f'{b / 1e3:.2f} KB'

param_summary_df['Total Parameter'] = param_summary_df['Total Parameter'].map('{:,}'.format)
param_summary_df['FLOPs'] = param_summary_df['FLOPs'].map(format_flops_str)
param_summary_df['Estimasi Ukuran 8-bit'] = param_summary_df['Estimasi Ukuran 8-bit'].map(format_bytes_str)
param_summary_df['Estimasi Memori Aktivasi 8-bit'] = param_summary_df['Estimasi Memori Aktivasi 8-bit'].map(format_bytes_str)

print(param_summary_df.to_string(index=False))

# Tampilkan Rincian Arsitektur
print("\n\n" + "="*65)
print(f"--- 4. Rincian Arsitektur per Model ---")
print("="*65)
for model_data in summary_list:
    print(f"\n>>> {model_data['Model']}:")
    print(model_data['Architecture Summary'])

In [None]:
# 6. Loop Pelatihan (Sekarang Fokus Training Saja)
from sklearn.model_selection import GroupShuffleSplit
import numpy as np

datasets = {
    'UASpeech': (uaspeech_files, uaspeech_labels, uaspeech_speakers),
    'TORGO': (torgo_files, torgo_labels, torgo_speakers)
}

for dataset_name, (data_files, data_labels, data_speakers) in datasets.items():
    print(f"\n{'#'*60}")
    print(f"MEMPROSES TRAINING DATASET: {dataset_name}")
    print(f"{'#'*60}\n")
    
    if len(data_files) == 0: continue

    # Mapping Kelas & Split (Sama seperti sebelumnya)
    unique_classes = sorted(list(set(data_labels)))
    class_mapping = {label: idx for idx, label in enumerate(unique_classes)}
    
    # Convert to Numpy for easy indexing
    X = np.array(data_files)
    y = np.array(data_labels)
    groups = np.array(data_speakers)
    
    # 1. SPLIT METODE PAPER 2 (STANDARD RANDOM SPLIT)
    # Tujuan: Meniru metodologi Paper 2 untuk mendapatkan performa 97%.
    # Menggunakan Stratified Shuffle Split, BUKAN Group Split.
    from sklearn.model_selection import train_test_split
    
    # Split 1: 80% Train, 20% (Test + Val)
    X_train, X_temp, y_train, y_temp = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    
    # Split 2: 50% Test, 50% Val (Dari sisa 20% tadi) -> Jadi 10% Val, 10% Test totalnya
    X_val, X_test, y_val, y_test = train_test_split(
        X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
    )
    
    # Print Distribution
    print(f"--- Data Distribution ({dataset_name}) [Paper 2 Style - Random Split] ---")
    print(f"[Train] Samples: {len(X_train)}")
    print(f"[Val  ] Samples: {len(X_val)}")
    print(f"[Test ] Samples: {len(X_test)}")

    for model_key, model_display_name in config.MODELS.items():
        print(f"\n--- Training Pipeline: {model_display_name} @ {dataset_name} ---")

        # ... (Pipeline sama: Dataset -> Build -> Train -> Eval)
        try:
             # Tipe Fitur
            feature_type = 'stft' if model_key == 'cnn_stft' else 'mfcc'
            
            # Create Dataset
            train_ds = data_loader.create_tf_dataset(X_train, y_train, class_mapping, is_training=True, feature_type=feature_type)
            val_ds = data_loader.create_tf_dataset(X_val, y_val, class_mapping, is_training=False, feature_type=feature_type)
            test_ds = data_loader.create_tf_dataset(X_test, y_test, class_mapping, is_training=False, feature_type=feature_type)

            # Get Input Shape from DS
            input_shape = None
            for feature, label in train_ds.take(1):
                input_shape = feature.shape[1:]
                break

            tf.keras.backend.clear_session()
            model = models.get_model(model_key, input_shape, num_classes=len(unique_classes))

            # Training
            run_name = f"{model_key}_{dataset_name}"
            history, time_taken = trainer.train_model(model, train_ds, val_ds, model_name=run_name)
            print(f"-> Training Done ({time_taken:.2f}s)")
            
            # Eval & Benchmark Metrics
            print(f"-> Evaluating {run_name}...")
            import time
            import json
            from sklearn.metrics import classification_report
            import numpy as np
            
            # 1. Inference Time Measurement
            start_eval = time.time()
            y_pred_probs = model.predict(test_ds)
            end_eval = time.time()
            
            # Count samples via ds iteration
            num_samples = 0
            y_true = []
            for features, labels in test_ds:
                num_samples += features.shape[0]
                y_true.extend(labels.numpy())
                
            inference_time_ms = ((end_eval - start_eval) / num_samples) * 1000
            
            # 2. Classification Report JSON
            # Robust Logic for 1D (Sigmoid) vs 2D (Softmax) outputs
            if y_pred_probs.ndim == 1 or y_pred_probs.shape[1] == 1:
                # Binary/Sigmoid Case
                y_pred = (y_pred_probs > 0.5).astype(int).flatten()
                prob_dysarthric = y_pred_probs.flatten()
            else:
                # Categorical/Softmax Case (Standard for this project)
                y_pred = np.argmax(y_pred_probs, axis=1)
                prob_dysarthric = y_pred_probs[:, 1]
            
            report_dict = classification_report(y_true, y_pred, target_names=unique_classes, output_dict=True)
            
            # Save Report
            report_path = os.path.join(config.OUTPUTS_DIR, f"{run_name}_report.json")
            with open(report_path, 'w') as f:
                json.dump(report_dict, f, indent=4)
            print(f"-> Report saved: {report_path}")
            
            # 3. Append to Benchmark Summary
            if 'benchmark_results' not in locals(): benchmark_results = []
            
            benchmark_entry = {
                "model": model_key,
                "dataset": dataset_name,
                "accuracy": report_dict['accuracy'],
                "inference_time_ms": inference_time_ms,
                "training_time_sec": time_taken,
                "run_name": run_name
            }
            benchmark_results.append(benchmark_entry)
            
            # 4. EXTENDED EVALUATION (Thesis Edition)
            try:
                from sklearn.metrics import confusion_matrix, roc_curve, precision_recall_curve, auc
                import matplotlib.pyplot as plt
                import seaborn as sns
                import pandas as pd
                
                # A. Save Model Architecture
                arch_path = os.path.join(config.OUTPUTS_DIR, f"{run_name}_arch.txt")
                with open(arch_path, 'w') as f:
                    model.summary(print_fn=lambda x: f.write(x + '\n'))
                print(f"-> Architecture saved: {arch_path}")

                # B. Save Full Predictions (For Error Analysis)
                # Re-map files for current dataset
                curr_test_files = uaspeech_test_files if dataset_name == 'UASpeech' else torgo_test_files
                
                # Create DataFrame
                # Ensure length matches (handle subsetting/filtering if diff)
                if len(curr_test_files) == len(y_true):
                    file_col = [os.path.basename(f) for f in curr_test_files]
                else:
                    # Fallback if split logic differs or shuffle happened invisibly
                    file_col = ['Unknown_File'] * len(y_true)
                
                pred_df = pd.DataFrame({
                    'file': file_col,
                    'true_label': y_true,
                    'pred_label': y_pred,
                    'prob_dysarthric': prob_dysarthric,
                    'is_correct': (np.array(y_true) == np.array(y_pred))
                })
                pred_csv_path = os.path.join(config.OUTPUTS_DIR, f"{run_name}_predictions.csv")
                pred_df.to_csv(pred_csv_path, index=False)
                print(f"-> Prediction Log saved: {pred_csv_path}")

                # C. Generate Static Plots (PNG for Thesis)
                print("-> Generating Thesis Plots...")
                plot_learning_curve(history.history, model_display_name, run_name)
                plot_confusion_matrix(y_true, y_pred, unique_classes, model_display_name, run_name)
                plot_roc_curve(y_true, prob_dysarthric, model_display_name, run_name)
                plot_pr_curve(y_true, prob_dysarthric, model_display_name, run_name)
                plot_class_report(y_true, y_pred, unique_classes, model_display_name, run_name)
                
                # D. Dashboard JSON Export (Keep this for React UI)
                cm = confusion_matrix(y_true, y_pred)
                fpr, tpr, _ = roc_curve(y_true, prob_dysarthric)
                roc_auc = auc(fpr, tpr)
                
                cm_list = cm.tolist()
                indices = np.linspace(0, len(fpr)-1, 50).astype(int)
                roc_data = [{"x": fpr[i], "y": tpr[i]} for i in indices]
                
                precision, recall, _ = precision_recall_curve(y_true, prob_dysarthric)
                indices_pr = np.linspace(0, len(precision)-1, 50).astype(int)
                pr_data = [{"x": recall[i], "y": precision[i]} for i in indices_pr]
                
                eval_export = {
                    "cm": cm_list,
                    "roc": roc_data,
                    "pr": pr_data,
                    "auroc": roc_auc
                }
                
                eval_path = os.path.join(config.OUTPUTS_DIR, f"{run_name}_eval.json")
                with open(eval_path, 'w') as f:
                    json.dump(eval_export, f)
                print(f"-> Extended Eval saved: {eval_path}")
            except Exception as e:
                print(f"⚠️ Failed to generate extended eval: {e}")
            
            # Save Summary
            summary_path = os.path.join(config.OUTPUTS_DIR, "benchmark_summary.json")
            with open(summary_path, 'w') as f:
                json.dump(benchmark_results, f, indent=4)
            
            # Standard Eval Print
            trainer.evaluate_model(model, test_ds, unique_classes, model_name=run_name)
        except Exception as e:
            print(f"ERROR Training {model_display_name}: {e}")

In [None]:
# 7. Visualisasi TensorBoard
logs_base_dir = os.path.join(config.OUTPUTS_DIR, 'logs')
%tensorboard --logdir "{logs_base_dir}"

In [None]:
# 8. Final Benchmarking Plots (Paper 2 Comparison - CLONED)\nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nimport json\nimport glob\nimport os\nimport numpy as np\n\n# ------------------------------------------------------\n# 1. LOAD ALL RESULTS\n# ------------------------------------------------------\nprint(\"\\nLoading all evaluation results...\")\neval_files = glob.glob(os.path.join(config.OUTPUTS_DIR, '*_eval.json'))\nall_results = {}\n\nif not eval_files:\n    print(\"❌ No evaluation files found. Skipping Comparison plots.\")\nelse:\n    summary_path = os.path.join(config.OUTPUTS_DIR, \"benchmark_summary.json\")\n    if os.path.exists(summary_path):\n        with open(summary_path, 'r') as f: benchmark_data = json.load(f)\n        # Convert to dictionary keyed by model name for easy lookup\n        # Assuming structure based on 'run_name' or 'model'\n        # Here we re-construct 'all_metrics' dictionary style from Paper 2\n        all_metrics = {}\n        training_times = {}\n        \n        for entry in benchmark_data:\n            ds = entry.get('dataset', 'unknown')\n            mod = entry.get('model', 'unknown')\n            if ds not in all_metrics: all_metrics[ds] = {}\n            if ds not in training_times: training_times[ds] = {}\n            \n            # Load full eval json for precision/recall/f1 breakdown if avail\n            # But benchmark_summary usually has just accuracy. \n            # Let's try to load the specific eval json for this run\n            run_name = entry.get('run_name')\n            eval_f = os.path.join(config.OUTPUTS_DIR, f\"{run_name}_eval.json\")\n            \n            # Default values\n            prec, rec, f1 = 0, 0, 0\n            \n            # Extract from eval json output if needed, or rely on what we have. \n            # Unlike Paper 2 which has 'all_results' in memory, here we reload from disk.\n            # For simplicity, we trust the 'benchmark_summary' if we updated it to support F1/Prec/Recall\n            # If not, we might need to rely on the 'report_bar' or similar.\n            # Let's assume we want to plot mainly what is available.\n            \n            all_metrics[ds][mod] = {\n                'accuracy': entry.get('accuracy', 0),\n                # If these are missing in summary, we might default to 0\n                'precision': entry.get('precision', 0),\n                'recall': entry.get('recall', 0),\n                'f1-score': entry.get('f1-score', 0)\n            }\n            training_times[ds][mod] = entry.get('training_time_sec', 0)\n\n    # ------------------------------------------------------\n    # 2. TABEL KOMPARASI METRIK\n    # ------------------------------------------------------\n    print(f\"\\n{'='*90}\")\n    print(\"📊📊📊 TABEL KOMPARASI METRIK ANTAR MODEL 📊📊📊\")\n    print(f\"{'='*90}\")\n    for dataset_name, metrics in all_metrics.items():\n        print(f\"\\n### KOMPARASI UNTUK DATASET: {dataset_name.upper()} ###\")\n        if metrics:\n            df_comparison = pd.DataFrame(metrics).T\n            df_comparison.index.name = 'Model'\n            print(df_comparison.round(3).to_string())\n        else:\n            print(\"Tidak ada metrik untuk ditampilkan.\")\n        print(\"-\" * 50)\n\n    # ------------------------------------------------------\n    # 3. GRAFIK BATANG PERBANDINGAN METRIK\n    # ------------------------------------------------------\n    print(f\"\\n{'='*90}\")\n    print(\"📈📈📈 GRAFIK BATANG PERBANDINGAN METRIK ANTAR MODEL 📈📈📈\")\n    print(f\"{'='*90}\")\n    \n    for dataset_name, metrics_data in all_metrics.items():\n        if not metrics_data: continue\n        try:\n            models = list(metrics_data.keys())\n            # Filter keys that actually exist in the data to avoid KeyErrors if summary is partial\n            # We assume accuracy is always there.\n            accuracies = [metrics_data[m].get('accuracy', 0) for m in models]\n            # Precision/Recall might be missing if not saved in summary. In Paper 2 script they are calculated in-memory.\n            # FOR NOW: We plot Accuracy primarily or use placeholders if 0.\n            precisions = [metrics_data[m].get('precision', 0) for m in models]\n            recalls = [metrics_data[m].get('recall', 0) for m in models]\n            f1_scores = [metrics_data[m].get('f1-score', 0) for m in models]\n\n            index = np.arange(len(models)); bar_width = 0.2\n            fig, ax = plt.subplots(figsize=(12, 7))\n            colors = plt.cm.viridis(np.linspace(0.1, 0.9, 4))\n\n            bar1 = ax.bar(index - bar_width*1.5, accuracies, bar_width, label='Accuracy', color=colors[0])\n            bar2 = ax.bar(index - bar_width/2, precisions, bar_width, label='Precision', color=colors[1])\n            bar3 = ax.bar(index + bar_width/2, recalls, bar_width, label='Recall', color=colors[2])\n            bar4 = ax.bar(index + bar_width*1.5, f1_scores, bar_width, label='F1-score', color=colors[3])\n\n            ax.set_xlabel('Model', fontweight='bold'); ax.set_ylabel('Skor', fontweight='bold')\n            ax.set_title(f'Perbandingan Metrik Antar Model: {dataset_name.upper()}', fontsize=14, fontweight='bold')\n            ax.set_xticks(index); ax.set_xticklabels(models, rotation=0)\n            ax.legend(); ax.set_ylim(0, 1.1)\n\n            def autolabel(bars):\n                for bar in bars:\n                    height = bar.get_height()\n                    ax.annotate(f'{height:.3f}', xy=(bar.get_x() + bar.get_width() / 2, height),\n                                xytext=(0, 3), textcoords=\"offset points\", ha='center', va='bottom', rotation=90)\n\n            autolabel(bar1); autolabel(bar2); autolabel(bar3); autolabel(bar4)\n            fig.tight_layout()\n            plt.savefig(os.path.join(config.OUTPUTS_DIR, f\"comparison_metrics_{dataset_name}.png\"))\n            plt.show()\n        except Exception as e:\n            print(f\"Error plot metric bar for {dataset_name}: {e}\")\n\n    # ------------------------------------------------------\n    # 4. GRAFIK KURVA ROC GABUNGAN\n    # ------------------------------------------------------\n    print(f\"\\n{'='*90}\")\n    print(\"📉📈📉 GRAFIK KURVA ROC (Combined) 📉📈📉\")\n    print(f\"{'='*90}\")\n    \n    # Group run_names by dataset from benchmark_results\n    datasets_runs = {}\n    for entry in benchmark_data:\n         ds = entry.get('dataset', 'unknown')\n         if ds not in datasets_runs: datasets_runs[ds] = []\n         datasets_runs[ds].append(entry)\n         \n    for dataset_name, entries in datasets_runs.items():\n        print(f\"\\n### KURVA ROC UNTUK DATASET: {dataset_name.upper()} ###\")\n        plt.figure(figsize=(10, 8))\n        has_plot = False\n        for entry in entries:\n            run_name = entry.get('run_name')\n            model_name = entry.get('model')\n            eval_path = os.path.join(config.OUTPUTS_DIR, f\"{run_name}_eval.json\")\n            try:\n                with open(eval_path, 'r') as f: data = json.load(f)\n                roc_data = data.get('roc', [])\n                if roc_data:\n                    fpr = [p['x'] for p in roc_data]\n                    tpr = [p['y'] for p in roc_data]\n                    auroc = data.get('auroc', 0)\n                    plt.plot(fpr, tpr, lw=2, label=f'{model_name} (AUC = {auroc:.2f})')\n                    has_plot = True\n            except: pass\n            \n        if has_plot:\n            plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Acak (AUC = 0.50)')\n            plt.xlim([0.0, 1.0]); plt.ylim([0.0, 1.05])\n            plt.xlabel('False Positive Rate', fontsize=12); plt.ylabel('True Positive Rate', fontsize=12)\n            plt.title(f'Kurva ROC Gabungan - {dataset_name.upper()}', fontsize=14, fontweight='bold')\n            plt.legend(loc=\"lower right\", fontsize=10); plt.grid(True)\n            plt.savefig(os.path.join(config.OUTPUTS_DIR, f\"combined_roc_{dataset_name}.png\"))\n            plt.show()\n        else:\n            # Don't show empty plot\n            plt.close()\n            \n    # ------------------------------------------------------\n    # 5. GRAFIK KURVA PRC GABUNGAN\n    # ------------------------------------------------------\n    print(f\"\\n{'='*90}\")\n    print(\"🎯🎯🎯 GRAFIK KURVA PRC (Combined) 🎯🎯🎯\")\n    print(f\"{'='*90}\")\n    \n    for dataset_name, entries in datasets_runs.items():\n        print(f\"\\n### KURVA PRC UNTUK DATASET: {dataset_name.upper()} ###\")\n        plt.figure(figsize=(10, 8))\n        has_plot = False\n        \n        for entry in entries:\n            run_name = entry.get('run_name')\n            model_name = entry.get('model')\n            eval_path = os.path.join(config.OUTPUTS_DIR, f\"{run_name}_eval.json\")\n            try:\n                with open(eval_path, 'r') as f: data = json.load(f)\n                pr_data = data.get('pr', [])\n                if pr_data:\n                    rec = [p['x'] for p in pr_data]\n                    prec = [p['y'] for p in pr_data]\n                    # Calculate AUPRC roughly or grab if saved. JSON usually doesn't have it explicitly unless added.\n                    # Let's just label model name.\n                    plt.plot(rec, prec, lw=2, label=f'{model_name}')\n                    has_plot = True\n            except: pass\n            \n        if has_plot:\n            plt.xlabel('Recall', fontsize=12)\n            plt.ylabel('Precision', fontsize=12)\n            plt.title(f'Kurva PRC Gabungan - {dataset_name.upper()}', fontsize=14, fontweight='bold')\n            plt.legend(loc=\"best\", fontsize=10)\n            plt.grid(True)\n            plt.savefig(os.path.join(config.OUTPUTS_DIR, f\"combined_prc_{dataset_name}.png\"))\n            plt.show()\n        else:\n            plt.close()\n            \n    # ------------------------------------------------------\n    # 6. TRAINING TIME COMPARISON\n    # ------------------------------------------------------\n    print(f\"\\n{'='*90}\")\n    print(\"⏱️⏱️⏱️ TABEL DAN GRAFIK WAKTU TRAINING ⏱️⏱️⏱️\")\n    print(f\"{'='*90}\")\n    \n    for dataset_name, times in training_times.items():\n         if not times: continue\n         print(f\"\\n### WAKTU TRAINING UNTUK DATASET: {dataset_name.upper()} ###\")\n         df_time = pd.DataFrame(list(times.items()), columns=['Model', 'Waktu (detik)'])\n         print(df_time.to_string(index=False))\n         \n         plt.figure(figsize=(8, 5))\n         plt.bar(df_time['Model'], df_time['Waktu (detik)'], color='#FF5722')\n         plt.title(f'Waktu Training per Model: {dataset_name}', fontsize=14)\n         plt.ylabel('Waktu (detik)'); plt.xlabel('Model')\n         plt.xticks(rotation=45); plt.tight_layout()\n         plt.savefig(os.path.join(config.OUTPUTS_DIR, f\"comparison_time_{dataset_name}.png\"))\n         plt.show()