In [1]:
import os
import json
import glob
import numpy as np
import soundfile as sf
import librosa
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, classification_report
from joblib import dump, load

# =========================
# CONFIG - CORRECTED PATHS
# =========================
DATASET_TRAIN_DIR = r"C:\Users\Jaiganesh\SoundGaurd\data\processed_data\train"
DATASET_TEST_DIR  = r"C:\Users\Jaiganesh\SoundGaurd\data\processed_data\test"
MODEL_DIR         = r"C:\Users\Jaiganesh\SoundGaurd\models_stage1"
os.makedirs(MODEL_DIR, exist_ok=True)

# Feature extraction parameters
SR = 22050
N_MFCC = 40
N_FFT = 1024
WIN_LENGTH = 512
HOP_LENGTH = 256
USE_DELTAS = True
POOL_STATS = ["mean", "std"]

# Folder structure
NON_THREAT_FOLDER = "non_threat"
THREAT_PARENT_FOLDER = "threat"
THREAT_SUBFOLDERS = ["glass_break", "scream", "gunshot"]

# =========================
# UTILITIES - FIXED VERSION
# =========================
def list_audio_files(root_dir):
    """List all audio files with binary threat/non-threat labels - FIXED VERSION"""
    wavs = []
    
    if not os.path.isdir(root_dir):
        print(f"[ERROR] Root directory doesn't exist: {root_dir}")
        return wavs
    
    # Non-threat files (label = 0) - FIXED: avoid double counting
    non_threat_path = os.path.join(root_dir, NON_THREAT_FOLDER)
    if os.path.isdir(non_threat_path):
        # Collect all unique WAV files (both .wav and .WAV)
        all_non_threat_files = set()  # Use set to avoid duplicates
        for ext in ["*.wav", "*.WAV"]:
            pattern = os.path.join(non_threat_path, ext)
            all_non_threat_files.update(glob.glob(pattern))
        
        all_non_threat_files = list(all_non_threat_files)  # Convert back to list
        wavs.extend([(p, 0) for p in all_non_threat_files])
        print(f"[INFO] Found {len(all_non_threat_files)} non-threat files")
    else:
        print(f"[WARN] Non-threat directory missing: {non_threat_path}")
    
    # Threat files (label = 1) - FIXED: avoid double counting
    threat_parent_path = os.path.join(root_dir, THREAT_PARENT_FOLDER)
    if os.path.isdir(threat_parent_path):
        for subfolder in THREAT_SUBFOLDERS:
            subfolder_path = os.path.join(threat_parent_path, subfolder)
            if os.path.isdir(subfolder_path):
                # Collect all unique WAV files for this subfolder
                all_threat_files = set()  # Use set to avoid duplicates
                for ext in ["*.wav", "*.WAV"]:
                    pattern = os.path.join(subfolder_path, ext)
                    all_threat_files.update(glob.glob(pattern))
                
                all_threat_files = list(all_threat_files)  # Convert back to list
                wavs.extend([(p, 1) for p in all_threat_files])
                print(f"[INFO] Found {len(all_threat_files)} {subfolder} files")
            else:
                print(f"[WARN] Threat subdirectory missing: {subfolder_path}")
    else:
        print(f"[WARN] Threat parent directory missing: {threat_parent_path}")
    
    # Final summary
    total_threats = len([w for w in wavs if w[1] == 1])
    total_non_threats = len([w for w in wavs if w[1] == 0])
    print(f"[INFO] Summary - Threats: {total_threats}, Non-threats: {total_non_threats}")
    print(f"[INFO] Total files: {len(wavs)}")
    
    return wavs

def extract_mfcc_features(wav_path):
    """Extract MFCC + deltas features"""
    try:
        # Load audio
        y, sr = sf.read(wav_path)
        
        # Convert to mono if stereo
        if y.ndim > 1:
            y = np.mean(y, axis=1)
        
        # Resample if needed
        if sr != SR:
            y = librosa.resample(y, orig_sr=sr, target_sr=SR)
        
        # Extract MFCCs
        mfcc = librosa.feature.mfcc(
            y=y,
            sr=SR,
            n_mfcc=N_MFCC,
            n_fft=N_FFT,
            hop_length=HOP_LENGTH,
            win_length=WIN_LENGTH
        )
        
        feats = [mfcc]
        
        # Add deltas
        if USE_DELTAS:
            delta = librosa.feature.delta(mfcc, order=1)
            delta2 = librosa.feature.delta(mfcc, order=2)
            feats.extend([delta, delta2])
        
        # Stack features (C x T)
        F = np.vstack(feats)
        
        # Pool over time
        pooled = []
        for stat in POOL_STATS:
            if stat == "mean":
                pooled.append(np.mean(F, axis=1))
            elif stat == "std":
                pooled.append(np.std(F, axis=1))
        
        # Final feature vector (240 features: 120 coefficients × 2 stats)
        pooled_vec = np.concatenate(pooled, axis=0)
        return pooled_vec.astype(np.float32)
        
    except Exception as e:
        print(f"[ERROR] Failed to process {wav_path}: {e}")
        return None

def build_dataset(root_dir):
    """Build feature matrix and labels"""
    items = list_audio_files(root_dir)
    
    if len(items) == 0:
        raise RuntimeError(f"No audio files found in {root_dir}")
    
    X, y, paths = [], [], []
    failed_count = 0
    
    print(f"[INFO] Processing {len(items)} files...")
    for i, (path, label) in enumerate(items):
        if (i + 1) % 500 == 0:  # Progress indicator every 500 files
            print(f"[INFO] Processed {i + 1}/{len(items)} files...")
        
        features = extract_mfcc_features(path)
        if features is not None and np.all(np.isfinite(features)):
            X.append(features)
            y.append(label)
            paths.append(path)
        else:
            failed_count += 1
    
    if failed_count > 0:
        print(f"[WARN] Failed to process {failed_count} files")
    
    if len(X) == 0:
        raise RuntimeError(f"No valid features extracted from {len(items)} files")
    
    X = np.vstack(X)
    y = np.array(y, dtype=np.int64)
    
    print(f"[INFO] Successfully processed {len(X)} files")
    print(f"[INFO] Feature shape: {X.shape}")
    
    return X, y, paths

def evaluate_model(y_true, y_pred, title=""):
    """Evaluate model performance"""
    acc = accuracy_score(y_true, y_pred)
    prec, rec, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="binary")
    macro_f1 = precision_recall_fscore_support(y_true, y_pred, average="macro")[2]
    cm = confusion_matrix(y_true, y_pred)
    
    print(f"\n=== {title} ===")
    print(f"Accuracy: {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall: {rec:.4f}")
    print(f"F1-Score: {f1:.4f}")
    print(f"Macro F1: {macro_f1:.4f}")
    print("\nConfusion Matrix:")
    print("         Pred: Non-Threat  Threat")
    print(f"Non-Threat:      {cm[0][0]:4d}      {cm[0][1]:4d}")
    print(f"Threat:          {cm[1][0]:4d}      {cm[1][1]:4d}")
    
    return {"accuracy": acc, "precision": prec, "recall": rec, "f1": f1, "macro_f1": macro_f1}

# =========================
# MAIN TRAINING
# =========================
if __name__ == "__main__":
    print("🎯 STAGE 1: BINARY THREAT DETECTION TRAINING (FIXED VERSION)")
    print("=" * 70)
    
    # Check folder structure first
    print("📁 DATASET STRUCTURE CHECK:")
    total_train_files = 0
    total_test_files = 0
    
    for split, split_dir in [("TRAIN", DATASET_TRAIN_DIR), ("TEST", DATASET_TEST_DIR)]:
        print(f"\n{split} Directory: {split_dir}")
        split_total = 0
        
        if os.path.isdir(split_dir):
            # Check non-threat
            non_threat_path = os.path.join(split_dir, NON_THREAT_FOLDER)
            if os.path.isdir(non_threat_path):
                # Count unique files (both .wav and .WAV)
                wav_files = set(glob.glob(os.path.join(non_threat_path, "*.wav")))
                wav_files.update(glob.glob(os.path.join(non_threat_path, "*.WAV")))
                count = len(wav_files)
                print(f"  ✅ {NON_THREAT_FOLDER}: {count} files")
                split_total += count
            else:
                print(f"  ❌ {NON_THREAT_FOLDER}: MISSING")
            
            # Check threat subfolders
            threat_path = os.path.join(split_dir, THREAT_PARENT_FOLDER)
            if os.path.isdir(threat_path):
                for subfolder in THREAT_SUBFOLDERS:
                    subfolder_path = os.path.join(threat_path, subfolder)
                    if os.path.isdir(subfolder_path):
                        # Count unique files (both .wav and .WAV)
                        wav_files = set(glob.glob(os.path.join(subfolder_path, "*.wav")))
                        wav_files.update(glob.glob(os.path.join(subfolder_path, "*.WAV")))
                        count = len(wav_files)
                        print(f"  ✅ threat/{subfolder}: {count} files")
                        split_total += count
                    else:
                        print(f"  ❌ threat/{subfolder}: MISSING")
            else:
                print(f"  ❌ {THREAT_PARENT_FOLDER}: MISSING")
        else:
            print(f"  ❌ Directory doesn't exist!")
        
        print(f"  📊 {split} Total: {split_total} files")
        if split == "TRAIN":
            total_train_files = split_total
        else:
            total_test_files = split_total
    
    print(f"\n📊 DATASET SUMMARY:")
    print(f"   Train: {total_train_files} files")
    print(f"   Test: {total_test_files} files")
    print(f"   Grand Total: {total_train_files + total_test_files} files")
    
    print("\n" + "=" * 70)
    print("🔊 BUILDING TRAINING DATASET...")
    X_train_full, y_train_full, train_paths = build_dataset(DATASET_TRAIN_DIR)
    
    threat_count = np.sum(y_train_full == 1)
    non_threat_count = np.sum(y_train_full == 0)
    print(f"Training Balance - Threats: {threat_count}, Non-threats: {non_threat_count}")
    
    # Validation split
    X_train, X_val, y_train, y_val = train_test_split(
        X_train_full, y_train_full, 
        test_size=0.15, 
        random_state=42, 
        stratify=y_train_full
    )
    
    print("\n" + "=" * 70)
    print("🤖 TRAINING AI MODELS...")
    
    # Models
    logreg = Pipeline([
        ("scaler", StandardScaler()),
        ("clf", LogisticRegression(max_iter=1000, C=1.0, random_state=42))
    ])
    
    linsvm = Pipeline([
        ("scaler", StandardScaler()),
        ("clf", LinearSVC(C=1.0, random_state=42, max_iter=2000))
    ])
    
    # Train and validate
    models = {"LogisticRegression": logreg, "LinearSVM": linsvm}
    results = {}
    
    for name, model in models.items():
        print(f"\n🔄 Training {name}...")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        results[name] = evaluate_model(y_val, y_pred, f"Validation - {name}")
    
    # Select best model
    best_name = max(results, key=lambda x: results[x]["macro_f1"])
    best_model = models[best_name]
    
    print(f"\n🏆 BEST MODEL: {best_name}")
    print(f"   Validation Macro F1: {results[best_name]['macro_f1']:.4f}")
    
    # Retrain on full training set
    print(f"\n🔄 Retraining {best_name} on full training set...")
    best_model.fit(X_train_full, y_train_full)
    
    print("\n" + "=" * 70)
    print("🧪 FINAL TEST EVALUATION...")
    X_test, y_test, test_paths = build_dataset(DATASET_TEST_DIR)
    
    test_threat_count = np.sum(y_test == 1)
    test_non_threat_count = np.sum(y_test == 0)
    print(f"Test Balance - Threats: {test_threat_count}, Non-threats: {test_non_threat_count}")
    
    y_test_pred = best_model.predict(X_test)
    final_results = evaluate_model(y_test, y_test_pred, "🎯 FINAL TEST RESULTS")
    
    # Save model and config
    model_filename = f"stage1_binary_{best_name.lower()}.joblib"
    model_path = os.path.join(MODEL_DIR, model_filename)
    dump(best_model, model_path)
    
    config = {
        "model_type": best_name,
        "model_path": model_path,
        "feature_params": {
            "sr": SR,
            "n_mfcc": N_MFCC,
            "n_fft": N_FFT,
            "win_length": WIN_LENGTH,
            "hop_length": HOP_LENGTH,
            "use_deltas": USE_DELTAS,
            "pool_stats": POOL_STATS
        },
        "results": final_results,
        "label_mapping": {"non_threat": 0, "threat": 1},
        "dataset_info": {
            "train_files": len(X_train_full),
            "test_files": len(X_test),
            "total_files": len(X_train_full) + len(X_test)
        }
    }
    
    config_path = os.path.join(MODEL_DIR, "stage1_config.json")
    with open(config_path, "w") as f:
        json.dump(config, f, indent=2)
    
    print("\n" + "=" * 70)
    print("✅ STAGE 1 COMPLETE!")
    print(f"📁 Model saved: {model_path}")
    print(f"📁 Config saved: {config_path}")
    print(f"🎯 Final Accuracy: {final_results['accuracy']:.1%}")
    print(f"📊 Dataset Size: {len(X_train_full)} train + {len(X_test)} test = {len(X_train_full) + len(X_test)} total files")
    print(f"🎯 This model can detect threats with {final_results['accuracy']:.1%} accuracy!")

# Inference function for new files
def predict_single_file(wav_path, model_path, config_path):
    """Predict threat/non-threat for a single WAV file"""
    model = load(model_path)
    with open(config_path) as f:
        config = json.load(f)
    
    features = extract_mfcc_features(wav_path)
    if features is None:
        return None, None
    
    features = features.reshape(1, -1)
    prediction = model.predict(features)[0]
    
    # Get probability if available
    probability = None
    try:
        if hasattr(model.named_steps['clf'], 'predict_proba'):
            probability = model.named_steps['clf'].predict_proba(
                model.named_steps['scaler'].transform(features)
            )[0][1]
    except:
        pass
    
    label = "threat" if prediction == 1 else "non_threat"
    return label, probability


🎯 STAGE 1: BINARY THREAT DETECTION TRAINING (FIXED VERSION)
📁 DATASET STRUCTURE CHECK:

TRAIN Directory: C:\Users\Jaiganesh\SoundGaurd\data\processed_data\train
  ✅ non_threat: 2100 files
  ✅ threat/glass_break: 700 files
  ✅ threat/scream: 700 files
  ✅ threat/gunshot: 700 files
  📊 TRAIN Total: 4200 files

TEST Directory: C:\Users\Jaiganesh\SoundGaurd\data\processed_data\test
  ✅ non_threat: 900 files
  ✅ threat/glass_break: 300 files
  ✅ threat/scream: 300 files
  ✅ threat/gunshot: 300 files
  📊 TEST Total: 1800 files

📊 DATASET SUMMARY:
   Train: 4200 files
   Test: 1800 files
   Grand Total: 6000 files

🔊 BUILDING TRAINING DATASET...
[INFO] Found 2100 non-threat files
[INFO] Found 700 glass_break files
[INFO] Found 700 scream files
[INFO] Found 700 gunshot files
[INFO] Summary - Threats: 2100, Non-threats: 2100
[INFO] Total files: 4200
[INFO] Processing 4200 files...
[INFO] Processed 500/4200 files...
[INFO] Processed 1000/4200 files...
[INFO] Processed 1500/4200 files...
[INFO] P