# Machine Learning Pipeline - Dự án Sức khỏe Răng miệng BRFSS 2022

## Mục tiêu
Xây dựng và đánh giá 10 baseline models theo kế hoạch team:
1. **Classical Models**: SVM, KNN
2. **Boosting Models**: XGBoost, CatBoost, AdaBoost
3. **Neural Networks**: MLP, 1D CNN, TabNet
4. **Hyperparameter Tuning**: Sử dụng Optuna
5. **Focus on Recall**: Tối ưu cho bài toán screening
6. **XAI Analysis**: 5 loại SHAP + LIME

---

In [1]:
# Import tất cả thư viện cần thiết
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from pathlib import Path
import joblib
from datetime import datetime

# Machine Learning
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.metrics import precision_recall_curve, average_precision_score

# Classical Models
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# Boosting Models
import xgboost as xgb
from catboost import CatBoostClassifier
from sklearn.ensemble import AdaBoostClassifier

# Neural Networks
from sklearn.neural_network import MLPClassifier
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, Flatten, Dropout

# Hyperparameter Tuning
import optuna

# XAI
import shap
import lime
from lime.lime_tabular import LimeTabularExplainer

# Cấu hình
warnings.filterwarnings('ignore')
plt.style.use('default')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)

print("✅ Import thành công tất cả thư viện cần thiết!")
print(f"📅 Thời gian bắt đầu: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

✅ Import thành công tất cả thư viện cần thiết!
📅 Thời gian bắt đầu: 2025-07-06 16:56:50


## 1. Tải Dataset đã Clean và Chuẩn bị Target Variable

In [2]:
# Tải dataset đã clean từ EDA notebook
print("📊 TẢI DATASET ĐÃ CLEAN")
print("=" * 40)

try:
    # Tải dữ liệu đã clean
    df = pd.read_parquet('../data/llcp2022_cleaned.parquet')
    print(f"✅ Tải thành công dataset đã clean")
    print(f"📊 Kích thước: {df.shape[0]:,} hàng × {df.shape[1]} cột")
    print(f"💾 Bộ nhớ: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
    print(f"❓ Missing values: {df.isnull().sum().sum():,}")
    
except FileNotFoundError:
    print("❌ Không tìm thấy file đã clean!")
    print("Vui lòng chạy EDA notebook trước để tạo file cleaned data")
    
# Hiển thị các cột có sẵn
print(f"\n📋 CÁC CỘT TRONG DATASET:")
print(f"   {list(df.columns)}")

# Tìm các biến mục tiêu tiềm năng
potential_targets = ['LASTDEN4', 'RMVTETH4', '_DENVST3']
available_targets = [col for col in potential_targets if col in df.columns]

print(f"\n🎯 BIẾN MỤC TIÊU TIỀM NĂNG:")
for target in available_targets:
    print(f"   - {target}: {df[target].nunique()} unique values, {df[target].isnull().sum():,} missing")

# Hiển thị sample data
display(df.head())

📊 TẢI DATASET ĐÃ CLEAN
✅ Tải thành công dataset đã clean
📊 Kích thước: 445,132 hàng × 115 cột
💾 Bộ nhớ: 505.59 MB
❓ Missing values: 963,258

📋 CÁC CỘT TRONG DATASET:
   ['_STATE', 'FMONTH', 'IDATE', 'IMONTH', 'IDAY', 'IYEAR', 'DISPCODE', 'SEQNO', '_PSU', 'SEXVAR', 'GENHLTH', 'PHYSHLTH', 'MENTHLTH', 'PRIMINSR', 'PERSDOC3', 'MEDCOST1', 'CHECKUP1', 'EXERANY2', 'SLEPTIM1', 'LASTDEN4', 'RMVTETH4', 'CVDINFR4', 'CVDCRHD4', 'CVDSTRK3', 'ASTHMA3', 'CHCSCNC1', 'CHCOCNC1', 'CHCCOPD3', 'ADDEPEV3', 'CHCKDNY2', 'HAVARTH4', 'DIABETE4', 'MARITAL', 'EDUCA', 'RENTHOM1', 'CPDEMO1C', 'VETERAN3', 'EMPLOY1', 'CHILDREN', 'INCOME3', 'WEIGHT2', 'HEIGHT3', 'DEAF', 'BLIND', 'DECIDE', 'DIFFWALK', 'DIFFDRES', 'DIFFALON', 'SMOKE100', 'USENOW3', 'ECIGNOW2', 'LCSCTSC1', 'ALCDAY4', 'FLUSHOT7', 'PNEUVAC4', 'TETANUS1', 'HIVTST7', 'HIVRISK5', 'COVIDPOS', 'QSTVER', 'QSTLANG', '_METSTAT', '_URBSTAT', '_STSTR', '_STRWT', '_RAWRAKE', '_WT2RAKE', '_IMPRACE', '_DUALUSE', '_LLCPWT2', '_LLCPWT', '_RFHLTH', '_PHYS14D', '_MENT14D'

Unnamed: 0,_STATE,FMONTH,IDATE,IMONTH,IDAY,IYEAR,DISPCODE,SEQNO,_PSU,SEXVAR,...,_SMOKER3,_RFSMOK3,_CURECI2,_SMOKGRP,DRNKANY6,DROCDY4_,_RFBING6,_DRNKWK2,_RFDRHV8,_AIDTST4
0,1.0,1.0,2032022,2,3,2022,1100.0,2022000001,2022000000.0,2.0,...,4.0,1.0,1.0,4.0,2.0,0.0,1.0,0.0,1.0,2.0
1,1.0,1.0,2042022,2,4,2022,1100.0,2022000002,2022000000.0,2.0,...,4.0,1.0,1.0,4.0,2.0,0.0,1.0,0.0,1.0,2.0
2,1.0,1.0,2022022,2,2,2022,1100.0,2022000003,2022000000.0,2.0,...,4.0,1.0,1.0,4.0,2.0,0.0,1.0,0.0,1.0,2.0
3,1.0,1.0,2032022,2,3,2022,1100.0,2022000004,2022000000.0,2.0,...,2.0,2.0,1.0,3.0,2.0,0.0,1.0,0.0,1.0,2.0
4,1.0,1.0,2022022,2,2,2022,1100.0,2022000005,2022000000.0,2.0,...,4.0,1.0,1.0,4.0,1.0,10.0,1.0,140.0,1.0,2.0


## 2. Chọn Target Variable và Feature Engineering

In [3]:
# Chọn target variable dựa trên phân tích EDA
print("🎯 CHỌN TARGET VARIABLE")
print("=" * 40)

# Phân tích từng target tiềm năng
target_analysis = {}

for target in available_targets:
    data = df[target].dropna()
    
    target_analysis[target] = {
        'missing_pct': df[target].isnull().mean() * 100,
        'unique_values': data.nunique(),
        'sample_size': len(data),
        'value_counts': data.value_counts().head(10)
    }
    
    print(f"\n--- {target} ---")
    print(f"Missing: {target_analysis[target]['missing_pct']:.1f}%")
    print(f"Sample size: {target_analysis[target]['sample_size']:,}")
    print(f"Unique values: {target_analysis[target]['unique_values']}")
    print(f"Value distribution:")
    for val, count in target_analysis[target]['value_counts'].items():
        print(f"   {val}: {count:,} ({count/target_analysis[target]['sample_size']*100:.1f}%)")

# Tự động chọn target variable tốt nhất
# Tiêu chí: missing < 15%, có 2-10 unique values, distribution cân bằng
best_target = None
best_score = 0

for target in available_targets:
    score = 0
    
    # Điểm cho missing rate
    if target_analysis[target]['missing_pct'] < 15:
        score += 3
    elif target_analysis[target]['missing_pct'] < 25:
        score += 2
    else:
        score += 1
    
    # Điểm cho số unique values
    if 2 <= target_analysis[target]['unique_values'] <= 5:
        score += 3
    elif target_analysis[target]['unique_values'] <= 10:
        score += 2
    else:
        score += 1
    
    # Điểm cho sample size
    if target_analysis[target]['sample_size'] > 100000:
        score += 3
    elif target_analysis[target]['sample_size'] > 50000:
        score += 2
    else:
        score += 1
    
    print(f"\n{target} - Score: {score}/9")
    
    if score > best_score:
        best_score = score
        best_target = target

print(f"\n🏆 CHỌN TARGET VARIABLE: {best_target} (Score: {best_score}/9)")

# Thiết lập target variable
TARGET_COLUMN = best_target
print(f"\n📊 TARGET VARIABLE: {TARGET_COLUMN}")

# Xử lý target variable cho classification
y = df[TARGET_COLUMN].copy()

# Nếu target có nhiều hơn 2 classes, có thể cần nhóm lại
if y.nunique() > 2:
    print(f"⚠️  Target có {y.nunique()} classes, đang xem xét nhóm lại cho binary classification...")
    
    # Ví dụ: nhóm lại cho LASTDEN4 (lần cuối đến nha sĩ)
    if TARGET_COLUMN == 'LASTDEN4':
        # 1: trong năm qua, 2: 1-2 năm, 3: 2-5 năm, 4: >5 năm, 8: chưa bao giờ
        # Nhóm thành: 0 = không đều đặn (2,3,4,8), 1 = đều đặn (1)
        y = (y == 1).astype(int)
        print(f"   → Nhóm lại thành: 0=không đều đặn, 1=đều đặn (đến nha sĩ hàng năm)")
    
    elif TARGET_COLUMN == 'RMVTETH4':
        # 1: 1-5 răng, 2: 6+ răng, 3: tất cả, 8: không có
        # Nhóm thành: 0 = không mất răng (8), 1 = có mất răng (1,2,3)
        y = (y.isin([1, 2, 3])).astype(int)
        print(f"   → Nhóm lại thành: 0=không mất răng, 1=có mất răng do bệnh")

# Loại bỏ missing values trong target
valid_indices = y.notna()
y = y[valid_indices]
X = df.drop(columns=[TARGET_COLUMN])[valid_indices]

print(f"\n📊 DATASET CUỐI CÙNG:")
print(f"   - Kích thước: {X.shape[0]:,} samples × {X.shape[1]} features")
print(f"   - Target distribution:")
for val, count in y.value_counts().items():
    print(f"     Class {val}: {count:,} ({count/len(y)*100:.1f}%)")
    
# Kiểm tra class imbalance
minority_class_pct = y.value_counts().min() / len(y) * 100
print(f"   - Minority class: {minority_class_pct:.1f}%")
if minority_class_pct < 20:
    print(f"   - ⚠️  Imbalanced dataset - cần xử lý khi training")
else:
    print(f"   - ✅ Balanced dataset")

🎯 CHỌN TARGET VARIABLE

--- LASTDEN4 ---
Missing: 0.3%
Sample size: 443,769
Unique values: 8
Value distribution:
   1.0: 292,408 (65.9%)
   2.0: 50,326 (11.3%)
   3.0: 46,987 (10.6%)
   4.0: 44,828 (10.1%)
   7.0: 4,866 (1.1%)
   8.0: 3,562 (0.8%)
   9.0: 788 (0.2%)
   5.0: 4 (0.0%)

--- RMVTETH4 ---
Missing: 0.3%
Sample size: 443,769
Unique values: 6
Value distribution:
   8.0: 233,455 (52.6%)
   1.0: 129,294 (29.1%)
   2.0: 45,570 (10.3%)
   3.0: 25,453 (5.7%)
   7.0: 8,563 (1.9%)
   9.0: 1,434 (0.3%)

--- _DENVST3 ---
Missing: 0.0%
Sample size: 445,128
Unique values: 3
Value distribution:
   1.0: 292,408 (65.7%)
   2.0: 145,703 (32.7%)
   9.0: 7,017 (1.6%)

LASTDEN4 - Score: 8/9

RMVTETH4 - Score: 8/9

_DENVST3 - Score: 9/9

🏆 CHỌN TARGET VARIABLE: _DENVST3 (Score: 9/9)

📊 TARGET VARIABLE: _DENVST3
⚠️  Target có 3 classes, đang xem xét nhóm lại cho binary classification...

📊 DATASET CUỐI CÙNG:
   - Kích thước: 445,128 samples × 114 features
   - Target distribution:
     Class 1.0:

## 3. Chia Dataset và Chuẩn bị Features

In [4]:
# Chia dataset thành train/validation/test sets
print("🔄 CHIA DATASET VÀ CHUẨN BỊ FEATURES")
print("=" * 50)

# Chia train (60%) / validation (20%) / test (20%)
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.25, random_state=42, stratify=y_temp
)

print(f"📊 KÍCH THƯỚC DATASETS:")
print(f"   - Train: {X_train.shape[0]:,} samples ({X_train.shape[0]/len(X)*100:.1f}%)")
print(f"   - Validation: {X_val.shape[0]:,} samples ({X_val.shape[0]/len(X)*100:.1f}%)")
print(f"   - Test: {X_test.shape[0]:,} samples ({X_test.shape[0]/len(X)*100:.1f}%)")

# Kiểm tra distribution trong từng set
print(f"\n📈 DISTRIBUTION TRONG TỪNG SET:")
for name, y_set in [('Train', y_train), ('Validation', y_val), ('Test', y_test)]:
    dist = y_set.value_counts(normalize=True) * 100
    print(f"   - {name} Distribution:")
    for class_val, pct in dist.items():
        print(f"     • Class {class_val}: {pct:.1f}%")

# Xử lý features
print(f"\n🔧 XỬ LÝ FEATURES:")

# Kiểm tra missing values
print(f"   - Missing values in train: {X_train.isnull().sum().sum()}")
print(f"   - Columns with missing values: {X_train.isnull().sum()[X_train.isnull().sum() > 0].shape[0]}")

# Tách numeric và categorical features
numeric_features = X_train.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X_train.select_dtypes(exclude=[np.number]).columns.tolist()

print(f"   - Numeric features: {len(numeric_features)}")
print(f"   - Categorical features: {len(categorical_features)}")

# Imputation for missing values
print(f"   - 🔄 Imputing missing values...")

# Copy data for processing
X_train_processed = X_train.copy()
X_val_processed = X_val.copy()
X_test_processed = X_test.copy()

# Impute numeric features with median
if numeric_features:
    numeric_imputer = SimpleImputer(strategy='median')
    X_train_processed[numeric_features] = numeric_imputer.fit_transform(X_train_processed[numeric_features])
    X_val_processed[numeric_features] = numeric_imputer.transform(X_val_processed[numeric_features])
    X_test_processed[numeric_features] = numeric_imputer.transform(X_test_processed[numeric_features])
    print(f"   - ✅ Imputed numeric features with median")

# Impute categorical features with mode
if categorical_features:
    categorical_imputer = SimpleImputer(strategy='most_frequent')
    X_train_processed[categorical_features] = categorical_imputer.fit_transform(X_train_processed[categorical_features])
    X_val_processed[categorical_features] = categorical_imputer.transform(X_val_processed[categorical_features])
    X_test_processed[categorical_features] = categorical_imputer.transform(X_test_processed[categorical_features])
    print(f"   - ✅ Imputed categorical features with mode")

# Verify no missing values
print(f"   - Missing values after imputation: {X_train_processed.isnull().sum().sum()}")

# Standardize numeric features
if numeric_features:
    scaler = StandardScaler()
    X_train_scaled = X_train_processed.copy()
    X_val_scaled = X_val_processed.copy()
    X_test_scaled = X_test_processed.copy()
    
    X_train_scaled[numeric_features] = scaler.fit_transform(X_train_processed[numeric_features])
    X_val_scaled[numeric_features] = scaler.transform(X_val_processed[numeric_features])
    X_test_scaled[numeric_features] = scaler.transform(X_test_processed[numeric_features])
    
    print(f"   - ✅ Standardized numeric features")
else:
    X_train_scaled = X_train_processed.copy()
    X_val_scaled = X_val_processed.copy()
    X_test_scaled = X_test_processed.copy()
    scaler = None

# Encode categorical features
if categorical_features:
    label_encoders = {}
    
    for col in categorical_features:
        le = LabelEncoder()
        
        # Fit trên train data
        X_train_scaled[col] = le.fit_transform(X_train_scaled[col].astype(str))
        
        # Transform validation và test data
        # Xử lý unseen categories
        def safe_transform(data, encoder):
            known_classes = set(encoder.classes_)
            data_str = data.astype(str)
            
            # Thay thế unknown categories bằng most frequent class
            most_frequent = encoder.classes_[0]  # hoặc có thể dùng mode
            data_str = data_str.apply(lambda x: x if x in known_classes else most_frequent)
            
            return encoder.transform(data_str)
        
        X_val_scaled[col] = safe_transform(X_val_scaled[col], le)
        X_test_scaled[col] = safe_transform(X_test_scaled[col], le)
        
        label_encoders[col] = le
    
    print(f"   - ✅ Encoded categorical features")
else:
    label_encoders = {}

# Chuyển đổi thành numpy arrays cho một số models
X_train_np = X_train_scaled.values
X_val_np = X_val_scaled.values
X_test_np = X_test_scaled.values

y_train_np = y_train.values
y_val_np = y_val.values
y_test_np = y_test.values

print(f"\n✅ CHUẨN BỊ FEATURES HOÀN THÀNH!")
print(f"   - Features shape: {X_train_np.shape}")
print(f"   - All features are numeric: {np.all([np.issubdtype(X_train_scaled[col].dtype, np.number) for col in X_train_scaled.columns])}")
print(f"   - No missing values in train: {not X_train_scaled.isnull().any().any()}")
print(f"   - No missing values in val: {not X_val_scaled.isnull().any().any()}")
print(f"   - No missing values in test: {not X_test_scaled.isnull().any().any()}")
print(f"   - No NaN in numpy arrays: {not np.isnan(X_train_np).any()}")

# Hiển thị thông tin về target variable
print(f"\n🎯 TARGET VARIABLE INFO:")
print(f"   - Target column: {TARGET_COLUMN}")
print(f"   - Unique classes: {sorted(y.unique())}")
print(f"   - Number of classes: {y.nunique()}")
print(f"   - Most frequent class: {y.mode().iloc[0]} ({(y.value_counts().iloc[0] / len(y) * 100):.1f}%)")
print(f"   - Least frequent class: {y.value_counts().index[-1]} ({(y.value_counts().iloc[-1] / len(y) * 100):.1f}%)")

🔄 CHIA DATASET VÀ CHUẨN BỊ FEATURES
📊 KÍCH THƯỚC DATASETS:
   - Train: 267,076 samples (60.0%)
   - Validation: 89,026 samples (20.0%)
   - Test: 89,026 samples (20.0%)

📈 DISTRIBUTION TRONG TỪNG SET:
   - Train Distribution:
     • Class 1.0: 65.7%
     • Class 2.0: 32.7%
     • Class 9.0: 1.6%
   - Validation Distribution:
     • Class 1.0: 65.7%
     • Class 2.0: 32.7%
     • Class 9.0: 1.6%
   - Test Distribution:
     • Class 1.0: 65.7%
     • Class 2.0: 32.7%
     • Class 9.0: 1.6%

🔧 XỬ LÝ FEATURES:
   - Missing values in train: 579121
📊 KÍCH THƯỚC DATASETS:
   - Train: 267,076 samples (60.0%)
   - Validation: 89,026 samples (20.0%)
   - Test: 89,026 samples (20.0%)

📈 DISTRIBUTION TRONG TỪNG SET:
   - Train Distribution:
     • Class 1.0: 65.7%
     • Class 2.0: 32.7%
     • Class 9.0: 1.6%
   - Validation Distribution:
     • Class 1.0: 65.7%
     • Class 2.0: 32.7%
     • Class 9.0: 1.6%
   - Test Distribution:
     • Class 1.0: 65.7%
     • Class 2.0: 32.7%
     • Class 9.0:

## 4. Training 10 Baseline Models

Theo kế hoạch team, chúng ta sẽ train 10 models:
- **Classical**: SVM, KNN
- **Boosting**: XGBoost, CatBoost, AdaBoost
- **Neural Networks**: MLP, 1D CNN, TabNet
- **Additional**: Random Forest, Logistic Regression, Naive Bayes

**Focus**: Tối ưu hóa **Recall** cho bài toán screening

In [None]:
# Định nghĩa và train 10 baseline models
print("🤖 TRAINING 10 BASELINE MODELS")
print("=" * 50)

# Dictionary để lưu models và kết quả
models = {}
results = {}

# Hàm đánh giá model với focus on Recall
def evaluate_model(model, X_val, y_val, model_name):
    """Đánh giá model với focus on Recall cho screening"""
    y_pred = model.predict(X_val)
    y_pred_proba = model.predict_proba(X_val)[:, 1] if hasattr(model, 'predict_proba') else None
    
    from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score
    
    recall = recall_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)
    accuracy = accuracy_score(y_val, y_pred)
    
    roc_auc = roc_auc_score(y_val, y_pred_proba) if y_pred_proba is not None else None
    
    return {
        'model_name': model_name,
        'recall': recall,
        'precision': precision,
        'f1': f1,
        'accuracy': accuracy,
        'roc_auc': roc_auc,
        'y_pred': y_pred,
        'y_pred_proba': y_pred_proba
    }

# Import additional models
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

# 1. Classical Models
print("\n🔹 CLASSICAL MODELS")
print("-" * 30)

# SVM
print("Training SVM...")
svm_model = SVC(kernel='rbf', probability=True, random_state=42)
svm_model.fit(X_train_np, y_train_np)
models['SVM'] = svm_model
results['SVM'] = evaluate_model(svm_model, X_val_np, y_val_np, 'SVM')
print(f"  SVM - Recall: {results['SVM']['recall']:.3f}, Precision: {results['SVM']['precision']:.3f}")

# KNN
print("Training KNN...")
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train_np, y_train_np)
models['KNN'] = knn_model
results['KNN'] = evaluate_model(knn_model, X_val_np, y_val_np, 'KNN')
print(f"  KNN - Recall: {results['KNN']['recall']:.3f}, Precision: {results['KNN']['precision']:.3f}")

# 2. Boosting Models
print("\n🔹 BOOSTING MODELS")
print("-" * 30)

# XGBoost
print("Training XGBoost...")
xgb_model = xgb.XGBClassifier(random_state=42, eval_metric='logloss')
xgb_model.fit(X_train_np, y_train_np)
models['XGBoost'] = xgb_model
results['XGBoost'] = evaluate_model(xgb_model, X_val_np, y_val_np, 'XGBoost')
print(f"  XGBoost - Recall: {results['XGBoost']['recall']:.3f}, Precision: {results['XGBoost']['precision']:.3f}")

# CatBoost
print("Training CatBoost...")
catboost_model = CatBoostClassifier(random_state=42, silent=True)
catboost_model.fit(X_train_np, y_train_np)
models['CatBoost'] = catboost_model
results['CatBoost'] = evaluate_model(catboost_model, X_val_np, y_val_np, 'CatBoost')
print(f"  CatBoost - Recall: {results['CatBoost']['recall']:.3f}, Precision: {results['CatBoost']['precision']:.3f}")

# AdaBoost
print("Training AdaBoost...")
ada_model = AdaBoostClassifier(random_state=42)
ada_model.fit(X_train_np, y_train_np)
models['AdaBoost'] = ada_model
results['AdaBoost'] = evaluate_model(ada_model, X_val_np, y_val_np, 'AdaBoost')
print(f"  AdaBoost - Recall: {results['AdaBoost']['recall']:.3f}, Precision: {results['AdaBoost']['precision']:.3f}")

# 3. Neural Networks
print("\n🔹 NEURAL NETWORKS")
print("-" * 30)

# MLP
print("Training MLP...")
mlp_model = MLPClassifier(hidden_layer_sizes=(100, 50), random_state=42, max_iter=500)
mlp_model.fit(X_train_np, y_train_np)
models['MLP'] = mlp_model
results['MLP'] = evaluate_model(mlp_model, X_val_np, y_val_np, 'MLP')
print(f"  MLP - Recall: {results['MLP']['recall']:.3f}, Precision: {results['MLP']['precision']:.3f}")

# 4. Additional Models
print("\n🔹 ADDITIONAL MODELS")
print("-" * 30)

# Random Forest
print("Training Random Forest...")
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_np, y_train_np)
models['RandomForest'] = rf_model
results['RandomForest'] = evaluate_model(rf_model, X_val_np, y_val_np, 'RandomForest')
print(f"  RandomForest - Recall: {results['RandomForest']['recall']:.3f}, Precision: {results['RandomForest']['precision']:.3f}")

# Logistic Regression
print("Training Logistic Regression...")
lr_model = LogisticRegression(random_state=42, max_iter=1000)
lr_model.fit(X_train_np, y_train_np)
models['LogisticRegression'] = lr_model
results['LogisticRegression'] = evaluate_model(lr_model, X_val_np, y_val_np, 'LogisticRegression')
print(f"  LogisticRegression - Recall: {results['LogisticRegression']['recall']:.3f}, Precision: {results['LogisticRegression']['precision']:.3f}")

# Naive Bayes
print("Training Naive Bayes...")
nb_model = GaussianNB()
nb_model.fit(X_train_np, y_train_np)
models['NaiveBayes'] = nb_model
results['NaiveBayes'] = evaluate_model(nb_model, X_val_np, y_val_np, 'NaiveBayes')
print(f"  NaiveBayes - Recall: {results['NaiveBayes']['recall']:.3f}, Precision: {results['NaiveBayes']['precision']:.3f}")

print(f"\n✅ HOÀN THÀNH TRAINING {len(models)} MODELS!")

🤖 TRAINING 10 BASELINE MODELS

🔹 CLASSICAL MODELS
------------------------------
Training SVM...


## 5. So sánh Performance của các Models

In [None]:
# So sánh performance của tất cả models
print("📊 SO SÁNH PERFORMANCE CỦA CÁC MODELS")
print("=" * 60)

# Tạo DataFrame để so sánh
comparison_df = pd.DataFrame({
    'Model': [result['model_name'] for result in results.values()],
    'Recall': [result['recall'] for result in results.values()],
    'Precision': [result['precision'] for result in results.values()],
    'F1': [result['f1'] for result in results.values()],
    'Accuracy': [result['accuracy'] for result in results.values()],
    'ROC_AUC': [result['roc_auc'] for result in results.values()]
})

# Sắp xếp theo Recall (quan trọng nhất cho screening)
comparison_df = comparison_df.sort_values('Recall', ascending=False)

print("🏆 RANKING THEO RECALL (QUAN TRỌNG NHẤT CHO SCREENING):")
print("-" * 80)
print(f"{'Rank':<4} {'Model':<15} {'Recall':<8} {'Precision':<9} {'F1':<8} {'Accuracy':<8} {'ROC_AUC':<8}")
print("-" * 80)

for i, (_, row) in enumerate(comparison_df.iterrows(), 1):
    roc_auc_str = f"{row['ROC_AUC']:.3f}" if row['ROC_AUC'] is not None else "N/A"
    print(f"{i:<4} {row['Model']:<15} {row['Recall']:<8.3f} {row['Precision']:<9.3f} {row['F1']:<8.3f} {row['Accuracy']:<8.3f} {roc_auc_str:<8}")

# Tìm top 3 models theo Recall
top_3_models = comparison_df.head(3)['Model'].tolist()
print(f"\n🥇 TOP 3 MODELS THEO RECALL:")
for i, model in enumerate(top_3_models, 1):
    recall = comparison_df[comparison_df['Model'] == model]['Recall'].iloc[0]
    print(f"   {i}. {model}: {recall:.3f}")

# Visualization
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# 1. Recall comparison
axes[0, 0].bar(comparison_df['Model'], comparison_df['Recall'], color='coral')
axes[0, 0].set_title('Recall Comparison (Higher is Better for Screening)', fontweight='bold')
axes[0, 0].set_ylabel('Recall')
axes[0, 0].tick_params(axis='x', rotation=45)
axes[0, 0].grid(axis='y', alpha=0.3)

# 2. Precision vs Recall scatter
axes[0, 1].scatter(comparison_df['Precision'], comparison_df['Recall'], s=100, alpha=0.7)
for i, model in enumerate(comparison_df['Model']):
    axes[0, 1].annotate(model, (comparison_df.iloc[i]['Precision'], comparison_df.iloc[i]['Recall']), 
                       xytext=(5, 5), textcoords='offset points', fontsize=8)
axes[0, 1].set_xlabel('Precision')
axes[0, 1].set_ylabel('Recall')
axes[0, 1].set_title('Precision vs Recall Trade-off', fontweight='bold')
axes[0, 1].grid(True, alpha=0.3)

# 3. ROC AUC comparison
roc_data = comparison_df[comparison_df['ROC_AUC'].notna()]
axes[1, 0].bar(roc_data['Model'], roc_data['ROC_AUC'], color='lightblue')
axes[1, 0].set_title('ROC AUC Comparison', fontweight='bold')
axes[1, 0].set_ylabel('ROC AUC')
axes[1, 0].tick_params(axis='x', rotation=45)
axes[1, 0].grid(axis='y', alpha=0.3)

# 4. All metrics radar chart (chỉ top 3)
from math import pi

metrics = ['Recall', 'Precision', 'F1', 'Accuracy']
N = len(metrics)
angles = [n / float(N) * 2 * pi for n in range(N)]
angles += angles[:1]

ax = axes[1, 1]
ax.set_theta_offset(pi / 2)
ax.set_theta_direction(-1)
ax.set_thetagrids(range(0, 360, 360 // N), metrics)

colors = ['red', 'blue', 'green']
for i, model in enumerate(top_3_models[:3]):
    values = [comparison_df[comparison_df['Model'] == model][metric].iloc[0] for metric in metrics]
    values += values[:1]
    ax.plot(angles, values, 'o-', linewidth=2, label=model, color=colors[i])
    ax.fill(angles, values, alpha=0.25, color=colors[i])

ax.set_ylim(0, 1)
ax.set_title('Top 3 Models - All Metrics', fontweight='bold', pad=20)
ax.legend(loc='upper right', bbox_to_anchor=(1.3, 1.0))
ax.grid(True)

plt.tight_layout()
plt.show()

# Lưu kết quả
comparison_df.to_csv('../results/model_comparison.csv', index=False)
print(f"\n💾 Đã lưu kết quả so sánh: ../results/model_comparison.csv")