# Notebook 04: Data Modeling (Multi-Model Comparison)

**M·ª•c ti√™u:** Hu·∫•n luy·ªán v√† so s√°nh c√°c m√¥ h√¨nh Binary Classification ƒë·ªÉ t√¨m ra model t·ªët nh·∫•t.

**C√°c m√¥ h√¨nh s·ª≠ d·ª•ng:**
1. **XGBoost:** Gradient Boosting (M·∫°nh m·∫Ω, chu·∫©n c√¥ng nghi·ªáp).
2. **Random Forest:** Bagging (·ªîn ƒë·ªãnh, √≠t b·ªã overfitting).
3. **Logistic Regression:** Linear (ƒê∆°n gi·∫£n, t·ªëc ƒë·ªô cao, l√†m baseline).

**Input:** `train_features.pkl`
**Output:** `best_model.pkl` (Model t·ªët nh·∫•t ƒë∆∞·ª£c ch·ªçn t·ª± ƒë·ªông).

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import os

# Import c√°c m√¥ h√¨nh
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

# --- C·∫§U H√åNH ---
INPUT_FILE = '../../dataset_final/clean_data/train_features.pkl'
OUTPUT_DIR = '../../dataset_final/models'
os.makedirs(OUTPUT_DIR, exist_ok=True)
MODEL_FILE = os.path.join(OUTPUT_DIR, 'best_matcher.pkl')
FEATURE_LIST_FILE = os.path.join(OUTPUT_DIR, 'feature_names.pkl')
RANDOM_SEED = 42

## 1. Load v√† Chu·∫©n b·ªã D·ªØ li·ªáu

In [None]:
if not os.path.exists(INPUT_FILE):
    raise FileNotFoundError(f"‚ùå Kh√¥ng t√¨m th·∫•y file: {INPUT_FILE}")

df = pd.read_pickle(INPUT_FILE)
print(f"T·ªïng s·ªë m·∫´u: {len(df)}")

feature_cols = [c for c in df.columns if c.startswith('feat_')]
target_col = 'label'

X = df[feature_cols]
y = df[target_col]

# Chia t·∫≠p d·ªØ li·ªáu
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_SEED, stratify=y
)

print(f"Train shape: {X_train.shape} | Val shape: {X_val.shape}")

# T√≠nh t·ª∑ l·ªá m·∫•t c√¢n b·∫±ng (d√πng cho XGBoost & Class Weight)
ratio = float(np.sum(y_train == 0)) / np.sum(y_train == 1)
print(f"Imbalance Ratio (Neg/Pos): {ratio:.2f}")

## 2. Hu·∫•n luy·ªán ƒêa M√¥ h√¨nh
Ch√∫ng ta s·∫Ω l·∫ßn l∆∞·ª£t train 3 model.

In [None]:
models = {}

# --- MODEL 1: XGBOOST ---
print("\nüöÄ Training XGBoost...")
xgb = XGBClassifier(
    n_estimators=200, learning_rate=0.05, max_depth=6,
    subsample=0.8, colsample_bytree=0.8,
    scale_pos_weight=ratio, # C√¢n b·∫±ng m·∫´u
    random_state=RANDOM_SEED,
    eval_metric='logloss',
    use_label_encoder=False,
    early_stopping_rounds=20 # D·ª´ng s·ªõm
)
xgb.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=0)
models['XGBoost'] = xgb

# --- MODEL 2: RANDOM FOREST ---
print("üöÄ Training Random Forest...")
# Random Forest c·∫ßn class_weight='balanced' ƒë·ªÉ x·ª≠ l√Ω m·∫•t c√¢n b·∫±ng
rf = RandomForestClassifier(
    n_estimators=200, max_depth=10,
    class_weight='balanced',
    random_state=RANDOM_SEED,
    n_jobs=-1 # Ch·∫°y ƒëa lu·ªìng
)
rf.fit(X_train, y_train)
models['Random Forest'] = rf

# --- MODEL 3: LOGISTIC REGRESSION ---
print("üöÄ Training Logistic Regression...")
# Logistic Regression c·∫ßn Scale d·ªØ li·ªáu (StandardScaler) ƒë·ªÉ ho·∫°t ƒë·ªông t·ªët
lr = make_pipeline(
    StandardScaler(),
    LogisticRegression(class_weight='balanced', max_iter=1000, random_state=RANDOM_SEED)
)
lr.fit(X_train, y_train)
models['Logistic Regression'] = lr

print("‚úÖ ƒê√£ hu·∫•n luy·ªán xong 3 m√¥ h√¨nh!")

## 3. So s√°nh & ƒê√°nh gi√° (Evaluation)
V·∫Ω ROC Curve ƒë·ªÉ xem model n√†o "b√° ƒë·∫°o" nh·∫•t.

In [None]:
plt.figure(figsize=(10, 8))

best_score = 0
best_model_name = ""
best_model_obj = None

print("=== K·∫æT QU·∫¢ ROC-AUC ===")

for name, model in models.items():
    # D·ª± ƒëo√°n x√°c su·∫•t
    if hasattr(model, "predict_proba"):
        y_prob = model.predict_proba(X_val)[:, 1]
    else:
        y_prob = model.decision_function(X_val)
        
    # T√≠nh AUC
    auc = roc_auc_score(y_val, y_prob)
    print(f"{name}: {auc:.4f}")
    
    # V·∫Ω ROC
    fpr, tpr, _ = roc_curve(y_val, y_prob)
    plt.plot(fpr, tpr, lw=2, label=f"{name} (AUC = {auc:.4f})")
    
    # L∆∞u model t·ªët nh·∫•t
    if auc > best_score:
        best_score = auc
        best_model_name = name
        best_model_obj = model

plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('So s√°nh ROC Curve gi·ªØa c√°c m√¥ h√¨nh')
plt.legend(loc="lower right")
plt.show()

## 4. L∆∞u Model t·ªët nh·∫•t
Script s·∫Ω t·ª± ƒë·ªông ch·ªçn model c√≥ AUC cao nh·∫•t ƒë·ªÉ l∆∞u.

In [None]:
print(f"\nüèÜ Model chi·∫øn th·∫Øng: {best_model_name} v·ªõi AUC = {best_score:.4f}")

# 1. L∆∞u Model
joblib.dump(best_model_obj, MODEL_FILE)

# 2. L∆∞u Feature List (N·∫øu model l√† Pipeline LR, ta v·∫´n c·∫ßn list feature g·ªëc)
joblib.dump(feature_cols, FEATURE_LIST_FILE)

print(f"‚úÖ ƒê√£ l∆∞u model t·ªët nh·∫•t t·∫°i: {os.path.abspath(MODEL_FILE)}")
print(f"‚úÖ ƒê√£ l∆∞u feature list t·∫°i: {os.path.abspath(FEATURE_LIST_FILE)}")
print("\nüëâ B∆Ø·ªöC TI·∫æP THEO: Ch·∫°y '05_evaluation.ipynb'.")