In [4]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import lightgbm as lgb
import joblib
from sklearn.metrics import accuracy_score

# ==== Định nghĩa các nhóm feature tốt nhất ====
basic_cols = ["area", "length", "width", "ratio", "major_axis_length", "minor_axis_length",
              "convex_hull_area", "convex_hull_perimeter", "equivalent_diameter", "aspect_ratio",
              "perimeter", "roundness", "compactness", "solidity",
              "shape_factor_1", "shape_factor_2", "shape_factor_3", "shape_factor_4"]

color_rgb_hsv_lab_cols = ["mean_r", "mean_g", "mean_B", "sqrt_r", "sqrt_g", "sqrt_B",
                          "std_r", "std_g", "std_B", "skew_r", "skew_g", "skew_B", "kur_r", "kur_g", "kur_B",
                          "mean_h", "mean_s", "mean_v", "std_h", "std_s", "std_v", "sqrt_h", "sqrt_s", "sqrt_v",
                          "skew_h", "skew_s", "skew_v", "kur_h", "kur_s", "kur_v",
                          "mean_l", "mean_a", "mean_b", "std_l", "std_a", "std_b", "sqrt_l", "sqrt_a", "sqrt_b",
                          "skew_l", "skew_a", "skew_b", "kur_l", "kur_a", "kur_b"]

lbp_cols = [f"LBP_{i}" for i in range(10)]
glcm_cols = [f"GLCM_{i}" for i in range(16)]
pf_cols = [f for f in pd.read_csv(r'C:\Users\Admin\Documents\Python Project\Res conn 2025\combined_features.csv').columns if f.startswith('pf_')]
var_ratio_cols = ['var_ratio_R_G', 'var_ratio_R_B', 'var_ratio_G_B', 'var_ratio_H_S', 'var_ratio_H_V',
                  'var_ratio_S_V', 'var_ratio_L_A', 'var_ratio_L_B', 'var_ratio_A_B']
range_cols = [f for f in pd.read_csv(r'C:\Users\Admin\Documents\Python Project\Res conn 2025\combined_features.csv').columns if f.startswith('range_') or f.startswith('iqr_') or f.startswith('mv_') or f.startswith('mc_')]

# ==== Kết hợp tất cả các feature tốt nhất ====
selected_features = basic_cols + color_rgb_hsv_lab_cols + lbp_cols + glcm_cols + pf_cols + var_ratio_cols + range_cols

# ==== Load dữ liệu ====
df = pd.read_csv(r'C:\Users\Admin\Documents\Python Project\Res conn 2025\combined_features.csv')
X = df[selected_features]
y = df['label']

# ==== Chuẩn hóa ====
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# ==== Tách train/test ====
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# ==== Định nghĩa các mô hình ====
models = {
    "logistic_regression": LogisticRegression(max_iter=2000, random_state=42),

    "svm_rbf": SVC(
        kernel='rbf',
        C=10,                  # ưu tiên đúng nhiều hơn margin lớn
        gamma='scale',
        probability=True,
        random_state=42
    ),

    "random_forest": RandomForestClassifier(
        n_estimators=150,
        max_depth=25,
        max_features='sqrt',   # tránh overfit trong high-dim
        min_samples_split=5,
        random_state=42,
        n_jobs=-1
    ),

    "knn": KNeighborsClassifier(
        n_neighbors=7,
        weights='distance',    # gán trọng số gần
        metric='minkowski',
        n_jobs=-1
    ),

    "lightgbm": lgb.LGBMClassifier(
        n_estimators=200,
        learning_rate=0.05,
        num_leaves=63,
        max_depth=10,
        min_child_samples=20,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        verbose=-1
    )
}


# ==== Thư mục lưu ====
model_save_path = r'C:\Users\Admin\Documents\Python Project\Res conn 2025\saved_models'
os.makedirs(model_save_path, exist_ok=True)

# ==== Train và lưu từng mô hình ====
for name, model in models.items():
    print(f"🔧 Training model: {name}")
    model.fit(X_train, y_train)

    # Dự đoán và in accuracy
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"✅ {name} accuracy: {acc:.4f}")

    # Lưu mô hình
    model_file = os.path.join(model_save_path, f"{name}.pkl")
    joblib.dump(model, model_file)

# ==== Lưu scaler ====
scaler_file = os.path.join(model_save_path, "scaler.pkl")
joblib.dump(scaler, scaler_file)

print("🎉 All models and scaler saved successfully!")


🔧 Training model: logistic_regression
✅ logistic_regression accuracy: 0.8748
🔧 Training model: svm_rbf
✅ svm_rbf accuracy: 0.8751
🔧 Training model: random_forest
✅ random_forest accuracy: 0.8331
🔧 Training model: knn
✅ knn accuracy: 0.7932
🔧 Training model: lightgbm




✅ lightgbm accuracy: 0.8675
🎉 All models and scaler saved successfully!
