In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import loguniform
from sklearn.model_selection import StratifiedKFold
import pandas as pd
import numpy as np

# Đọc dữ liệu đã gộp
df = pd.read_csv('sentinel_combined.csv')
# Chọn các đặc trưng và nhãn


features = ['brightness', 'min_rgb', 'ndi_rb', 'range_rgb', 'whiteness2']
X = df[features]
y = df['Classvalue']
y = y.map({1: 0, 2: 1})


# Chia tập train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)




In [2]:
# Train mô hình cơ bản
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

# Đánh giá
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.86      0.87        98
           1       0.89      0.90      0.90       124

    accuracy                           0.88       222
   macro avg       0.88      0.88      0.88       222
weighted avg       0.88      0.88      0.88       222



In [3]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt', 'log2'],
    'class_weight': [None, 'balanced'],
    'bootstrap': [True]
}

grid = GridSearchCV(RandomForestClassifier(random_state=42),
                    param_grid,
                    scoring='f1',
                    cv=5,
                    n_jobs=-1,
                    verbose=1)

grid.fit(X_train, y_train)

print("Best params:", grid.best_params_)
print("Best F1 score:", grid.best_score_)


Fitting 5 folds for each of 96 candidates, totalling 480 fits
Best params: {'bootstrap': True, 'class_weight': None, 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 200}
Best F1 score: 0.8936779011952675


In [4]:
best_rf = RandomForestClassifier(
    bootstrap=True,
    class_weight='balanced',
    max_depth=10,
    max_features='sqrt',
    min_samples_leaf=2,
    min_samples_split=2,
    n_estimators=200,
    random_state=42
)
best_rf.fit(X_train, y_train)


In [5]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = best_rf.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


[[ 88  10]
 [ 13 111]]
              precision    recall  f1-score   support

           0       0.87      0.90      0.88        98
           1       0.92      0.90      0.91       124

    accuracy                           0.90       222
   macro avg       0.89      0.90      0.90       222
weighted avg       0.90      0.90      0.90       222



In [6]:
import joblib
joblib.dump(best_rf, 'best_random_forest_model.pkl')


['best_random_forest_model.pkl']

In [2]:
import xgboost
import sklearn

print("XGBoost version:", xgboost.__version__)
print("Scikit-learn version:", sklearn.__version__)

XGBoost version: 2.1.3
Scikit-learn version: 1.3.2


In [47]:
from xgboost import XGBClassifier

xgb_model = XGBClassifier(eval_metric='logloss', random_state=42, n_estimators=100)
xgb_model.fit(
    X_train, 
    y_train
)

In [49]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = xgb_model.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[ 86  12]
 [ 12 112]]
              precision    recall  f1-score   support

           0       0.88      0.88      0.88        98
           1       0.90      0.90      0.90       124

    accuracy                           0.89       222
   macro avg       0.89      0.89      0.89       222
weighted avg       0.89      0.89      0.89       222



In [50]:
from sklearn.model_selection import GridSearchCV

# K-fold giữ tỷ lệ phân phối class (rất quan trọng với dữ liệu Sentinel)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

param_dis = {
    'n_estimators': np.arange(100, 1000, 150),           # Số cây
    'max_depth': np.arange(8, 13),                       # Độ sâu tối đa
    'learning_rate': loguniform(0.009, 0.18),            # Tốc độ học
    'reg_lambda': loguniform(1e-5, 3),                   # Regularization L2
    'subsample': loguniform(0.5, 1),                     # Tỷ lệ mẫu mỗi cây
    'min_child_weight': loguniform(0.4, 5),              # Tránh overfitting
    'colsample_bytree': loguniform(0.5, 1),              # Số features dùng mỗi cây
    'max_bin': [256, 1024],                              # Số bin cho histogram
    'tree_method': ['hist'],                             # Dùng hist để tăng tốc (CPU)
    # 'device': ['cuda'],                                # Bỏ nếu không có GPU
    'nthread': [2],                                      # Số luồng CPU
    'eval_metric': ['logloss']                           # Hàm đánh giá phân loại
}

# Khởi tạo mô hình tìm kiếm
search = RandomizedSearchCV(
    estimator=XGBClassifier(use_label_encoder=False, random_state=42),
    param_distributions=param_dis,
    n_iter=100,
    scoring='f1',
    refit=True,
    cv=cv,
    verbose=3,
    n_jobs=-1
)

In [51]:
search.fit(X_train, y_train)

print("Best params:", search.best_params_)
print("Best F1 score:", search.best_score_)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


Parameters: { "use_label_encoder" } are not used.



Best params: {'colsample_bytree': 0.6489289803323258, 'eval_metric': 'logloss', 'learning_rate': 0.11685280836602645, 'max_bin': 256, 'max_depth': 11, 'min_child_weight': 2.0438629053161534, 'n_estimators': 850, 'nthread': 2, 'reg_lambda': 1.0215952267552846, 'subsample': 0.7757895016943877, 'tree_method': 'hist'}
Best F1 score: 0.8954343617461935


In [43]:
from sklearn.metrics import classification_report, confusion_matrix

xgb_model = search.best_estimator_
xgb_model.fit(X_train, y_train)
y_pred = xgb_model.predict(X_test)


print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


[[ 86  12]
 [ 11 113]]
              precision    recall  f1-score   support

           0       0.89      0.88      0.88        98
           1       0.90      0.91      0.91       124

    accuracy                           0.90       222
   macro avg       0.90      0.89      0.89       222
weighted avg       0.90      0.90      0.90       222



Parameters: { "use_label_encoder" } are not used.

