In [1]:
# Import necessary libraries
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier
import os
import numpy as np
# from sklearn.model_selection import RandomizedSearchCV
import joblib
from dask.distributed import Client
from dask_ml.model_selection import RandomizedSearchCV
from dask_ml.model_selection import GridSearchCV
import xgboost as xgb

OUTPUT_DIR = "../Output/textpreprocess/240926/sample_step2_10000"
BEST_PARAM = os.path.join(OUTPUT_DIR, 'XGBoost_BestParam.pkl')

import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='xgboost')

os.makedirs(OUTPUT_DIR, exist_ok=True)

X_train_tfidf = joblib.load(os.path.join(OUTPUT_DIR, 'X_train_tfidf.pkl'))
X_test_tfidf = joblib.load(os.path.join(OUTPUT_DIR, 'X_test_tfidf.pkl'))
y_train = joblib.load(os.path.join(OUTPUT_DIR, 'y_train.pkl'))
y_test = joblib.load(os.path.join(OUTPUT_DIR, 'y_test.pkl'))

In [2]:
# Convert to DMatrix (XGBoost's data format)
dtrain = xgb.DMatrix(X_train_tfidf, label=y_train)
dtest = xgb.DMatrix(X_test_tfidf, label=y_test)

In [3]:
# Define model parameters for XGBoost
params = {
    'objective': 'binary:logistic',  # Binary classification
    'max_depth': 6,
    'eta': 0.3,
    'verbosity': 1,
    'eval_metric': 'logloss',
    'tree_method': 'gpu_hist'  # Use GPU for training
}

# Train the model using GPU
model = xgb.train(params, dtrain, num_boost_round=100)

# Make predictions
y_pred_prob = model.predict(dtest)
y_pred = [1 if prob > 0.5 else 0 for prob in y_pred_prob]

# Model evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:\n", classification_report(y_test, y_pred))

Model Accuracy: 80.25%
Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.79      0.80       930
           1       0.79      0.82      0.80       893

    accuracy                           0.80      1823
   macro avg       0.80      0.80      0.80      1823
weighted avg       0.80      0.80      0.80      1823



In [4]:
client = Client(n_workers=5, threads_per_worker=2, memory_limit='10GB')

In [5]:
model = xgb.XGBClassifier(tree_method='gpu_hist', use_label_encoder=False, eval_metric='logloss', verbosity=0)

# param_dist = {
#     'learning_rate': np.linspace(0.01, 0.3, 10),  # 학습률 범위
#     'n_estimators': np.arange(50, 300, 50),  # 트리 개수
#     'max_depth': np.arange(3, 10, 1),  # 트리 최대 깊이
#     'min_child_weight': np.arange(1, 6, 1),  # 최소 가중치
#     'subsample': np.linspace(0.5, 1.0, 6),  # 샘플링 비율
#     'colsample_bytree': np.linspace(0.5, 1.0, 6),  # 특성 샘플링 비율
#     'gamma': np.linspace(0, 0.5, 5),  # 최소 손실 감소
#     'tree_method': 'hist',  # GPU 사용,
#     'device':'cuda'
# }

param_dist = {
    'learning_rate': np.linspace(0.05, 0.2, 5),  # 범위를 좁게 설정
    'n_estimators': np.arange(50, 200, 50),  # 범위를 줄임
    'max_depth': np.arange(3, 8, 1),  # 트리 깊이 축소
    'min_child_weight': np.arange(1, 4, 1),  # 줄임
    'subsample': np.linspace(0.6, 0.9, 3),  # 샘플링 비율 축소
    'colsample_bytree': np.linspace(0.6, 0.9, 3),  # 축소
    'gamma': np.linspace(0, 0.3, 3),  # 축소
    'tree_method': ['hist'],  # GPU 사용
    'device': ['cuda'],
}

# RandomizedSearchCV (Dask)
random_search = RandomizedSearchCV(
    model, 
    param_distributions=param_dist, 
    n_iter=50,  
    scoring='accuracy', 
    cv=3,  
    random_state=42,
    n_jobs=-1,  
)

# 모델 학습
random_search.fit(X_train_tfidf.toarray(), y_train)

print(f"Best parameters found: {random_search.best_params_}")
best_model = random_search.best_estimator_

This may cause some slowdown.
Consider scattering data ahead of time and using futures.
  futures = scheduler(
This may cause some slowdown.
Consider scattering data ahead of time and using futures.
  out = scheduler(dsk, keys, num_workers=n_jobs)


Best parameters found: {'tree_method': 'hist', 'subsample': 0.75, 'n_estimators': 150, 'min_child_weight': 2, 'max_depth': 6, 'learning_rate': 0.16250000000000003, 'gamma': 0.15, 'device': 'cuda', 'colsample_bytree': 0.75}


In [6]:
# Make predictions
y_pred_prob = random_search.predict(X_test_tfidf.toarray())
y_pred = [1 if prob > 0.5 else 0 for prob in y_pred_prob]

# Model evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:\n", classification_report(y_test, y_pred))

Model Accuracy: 81.35%
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.80      0.81       930
           1       0.80      0.83      0.81       893

    accuracy                           0.81      1823
   macro avg       0.81      0.81      0.81      1823
weighted avg       0.81      0.81      0.81      1823



In [7]:
joblib.dump(random_search.best_params_, BEST_PARAM)

print(f"Best model saved at {BEST_PARAM}")

Best model saved at ../Output/textpreprocess/240926/sample_step2_10000/XGBoost_BestParam.pkl


In [8]:
# Load the saved best model
best_param = joblib.load(BEST_PARAM)

print(f"Best model loaded from {BEST_PARAM}")

Best model loaded from ../Output/textpreprocess/240926/sample_step2_10000/XGBoost_BestParam.pkl


In [14]:
# Initialize the XGBoost classifier with base settings
model = xgb.XGBClassifier(
    tree_method='gpu_hist',  # Use GPU for histogram optimization
    use_label_encoder=False,  # Disable label encoder warning
    eval_metric='logloss',  # Use log loss as the evaluation metric
    verbosity=0  # Suppress unnecessary warnings and logs
)

# Assuming you have the best parameters from RandomizedSearchCV saved as `random_search.best_params_`
best_params = random_search.best_params_

# Dynamically create the parameter grid based on the best parameters found by RandomizedSearchCV
param_grid = {
    'learning_rate': np.linspace(best_params['learning_rate'] - 0.01, best_params['learning_rate'] + 0.01, 3),  
    'n_estimators': [best_params['n_estimators'] - 50, best_params['n_estimators'], best_params['n_estimators'] + 50],  
    'max_depth': [best_params['max_depth'] - 1, best_params['max_depth'], best_params['max_depth'] + 1],  
    'min_child_weight': [best_params['min_child_weight'] - 1, best_params['min_child_weight']],  
    'subsample': np.linspace(best_params['subsample'] - 0.05, best_params['subsample'] + 0.05, 3),  
    'colsample_bytree': np.linspace(best_params['colsample_bytree'] - 0.05, best_params['colsample_bytree'] + 0.05, 3),  
    'gamma': np.linspace(best_params['gamma'] - 0.05, best_params['gamma'] + 0.05, 3),  
    'tree_method': ['hist'],  # Use GPU
    'device': ['cuda'],  # GPU usage
}

# Proceed with GridSearchCV using this dynamically generated param_grid
grid_search = GridSearchCV(
    model,
    param_grid=param_grid,
    scoring='accuracy',
    cv=3,
    n_jobs=-1,
)

# Fit the model
grid_search.fit(X_train_tfidf.toarray(), y_train)

# Print the best parameters and model
print(f"Best parameters found by GridSearchCV: {grid_search.best_params_}")
best_model = grid_search.best_estimator_

This may cause some slowdown.
Consider scattering data ahead of time and using futures.
  futures = scheduler(
This may cause some slowdown.
Consider scattering data ahead of time and using futures.
  out = scheduler(dsk, keys, num_workers=n_jobs)


Best parameters found by GridSearchCV: {'colsample_bytree': 0.7, 'device': 'cuda', 'gamma': 0.2, 'learning_rate': 0.15250000000000002, 'max_depth': 6, 'min_child_weight': 2, 'n_estimators': 200, 'subsample': 0.8, 'tree_method': 'hist'}


In [15]:
# Make predictions
y_pred_prob = grid_search.predict(X_test_tfidf.toarray())
y_pred = [1 if prob > 0.5 else 0 for prob in y_pred_prob]

# Model evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:\n", classification_report(y_test, y_pred))

Model Accuracy: 81.35%
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.80      0.81       930
           1       0.80      0.83      0.81       893

    accuracy                           0.81      1823
   macro avg       0.81      0.81      0.81      1823
weighted avg       0.81      0.81      0.81      1823



In [16]:
# Get the best parameters and estimator
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print(f"Best parameters found by GridSearchCV: {best_params}")


model_save_path = os.path.join(OUTPUT_DIR, "XGBoost_best_model_grid_search.pkl")
params_save_path = os.path.join(OUTPUT_DIR, "XGBoost_best_params_grid_search.pkl")

# Save the best model
joblib.dump(best_model, model_save_path)
print(f"Best model saved at {model_save_path}")

# Save the best parameters
joblib.dump(best_params, params_save_path)
print(f"Best parameters saved at {params_save_path}")

Best parameters found by GridSearchCV: {'colsample_bytree': 0.7, 'device': 'cuda', 'gamma': 0.2, 'learning_rate': 0.15250000000000002, 'max_depth': 6, 'min_child_weight': 2, 'n_estimators': 200, 'subsample': 0.8, 'tree_method': 'hist'}
Best model saved at ../Output/textpreprocess/240926/sample_step2_10000/XGBoost_best_model_grid_search.pkl
Best parameters saved at ../Output/textpreprocess/240926/sample_step2_10000/XGBoost_best_params_grid_search.pkl
