<a href="https://colab.research.google.com/github/Hanna07111/news-popularity-project/blob/main/XGboost_Ensemble_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.3.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.16.1-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.3.0-py3-none-any.whl (386 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m386.6/386.6 kB[0m [31m33.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.16.1-py3-none-any.whl (242 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m242.5/242.5 kB[0m [31m25.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.16.1 colorlog-6.9.0 optuna-4.3.0


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

train_df = pd.read_csv("/content/drive/MyDrive/25-1 패턴인식 프로젝트/train_processed.csv")
test_df = pd.read_csv("/content/drive/MyDrive/25-1 패턴인식 프로젝트/test_processed.csv")

X = train_df.drop(columns=['id', 'y', 'shares'])
y = train_df['y']

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

### xgboost with best parameters

In [None]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from xgboost import XGBClassifier

# 최적 하이퍼파라미터로 모델 정의
best_params = {
    'n_estimators': 487,
    'max_depth': 3,
    'learning_rate': 0.022783546678795837,
    'subsample': 0.9865934283784953,
    'colsample_bytree': 0.6270846419992482,
    'gamma': 0.8172927176529761,
    'min_child_weight': 4,
    'eval_metric': 'auc',
    'random_state': 42,
    'tree_method': 'hist',
    # CPU 사용을 원하면 'hist'
    # GPU를 사용하려면 'device': 'cuda' 추가
    'device': 'cuda'
}

# 최종 모델 학습
final_model = XGBClassifier(**best_params)
final_model.fit(X_train, y_train)

# 검증 데이터에 대해 예측 수행
y_pred_valid = final_model.predict(X_valid)
y_prob_valid = final_model.predict_proba(X_valid)[:, 1]  # 클래스 1일 확률

# 성능 평가 (Accuracy, F1 Score, AUC)
accuracy = accuracy_score(y_valid, y_pred_valid)
f1 = f1_score(y_valid, y_pred_valid)
auc = roc_auc_score(y_valid, y_prob_valid)

# 세 가지 지표의 평균 계산
mean_metric = (accuracy + f1 + auc) / 3

# 성능 결과 출력
print("Final Tuned Model Accuracy:", accuracy)
print("Final Tuned Model F1 Score:", f1)
print("Final Tuned Model AUC:", auc)
print("Mean Evaluation Metric (Accuracy + F1 + AUC) / 3:", mean_metric)


Final Tuned Model Accuracy: 0.6765765765765765
Final Tuned Model F1 Score: 0.6740807989105765
Final Tuned Model AUC: 0.7298996619142015
Mean Evaluation Metric (Accuracy + F1 + AUC) / 3: 0.6935190124671182


### xgboost with best parameters + random forest

In [None]:
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression

#random forest와 앙상블
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

stack = StackingClassifier(
    estimators=[('xgb', final_model), ('rf', rf_model)],
    final_estimator=LogisticRegression()
)

stack.fit(X_train, y_train)

# 검증 데이터에 대해 예측 수행
y_pred_stack = stack.predict(X_valid)
y_prob_stack = stack.predict_proba(X_valid)[:, 1]  # 클래스 1일 확률

# 성능 평가 (Accuracy, F1 Score, AUC)
accuracy = accuracy_score(y_valid, y_pred_stack)
f1 = f1_score(y_valid, y_pred_stack)
auc = roc_auc_score(y_valid, y_prob_stack)

# 세 가지 지표의 평균 계산
mean_metric = (accuracy + f1 + auc) / 3

# 성능 결과 출력
print("Final Tuned Model Accuracy:", accuracy)
print("Final Tuned Model F1 Score:", f1)
print("Final Tuned Model AUC:", auc)
print("Mean Evaluation Metric (Accuracy + F1 + AUC) / 3:", mean_metric)

Final Tuned Model Accuracy: 0.6722972972972973
Final Tuned Model F1 Score: 0.6716316858496953
Final Tuned Model AUC: 0.7323438389996508
Mean Evaluation Metric (Accuracy + F1 + AUC) / 3: 0.6920909407155479


### tuning random forest (gridsearch, optuna)

In [None]:
# tuning with gridsearch
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt', 'log2']
}

rf = RandomForestClassifier(random_state=42)
grid = GridSearchCV(rf, param_grid, cv=3, scoring='accuracy', n_jobs=-1, verbose=1)
grid.fit(X_train, y_train)

print("Best Params:", grid.best_params_)
print("Best Score:", grid.best_score_)

Fitting 3 folds for each of 72 candidates, totalling 216 fits
Best Params: {'max_depth': 15, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 300}
Best Score: 0.6544481981981981


In [None]:
# tuning with optuna
import optuna
from sklearn.model_selection import cross_val_score

def rf_objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 300),
        'max_depth': trial.suggest_int('max_depth', 5, 20),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 5),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2']),
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False]),
    }

    model = RandomForestClassifier(random_state=42, **params)
    score = cross_val_score(model, X_train, y_train, cv=3, scoring='accuracy', n_jobs=-1)
    return score.mean()

study_rf = optuna.create_study(direction='maximize')
study_rf.optimize(rf_objective, n_trials=50)

print("Best Score:", study_rf.best_value)
print("Best Params:", study_rf.best_params)

[I 2025-05-23 09:49:50,639] A new study created in memory with name: no-name-5874b19d-ef1e-403b-b45f-3f6579e2c2e4
[I 2025-05-23 09:50:10,320] Trial 0 finished with value: 0.6529279279279279 and parameters: {'n_estimators': 166, 'max_depth': 12, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'log2', 'bootstrap': True}. Best is trial 0 with value: 0.6529279279279279.
[I 2025-05-23 09:50:26,790] Trial 1 finished with value: 0.6488175675675675 and parameters: {'n_estimators': 195, 'max_depth': 9, 'min_samples_split': 3, 'min_samples_leaf': 5, 'max_features': 'sqrt', 'bootstrap': True}. Best is trial 0 with value: 0.6529279279279279.
[I 2025-05-23 09:50:36,039] Trial 2 finished with value: 0.6454391891891892 and parameters: {'n_estimators': 202, 'max_depth': 6, 'min_samples_split': 4, 'min_samples_leaf': 4, 'max_features': 'log2', 'bootstrap': True}. Best is trial 0 with value: 0.6529279279279279.
[I 2025-05-23 09:50:57,029] Trial 3 finished with value: 0.6480855855855855 a

Best Score: 0.6537162162162162
Best Params: {'n_estimators': 230, 'max_depth': 10, 'min_samples_split': 9, 'min_samples_leaf': 3, 'max_features': 'log2', 'bootstrap': False}


### xgboost with best parameters + random forest with best parameters

In [None]:
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression

#random forest와 앙상블
rf_model = RandomForestClassifier(**grid.best_params_)

stack = StackingClassifier(
    estimators=[('xgb', final_model), ('rf', rf_model)],
    final_estimator=LogisticRegression()
)

stack.fit(X_train, y_train)

# 검증 데이터에 대해 예측 수행
y_pred_stack = stack.predict(X_valid)
y_prob_stack = stack.predict_proba(X_valid)[:, 1]  # 클래스 1일 확률

# 성능 평가 (Accuracy, F1 Score, AUC)
accuracy = accuracy_score(y_valid, y_pred_stack)
f1 = f1_score(y_valid, y_pred_stack)
auc = roc_auc_score(y_valid, y_prob_stack)

# 세 가지 지표의 평균 계산
mean_metric = (accuracy + f1 + auc) / 3

# 성능 결과 출력
print("Final Tuned Model Accuracy:", accuracy)
print("Final Tuned Model F1 Score:", f1)
print("Final Tuned Model AUC:", auc)
print("Mean Evaluation Metric (Accuracy + F1 + AUC) / 3:", mean_metric)

Final Tuned Model Accuracy: 0.6711711711711712
Final Tuned Model F1 Score: 0.6722047597665021
Final Tuned Model AUC: 0.7314660050377035
Mean Evaluation Metric (Accuracy + F1 + AUC) / 3: 0.6916139786584589


### xgboost with best parameters + rf + logistic regression

In [None]:
# Base models
base_learners = [
    ('xgb', XGBClassifier(**best_params)),
    ('rf', RandomForestClassifier(**grid.best_params_)),
    ('lr', LogisticRegression(max_iter=1000))
]

# Meta model
meta_model = LogisticRegression()

# Stacking
stacking_clf = StackingClassifier(
    estimators=base_learners,
    final_estimator=meta_model,
    cv=5,  # 내부에서 K-fold cross validation
    n_jobs=-1
)

stack.fit(X_train, y_train)

# 검증 데이터에 대해 예측 수행
y_pred_stack = stack.predict(X_valid)
y_prob_stack = stack.predict_proba(X_valid)[:, 1]  # 클래스 1일 확률

# 성능 평가 (Accuracy, F1 Score, AUC)
accuracy = accuracy_score(y_valid, y_pred_stack)
f1 = f1_score(y_valid, y_pred_stack)
auc = roc_auc_score(y_valid, y_prob_stack)

# 세 가지 지표의 평균 계산
mean_metric = (accuracy + f1 + auc) / 3

# 성능 결과 출력
print("Final Tuned Model Accuracy:", accuracy)
print("Final Tuned Model F1 Score:", f1)
print("Final Tuned Model AUC:", auc)
print("Mean Evaluation Metric (Accuracy + F1 + AUC) / 3:", mean_metric)

Final Tuned Model Accuracy: 0.6702702702702703
Final Tuned Model F1 Score: 0.6702702702702703
Final Tuned Model AUC: 0.7313002190120654
Mean Evaluation Metric (Accuracy + F1 + AUC) / 3: 0.6906135865175353
