In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
!pip install catboost


Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [7]:
# 필요한 라이브러리 불러오기
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
import lightgbm as lgb
import catboost as cb

# 데이터 불러오기
train = pd.read_csv('/content/drive/MyDrive/패턴인식/train_processed.csv')
test = pd.read_csv('/content/drive/MyDrive/패턴인식/test_processed.csv')

# Feature, Label 나누기 ('shares', 'id' 제거)
X = train.drop(['id', 'y', 'shares'], axis=1)
y = train['y']

# Train/Validation 분리
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# XGBoost 모델 정의 (튜닝된 파라미터)
xgb_clf = xgb.XGBClassifier(
    n_estimators=427,
    max_depth=3,
    learning_rate=0.021978188969319974,
    subsample=0.9760848989537714,
    colsample_bytree=0.7301003992053027,
    gamma=0.2749409209747699,
    min_child_weight=1,
    eval_metric='logloss',
    use_label_encoder=False,
    random_state=42
)

# LightGBM 모델 정의
lgb_clf = lgb.LGBMClassifier(
    n_estimators=300,
    max_depth=3,
    learning_rate=0.02,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

# CatBoost 모델 정의 (silent 모드)
cb_clf = cb.CatBoostClassifier(
    iterations=300,
    depth=3,
    learning_rate=0.02,
    verbose=0,
    random_state=42
)

# 최종 메타 모델 (Logistic Regression)
meta_clf = LogisticRegression(max_iter=1000, solver='lbfgs', random_state=42)

# StackingClassifier 정의
stacking_clf = StackingClassifier(
    estimators=[
        ('xgb', xgb_clf),
        ('lgb', lgb_clf),
        ('cb', cb_clf)
    ],
    final_estimator=meta_clf,
    passthrough=True,
    n_jobs=-1
)

# 모델 학습
stacking_clf.fit(X_train, y_train)

# 검증 예측
y_pred_valid = stacking_clf.predict(X_valid)

# 평가 결과 출력
print("Stacking Model Accuracy:", accuracy_score(y_valid, y_pred_valid))
print("Stacking Model F1 Score:", f1_score(y_valid, y_pred_valid))
print("\nClassification Report:\n")
print(classification_report(y_valid, y_pred_valid))

# 테스트 데이터 예측 후 저장
X_test = test.drop(['id'], axis=1)
y_pred_test = stacking_clf.predict(X_test)
y_prob_test = stacking_clf.predict_proba(X_test)[:, 1]

submission = pd.DataFrame({
    'id': test['id'],
    'y_predict': y_pred_test,
    'y_prob': y_prob_test
})

submission.to_csv('prediction_stacking_catboost.csv', index=False)
print("Done! XGB + LGB + CatBoost Stacking 결과가 prediction_stacking_catboost.csv에 저장되었습니다.")

Stacking Model Accuracy: 0.6614864864864864
Stacking Model F1 Score: 0.6603389830508475

Classification Report:

              precision    recall  f1-score   support

           0       0.66      0.66      0.66      2227
           1       0.66      0.66      0.66      2213

    accuracy                           0.66      4440
   macro avg       0.66      0.66      0.66      4440
weighted avg       0.66      0.66      0.66      4440

Done! XGB + LGB + CatBoost Stacking 결과가 prediction_stacking_catboost.csv에 저장되었습니다.


In [8]:
# 필요한 라이브러리 불러오기
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
import xgboost as xgb
import lightgbm as lgb
import catboost as cb

# 데이터 불러오기
train = pd.read_csv('/content/drive/MyDrive/패턴인식/train_processed.csv')
test = pd.read_csv('/content/drive/MyDrive/패턴인식/test_processed.csv')

# Feature, Label 나누기 ('shares', 'id' 제거)
X = train.drop(['id', 'y', 'shares'], axis=1)
y = train['y']

# Train/Validation 분리
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 1. XGBoost 모델 정의 (튜닝된 최적 파라미터)
xgb_clf = xgb.XGBClassifier(
    n_estimators=427,
    max_depth=3,
    learning_rate=0.021978188969319974,
    subsample=0.9760848989537714,
    colsample_bytree=0.7301003992053027,
    gamma=0.2749409209747699,
    min_child_weight=1,
    eval_metric='logloss',
    use_label_encoder=False,
    random_state=42
)

xgb_clf.fit(X_train, y_train)
xgb_prob = xgb_clf.predict_proba(X_valid)[:, 1]

# 2. Stacking 모델 (XGB + LGB + CB)
lgb_clf = lgb.LGBMClassifier(
    n_estimators=300,
    max_depth=3,
    learning_rate=0.02,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

cb_clf = cb.CatBoostClassifier(
    iterations=300,
    depth=3,
    learning_rate=0.02,
    verbose=0,
    random_state=42
)

meta_clf = LogisticRegression(max_iter=1000, solver='lbfgs', random_state=42)

stacking_clf = StackingClassifier(
    estimators=[
        ('xgb', xgb_clf),
        ('lgb', lgb_clf),
        ('cb', cb_clf)
    ],
    final_estimator=meta_clf,
    passthrough=True,
    n_jobs=-1
)

stacking_clf.fit(X_train, y_train)
stacking_prob = stacking_clf.predict_proba(X_valid)[:, 1]

# 3. Soft Voting (XGB 0.7 + Stacking 0.3)
final_prob = (xgb_prob * 0.7) + (stacking_prob * 0.3)
threshold = 0.48
final_pred = (final_prob >= threshold).astype(int)

# 성능 평가
print("F1 Score:", f1_score(y_valid, final_pred))
print("Accuracy:", accuracy_score(y_valid, final_pred))
print("\nClassification Report:\n")
print(classification_report(y_valid, final_pred))

# 테스트 예측
X_test = test.drop(['id'], axis=1)
xgb_test_prob = xgb_clf.predict_proba(X_test)[:, 1]
stacking_test_prob = stacking_clf.predict_proba(X_test)[:, 1]
final_test_prob = (xgb_test_prob * 0.7) + (stacking_test_prob * 0.3)
final_test_pred = (final_test_prob >= threshold).astype(int)

submission = pd.DataFrame({
    'id': test['id'],
    'y_predict': final_test_pred,
    'y_prob': final_test_prob
})

submission.to_csv('prediction_final_softvote.csv', index=False)
print("Done! Soft Voting (XGB+Stacking) 결과가 prediction_final_softvote.csv에 저장되었습니다.")

Parameters: { "use_label_encoder" } are not used.



F1 Score: 0.67
Accuracy: 0.6581081081081082

Classification Report:

              precision    recall  f1-score   support

           0       0.67      0.62      0.65      2227
           1       0.65      0.70      0.67      2213

    accuracy                           0.66      4440
   macro avg       0.66      0.66      0.66      4440
weighted avg       0.66      0.66      0.66      4440

Done! Soft Voting (XGB+Stacking) 결과가 prediction_final_softvote.csv에 저장되었습니다.


In [10]:
# 필요한 라이브러리 불러오기
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
import xgboost as xgb
import lightgbm as lgb
import catboost as cb

# 데이터 불러오기
train = pd.read_csv('/content/drive/MyDrive/패턴인식/train_processed.csv')
test = pd.read_csv('/content/drive/MyDrive/패턴인식/test_processed.csv')

# Feature, Label 나누기 ('shares', 'id' 제거)
X = train.drop(['id', 'y', 'shares'], axis=1)
y = train['y']

# Train/Validation 분리
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# XGBoost 모델 정의 (튜닝된 최적 파라미터)
xgb_clf = xgb.XGBClassifier(
    n_estimators=427,
    max_depth=3,
    learning_rate=0.021978188969319974,
    subsample=0.9760848989537714,
    colsample_bytree=0.7301003992053027,
    gamma=0.2749409209747699,
    min_child_weight=1,
    eval_metric='logloss',
    use_label_encoder=False,
    random_state=42
)

xgb_clf.fit(X_train, y_train)
xgb_prob = xgb_clf.predict_proba(X_valid)[:, 1]

# Stacking 모델 정의 (XGB + LGB + CB), meta model: RandomForest
lgb_clf = lgb.LGBMClassifier(
    n_estimators=300,
    max_depth=3,
    learning_rate=0.02,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

cb_clf = cb.CatBoostClassifier(
    iterations=300,
    depth=3,
    learning_rate=0.02,
    verbose=0,
    random_state=42
)

meta_clf = RandomForestClassifier(n_estimators=100, max_depth=3, random_state=42)

stacking_clf = StackingClassifier(
    estimators=[
        ('xgb', xgb_clf),
        ('lgb', lgb_clf),
        ('cb', cb_clf)
    ],
    final_estimator=meta_clf,
    passthrough=True,
    n_jobs=-1
)

stacking_clf.fit(X_train, y_train)
stacking_prob = stacking_clf.predict_proba(X_valid)[:, 1]

# Soft Voting 조합
final_prob = (xgb_prob * 0.7) + (stacking_prob * 0.3)

# F1 Score를 높이기 위한 최적 threshold 탐색
best_f1 = 0
best_t = 0
for t in np.arange(0.40, 0.51, 0.01):
    pred = (final_prob >= t).astype(int)
    f1 = f1_score(y_valid, pred)
    if f1 > best_f1:
        best_f1 = f1
        best_t = t
    print(f"threshold = {t:.2f} | F1 Score = {f1:.4f} | Accuracy = {accuracy_score(y_valid, pred):.4f}")

print(f"\n📌 F1 최적 threshold = {best_t:.2f} (F1 Score = {best_f1:.4f})")

# 최적 threshold로 최종 예측
final_pred = (final_prob >= best_t).astype(int)

# 성능 평가
print("\n최종 F1 기준 평가 결과")
print("F1 Score:", f1_score(y_valid, final_pred))
print("Accuracy:", accuracy_score(y_valid, final_pred))
print("\nClassification Report:\n")
print(classification_report(y_valid, final_pred))

# 테스트 예측
X_test = test.drop(['id'], axis=1)
xgb_test_prob = xgb_clf.predict_proba(X_test)[:, 1]
stacking_test_prob = stacking_clf.predict_proba(X_test)[:, 1]
final_test_prob = (xgb_test_prob * 0.7) + (stacking_test_prob * 0.3)
final_test_pred = (final_test_prob >= best_t).astype(int)

submission = pd.DataFrame({
    'id': test['id'],
    'y_predict': final_test_pred,
    'y_prob': final_test_prob
})

submission.to_csv('prediction_best_f1.csv', index=False)
print("\nDone! F1 최적화 결과가 prediction_best_f1.csv에 저장되었습니다.")


Parameters: { "use_label_encoder" } are not used.



threshold = 0.40 | F1 Score = 0.6940 | Accuracy = 0.6354
threshold = 0.41 | F1 Score = 0.6889 | Accuracy = 0.6358
threshold = 0.42 | F1 Score = 0.6886 | Accuracy = 0.6399
threshold = 0.43 | F1 Score = 0.6863 | Accuracy = 0.6432
threshold = 0.44 | F1 Score = 0.6860 | Accuracy = 0.6498
threshold = 0.45 | F1 Score = 0.6814 | Accuracy = 0.6525
threshold = 0.46 | F1 Score = 0.6760 | Accuracy = 0.6523
threshold = 0.47 | F1 Score = 0.6712 | Accuracy = 0.6541
threshold = 0.48 | F1 Score = 0.6648 | Accuracy = 0.6547
threshold = 0.49 | F1 Score = 0.6630 | Accuracy = 0.6592
threshold = 0.50 | F1 Score = 0.6603 | Accuracy = 0.6610

📌 F1 최적 threshold = 0.40 (F1 Score = 0.6940)

최종 F1 기준 평가 결과
F1 Score: 0.694008694008694
Accuracy: 0.6353603603603604

Classification Report:

              precision    recall  f1-score   support

           0       0.72      0.44      0.55      2227
           1       0.60      0.83      0.69      2213

    accuracy                           0.64      4440
   macro av