# 배깅- Random Forest

- 기본 모델 학습

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
import pandas as pd

data=pd.read_csv('../mon.csv')

# 특징(X)와 타겟(y) 분리
X = data.drop(columns=['Label'])
y = data['Label']

# 데이터를 훈련 세트와 테스트 세트로 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 랜덤 포레스트 모델 초기화 및 학습
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# 테스트 세트 예측
y_pred = model.predict(X_test)

# 성능 평가
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')  # 가중 평균 F1 점수 계산
classification_rep = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"F1 Score (weighted): {f1}")
print("\nClassification Report:\n", classification_rep)
print("\nConfusion Matrix:\n", conf_matrix)

Accuracy: 0.7739473684210526
F1 Score (weighted): 0.7728213788787172

Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.67      0.73        42
           1       0.84      0.64      0.73        42
           2       0.82      0.94      0.88        35
           3       0.66      0.86      0.75        29
           4       0.83      0.90      0.86        39
           5       0.89      0.93      0.91        45
           6       0.78      0.89      0.83        44
           7       0.78      0.89      0.83        36
           8       0.75      0.71      0.73        34
           9       0.56      0.61      0.58        31
          10       0.95      0.74      0.83        47
          11       0.67      0.80      0.73        35
          12       0.86      0.86      0.86        42
          13       0.66      0.53      0.58        40
          14       0.83      0.56      0.67        36
          15       0.78      0.80      0.

hyperparameter Tuning

In [None]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix, roc_curve, auc, precision_recall_curve
import matplotlib.pyplot as plt
import pandas as pd

# 특징(X)와 타겟(y) 분리
X = data.drop(columns=['Label'])
y = data['Label']

# 데이터를 Train, Validation, Test로 나누기
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)  # 70% Train
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)  # 15% Validation, 15% Test

# 랜덤 포레스트 모델 초기화
model = RandomForestClassifier(random_state=42)

# RandomizedSearchCV를 위한 하이퍼파라미터 공간 정의
param_dist = {
    'n_estimators': [100, 200, 300, 500],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2'],
    'bootstrap': [True, False]
}

# RandomizedSearchCV 초기화 및 학습
random_search = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_dist,
    n_iter=50,
    cv=5,
    n_jobs=-1,
    verbose=2,
    random_state=42,
    scoring='f1_weighted'
)
random_search.fit(X_train, y_train)

# 최적의 하이퍼파라미터 출력
print(f"Best parameters found: {random_search.best_params_}")

# 최적 하이퍼파라미터로 모델 생성
best_model = RandomForestClassifier(random_state=42, **random_search.best_params_)

# 최적 모델로 학습
best_model.fit(X_train, y_train)

# Validation Set으로 예측
y_val_pred = best_model.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
val_f1 = f1_score(y_val, y_val_pred, average='weighted')
print(f"\nValidation Accuracy: {val_accuracy}")
print(f"Validation F1 Score: {val_f1}")

# Test Set으로 예측
y_test_pred = best_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred, average='weighted')
classification_rep = classification_report(y_test, y_test_pred)
conf_matrix = confusion_matrix(y_test, y_test_pred)

print(f"\nTest Accuracy: {test_accuracy}")
print(f"Test F1 Score: {test_f1}")
print("\nClassification Report:\n", classification_rep)
print("\nConfusion Matrix:\n", conf_matrix)

# ROC Curve 및 AUC 계산
y_test_prob = best_model.predict_proba(X_test)[:, 1]  # 양성 클래스 확률
fpr, tpr, _ = roc_curve(y_test, y_test_prob)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f"ROC curve (AUC = {roc_auc:.2f})")
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label="Random guess")
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.grid()
plt.show()

# PR Curve 및 AUC 계산
precision, recall, _ = precision_recall_curve(y_test, y_test_prob)
pr_auc = auc(recall, precision)

plt.figure(figsize=(10, 6))
plt.plot(recall, precision, color='blue', lw=2, label=f"PR curve (AUC = {pr_auc:.2f})")
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall (PR) Curve')
plt.legend(loc="lower left")
plt.grid()
plt.show()


Train Set: 13300 samples
Validation Set: 2850 samples
Test Set: 2850 samples
Fitting 5 folds for each of 50 candidates, totalling 250 fits


26 fits failed out of a total of 250.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
3 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\yzooz\pythonenv\lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\yzooz\pythonenv\lib\site-packages\sklearn\base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\yzooz\pythonenv\lib\site-packages\sklearn\ensemble\_forest.py", line 456, in fit
    trees = Parallel(
  File "c:\Users\yzooz\pythonenv\lib\site-packages\sklearn\utils\parallel.py", line 65, in __call__
    return super().__call__(iterable_with_config)
  Fil

Best parameters found: {'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 20, 'bootstrap': True}

Validation Accuracy: 0.7757894736842105
Validation F1 Score: 0.7728641970586617

Test Accuracy: 0.7929824561403509
Test F1 Score: 0.7916543757618631

Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.57      0.67        30
           1       0.69      0.83      0.76        30
           2       0.93      0.90      0.92        30
           3       0.84      0.87      0.85        30
           4       0.85      0.77      0.81        30
           5       0.96      0.77      0.85        30
           6       0.77      0.90      0.83        30
           7       0.84      0.90      0.87        30
           8       0.90      0.87      0.88        30
           9       0.72      0.87      0.79        30
          10       0.96      0.80      0.87        30
          11       0.88 