# 배깅- Random Forest

- 기본 모델 학습

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score

data=pd.read_csv('../../../../monunmon.csv')

# 특징(X)와 타겟(y) 분리
X = data.drop(columns=['Label'])
y = data['Label']

# 데이터를 훈련 세트와 테스트 세트로 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)  # 검증 20% (전체의 14%)

# 랜덤 포레스트 모델 초기화
model = RandomForestClassifier(random_state=42)

# GridSearchCV로 튜닝할 하이퍼파라미터 설정
param_grid = {
    'n_estimators': [200, 300],
    'max_depth': [20, 30],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'bootstrap': [False]
}

# GridSearchCV 설정
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='f1_weighted',
    cv=3,
    verbose=2,
    n_jobs=-1
)

# GridSearchCV 수행
grid_search.fit(X_train, y_train)

# 최적 하이퍼파라미터 및 최적 점수 출력
best_model = grid_search.best_estimator_
print("Best Parameters:", grid_search.best_params_)

# 검증 데이터 평가
y_val_pred = best_model.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
val_f1 = f1_score(y_val, y_val_pred, average='weighted')  # 다중 클래스에 적합한 설정
print(f"Validation Accuracy: {val_accuracy:.2f}")
print(f"Validation F1 Score: {val_f1:.2f}")

# 테스트 데이터 평가
y_test_pred = best_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred, average='weighted')  # 다중 클래스에 적합한 설정
print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Test F1 Score: {test_f1:.4f}")

# 테스트 데이터에 대한 분류 보고서 출력
print("\n테스트 데이터에 대한 분류 보고서:")
print(classification_report(y_test, y_test_pred))

#y_pred = best_model.predict(X_test)

#accuracy = accuracy_score(y_test, y_pred)
#f1 = f1_score(y_test, y_pred, average='weighted')
#classification_rep = classification_report(y_test, y_pred)
#conf_matrix = confusion_matrix(y_test, y_pred)

#print(f"Accuracy: {accuracy}")
#print(f"F1 Score (weighted): {f1}")
#print("\nClassification Report:\n", classification_rep)
#print("\nConfusion Matrix:\n", conf_matrix)


Fitting 3 folds for each of 16 candidates, totalling 48 fits
Best Parameters: {'bootstrap': False, 'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}
Validation Accuracy: 0.74
Validation F1 Score: 0.74
Test Accuracy: 0.7402
Test F1 Score: 0.7399

테스트 데이터에 대한 분류 보고서:
              precision    recall  f1-score   support

          -1       0.59      0.69      0.64       590
           0       0.76      0.63      0.69        30
           1       0.71      0.61      0.66        44
           2       0.82      0.78      0.80        41
           3       0.76      0.67      0.71        33
           4       0.64      0.78      0.70        32
           5       0.82      0.89      0.86        37
           6       0.81      0.92      0.86        38
           7       0.82      0.80      0.81        35
           8       0.69      0.76      0.72        33
           9       0.81      0.81      0.81        26
          10       0.86      0.72      0.78       