In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    classification_report,
    roc_auc_score
)
import pickle
import warnings
warnings.filterwarnings('ignore')


In [None]:
# Setting parameter
n_fold = 5
rand_seed = 5

In [None]:
# loading data & model
df = pd.read_csv("data/selected_tree.csv" , sep = ",")

X = df.drop('label', axis=1)  # Features
Y = df['label']  # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=rand_seed, stratify=Y)


print(f'Train: {X_train.shape} | Test: {X_test.shape}')


with open("./model_rf.pkl", "rb") as f:
    base_model = pickle.load(f)


- GridSearchCV: 머신러닝 모델의 하이퍼파라미터를 최적화하기 위한 방법. 
- 주어진 파라미터 공간을 탐색하여 최적의 파라미터 조합

In [None]:
param_grid = { 
    'n_estimators': [100,200, 500],    ## n Tree
    'max_features': [ 'sqrt', 'log2'], ## 각 결정 트리를 만들 때 사용할 특성(feature)의 최대 개수
    'max_depth' : [4,5,6,7,8],         ## 각 결정 트리의 최대 깊이
    'criterion' :['gini', 'entropy']   ## 트리 분할의 품질을 측정하는 기준
}
grid_clf = GridSearchCV(estimator=base_model, param_grid=param_grid, cv= n_fold)
grid_clf.fit(X_train, y_train)

In [None]:
print(f'Best parameters: {grid_clf.best_params_}')
accuracy = grid_clf.best_score_ *100
print("Accuracy with tuning is : {:.2f}%".format(accuracy) )

In [None]:
print('Performance in test set')
print(classification_report(y_test,grid_clf.best_estimator_.predict(X_test)))
