#### [2025_12_11]_독버섯 감지 & 유방암 감지
- 필수
   * 교차검증
   * 데이터 누수 안됨!
   * 모델 : 앙상블 계열 => Voting, 배깅은 RandomForest
- 기한 : 주말(12월 14일 일요일까지)

In [57]:
## ==================================================
## [1-1] 모듈 로딩
## ==================================================
import pandas as pd
import numpy as np

## ML학습 관련
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

## ML 데이터셋 및 전처리 관련
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler, LabelEncoder, OneHotEncoder

## ML CV, Pipeline 관련 => 모델 일반화/최적 하이퍼파라미터 조사 및 데이터 누수 해결
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC

## ML 성능지표 관련
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import classification_report

## 시각화 관련
import matplotlib.pyplot as plt
import graphviz

##### 독버섯 감지 <hr>
1. X / y 분리
2. 범주형 인코딩 파이프라인 구성
3. RandomForest + VotingClassifier
4. GridSearchCV + 교차검증
5. Feature Importance 분석

In [58]:
## 데이터 가져오기
data_file = '../Data/mushrooms.csv'
mDF = pd.read_csv(data_file)

display(mDF)

mDF.info()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,e,k,s,n,f,n,a,c,b,y,...,s,o,o,p,o,o,p,b,c,l
8120,e,x,s,n,f,n,a,c,b,y,...,s,o,o,p,n,o,p,b,v,l
8121,e,f,s,n,f,n,a,c,b,n,...,s,o,o,p,o,o,p,b,c,l
8122,p,k,y,n,f,y,f,c,n,b,...,k,w,w,p,w,o,e,w,v,l


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   class                     8124 non-null   object
 1   cap-shape                 8124 non-null   object
 2   cap-surface               8124 non-null   object
 3   cap-color                 8124 non-null   object
 4   bruises                   8124 non-null   object
 5   odor                      8124 non-null   object
 6   gill-attachment           8124 non-null   object
 7   gill-spacing              8124 non-null   object
 8   gill-size                 8124 non-null   object
 9   gill-color                8124 non-null   object
 10  stalk-shape               8124 non-null   object
 11  stalk-root                8124 non-null   object
 12  stalk-surface-above-ring  8124 non-null   object
 13  stalk-surface-below-ring  8124 non-null   object
 14  stalk-color-above-ring  

In [59]:
## ==================================================
## [2-1] 피쳐/타겟 분리
## ==================================================
targetSR = mDF['class']
featureDF = mDF.drop("class", axis=1)
print(f'featureDF:{featureDF.shape},  targetSR:{targetSR.shape}')


featureDF:(8124, 22),  targetSR:(8124,)


In [60]:
## ==================================================
## [2-2] 학습용/테스트용 분리
## ==================================================
x_train, x_test, y_train, y_test = train_test_split(featureDF,
                                                    targetSR,
                                                    test_size=0.2,
                                                    random_state=42,
                                                    stratify=targetSR)

print(f'[TRAIN] x_train:{x_train.shape},  y_train:{y_train.shape}')
print(f'[TEST] x_test:{x_test.shape},  y_test:{y_test.shape}')

[TRAIN] x_train:(6499, 22),  y_train:(6499,)
[TEST] x_test:(1625, 22),  y_test:(1625,)


In [61]:
## ==================================================
## [2-3] 타켓 컬럼 인코딩 처리
## ==================================================
lbEncoder  = LabelEncoder()

en_y_train = lbEncoder.fit_transform(y_train)  ## 학습용 타겟으로 인코더 생성 후 변환까지 진행
en_y_test  = lbEncoder.transform(y_test)

[3] 학습 진행 <hr>

[3-1] 배깅 랜덤포레스트

In [62]:
pipe_simple = Pipeline([
    ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ("model", RandomForestClassifier(random_state=42,  n_jobs=-1))
])

In [63]:
param_grid_rf = {
    "model__n_estimators": [200, 500],
    "model__max_depth": [None, 10, 20],
    "model__min_samples_split": [2, 5, 10]
}

In [64]:
grid_rf = GridSearchCV(
    estimator=pipe_simple,
    param_grid=param_grid_rf,
    n_jobs=-1,
    verbose=1
)

In [65]:
grid_rf.fit(x_train, en_y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_grid,"{'model__max_depth': [None, 10, ...], 'model__min_samples_split': [2, 5, ...], 'model__n_estimators': [200, 500]}"
,scoring,
,n_jobs,-1
,refit,True
,cv,
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [66]:
print("Best CV Score:", grid_rf.best_score_)
print("Best Params:", grid_rf.best_params_)

best_rf = grid_rf.best_estimator_
print("TEST Accuracy:", best_rf.score(x_test, en_y_test))

Best CV Score: 1.0
Best Params: {'model__max_depth': None, 'model__min_samples_split': 2, 'model__n_estimators': 200}
TEST Accuracy: 1.0


[3-2] voting 방법(knn, svc, dt)

In [67]:
# 각 모델을 파이프라인으로 감싸기
pipe_knn = Pipeline([
    ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ("knn", KNeighborsClassifier())
])

pipe_svc = Pipeline([
    ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ("svc", SVC(probability=True, random_state=42))
])

pipe_dt = Pipeline([
    ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ("dt", DecisionTreeClassifier(random_state=42))
])

# Voting
vtModel = VotingClassifier(
    estimators=[("knn", pipe_knn), ("svc", pipe_svc), ("dt", pipe_dt)]
)


In [68]:
param_grid_voting = {
    "knn__knn__n_neighbors": [3, 5, 11],

    "svc__svc__C": [1, 3, 10],

    "dt__dt__max_depth": [None, 5, 10]

}

In [69]:
# -------------------------------------------------
# 4) 학습
# -------------------------------------------------
grid_voting = GridSearchCV(
    estimator=vtModel,
    param_grid=param_grid_voting,
    scoring="accuracy",
    n_jobs=-1,
    verbose=1
)
grid_voting.fit(x_train, en_y_train)


Fitting 5 folds for each of 27 candidates, totalling 135 fits


0,1,2
,estimator,VotingClassif...tate=42))]))])
,param_grid,"{'dt__dt__max_depth': [None, 5, ...], 'knn__knn__n_neighbors': [3, 5, ...], 'svc__svc__C': [1, 3, ...]}"
,scoring,'accuracy'
,n_jobs,-1
,refit,True
,cv,
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_neighbors,3
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,C,3
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,True
,tol,0.001
,cache_size,200
,class_weight,

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [70]:
## 성능평가

print("Best Voting CV Score:", grid_voting.best_score_)
print("Best Voting Params:", grid_voting.best_params_)

best_voting = grid_voting.best_estimator_
print("Voting TEST Accuracy:", best_voting.score(x_test, en_y_test))

Best Voting CV Score: 0.9996923076923077
Best Voting Params: {'dt__dt__max_depth': None, 'knn__knn__n_neighbors': 3, 'svc__svc__C': 3}
Voting TEST Accuracy: 1.0


##### 유방암 검사

In [71]:
## =========================
## 유방암 데이터 가져오기
## =========================
data_file = "../Data/wdbc.csv"
bDF = pd.read_csv(data_file)

display(bDF.head())
bDF.info()


Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave_points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave_points_worst,symmetry_worst,fractal_dimension_worst
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 32 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       569 non-null    int64  
 1   diagnosis                569 non-null    object 
 2   radius_mean              569 non-null    float64
 3   texture_mean             569 non-null    float64
 4   perimeter_mean           569 non-null    float64
 5   area_mean                569 non-null    float64
 6   smoothness_mean          569 non-null    float64
 7   compactness_mean         569 non-null    float64
 8   concavity_mean           569 non-null    float64
 9   concave_points_mean      569 non-null    float64
 10  symmetry_mean            569 non-null    float64
 11  fractal_dimension_mean   569 non-null    float64
 12  radius_se                569 non-null    float64
 13  texture_se               569 non-null    float64
 14  perimeter_se             5

In [None]:
## =========================
## [2-1] 피쳐/타겟 분리
## =========================
targetSR = bDF["diagnosis"]
featureDF = bDF.drop(["diagnosis"], axis=1)

# id 컬럼이 있다면 제거 - 의미없음
if "id" in featureDF.columns:
    featureDF = featureDF.drop(["id"], axis=1)

print(f"featureDF:{featureDF.shape}, targetSR:{targetSR.shape}")

featureDF:(569, 30), targetSR:(569,)


In [73]:
## =========================
## [2-2] 학습/테스트 분리
## =========================
x_train, x_test, y_train, y_test = train_test_split(
    featureDF, targetSR,
    test_size=0.2,
    random_state=42,
    stratify=targetSR
)

print(f"[TRAIN] x_train:{x_train.shape}, y_train:{y_train.shape}")
print(f"[TEST]  x_test:{x_test.shape},  y_test:{y_test.shape}")


[TRAIN] x_train:(455, 30), y_train:(455,)
[TEST]  x_test:(114, 30),  y_test:(114,)


In [74]:
## =========================
## [2-3] 타깃 인코딩
## =========================
lbEncoder = LabelEncoder()
en_y_train = lbEncoder.fit_transform(y_train)   # train 기준 fit
en_y_test  = lbEncoder.transform(y_test)


[랜덤 포레스트]

In [None]:

pipe_rf = Pipeline([
    ("scaler", RobustScaler()),
    ("model", RandomForestClassifier(random_state=42, n_jobs=-1))
])

param_grid_rf = {
    "model__n_estimators": [200, 500],
    "model__max_depth": [None, 5, 10, 20],
    "model__min_samples_split": [2, 5, 10]
}

grid_rf = GridSearchCV(
    estimator=pipe_rf,
    param_grid=param_grid_rf,
    scoring="accuracy",
    n_jobs=-1,
    verbose=1
)

grid_rf.fit(x_train, en_y_train)

print("Best CV Score:", grid_rf.best_score_)
print("Best Params:", grid_rf.best_params_)

best_rf = grid_rf.best_estimator_
print("TEST Accuracy:", best_rf.score(x_test, en_y_test))
print(classification_report(en_y_test, best_rf.predict(x_test)))


Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best CV Score: 0.9582417582417584
Best Params: {'model__max_depth': None, 'model__min_samples_split': 2, 'model__n_estimators': 200}
TEST Accuracy: 0.9649122807017544
              precision    recall  f1-score   support

           0       0.95      1.00      0.97        72
           1       1.00      0.90      0.95        42

    accuracy                           0.96       114
   macro avg       0.97      0.95      0.96       114
weighted avg       0.97      0.96      0.96       114



[voting]

In [77]:
pipe_knn = Pipeline([
    ("scaler", RobustScaler()),
    ("knn", KNeighborsClassifier())
])

pipe_svc = Pipeline([
    ("scaler", RobustScaler()),
    ("svc", SVC(probability=True, random_state=42))
])

pipe_dt = Pipeline([
    ("scaler", RobustScaler()),
    ("dt", DecisionTreeClassifier(random_state=42))
])

vtModel = VotingClassifier(
    estimators=[("knn", pipe_knn), ("svc", pipe_svc), ("dt", pipe_dt)],
    voting="soft"
)

param_grid_voting = {
    "knn__knn__n_neighbors": [3, 5, 7, 11],

    "svc__svc__C": [0.5, 1, 3, 10],

    "dt__dt__max_depth": [None, 3, 5, 10],
}

grid_voting = GridSearchCV(
    estimator=vtModel,
    param_grid=param_grid_voting,
    scoring="accuracy",
    n_jobs=-1,
    verbose=1
)

grid_voting.fit(x_train, en_y_train)

print("Best Voting CV Score:", grid_voting.best_score_)
print("Best Voting Params:", grid_voting.best_params_)

best_voting = grid_voting.best_estimator_
print("Voting TEST Accuracy:", best_voting.score(x_test, en_y_test))
print(classification_report(en_y_test, best_voting.predict(x_test)))


Fitting 5 folds for each of 64 candidates, totalling 320 fits
Best Voting CV Score: 0.9736263736263737
Best Voting Params: {'dt__dt__max_depth': None, 'knn__knn__n_neighbors': 7, 'svc__svc__C': 10}


Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



Voting TEST Accuracy: 0.9473684210526315
              precision    recall  f1-score   support

           0       0.95      0.97      0.96        72
           1       0.95      0.90      0.93        42

    accuracy                           0.95       114
   macro avg       0.95      0.94      0.94       114
weighted avg       0.95      0.95      0.95       114

