In [1]:
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, make_scorer
import pandas as pd
import numpy as np
import joblib
import xgboost as xgb
from xgboost import XGBClassifier

# 2. Version2 (even NC2)

In [2]:
data2_dir = '/Users/imdohyeon/Documents/PythonWorkspace/4n/preprocessSeg/extracted/ver2(even NC)/features_ver2.csv'
data2 = pd.read_csv(data2_dir)
data2 = data2.sample(frac=1).reset_index(drop=True)

# 데이터 나누기
X2 = data2.drop(['label'], axis=1)
y2 = data2['label']
# 데이터 분할
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.3, random_state=42)

In [3]:
lasso = Lasso(alpha=0.01, max_iter=10000)
# SelectFromModel 학습
feature_selector = SelectFromModel(estimator=lasso)
feature_selector.fit(X2_train, y2_train)  # Lasso 모델 학습

SelectFromModel(estimator=Lasso(alpha=0.01, max_iter=10000))

## 2-1. L1 Regularization + RF (Grid)

In [4]:
# Random Forest 모델
rf_model = RandomForestClassifier(random_state=42)

# 파이프라인 구축
pipeline = Pipeline([
    ('feature_selection', SelectFromModel(estimator=lasso)),
    ('random_forest', rf_model)
])

# 하이퍼파라미터 그리드 정의
param_grid = {
    'feature_selection__estimator__alpha': [0.001, 0.01, 0.1, 1],  # Lasso의 alpha 값
    'random_forest__n_estimators': [50, 100, 200],  # Random Forest의 트리 개수
    'random_forest__max_depth': [None, 10, 20, 30]  # Random Forest의 최대 깊이
}
# Weighted F1 스코어 사용
f1_scorer = make_scorer(f1_score, average='weighted')

grid_search_rf_2 = GridSearchCV(pipeline, param_grid, cv=5, scoring=f1_scorer, n_jobs=-1, verbose=1)

grid_search_rf_2.fit(X2_train, y2_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


60 fits failed out of a total of 240.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
24 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/imdohyeon/miniconda3/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/imdohyeon/miniconda3/lib/python3.7/site-packages/sklearn/pipeline.py", line 394, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/imdohyeon/miniconda3/lib/python3.7/site-packages/sklearn/ensemble/_forest.py", line 328, in fit
    X, y, multi_output=True, accept_sparse="csc", dtype=DTYPE
  File "/Users/imdohyeon/miniconda3/lib/python3.7/site-packages/s

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('feature_selection',
                                        SelectFromModel(estimator=Lasso(alpha=0.01,
                                                                        max_iter=10000))),
                                       ('random_forest',
                                        RandomForestClassifier(random_state=42))]),
             n_jobs=-1,
             param_grid={'feature_selection__estimator__alpha': [0.001, 0.01,
                                                                 0.1, 1],
                         'random_forest__max_depth': [None, 10, 20, 30],
                         'random_forest__n_estimators': [50, 100, 200]},
             scoring=make_scorer(f1_score, average=weighted), verbose=1)

In [5]:
# 최적 파라미터 및 성능 출력
print("Best Parameters:", grid_search_rf_2.best_params_)
print("Best CV Score:", grid_search_rf_2.best_score_)

# 최적 모델 로드
best_model_rf_2 = grid_search_rf_2.best_estimator_

# 테스트 세트 평가
y_pred_2 = best_model_rf_2.predict(X2_test)
accuracy = accuracy_score(y2_test, y_pred_2)
f1 = f1_score(y2_test, y_pred_2, average='weighted')
print(f"Test Accuracy: {accuracy:.2f}", f"Test F1: {f1:.2f}")

Best Parameters: {'feature_selection__estimator__alpha': 0.001, 'random_forest__max_depth': None, 'random_forest__n_estimators': 200}
Best CV Score: 0.8719161560400901
Test Accuracy: 0.88 Test F1: 0.88


In [6]:
# 최적 모델 저장
joblib.dump(best_model_rf_2, "/Users/imdohyeon/Documents/PythonWorkspace/4n/model/best_l1rf_model_ver2.pkl")
print("Model saved to 'best_l1rf_model_ver2.pkl'")

Model saved to 'best_l1rf_model_ver2.pkl'


## 2-2. L1 Regularization + XGB (Grid)

In [7]:
# XGBoost 모델
xgb_model = XGBClassifier(
    objective='logloss',
    use_label_encoder=False,
    eval_metric='logloss'
)

# 파이프라인
pipeline = Pipeline([
    ('feature_selection', SelectFromModel(estimator=lasso)),
    ('xgboost', xgb_model)
])

# Grid Search 설정
param_grid = {
    'feature_selection__estimator__alpha': [0.001, 0.01, 0.1],
    'xgboost__n_estimators': [50, 100, 200],
    'xgboost__max_depth': [3, 5, 7],
    'xgboost__learning_rate': [0.01, 0.1, 0.3]
}

scoring = {
    'accuracy': make_scorer(accuracy_score),
    'f1_score': make_scorer(f1_score, average='weighted')
}

grid_search_xgb_2 = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    scoring=scoring,
    refit='accuracy',
    verbose=1,
    n_jobs=-1
)

grid_search_xgb_2.fit(X2_train, y2_train)

Fitting 5 folds for each of 81 candidates, totalling 405 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('feature_selection',
                                        SelectFromModel(estimator=Lasso(alpha=0.01,
                                                                        max_iter=10000))),
                                       ('xgboost',
                                        XGBClassifier(base_score=None,
                                                      booster=None,
                                                      callbacks=None,
                                                      colsample_bylevel=None,
                                                      colsample_bynode=None,
                                                      colsample_bytree=None,
                                                      early_stopping_rounds=None,
                                                      enable_categorical=False,
                                                      eval_metric='logloss',
              

In [8]:
# 최적 파라미터 출력
print("Best Parameters:", grid_search_xgb_2.best_params_)

# 테스트 데이터 평가
y_pred_xgb_2 = grid_search_xgb_2.best_estimator_.predict(X2_test)
print(f"Test Accuracy: {accuracy_score(y2_test, y_pred_xgb_2):.2f}")
print(f"Test F1-Score: {f1_score(y2_test, y_pred_xgb_2, average='weighted'):.2f}")

Best Parameters: {'feature_selection__estimator__alpha': 0.001, 'xgboost__learning_rate': 0.3, 'xgboost__max_depth': 7, 'xgboost__n_estimators': 100}
Test Accuracy: 0.90
Test F1-Score: 0.90


In [9]:
# 최적 모델 저장
joblib.dump(grid_search_xgb_2.best_estimator_, "/Users/imdohyeon/Documents/PythonWorkspace/4n/model/best_l1xgb_model_ver2.pkl")
print("Model saved to 'best_l1xgb_model_ver2.pkl'")

Model saved to 'best_l1xgb_model_ver2.pkl'


# 3. L1 Regularization 결과 확인

In [10]:
# SelectFromModel 학습
feature_selector = SelectFromModel(estimator=lasso)
feature_selector.fit(X2_train, y2_train)  # Lasso 모델 학습

# 선택된 특징 확인
selected_features = feature_selector.get_support()  # 선택된 특징에 대한 Boolean Mask
selected_feature_count = sum(selected_features)  # 선택된 특징 개수
total_features = len(selected_features)  # 전체 특징 개수

print(f"선택된 특징 개수: {selected_feature_count}")
print(f"전체 특징 개수: {total_features}")
print(f"선택된 특징 비율: {selected_feature_count / total_features:.2%}")

선택된 특징 개수: 19
전체 특징 개수: 133
선택된 특징 비율: 14.29%


In [11]:
selected_features

array([False,  True,  True,  True, False,  True,  True,  True, False,
       False, False,  True,  True, False, False, False, False,  True,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False,  True,  True,
        True,  True, False, False, False, False, False,  True,  True,
        True,  True,

- Kurtosis, Spectral entropy만 사용됨

# 나머지 절반 NC 데이터 테스트

## Version 2

In [12]:
new_data_dir = '/Users/imdohyeon/Documents/PythonWorkspace/4n/preprocessSeg/extracted/ver2(even NC)/features_nc2_ver2.csv'
new_data = pd.read_csv(new_data_dir)
new_data_X = new_data.drop(['label'], axis=1)
new_data_y = new_data['label']

In [13]:
y_pred_rf_nc2 = grid_search_rf_2.best_estimator_.predict(new_data_X)
y_pred_xgb_nc2 = grid_search_xgb_2.best_estimator_.predict(new_data_X)

In [14]:
acc_rf_nc2 = accuracy_score(new_data_y, y_pred_rf_nc2)
f1_rf_nc2 = f1_score(new_data_y, y_pred_rf_nc2, average='weighted')

acc_xgb_nc2 = accuracy_score(new_data_y, y_pred_xgb_nc2)
f1_xgb_nc2 = f1_score(new_data_y, y_pred_xgb_nc2, average='weighted')

print(f"rf. Test Accuracy: {acc_rf_nc2:.2f}", f"Test F1: {f1_rf_nc2:.2f}")
print(f"xgb. Test Accuracy: {acc_xgb_nc2:.2f}", f"Test F1: {f1_xgb_nc2:.2f}")

rf. Test Accuracy: 0.56 Test F1: 0.72
xgb. Test Accuracy: 0.61 Test F1: 0.76
