In [2]:
import sys
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import xgboost as xgb
from xgboost import XGBClassifier
from scipy.fft import fft, fftfreq
from imblearn.combine import SMOTETomek
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.decomposition import FastICA
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
import os
os.environ['OMP_NUM_THREADS'] = '1'

warnings.filterwarnings('ignore')
sns.set()

if 'google.colab' in sys.modules:
    !pip install -q --upgrade xgboost
    !wget -q https://raw.githubusercontent.com/rickiepark/handson-gb/main/Chapter07/exoplanets.csv.zip

!unzip -o exoplanets.csv.zip

xgb.set_config(verbosity=0)

df = pd.read_csv('exoplanets.csv', nrows=400)

# 데이터 로드 및 준비
X = df.filter(like="FLUX")  # FLUX로 시작하는 열
y = df["LABEL"] - 1  # 타깃 변수

# 1. 데이터 분할: 테스트 데이터 분리 (최종 평가용, 전체 데이터의 20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# 2. FFT 수행 및 진폭 계산
timestep = 1
fft_train_data = []
fft_test_data = []

for i in range(X_train.shape[0]):
    sample_flux = X_train.iloc[i, :]  # i번째 샘플
    n = len(sample_flux)

    # FFT 수행
    fft_result = fft(sample_flux.to_numpy())
    magnitude = np.abs(fft_result[:n // 2])  # 양의 주파수 성분만 사용
    fft_train_data.append(magnitude)

for i in range(X_test.shape[0]):
    sample_flux = X_test.iloc[i, :]  # i번째 샘플
    n = len(sample_flux)

    # FFT 수행
    fft_result = fft(sample_flux.to_numpy())
    magnitude = np.abs(fft_result[:n // 2])  # 양의 주파수 성분만 사용
    fft_test_data.append(magnitude)

# FFT 결과를 DataFrame으로 변환
fft_train_df = pd.DataFrame(fft_train_data)
fft_test_df = pd.DataFrame(fft_test_data)

# 3. 데이터 샘플링: SMOTE + Tomek Links 사용 (훈련 데이터에 대해서만 적용)
smote_tomek = SMOTETomek(random_state=42)
X_train_resampled, y_train_resampled = smote_tomek.fit_resample(fft_train_df, y_train)

# 4. 데이터 스케일링 (훈련-검증 데이터에 대해서만 스케일링 적용)
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train_resampled)
X_test_scaled = scaler.transform(fft_test_df)

# PCA 수행 및 주성분 선택
pca = PCA(n_components=10)
principal_components = pca.fit_transform(X_train_scaled)

# PCA 결과를 DataFrame으로 변환하고 원하는 주성분 선택
pca_df = pd.DataFrame(principal_components, columns=['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10'])
X_train_pca_selected = pca_df[['PC1', 'PC2', 'PC4', 'PC8', 'PC5']]  # 임의로 선택한 주성분 사용
# X_train_pca_selected = pca_df[['PC1', 'PC2', 'PC5', 'PC9', 'PC10']]

# 테스트 데이터에도 동일한 PCA 변환 적용 후 원하는 주성분 선택
X_test_pca = pca.transform(X_test_scaled)
X_test_pca_df = pd.DataFrame(X_test_pca, columns=['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10'])
X_test_pca_selected = X_test_pca_df[['PC1', 'PC2', 'PC4', 'PC8', 'PC5']]  # 임의로 선택한 주성분 사용
# X_test_pca_selected = X_test_pca_df[['PC1', 'PC2', 'PC5', 'PC9', 'PC10']]

# 5. Stratified K-Fold Cross Validation 설정 (훈련-검증 데이터만 사용)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 모델 정의
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

# Cross Validation 학습 및 평가
validation_scores = []

for train_index, val_index in skf.split(X_train_pca_selected, y_train_resampled):
    # 훈련-검증 데이터 분할
    X_train, X_val = X_train_pca_selected.iloc[train_index], X_train_pca_selected.iloc[val_index]
    y_train, y_val = y_train_resampled.iloc[train_index], y_train_resampled.iloc[val_index]

    # 모델 학습
    model.fit(X_train, y_train)

    # 검증 데이터로 성능 평가
    y_val_pred = model.predict(X_val)
    val_accuracy = accuracy_score(y_val, y_val_pred)
    validation_scores.append(val_accuracy)

    print(f"Validation Accuracy: {val_accuracy:.4f}")

# 교차 검증 평균 성능 출력
print("\n=== Cross Validation Results ===")
print(f"Mean Validation Accuracy: {np.mean(validation_scores):.4f}")
print(f"Validation Accuracy Standard Deviation: {np.std(validation_scores):.4f}")

# 6. 최종 모델 학습 및 테스트 평가
model.fit(X_train_pca_selected, y_train_resampled)  # 전체 훈련-검증 데이터로 학습

# 테스트 데이터 평가
y_test_pred = model.predict(X_test_pca_selected)
test_accuracy = accuracy_score(y_test, y_test_pred)

print("\n=== Final Test Set Evaluation ===")
print("Accuracy:", test_accuracy)  # 테스트 세트 정확도 출력
print("Classification Report:\n", classification_report(y_test, y_test_pred))  # 상세 성능 출력

# 혼동 행렬 출력
conf_matrix = confusion_matrix(y_test, y_test_pred)
print("\n=== Confusion Matrix ===")
print(conf_matrix)

# 7. 하이퍼파라미터 튜닝을 위한 추가 코드
# RandomizedSearchCV 설정
param_dist = {
    'n_estimators': [100, 200, 300, 400, 500],
    'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3],
    'max_depth': [3, 5, 7, 9],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.2, 0.4],
    'min_child_weight': [1, 3, 5],
    'reg_alpha': [0, 0.1, 1],  # L1 regularization
    'reg_lambda': [0.5, 1, 5]  # L2 regularization
}

random_search = RandomizedSearchCV(
    model,
    param_distributions=param_dist,
    n_iter=50,
    scoring='accuracy',
    n_jobs=-1,
    cv=5,
    verbose=2,
    random_state=42
)

# 하이퍼파라미터 최적화 수행
random_search.fit(X_train_pca_selected, y_train_resampled)

# 최적의 하이퍼파라미터 및 최적의 모델로 테스트 데이터 평가
print("\n=== Best Parameters from Randomized Search ===")
print(random_search.best_params_)

# 최적 모델로 테스트 평가
best_model = random_search.best_estimator_
y_test_pred = best_model.predict(X_test_pca_selected)
test_accuracy = accuracy_score(y_test, y_test_pred)
print("\n=== Final Test Set Evaluation with Best Parameters ===")
print("Accuracy:", test_accuracy)
print("Classification Report:\n", classification_report(y_test, y_test_pred))

# 혼동 행렬 출력
conf_matrix = confusion_matrix(y_test, y_test_pred)
print("\n=== Confusion Matrix ===")
print(conf_matrix)

Archive:  exoplanets.csv.zip
  inflating: exoplanets.csv          
  inflating: __MACOSX/._exoplanets.csv  
Validation Accuracy: 0.9052
Validation Accuracy: 0.8276
Validation Accuracy: 0.8534
Validation Accuracy: 0.8534
Validation Accuracy: 0.8534

=== Cross Validation Results ===
Mean Validation Accuracy: 0.8586
Validation Accuracy Standard Deviation: 0.0253

=== Final Test Set Evaluation ===
Accuracy: 0.875
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.89      0.93        73
           1       0.38      0.71      0.50         7

    accuracy                           0.88        80
   macro avg       0.68      0.80      0.71        80
weighted avg       0.92      0.88      0.89        80


=== Confusion Matrix ===
[[65  8]
 [ 2  5]]
Fitting 5 folds for each of 50 candidates, totalling 250 fits

=== Best Parameters from Randomized Search ===
{'subsample': 0.8, 'reg_lambda': 0.5, 'reg_alpha': 1, 'n_estimators': 200, 'min_c