In [1]:
import pandas as pd

df_train = pd.read_csv('combined_train_data.csv')
df_valid = pd.read_csv('combined_val_data.csv')

In [2]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [3]:
# 필요 라이브러리 불러오기

import pandas as pd
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

## input, output 데이터 지정하기

In [5]:
# Separate features and target
x_train = df_train.iloc[:, [3, 4] + list(range(6, df_train.shape[1]))]
y_train = df_train.iloc[:, 5]

x_valid = df_valid.iloc[:, [3, 4] + list(range(6, df_train.shape[1]))]
y_valid = df_valid.iloc[:, 5]


## 레이블 인코딩 작업

In [23]:
from sklearn.preprocessing import LabelEncoder

# Encode target labels with value between 0 and n_classes-1
# XGBClassifier를 사용하기 위해 대상 레이블을 정수로 인코딩해야 합니다.
# 현재 대상 레이블은 범주형 문자열이므로, LabelEncoder를 사용하여 숫자로 변환해야 합니다.

label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_valid = label_encoder.transform(y_valid)

# got ['in' 'noise' 'normal' 'other' 'out']---> 클래스를 정수로 인코딩: [0 1 2 3 4]

## 데이터 설명자료 모범 답안 XGBClassifier

In [13]:
# XGBClassifier 초기화 및 훈련
model = XGBClassifier(
    base_score=0.5, booster='gbtree', colsample_bylevel=1,
    colsample_bynode=1, colsample_bytree=1, eval_metric='mlogloss',
    gamma=0, device='cpu', importance_type='gain',
    interaction_constraints='', learning_rate=0.5, max_delta_step=0,
    max_depth=6, min_child_weight=1,
    monotone_constraints='()', n_estimators=100, n_jobs=0,
    num_class=5, num_parallel_tree=1, objective='multi:softprob',
    random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
    seed=42, subsample=1, tree_method='exact', validate_parameters=1,
    verbosity=None
)

model.fit(x_train, y_train)


In [14]:
# Predict on the validation set
y_pred = model.predict(x_valid)
y_pred_prob = model.predict_proba(x_valid)

# Evaluate the model
accuracy = accuracy_score(y_valid, y_pred)
print(f"Validation Accuracy: {accuracy:.4f}")


Validation Accuracy: 0.9246


In [None]:
from sklearn.metrics import confusion_matrix

# 혼동 행렬 생성 및 평가 지표 계산
print("\nDEFAULT_XGBClassifier 혼동 행렬:")
confusion_matrix(y_valid, y_pred)

## 기본설정 XGBClassifier

In [28]:
# 기본 모델일 때
default_model = XGBClassifier()
default_model.fit(x_train, y_train)

In [19]:
# Predict on the validation set
y_pred = default_model.predict(x_valid)
y_pred_prob = default_model.predict_proba(x_valid)

# Evaluate the model
accuracy_default = accuracy_score(y_valid, y_pred)
print(f"Validation Accuracy: {accuracy_default:.4f}")

Validation Accuracy: 0.8988


In [16]:
# 모델의 하이퍼파라미터 출력
print("Default Hyperparameters:")
print(default_model.get_params())
print("\nCustom Hyperparameters:")
print(model.get_params())

Default Hyperparameters:
{'objective': 'binary:logistic', 'base_score': None, 'booster': None, 'callbacks': None, 'colsample_bylevel': None, 'colsample_bynode': None, 'colsample_bytree': None, 'device': None, 'early_stopping_rounds': None, 'enable_categorical': False, 'eval_metric': None, 'feature_types': None, 'gamma': None, 'grow_policy': None, 'importance_type': None, 'interaction_constraints': None, 'learning_rate': None, 'max_bin': None, 'max_cat_threshold': None, 'max_cat_to_onehot': None, 'max_delta_step': None, 'max_depth': None, 'max_leaves': None, 'min_child_weight': None, 'missing': nan, 'monotone_constraints': None, 'multi_strategy': None, 'n_estimators': None, 'n_jobs': None, 'num_parallel_tree': None, 'random_state': None, 'reg_alpha': None, 'reg_lambda': None, 'sampling_method': None, 'scale_pos_weight': None, 'subsample': None, 'tree_method': None, 'validate_parameters': None, 'verbosity': None}

Custom Hyperparameters:
{'objective': 'multi:softprob', 'base_score': 0.5,

## DNN 기본적인 틀에서

In [22]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models

# DNN 모델 정의
model = models.Sequential([
    layers.Input(shape=(x_train.shape[1],)),
    layers.Dense(128, activation='relu'),
    layers.Dense(64, activation='relu'),
    layers.Dense(len(np.unique(y_train)), activation='softmax')
])

# 모델 컴파일
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# 모델 훈련
history = model.fit(x_train, y_train, epochs=10, batch_size=32, validation_data=(x_valid, y_valid))

# 훈련된 모델로 예측
y_pred_prob_dnn = model.predict(x_valid)
y_pred_dnn = np.argmax(y_pred_prob_dnn, axis=1)

# 정확도 평가
accuracy_dnn = accuracy_score(y_valid, y_pred_dnn)
print(f"DNN Validation Accuracy: {accuracy_dnn:.4f}")

Epoch 1/10
[1m1956/1956[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 5ms/step - accuracy: 0.4970 - loss: 11.4070 - val_accuracy: 0.5841 - val_loss: 1.3887
Epoch 2/10
[1m1956/1956[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 5ms/step - accuracy: 0.5999 - loss: 1.2805 - val_accuracy: 0.6216 - val_loss: 0.9763
Epoch 3/10
[1m1956/1956[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 6ms/step - accuracy: 0.6180 - loss: 0.9863 - val_accuracy: 0.6024 - val_loss: 0.9236
Epoch 4/10
[1m1956/1956[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 6ms/step - accuracy: 0.6365 - loss: 0.8912 - val_accuracy: 0.6332 - val_loss: 0.9333
Epoch 5/10
[1m1956/1956[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 4ms/step - accuracy: 0.6492 - loss: 0.8502 - val_accuracy: 0.6377 - val_loss: 0.8483
Epoch 6/10
[1m1956/1956[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 4ms/step - accuracy: 0.6672 - loss: 0.7863 - val_accuracy: 0.6747 - val_loss: 0.7627
Epoch 7/1

In [26]:
# 다중분류 혼동행렬로 평가하기

from sklearn.metrics import confusion_matrix

# 혼동 행렬 생성 및 평가 지표 계산

print("\nDNN 혼동 행렬:")
confusion_matrix(y_valid, y_pred)



DNN 혼동 행렬:


array([[1402,   37,    0,   36,  184],
       [  44,  499,    0,   23,   63],
       [   0,    0, 2462,    0,    0],
       [  71,   37,    0,  640,  130],
       [ 106,   27,    0,   33, 2026]], dtype=int64)