ML 접근방식

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score

# 데이터 로딩
data = pd.read_csv('your_dataset.csv')

# 특성(X)과 타겟(y) 분리
X = data.drop('target_column', axis=1)
y = data['target_column']

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 데이터 정규화
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 분류 모델 (예: 로지스틱 회귀와 랜덤 포레스트)
def ml_classification():
    # 로지스틱 회귀
    log_reg = LogisticRegression()
    log_reg.fit(X_train_scaled, y_train)
    log_reg_pred = log_reg.predict(X_test_scaled)
    log_reg_accuracy = accuracy_score(y_test, log_reg_pred)
    print(f"Logistic Regression Accuracy: {log_reg_accuracy}")

    # 랜덤 포레스트
    rf_clf = RandomForestClassifier()
    rf_clf.fit(X_train_scaled, y_train)
    rf_clf_pred = rf_clf.predict(X_test_scaled)
    rf_clf_accuracy = accuracy_score(y_test, rf_clf_pred)
    print(f"Random Forest Classifier Accuracy: {rf_clf_accuracy}")

# 회귀 모델 (예: 선형 회귀와 랜덤 포레스트)
def ml_regression():
    # 선형 회귀
    lin_reg = LinearRegression()
    lin_reg.fit(X_train_scaled, y_train)
    lin_reg_pred = lin_reg.predict(X_test_scaled)
    lin_reg_mse = mean_squared_error(y_test, lin_reg_pred)
    lin_reg_r2 = r2_score(y_test, lin_reg_pred)
    print(f"Linear Regression MSE: {lin_reg_mse}, R2: {lin_reg_r2}")

    # 랜덤 포레스트
    rf_reg = RandomForestRegressor()
    rf_reg.fit(X_train_scaled, y_train)
    rf_reg_pred = rf_reg.predict(X_test_scaled)
    rf_reg_mse = mean_squared_error(y_test, rf_reg_pred)
    rf_reg_r2 = r2_score(y_test, rf_reg_pred)
    print(f"Random Forest Regressor MSE: {rf_reg_mse}, R2: {rf_reg_r2}")

# ML 모델 실행
print("ML Classification Results:")
ml_classification()
print("\nML Regression Results:")
ml_regression()


DL 접근방식

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score

# 분류를 위한 DL 모델
def dl_classification(input_shape, num_classes):
    model = Sequential([
        Dense(64, activation='relu', input_shape=(input_shape,)),
        Dropout(0.2),
        Dense(32, activation='relu'),
        Dropout(0.2),
        Dense(num_classes, activation='softmax')
    ])
    model.compile(optimizer=Adam(), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# 회귀를 위한 DL 모델
def dl_regression(input_shape):
    model = Sequential([
        Dense(64, activation='relu', input_shape=(input_shape,)),
        Dropout(0.2),
        Dense(32, activation='relu'),
        Dropout(0.2),
        Dense(1)
    ])
    model.compile(optimizer=Adam(), loss='mean_squared_error', metrics=['mae'])
    return model

# DL 분류 모델 학습 및 평가
def run_dl_classification():
    model = dl_classification(X_train_scaled.shape[1], len(np.unique(y)))
    history = model.fit(X_train_scaled, y_train, epochs=100, batch_size=32, validation_split=0.2, verbose=0)

    y_pred = model.predict(X_test_scaled)
    y_pred_classes = np.argmax(y_pred, axis=1)
    accuracy = accuracy_score(y_test, y_pred_classes)
    print(f"DL Classification Accuracy: {accuracy}")

    return history

# DL 회귀 모델 학습 및 평가
def run_dl_regression():
    model = dl_regression(X_train_scaled.shape[1])
    history = model.fit(X_train_scaled, y_train, epochs=100, batch_size=32, validation_split=0.2, verbose=0)

    y_pred = model.predict(X_test_scaled)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"DL Regression MSE: {mse}, R2: {r2}")

    return history

# DL 모델 실행
print("\nDL Classification Results:")
dl_clf_history = run_dl_classification()
print("\nDL Regression Results:")
dl_reg_history = run_dl_regression()

시각화

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, roc_curve, auc
import numpy as np

ML

In [None]:
def ml_visualization(y_true, y_pred, y_pred_proba=None, model_name="ML Model"):
    plt.figure(figsize=(15, 5))

    # 혼동 행렬 (Confusion Matrix)
    plt.subplot(1, 3, 1)
    cm = confusion_matrix(y_true, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'{model_name} Confusion Matrix')
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

    # 실제 값 vs 예측 값 (회귀의 경우)
    plt.subplot(1, 3, 2)
    plt.scatter(y_true, y_pred, alpha=0.5)
    plt.plot([y_true.min(), y_true.max()], [y_true.min(), y_true.max()], 'r--', lw=2)
    plt.title(f'{model_name} Actual vs Predicted')
    plt.xlabel('Actual Values')
    plt.ylabel('Predicted Values')

    # ROC 곡선 (분류의 경우, 이진 분류 가정)
    if y_pred_proba is not None:
        plt.subplot(1, 3, 3)
        fpr, tpr, _ = roc_curve(y_true, y_pred_proba[:, 1])
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
        plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title(f'{model_name} ROC Curve')
        plt.legend(loc="lower right")

    plt.tight_layout()
    plt.show()

# 사용 예시 (분류)
ml_visualization(y_test, log_reg_pred, log_reg.predict_proba(X_test_scaled), "Logistic Regression")
ml_visualization(y_test, rf_clf_pred, rf_clf.predict_proba(X_test_scaled), "Random Forest Classifier")

# 사용 예시 (회귀)
ml_visualization(y_test, lin_reg_pred, model_name="Linear Regression")
ml_visualization(y_test, rf_reg_pred, model_name="Random Forest Regressor")

DL

In [None]:
def dl_visualization(history, y_true, y_pred, model_name="DL Model"):
    plt.figure(figsize=(15, 10))

    # 학습 곡선
    plt.subplot(2, 2, 1)
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title(f'{model_name} Learning Curve - Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()

    # 정확도 또는 MAE 곡선
    plt.subplot(2, 2, 2)
    if 'accuracy' in history.history:
        plt.plot(history.history['accuracy'], label='Training Accuracy')
        plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
        plt.title(f'{model_name} Learning Curve - Accuracy')
        plt.ylabel('Accuracy')
    else:
        plt.plot(history.history['mae'], label='Training MAE')
        plt.plot(history.history['val_mae'], label='Validation MAE')
        plt.title(f'{model_name} Learning Curve - MAE')
        plt.ylabel('MAE')
    plt.xlabel('Epoch')
    plt.legend()

    # 실제 값 vs 예측 값
    plt.subplot(2, 2, 3)
    plt.scatter(y_true, y_pred, alpha=0.5)
    plt.plot([y_true.min(), y_true.max()], [y_true.min(), y_true.max()], 'r--', lw=2)
    plt.title(f'{model_name} Actual vs Predicted')
    plt.xlabel('Actual Values')
    plt.ylabel('Predicted Values')

    # 예측 오차 분포
    plt.subplot(2, 2, 4)
    error = y_pred - y_true
    plt.hist(error, bins=30)
    plt.title(f'{model_name} Prediction Error Distribution')
    plt.xlabel('Prediction Error')
    plt.ylabel('Count')

    plt.tight_layout()
    plt.show()

# 사용 예시 (분류)
dl_visualization(dl_clf_history, y_test, np.argmax(dl_clf_model.predict(X_test_scaled), axis=1), "DL Classification")

# 사용 예시 (회귀)
dl_visualization(dl_reg_history, y_test, dl_reg_model.predict(X_test_scaled).flatten(), "DL Regression")
