# 모델 최적화 성능 분석 함수

In [1]:
import numpy as np
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

def compare_models(X_train, X_test, y_train, y_test):
    """
    여러 모델을 비교하고 최적의 하이퍼파라미터를 찾는 함수

    Parameters:
    X_train, X_test: 학습 및 테스트 특성 데이터
    y_train, y_test: 학습 및 테스트 레이블 데이터

    Returns:
    best_model: 최적의 모델
    results_df: 모델 비교 결과
    """

    # 1. 모델과 하이퍼파라미터 정의
    models = {
        'Logistic Regression': {
            'model': LogisticRegression(),
            'params': {
                'C': [0.001, 0.01, 0.1, 1, 10],
                'max_iter': [1000]
            }
        },
        'Decision Tree': {
            'model': DecisionTreeClassifier(),
            'params': {
                'max_depth': [3, 5, 7, 10],
                'min_samples_split': [2, 5, 10]
            }
        },
        'Random Forest': {
            'model': RandomForestClassifier(),
            'params': {
                'n_estimators': [100, 200],
                'max_depth': [3, 5, 7],
                'min_samples_split': [2, 5]
            }
        },
        'SVM': {
            'model': SVC(),
            'params': {
                'C': [0.1, 1, 10],
                'kernel': ['rbf', 'linear']
            }
        },
        'KNN': {
            'model': KNeighborsClassifier(),
            'params': {
                'n_neighbors': [3, 5, 7, 9],
                'weights': ['uniform', 'distance']
            }
        }
    }

    # 2. 각 모델별로 GridSearchCV 수행
    results = []
    best_score = 0
    best_model = None

    for name, model_info in models.items():
        grid_search = GridSearchCV(
            model_info['model'],
            model_info['params'],
            cv=5,
            scoring='accuracy',
            n_jobs=-1
        )

        # 모델 학습
        grid_search.fit(X_train, y_train)

        # 테스트 세트로 성능 평가
        y_pred = grid_search.predict(X_test)
        test_score = accuracy_score(y_test, y_pred)

        # 결과 저장
        results.append({
            'Model': name,
            'Best Parameters': grid_search.best_params_,
            'Cross-val Score': grid_search.best_score_,
            'Test Score': test_score
        })

        # 최고 성능 모델 저장
        if test_score > best_score:
            best_score = test_score
            best_model = grid_search.best_estimator_

    # 결과를 데이터프레임으로 변환
    results_df = pd.DataFrame(results)

    return best_model, results_df

# 사용 예시

# 데이터 준비
df = pd.read_csv('https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv')
df['Age'] = df['Age'].fillna(df['Age'].mean())
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])
df['Family'] = df['SibSp'] + df['Parch']
from sklearn.preprocessing import LabelEncoder
df['Sex'] = LabelEncoder().fit_transform(df['Sex'])
df['Embarked'] = LabelEncoder().fit_transform(df['Embarked'])
X = df[['Pclass','Fare','Sex','Age','Embarked','Family']]
y = df['Survived']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 모델 비교 실행
best_model, results = compare_models(X_train, X_test, y_train, y_test)

# 결과 출력
print("\n모델 비교 결과:")
print(results)

print("\n최적 모델 성능 보고서:")
print(best_model)
y_pred = best_model.predict(X_test)
print(classification_report(y_test, y_pred))


모델 비교 결과:
                 Model                                    Best Parameters  \
0  Logistic Regression                       {'C': 0.1, 'max_iter': 1000}   
1        Decision Tree           {'max_depth': 3, 'min_samples_split': 2}   
2        Random Forest  {'max_depth': 5, 'min_samples_split': 5, 'n_es...   
3                  SVM                     {'C': 0.1, 'kernel': 'linear'}   
4                  KNN          {'n_neighbors': 3, 'weights': 'distance'}   

   Cross-val Score  Test Score  
0         0.799114    0.815642  
1         0.821619    0.798883  
2         0.835625    0.804469  
3         0.787876    0.782123  
4         0.712085    0.692737  

최적 모델 성능 보고서:
LogisticRegression(C=0.1, max_iter=1000)
              precision    recall  f1-score   support

           0       0.82      0.88      0.85       105
           1       0.81      0.73      0.77        74

    accuracy                           0.82       179
   macro avg       0.81      0.80      0.81       179
