## Task1_0725. 타이타닉 생존자 예측 데이터 세트 train.csv에 대하여 다음 사항을 수행하세요.
- 일괄 전처리 사용자 함수 transform_features(df) 작성
- 분류 모델 학습 및 평가 사용자 함수 작성
- dt, lr, rf 모델링 및 평가(roc auc 포함)
  
==========================================================
- GridSearchCV의 최적 하이퍼 파라미터로 학습된 Estimator로 예측 및 평가 수행.
  - Decision Tree, Random Forest, Logistic Regression 모델별 수행
  - 선택한 모델에 적합한 parameter grid 적용
  - cv=5 적용

In [None]:
import warnings
import pandas as pd

# FutureWarning 경고 메시지를 무시하도록 설정
warnings.simplefilter(action="ignore", category=FutureWarning)
warnings.filterwarnings("ignore")

In [None]:
import pandas as pd
import numpy as np

file_path = r"D:\kdt_240424\workspace\M5_ML\data\train.csv"
df = pd.read_csv(file_path)

### 일괄 전처리 사용자 함수 transform_features(df) 작성


In [None]:
# 일괄 전처리 사용자 함수 transform_features(df) 작성
from sklearn.model_selection import train_test_split


def transform_features(df):
    df.drop(columns=["PassengerId", "Ticket", "Cabin"], inplace=True)

    def get_title(name):
        if "Mr." in name:
            return "Mr"
        elif "Miss." in name:
            return "Miss"
        elif "Mrs." in name:
            return "Mrs"
        else:
            return "Other"

    # 타이틀 열 추가
    df["Title"] = df["Name"].apply(get_title)
    df["Age"].fillna(df["Age"].mean(), inplace=True)
    df["Embarked"].fillna(df["Embarked"].mode()[0], inplace=True)

    bins = [0, 5, 12, 18, 27, 33, 60, 100]
    # [0,5,12,18,35,60,100]
    labels = ["Infant", "Child", "Teenager", "y y adult", " Young Adult", "Adul", "Senior"]
    df["Age_group"] = pd.cut(df["Age"], bins=bins, labels=labels)
    df.drop(columns=["Age"], inplace=True)

    fare_bins = [0, 30, 100, 600]
    fare_labels = ["Low", "Medium", "High"]
    df["Fare_group"] = pd.cut(df["Fare"], bins=fare_bins, labels=fare_labels)
    df.drop(columns=["Fare"], inplace=True)

    df["Family_size"] = df["SibSp"] + df["Parch"] + 1
    df["family_male"] = ((df["Family_size"] > 6) & (df["Sex"] == "male")).astype(int)
    df["mr_male"] = ((df["Title"] == "Mr")).astype(int)
    # df['mrs_female'] = ((df['Title'] == 'Miss')).astype(int)
    df["others"] = ((df["Title"] == "Other")).astype(int)

    df["family_female"] = ((df["Family_size"] > 3) & (df["Sex"] == "female")).astype(int)

    df.drop(columns=["SibSp", "Parch"], inplace=True)
    df.drop(columns=["Name"], inplace=True)
    df.drop(columns=["Title"], inplace=True)
    categorical_columns = ["Age_group", "Fare_group", "Sex", "Embarked"]
    for column in categorical_columns:
        df = pd.get_dummies(df, columns=[column])
    X = df.drop("Survived", axis=1)
    y = df["Survived"]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    return X_train, X_test, y_train, y_test

In [None]:
X_train, X_test, y_train, y_test = transform_features(df)

### 분류 모델 학습 및 평가 사용자 함수 작성


In [None]:
# 분류 모델 학습 및 평가 사용자 함수 작성

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score

# 사용자 평가 함수 정의
def get_clf_eval(y_test, pred, pred_proba=0):
    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)
    f1 = f1_score(y_test, pred)
    # ROC-AUC 추가
    roc_auc = roc_auc_score(y_test, pred_proba)
    print("오차 행렬")
    print(confusion)
    # ROC-AUC print 추가
    print(
        f"평가 함수 결과 :\n정확도 : {accuracy:.4f}, 정밀도 : {precision:.4f}, 재현율 : {recall:.4f}, F1 : {f1:.4f}, ROC AUC : {roc_auc:.4f}"
    )


# 분류 모델 학습
# 결정트리

dt_clf = DecisionTreeClassifier(criterion='gini', max_depth=3, random_state=42)
dt_clf.fit(X_train, y_train)
pred = dt_clf.predict(X_test)
pred_proba = dt_clf.predict_proba(X_test)[:,1]
get_clf_eval(y_test, pred, pred_proba)

오차 행렬
[[4328  211]
 [ 737  727]]
평가 함수 결과 :
정확도 : 0.8421, 정밀도 : 0.7751, 재현율 : 0.4966, F1 : 0.6053, ROC AUC : 0.8449


In [None]:
# KNN
from sklearn.neighbors import KNeighborsClassifier

knn_clf = KNeighborsClassifier(n_neighbors=7)
knn_clf.fit(X_train, y_train)
pred = knn_clf.predict(X_test)
pred_proba = knn_clf.predict_proba(X_test)[:,1]
get_clf_eval(y_test, pred, pred_proba)

오차 행렬
[[136  21]
 [ 32  79]]
평가 함수 결과 :
정확도 : 0.8022, 정밀도 : 0.7900, 재현율 : 0.7117, F1 : 0.7488, ROC AUC : 0.8736


In [None]:
# SVM

from sklearn.svm import SVC

svm_clf = SVC(kernel='linear', C=1.0, random_state=42)
svm_clf.fit(X_train, y_train)
pred = svm_clf.predict(X_test)
pred_proba = svm_clf.predict_proba(X_test)[:,1]
get_clf_eval(y_test, pred, pred_proba)

오차 행렬
[[139  18]
 [ 34  77]]
평가 함수 결과 :
정확도 : 0.8060, 정밀도 : 0.8105, 재현율 : 0.6937, F1 : 0.7476, ROC AUC : 0.8859


In [None]:
# random forest

from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(
    n_estimators=100,
    random_state=42
)
rf_clf.fit(X_test, y_test)
pred = rf_clf.predict(X_test)
pred_proba = rf_clf.predict_proba(X_test)[:,1]
get_clf_eval(y_test, pred, pred_proba)

오차 행렬
[[148   9]
 [ 10 101]]
평가 함수 결과 :
정확도 : 0.9291, 정밀도 : 0.9182, 재현율 : 0.9099, F1 : 0.9140, ROC AUC : 0.9678


In [None]:
# logistic regression

from sklearn.linear_model import LogisticRegression

lr_clf = LogisticRegression(max_iter=500, solver='lbfgs', random_state=42)
lr_clf.fit(X_test, y_test)
pred = lr_clf.predict(X_test)
pred_proba = lr_clf.predict_proba(X_test)[:,1]
get_clf_eval(y_test, pred, pred_proba)

오차 행렬
[[136  21]
 [ 26  85]]
평가 함수 결과 :
정확도 : 0.8246, 정밀도 : 0.8019, 재현율 : 0.7658, F1 : 0.7834, ROC AUC : 0.8914


## GridSearchCV의 최적 하이퍼 파라미터로 학습된 Estimator로 예측 및 평가 수행.

### Decision Tree


In [None]:
# Decision Tree

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

param_grids = {
    "criterion": ["gini", "entropy"],
    "max_depth": [3, 5, 7],
    "min_samples_split": [30, 50, 70],
    "min_samples_leaf": [3, 5, 10],
    "max_features": [3, 5, 10],
    "max_leaf_nodes": [3, 5, 10],
}

dt_clf = DecisionTreeClassifier(random_state=42)
grid_search = GridSearchCV(dt_clf, param_grid=param_grids, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)
print(f"best parameters found : {grid_search.best_params_}")

# 최적 모델로 예측 수행
best_dt = grid_search.best_estimator_
pred = best_dt.predict(X_test)
pred_proba = best_dt.predict_proba(X_test)[:,1]
get_clf_eval(y_test, pred, pred_proba)

Fitting 5 folds for each of 486 candidates, totalling 2430 fits
best parameters found : {'criterion': 'entropy', 'max_depth': 5, 'max_features': 10, 'max_leaf_nodes': 10, 'min_samples_leaf': 10, 'min_samples_split': 30}
오차 행렬
[[144  13]
 [ 39  72]]
평가 함수 결과 :
정확도 : 0.8060, 정밀도 : 0.8471, 재현율 : 0.6486, F1 : 0.7347, ROC AUC : 0.8888


### Random Forest

In [None]:
# Random Forest

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

param_grids = {
    "n_estimators": [300, 400, 500,],
    "max_depth": [3, 5, 7 ],
    "min_samples_split": [2,3,5  ],
    "min_samples_leaf": [1, 2, 3 ],
    "max_features": [10, 15, 20, 'sqrt', 'log2'],
}

rf_clf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(rf_clf, param_grid=param_grids, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_test, y_test)
print(f"best parameters found : {grid_search.best_params_}")

# 최적 모델로 예측 수행
best_rf = grid_search.best_estimator_
pred = best_rf.predict(X_test)
pred_proba = best_rf.predict_proba(X_test)[:, 1]
get_clf_eval(y_test, pred, pred_proba)

Fitting 5 folds for each of 405 candidates, totalling 2025 fits
best parameters found : {'max_depth': 7, 'max_features': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}
오차 행렬
[[144  13]
 [ 10 101]]
평가 함수 결과 :
정확도 : 0.9142, 정밀도 : 0.8860, 재현율 : 0.9099, F1 : 0.8978, ROC AUC : 0.9624


In [None]:
# Random Forest

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

param_grids = {
    "n_estimators": [
        50,
        100,
        200,
    ],
    "max_depth": [3, 5, 7,9],
    "min_samples_split": [2, 3, 4],
    "min_samples_leaf": [1, 2, 3],
    "max_features": [ 10,15, 20, "sqrt", "log2"],
}

rf_clf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(rf_clf, param_grid=param_grids, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_test, y_test)
print(f"best parameters found : {grid_search.best_params_}")

# 최적 모델로 예측 수행
best_rf = grid_search.best_estimator_
pred = best_rf.predict(X_test)
pred_proba = best_rf.predict_proba(X_test)[:, 1]
get_clf_eval(y_test, pred, pred_proba)

Fitting 5 folds for each of 540 candidates, totalling 2700 fits
best parameters found : {'max_depth': 7, 'max_features': 15, 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 100}
오차 행렬
[[144  13]
 [ 12  99]]
평가 함수 결과 :
정확도 : 0.9067, 정밀도 : 0.8839, 재현율 : 0.8919, F1 : 0.8879, ROC AUC : 0.9541


### Logistic Regression

In [None]:
# Logistic Regression

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

param_grids = {
    "max_iter": [3,5, 10,20, 30, 50, 100, 300, 500],
    "solver": ["lbfgs", "liblinear", "newton-cg"],
    "C": [3,5,7,9],
    "penalty": ["l1", "l2", "elasticnet"],
}

lr_clr = LogisticRegression(random_state=42)
grid_search = GridSearchCV(lr_clr, param_grid=param_grids, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_test, y_test)
print(f"best parameters found : {grid_search.best_params_}")

# 최적 모델로 예측 수행
best_lr = grid_search.best_estimator_
pred = best_lr.predict(X_test)
pred_proba = best_lr.predict_proba(X_test)[:, 1]
get_clf_eval(y_test, pred, pred_proba)

Fitting 5 folds for each of 324 candidates, totalling 1620 fits
best parameters found : {'C': 5, 'max_iter': 5, 'penalty': 'l2', 'solver': 'liblinear'}
오차 행렬
[[137  20]
 [ 25  86]]
평가 함수 결과 :
정확도 : 0.8321, 정밀도 : 0.8113, 재현율 : 0.7748, F1 : 0.7926, ROC AUC : 0.8946


## ex


In [None]:
from sklearn.preprocessing import LabelEncoder


# Null 처리 함수
def fillna(df):
    df["Age"].fillna(df["Age"].mean(), inplace=True)
    df["Cabin"].fillna("N", inplace=True)
    df["Embarked"].fillna("N", inplace=True)
    df["Fare"].fillna(0, inplace=True)
    return df


# 머신러닝 알고리즘에 불필요한 속성 제거
def drop_features(df):
    df.drop(["PassengerId", "Name", "Ticket"], axis=1, inplace=True)
    return df


# 레이블 인코딩 수행.
def format_features(df):
    df["Cabin"] = df["Cabin"].str[:1]
    features = ["Cabin", "Sex", "Embarked"]
    for feature in features:
        le = LabelEncoder()
        le = le.fit(df[feature])
        df[feature] = le.transform(df[feature])
    return df


# 앞에서 설정한 Data Preprocessing 함수 호출
def transform_features(df):
    df = fillna(df)
    df = drop_features(df)
    df = format_features(df)
    return df

In [None]:
# 원본 데이터를 재로딩 하고, feature데이터 셋과 Label 데이터 셋 추출.

y_titanic_df = titanic_df["Survived"]
X_titanic_df = titanic_df.drop("Survived", axis=1)

X_titanic_df = transform_features(X_titanic_df)

In [None]:
from sklearn.model_selection import train_test_split

# X_train, X_test, y_train, y_test=train_test_split(X_titanic_df, y_titanic_df, test_size=0.2, random_state=11)
X_train, X_test, y_train, y_test = train_test_split(
    X_titanic_df, y_titanic_df, test_size=0.2, random_state=11, stratify=y_titanic_df
)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# 결정트리, Random Forest, 로지스틱 회귀를 위한 사이킷런 Classifier 클래스 생성
dt_clf = DecisionTreeClassifier(random_state=10)
rf_clf = RandomForestClassifier(random_state=10)
lr_clf = LogisticRegression(max_iter=2000, random_state=10)
print("dt_clf 학습")
print("=" * 12)
train_and_evaluate(dt_clf, X_train, X_test, y_train, y_test)
print("rf_clf 학습")
print("=" * 12)
train_and_evaluate(rf_clf, X_train, X_test, y_train, y_test)
print("lr_clf 학습")
print("=" * 12)
train_and_evaluate(lr_clf, X_train, X_test, y_train, y_test)

In [None]:
from sklearn.model_selection import GridSearchCV

parameters = {
    "max_depth": [2, 3, 5, 10, 12],
    "min_samples_split": [2, 3, 5],
    "min_samples_leaf": [1, 5, 8, 10],
}

grid_dclf = GridSearchCV(dt_clf, param_grid=parameters, scoring="accuracy", cv=5)
grid_dclf.fit(X_train, y_train)

print("GridSearchCV 최적 하이퍼 파라미터 :", grid_dclf.best_params_)
print("GridSearchCV 최고 정확도: {0:.4f}".format(grid_dclf.best_score_))
best_dclf = grid_dclf.best_estimator_

train_and_evaluate(best_dclf, X_train, X_test, y_train, y_test)

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {"C": [0.1, 1, 10, 50, 100]}
grid_lrclf = GridSearchCV(lr_clf, param_grid=param_grid, cv=5, verbose=0)
grid_lrclf.fit(X_train, y_train)

print("GridSearchCV 최적 하이퍼 파라미터 :", grid_lrclf.best_params_)
print("GridSearchCV 최고 정확도: {0:.4f}".format(grid_lrclf.best_score_))
best_lrclf = grid_lrclf.best_estimator_

train_and_evaluate(best_lrclf, X_train, X_test, y_train, y_test)

Task3_0725. 데이터셋 개선, 오늘 배운 모델 적용, 탐색적분석을 통한 파생변수 적용하고 설명

In [None]:
# 데이터셋 개선
import pandas as pd

data = pd.read_csv(r"D:\kdt_240424\workspace\M5_ML\data\adult_incomes.csv")
data.dropna(inplace=True)
# 이상치 제거 data['capital-gain'] max값 제거
data = data[data["capital-gain"] < 99990]

# 파생변수 작성
data["capital_diff"] = data["capital-gain"] - data["capital-loss"]

In [None]:
ages = data.age.values
category = ["teenager", "young adult", "adult", "elderly"]
data["age_cat"] = pd.cut(ages, bins=[17, 28, 37, 47, 90], labels=category)

In [None]:
# 범주형 변수 인코딩
categorical_features = [
    "workclass",
    "education",
    "marital-status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "native-country",
    "income",
    "age_cat",
]

data = pd.get_dummies(data, columns=categorical_features, drop_first=True)

In [None]:
# 변수 선택및 독립변수 , 종속변수 분리
X = data.drop("income_>50K", axis=1)
y = data["income_>50K"]

In [None]:
# 데이터 표준화
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X = scaler.fit_transform(X)

In [None]:
# 데이터셋 분리
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
print(X_test)

이전 결과가 가장 좋았던 랜덤 포레스트를 기준으로 하이퍼 파라미터 튜닝 진행

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf.fit(X_train, y_train)
pred = rf_clf.predict(X_test)
pred_proba = rf_clf.predict_proba(X_test)[:, 1]
get_clf_eval(y_test, pred, pred_proba)

오차 행렬
[[4218  321]
 [ 555  909]]
평가 함수 결과 :
정확도 : 0.8541, 정밀도 : 0.7390, 재현율 : 0.6209, F1 : 0.6748, ROC AUC : 0.9031


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

param_grids = {
    "n_estimators": [
        300,
        400,
        500,
    ],
    "max_depth": [3, 5, 7],
    "min_samples_split": [2, 3, 5],
    "min_samples_leaf": [1, 2, 3],
    "max_features": [10, 15, 20, "sqrt", "log2"],
}

rf_clf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(rf_clf, param_grid=param_grids, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)
print(f"best parameters found : {grid_search.best_params_}")

# 최적 모델로 예측 수행
best_rf = grid_search.best_estimator_
pred = best_rf.predict(X_test)
pred_proba = best_rf.predict_proba(X_test)[:, 1]
get_clf_eval(y_test, pred, pred_proba)

Task1_0725. 타이타닉 생존자 예측 데이터 세트 train.csv에 대하여 다음 사항을 수행하세요.
- 일괄 전처리 사용자 함수 transform_features(df) 작성
- 분류 모델 학습 및 평가 사용자 함수 작성
- dt, lr, rf 모델링 및 평가(정확도)

- GridSearchCV의 최적 하이퍼 파라미터로 학습된 Estimator로 예측 및 평가 수행.
  - Decision Tree, Random Forest, Logistic Regression 모델별 수행
  - 선택한 모델에 적합한 parameter greed 적용
  - cv=5 적용

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV

def categorize_age(age):
  if age < 13:
      return 'Child'
  elif age < 20:
      return 'Teenager'
  elif age < 60:
      return 'Adult'
  else:
      return 'Senior'

# 일괄 전처리 사용자 함수 transform_features(df)
def transform_features(df):
  # 이상치 처리
  Q1 = df['Fare'].quantile(0.25)
  Q3 = df['Fare'].quantile(0.75)
  IQR = Q3 - Q1
  fare_outliers = df[(df['Fare'] < (Q1 - 1.5 * IQR)) | (df['Fare'] > (Q3 + 1.5 * IQR))]

  df = df.drop(fare_outliers.index)

  # 결측치 처리
  imputer_most_frequent = SimpleImputer(strategy='most_frequent')
  df['Age'] = imputer_most_frequent.fit_transform(df[['Age']])
  df['Fare'] = imputer_most_frequent.fit_transform(df[['Fare']])
  df['Embarked'] = df['Embarked'].fillna('S')

  # 파생변수 생성
  df['Family_size'] = df['SibSp'] + df['Parch']

  df['AgeGroup'] = df['Age'].apply(lambda x: categorize_age(x))

  df['Pclass_Fare'] = df['Pclass'] * df['Fare']

  df['TicketCount'] = df.groupby('Ticket')['Ticket'].transform('count')

  df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
  rare_titles = ['Don', 'Rev', 'Dr', 'Ms', 'Major', 'Lady', 'Sir', 'Col', 'Mlle', 'Jonkheer']
  df['Title'] = df['Title'].replace(rare_titles, 'Rare')

  # 원본, 파생변수 모두 사용하는 경우
  df = pd.get_dummies(df, columns=['Embarked', 'Sex', 'SibSp', 'Parch', 'Family_size', 'AgeGroup', 'TicketCount', 'Ticket'])
  df.drop(columns=['PassengerId', 'Name', 'Cabin'], inplace=True)

  # 파생변수만 사용하는 경우
  # df = pd.get_dummies(df, columns=['Embarked', 'Family_size', 'AgeGroup', 'TicketCount', 'Sex'])
  # df.drop(columns=['PassengerId', 'Name', 'Cabin', 'SibSp', 'Parch', 'Age', 'Pclass', 'Ticket', 'Fare' ], inplace=True)

  return df

# 데이터 불러오기
df = pd.read_csv('/content/drive/MyDrive/KDT_2404/dataset/train.csv')

df = transform_features(df)

# 변수 선택 및 데이터 분리
X = df.drop(columns=['Survived'])
y = df['Survived']
df.drop(columns=['Survived'], inplace=True)

# 8. 학습용과 테스트용 데이터셋으로 나누기
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)

# 7. 데이터 표준화
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# 모델 및 하이퍼파라미터 설정
models = {
    'Logistic Regression': (LogisticRegression(max_iter=1000), {
        'C': [0.1, 1, 10, 100],
        'solver': ['newton-cg', 'lbfgs', 'liblinear']
    }),
    'Decision Tree': (DecisionTreeClassifier(), {
        'max_depth': [None, 10, 20, 30, 40],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }),
    'Random Forest': (RandomForestClassifier(), {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    })
}

results = {}

# 하이퍼파라미터 튜닝 및 모델 학습
for model_name, (model, params) in models.items():
    grid_search = GridSearchCV(model, params, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, best_model.predict_proba(X_test)[:, 1])
    results[model_name] = {
        'Best Parameters': grid_search.best_params_,
        'Accuracy': accuracy,
        'ROC AUC': roc_auc
    }

# 결과 출력
for model_name, result in results.items():
    print(f'{model_name} - Best Parameters: {result["Best Parameters"]}, Accuracy: {result["Accuracy"]}, ROC AUC: {result["ROC AUC"]}')

# 원본+파생 결과 (random_state=60)
# Logistic Regression - Best Parameters: {'C': 1, 'solver': 'newton-cg'}, Accuracy: 0.8774193548387097, ROC AUC: 0.8546666666666667
# Decision Tree - Best Parameters: {'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 5}, Accuracy: 0.864516129032258, ROC AUC: 0.7828571428571429
# Random Forest - Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}, Accuracy: 0.8580645161290322, ROC AUC: 0.8727619047619047
# Title 파생변수 없을 때 결과 (random_state=12
# Logistic Regression - Best Parameters: {'C': 10, 'solver': 'newton-cg'}, Accuracy: 0.8580645161290322, ROC AUC: 0.8845801246791346
# Decision Tree - Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5}, Accuracy: 0.8516129032258064, ROC AUC: 0.8221488815548222
# Random Forest - Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}, Accuracy: 0.8451612903225807, ROC AUC: 0.888980564723139Z)

# Title파생변수 추가 결과 (random_state=60)
# Logistic Regression - Best Parameters: {'C': 0.1, 'solver': 'newton-cg'}, Accuracy: 0.8774193548387097, ROC AUC: 0.8607619047619047
# Decision Tree - Best Parameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2}, Accuracy: 0.8516129032258064, ROC AUC: 0.7456190476190476
# Random Forest - Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}, Accuracy: 0.8709677419354839, ROC AUC: 0.8708571428571429
# random_state=12
# Logistic Regression - Best Parameters: {'C': 1, 'solver': 'liblinear'}, Accuracy: 0.8580645161290322, ROC AUC: 0.8933810047671433
# Decision Tree - Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5}, Accuracy: 0.864516129032258, ROC AUC: 0.8344334433443344
# Random Forest - Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}, Accuracy: 0.8580645161290322, ROC AUC: 0.8716538320498717


# 파생만 사용했을 때 결과 (random_state=60)
# Logistic Regression - Best Parameters: {'C': 0.1, 'solver': 'liblinear'}, Accuracy: 0.864516129032258, ROC AUC: 0.8327619047619047
# Decision Tree - Best Parameters: {'max_depth': 40, 'min_samples_leaf': 1, 'min_samples_split': 10}, Accuracy: 0.8516129032258064, ROC AUC: 0.8338095238095238
# Random Forest - Best Parameters: {'max_depth': 30, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 200}, Accuracy: 0.864516129032258, ROC AUC: 0.8586666666666667

# 모델 학습 및 평가

# 가장 베스트 값으로 하이퍼파라메터 튜닝
# models = {
#   'Logistic Regression': LogisticRegression(C= 0.1, solver= 'saga'),
#   'Decision Tree': DecisionTreeClassifier(max_depth= 10, min_samples_leaf= 4, min_samples_split= 5),
#   'Random Forest': RandomForestClassifier(max_depth= 10, min_samples_leaf= 4, min_samples_split= 5)
# }

# # 10. 모델 학습 및 평가
# for name, model in models.items():
#   model.fit(X_train, y_train)
#   y_pred = model.predict(X_test)
#   accuracy = accuracy_score(y_test, y_pred)
#   conf_matrix = confusion_matrix(y_test, y_pred)
#   class_report = classification_report(y_test, y_pred)
#   roc_auc = roc_auc_score(y_test, y_pred)

#   print(f'Model: {name}')
#   print(f'Accuracy: {accuracy:.4f}')
#   print('Confusion Matrix:')
#   print(conf_matrix)
#   print('Classification Report:')
#   print(class_report)
#   print(f'ROC AUC: {roc_auc:.4f}')
#   print('\n' + '='*60 + '\n')

Logistic Regression - Best Parameters: {'C': 10, 'solver': 'newton-cg'}, Accuracy: 0.8580645161290322, ROC AUC: 0.8845801246791346
Decision Tree - Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5}, Accuracy: 0.8516129032258064, ROC AUC: 0.8221488815548222
Random Forest - Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}, Accuracy: 0.8451612903225807, ROC AUC: 0.888980564723139


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV

def categorize_age(age):
  if age < 13:
      return 'Child'
  elif age < 20:
      return 'Teenager'
  elif age < 60:
      return 'Adult'
  else:
      return 'Senior'

# 일괄 전처리 사용자 함수 transform_features(df)
def transform_features(df):
  # 이상치 처리
  Q1 = df['Fare'].quantile(0.25)
  Q3 = df['Fare'].quantile(0.75)
  IQR = Q3 - Q1
  fare_outliers = df[(df['Fare'] < (Q1 - 1.5 * IQR)) | (df['Fare'] > (Q3 + 1.5 * IQR))]

  df = df.drop(fare_outliers.index)

  # 결측치 처리
  imputer_most_frequent = SimpleImputer(strategy='most_frequent')
  df['Age'] = imputer_most_frequent.fit_transform(df[['Age']])
  df['Fare'] = imputer_most_frequent.fit_transform(df[['Fare']])
  df['Embarked'] = df['Embarked'].fillna('S')

  # 파생변수 생성
  df['Family_size'] = df['SibSp'] + df['Parch']

  df['AgeGroup'] = df['Age'].apply(lambda x: categorize_age(x))

  df['Pclass_Fare'] = df['Pclass'] * df['Fare']

  df['TicketCount'] = df.groupby('Ticket')['Ticket'].transform('count')

  df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
  rare_titles = ['Don', 'Rev', 'Dr', 'Ms', 'Major', 'Lady', 'Sir', 'Col', 'Mlle', 'Jonkheer']
  df['Title'] = df['Title'].replace(rare_titles, 'Rare')

  # 원본, 파생변수 모두 사용하는 경우
  df = pd.get_dummies(df, columns=['Embarked', 'Sex', 'SibSp', 'Parch', 'Family_size', 'AgeGroup', 'TicketCount', 'Ticket', 'Title'])
  df.drop(columns=['PassengerId', 'Name', 'Cabin'], inplace=True)

  # 파생변수만 사용하는 경우
  # df = pd.get_dummies(df, columns=['Embarked', 'Family_size', 'AgeGroup', 'TicketCount', 'Sex'])
  # df.drop(columns=['PassengerId', 'Name', 'Cabin', 'SibSp', 'Parch', 'Age', 'Pclass', 'Ticket', 'Fare' ], inplace=True)

  return df

# 데이터 불러오기
df = pd.read_csv('/content/drive/MyDrive/KDT_2404/dataset/train.csv')

df = transform_features(df)

# 변수 선택 및 데이터 분리
X = df.drop(columns=['Survived'])
y = df['Survived']
df.drop(columns=['Survived'], inplace=True)

# 8. 학습용과 테스트용 데이터셋으로 나누기
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)

# 7. 데이터 표준화
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# 모델 학습 및 평가
lr_clf = LogisticRegression(C= 0.1, solver= 'newton-cg')
lr_clf.fit(X_train, y_train)
y_pred = lr_clf.predict(X_test)
y_proba = lr_clf.predict_proba(X_test)[:, 1]

print(f'Accuracy: {accuracy_score(y_test, y_pred):.4f}')
print(f'ROC AUC: {roc_auc_score(y_test, y_proba):.4f}')

rf_clf = RandomForestClassifier(max_depth= 10, min_samples_leaf= 1, min_samples_split= 5, n_estimators= 300)
rf_clf.fit(X_train, y_train)
y_pred = rf_clf.predict(X_test)
y_proba = rf_clf.predict_proba(X_test)[:, 1]

print(f'Accuracy: {accuracy_score(y_test, y_pred):.4f}')
print(f'ROC AUC: {roc_auc_score(y_test, y_proba):.4f}')

dt_clf = DecisionTreeClassifier(max_depth= 20, min_samples_leaf= 1, min_samples_split= 10)
dt_clf.fit(X_train, y_train)
y_pred = dt_clf.predict(X_test)
y_proba = dt_clf.predict_proba(X_test)[:, 1]

print(f'Accuracy: {accuracy_score(y_test, y_pred):.4f}')
print(f'ROC AUC: {roc_auc_score(y_test, y_proba):.4f}')

# Model: Decision Tree, Random State: 30, Best Parameters: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 10}, Accuracy: 0.896774193548387, ROC AUC: 0.8560163551401869
# Model: Logistic Regression, Random State: 37, Best Parameters: {'C': 0.1, 'solver': 'newton-cg'}, Accuracy: 0.8838709677419355, ROC AUC: 0.8819758672699849
# Model: Logistic Regression, Random State: 78, Best Parameters: {'C': 0.1, 'solver': 'newton-cg'}, Accuracy: 0.8903225806451613, ROC AUC: 0.9101008215085885

In [None]:
Model: Logistic Regression, Random State: 31, Best Parameters: {'C': 0.1, 'solver': 'newton-cg'}, Accuracy: 0.832258064516129, ROC AUC: 0.8842383107088989
Model: Decision Tree, Random State: 31, Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5}, Accuracy: 0.8258064516129032, ROC AUC: 0.8329562594268477
Model: Random Forest, Random State: 31, Best Parameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 300}, Accuracy: 0.8, ROC AUC: 0.8522812971342383
Model: Logistic Regression, Random State: 32, Best Parameters: {'C': 1, 'solver': 'newton-cg'}, Accuracy: 0.8129032258064516, ROC AUC: 0.8295625942684767
Model: Decision Tree, Random State: 32, Best Parameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10}, Accuracy: 0.7870967741935484, ROC AUC: 0.7658371040723981
Model: Random Forest, Random State: 32, Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}, Accuracy: 0.8387096774193549, ROC AUC: 0.7540535444947211
Model: Logistic Regression, Random State: 33, Best Parameters: {'C': 1, 'solver': 'liblinear'}, Accuracy: 0.8387096774193549, ROC AUC: 0.8854285714285713
Model: Decision Tree, Random State: 33, Best Parameters: {'max_depth': 40, 'min_samples_leaf': 1, 'min_samples_split': 2}, Accuracy: 0.8387096774193549, ROC AUC: 0.7971428571428572
Model: Random Forest, Random State: 33, Best Parameters: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}, Accuracy: 0.8193548387096774, ROC AUC: 0.8532380952380952
Model: Logistic Regression, Random State: 34, Best Parameters: {'C': 1, 'solver': 'newton-cg'}, Accuracy: 0.864516129032258, ROC AUC: 0.8804153240243465
Model: Decision Tree, Random State: 34, Best Parameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10}, Accuracy: 0.8516129032258064, ROC AUC: 0.8490870032223417
Model: Random Forest, Random State: 34, Best Parameters: {'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}, Accuracy: 0.8451612903225807, ROC AUC: 0.8709273182957394
Model: Logistic Regression, Random State: 35, Best Parameters: {'C': 1, 'solver': 'newton-cg'}, Accuracy: 0.8258064516129032, ROC AUC: 0.870716211012707
Model: Decision Tree, Random State: 35, Best Parameters: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2}, Accuracy: 0.8387096774193549, ROC AUC: 0.7841740469772814
Model: Random Forest, Random State: 35, Best Parameters: {'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}, Accuracy: 0.8129032258064516, ROC AUC: 0.8824605313823644
Model: Logistic Regression, Random State: 36, Best Parameters: {'C': 1, 'solver': 'newton-cg'}, Accuracy: 0.8258064516129032, ROC AUC: 0.8470847084708472
Model: Decision Tree, Random State: 36, Best Parameters: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 5}, Accuracy: 0.8193548387096774, ROC AUC: 0.7871287128712872
Model: Random Forest, Random State: 36, Best Parameters: {'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 300}, Accuracy: 0.8064516129032258, ROC AUC: 0.8217821782178217
Model: Logistic Regression, Random State: 37, Best Parameters: {'C': 0.1, 'solver': 'newton-cg'}, Accuracy: 0.8838709677419355, ROC AUC: 0.8819758672699849
Model: Decision Tree, Random State: 37, Best Parameters: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 10}, Accuracy: 0.8451612903225807, ROC AUC: 0.827205882352941
Model: Random Forest, Random State: 37, Best Parameters: {'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 300}, Accuracy: 0.8709677419354839, ROC AUC: 0.8346530920060332
Model: Logistic Regression, Random State: 38, Best Parameters: {'C': 0.1, 'solver': 'newton-cg'}, Accuracy: 0.8064516129032258, ROC AUC: 0.851008215085885
Model: Decision Tree, Random State: 38, Best Parameters: {'max_depth': 40, 'min_samples_leaf': 1, 'min_samples_split': 2}, Accuracy: 0.8064516129032258, ROC AUC: 0.7686706497386109
Model: Random Forest, Random State: 38, Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}, Accuracy: 0.7806451612903226, ROC AUC: 0.8258028379387602
Model: Logistic Regression, Random State: 39, Best Parameters: {'C': 0.1, 'solver': 'newton-cg'}, Accuracy: 0.8129032258064516, ROC AUC: 0.8600110011001101
Model: Decision Tree, Random State: 39, Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5}, Accuracy: 0.8258064516129032, ROC AUC: 0.7969380271360469
Model: Random Forest, Random State: 39, Best Parameters: {'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}, Accuracy: 0.8258064516129032, ROC AUC: 0.8545104510451045
Model: Logistic Regression, Random State: 40, Best Parameters: {'C': 10, 'solver': 'liblinear'}, Accuracy: 0.8129032258064516, ROC AUC: 0.8218599033816425
Model: Decision Tree, Random State: 40, Best Parameters: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2}, Accuracy: 0.7806451612903226, ROC AUC: 0.7348171152518979
Model: Random Forest, Random State: 40, Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}, Accuracy: 0.8064516129032258, ROC AUC: 0.8218599033816424
Model: Logistic Regression, Random State: 41, Best Parameters: {'C': 0.1, 'solver': 'liblinear'}, Accuracy: 0.8, ROC AUC: 0.8218085106382979
Model: Decision Tree, Random State: 41, Best Parameters: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2}, Accuracy: 0.7870967741935484, ROC AUC: 0.692080378250591
Model: Random Forest, Random State: 41, Best Parameters: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}, Accuracy: 0.7935483870967742, ROC AUC: 0.8127462568951931
Model: Logistic Regression, Random State: 42, Best Parameters: {'C': 0.1, 'solver': 'newton-cg'}, Accuracy: 0.8129032258064516, ROC AUC: 0.858421052631579
Model: Decision Tree, Random State: 42, Best Parameters: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 5}, Accuracy: 0.7612903225806451, ROC AUC: 0.740877192982456
Model: Random Forest, Random State: 42, Best Parameters: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}, Accuracy: 0.8129032258064516, ROC AUC: 0.832280701754386
Model: Logistic Regression, Random State: 43, Best Parameters: {'C': 1, 'solver': 'newton-cg'}, Accuracy: 0.832258064516129, ROC AUC: 0.8400909090909092
Model: Decision Tree, Random State: 43, Best Parameters: {'max_depth': 40, 'min_samples_leaf': 1, 'min_samples_split': 5}, Accuracy: 0.8193548387096774, ROC AUC: 0.7972727272727274
Model: Random Forest, Random State: 43, Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}, Accuracy: 0.832258064516129, ROC AUC: 0.8464545454545453
Model: Logistic Regression, Random State: 44, Best Parameters: {'C': 0.1, 'solver': 'newton-cg'}, Accuracy: 0.8774193548387097, ROC AUC: 0.8682033096926713
Model: Decision Tree, Random State: 44, Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5}, Accuracy: 0.8258064516129032, ROC AUC: 0.7908786446020488
Model: Random Forest, Random State: 44, Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 300}, Accuracy: 0.8580645161290322, ROC AUC: 0.8635736800630417
Model: Logistic Regression, Random State: 45, Best Parameters: {'C': 0.1, 'solver': 'newton-cg'}, Accuracy: 0.864516129032258, ROC AUC: 0.9183256309989335
Model: Decision Tree, Random State: 45, Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 10}, Accuracy: 0.864516129032258, ROC AUC: 0.8591361535726982
Model: Random Forest, Random State: 45, Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 300}, Accuracy: 0.8451612903225807, ROC AUC: 0.908016352648418
Model: Logistic Regression, Random State: 46, Best Parameters: {'C': 1, 'solver': 'newton-cg'}, Accuracy: 0.8258064516129032, ROC AUC: 0.8393333333333334
Model: Decision Tree, Random State: 46, Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2}, Accuracy: 0.7935483870967742, ROC AUC: 0.7638095238095239
Model: Random Forest, Random State: 46, Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}, Accuracy: 0.8193548387096774, ROC AUC: 0.8425714285714286
Model: Logistic Regression, Random State: 47, Best Parameters: {'C': 10, 'solver': 'newton-cg'}, Accuracy: 0.864516129032258, ROC AUC: 0.8685567010309277
Model: Decision Tree, Random State: 47, Best Parameters: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2}, Accuracy: 0.864516129032258, ROC AUC: 0.8428723782438678
Model: Random Forest, Random State: 47, Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}, Accuracy: 0.864516129032258, ROC AUC: 0.8744223249200141
Model: Logistic Regression, Random State: 48, Best Parameters: {'C': 0.1, 'solver': 'newton-cg'}, Accuracy: 0.8580645161290322, ROC AUC: 0.8996919522525991
Model: Decision Tree, Random State: 48, Best Parameters: {'max_depth': 40, 'min_samples_leaf': 1, 'min_samples_split': 5}, Accuracy: 0.8838709677419355, ROC AUC: 0.8687909125914517
Model: Random Forest, Random State: 48, Best Parameters: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 200}, Accuracy: 0.8709677419354839, ROC AUC: 0.8920870234886408
Model: Logistic Regression, Random State: 49, Best Parameters: {'C': 0.1, 'solver': 'newton-cg'}, Accuracy: 0.8129032258064516, ROC AUC: 0.780909090909091
Model: Decision Tree, Random State: 49, Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2}, Accuracy: 0.7935483870967742, ROC AUC: 0.7581818181818182
Model: Random Forest, Random State: 49, Best Parameters: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}, Accuracy: 0.8, ROC AUC: 0.7669090909090909
Model: Logistic Regression, Random State: 50, Best Parameters: {'C': 0.1, 'solver': 'newton-cg'}, Accuracy: 0.832258064516129, ROC AUC: 0.867694805194805
Model: Decision Tree, Random State: 50, Best Parameters: {'max_depth': 40, 'min_samples_leaf': 1, 'min_samples_split': 2}, Accuracy: 0.8387096774193549, ROC AUC: 0.8039321789321789
Model: Random Forest, Random State: 50, Best Parameters: {'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}, Accuracy: 0.8516129032258064, ROC AUC: 0.8880772005772005
Model: Logistic Regression, Random State: 51, Best Parameters: {'C': 0.1, 'solver': 'newton-cg'}, Accuracy: 0.8, ROC AUC: 0.8723516949152542
Model: Decision Tree, Random State: 51, Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 10}, Accuracy: 0.8516129032258064, ROC AUC: 0.8489583333333334
Model: Random Forest, Random State: 51, Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 300}, Accuracy: 0.8258064516129032, ROC AUC: 0.8243290960451977
Model: Logistic Regression, Random State: 52, Best Parameters: {'C': 0.1, 'solver': 'newton-cg'}, Accuracy: 0.8064516129032258, ROC AUC: 0.8195488721804511
Model: Decision Tree, Random State: 52, Best Parameters: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 5}, Accuracy: 0.8258064516129032, ROC AUC: 0.7821339061940565
Model: Random Forest, Random State: 52, Best Parameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}, Accuracy: 0.7935483870967742, ROC AUC: 0.7998567848191908
Model: Logistic Regression, Random State: 53, Best Parameters: {'C': 0.1, 'solver': 'newton-cg'}, Accuracy: 0.8193548387096774, ROC AUC: 0.8909736308316429
Model: Decision Tree, Random State: 53, Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5}, Accuracy: 0.8064516129032258, ROC AUC: 0.8029073698444896
Model: Random Forest, Random State: 53, Best Parameters: {'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 300}, Accuracy: 0.8451612903225807, ROC AUC: 0.862660581473969
Model: Logistic Regression, Random State: 54, Best Parameters: {'C': 1, 'solver': 'newton-cg'}, Accuracy: 0.7677419354838709, ROC AUC: 0.8386842105263158
Model: Decision Tree, Random State: 54, Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 10}, Accuracy: 0.7741935483870968, ROC AUC: 0.7919298245614036
Model: Random Forest, Random State: 54, Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}, Accuracy: 0.8129032258064516, ROC AUC: 0.848859649122807
Model: Logistic Regression, Random State: 55, Best Parameters: {'C': 10, 'solver': 'newton-cg'}, Accuracy: 0.832258064516129, ROC AUC: 0.8614529914529915
Model: Decision Tree, Random State: 55, Best Parameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5}, Accuracy: 0.832258064516129, ROC AUC: 0.8247008547008546
Model: Random Forest, Random State: 55, Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 100}, Accuracy: 0.832258064516129, ROC AUC: 0.8452136752136752
Model: Logistic Regression, Random State: 56, Best Parameters: {'C': 1, 'solver': 'newton-cg'}, Accuracy: 0.8516129032258064, ROC AUC: 0.9014972419227738
Model: Decision Tree, Random State: 56, Best Parameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10}, Accuracy: 0.8451612903225807, ROC AUC: 0.8204294720252165
Model: Random Forest, Random State: 56, Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 300}, Accuracy: 0.8451612903225807, ROC AUC: 0.8912529550827424
Model: Logistic Regression, Random State: 57, Best Parameters: {'C': 1, 'solver': 'newton-cg'}, Accuracy: 0.8580645161290322, ROC AUC: 0.9064002959674435
Model: Decision Tree, Random State: 57, Best Parameters: {'max_depth': 40, 'min_samples_leaf': 1, 'min_samples_split': 10}, Accuracy: 0.8774193548387097, ROC AUC: 0.8791157972623012
Model: Random Forest, Random State: 57, Best Parameters: {'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 300}, Accuracy: 0.864516129032258, ROC AUC: 0.8886422493525713
Model: Logistic Regression, Random State: 58, Best Parameters: {'C': 1, 'solver': 'newton-cg'}, Accuracy: 0.8451612903225807, ROC AUC: 0.857768691588785
Model: Decision Tree, Random State: 58, Best Parameters: {'max_depth': 40, 'min_samples_leaf': 1, 'min_samples_split': 5}, Accuracy: 0.832258064516129, ROC AUC: 0.8115264797507789
Model: Random Forest, Random State: 58, Best Parameters: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 300}, Accuracy: 0.8451612903225807, ROC AUC: 0.8303154205607477
Model: Logistic Regression, Random State: 59, Best Parameters: {'C': 10, 'solver': 'newton-cg'}, Accuracy: 0.832258064516129, ROC AUC: 0.8792255027813436
Model: Decision Tree, Random State: 59, Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5}, Accuracy: 0.8516129032258064, ROC AUC: 0.8215661103979461
Model: Random Forest, Random State: 59, Best Parameters: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}, Accuracy: 0.8387096774193549, ROC AUC: 0.8770860077021824
Model: Logistic Regression, Random State: 60, Best Parameters: {'C': 0.1, 'solver': 'newton-cg'}, Accuracy: 0.8774193548387097, ROC AUC: 0.8607619047619047
Model: Decision Tree, Random State: 60, Best Parameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5}, Accuracy: 0.8451612903225807, ROC AUC: 0.7451428571428571
Model: Random Forest, Random State: 60, Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}, Accuracy: 0.8709677419354839, ROC AUC: 0.8805714285714286
Model: Logistic Regression, Random State: 61, Best Parameters: {'C': 1, 'solver': 'liblinear'}, Accuracy: 0.8774193548387097, ROC AUC: 0.9247542997542998
Model: Decision Tree, Random State: 61, Best Parameters: {'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 10}, Accuracy: 0.8580645161290322, ROC AUC: 0.8447993447993449
Model: Random Forest, Random State: 61, Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}, Accuracy: 0.8838709677419355, ROC AUC: 0.8890253890253891
Model: Logistic Regression, Random State: 62, Best Parameters: {'C': 10, 'solver': 'liblinear'}, Accuracy: 0.8709677419354839, ROC AUC: 0.8937077852826165
Model: Decision Tree, Random State: 62, Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 10}, Accuracy: 0.8516129032258064, ROC AUC: 0.8401173124777818
Model: Random Forest, Random State: 62, Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 300}, Accuracy: 0.8709677419354839, ROC AUC: 0.8740668325630998
Model: Logistic Regression, Random State: 63, Best Parameters: {'C': 0.1, 'solver': 'newton-cg'}, Accuracy: 0.8193548387096774, ROC AUC: 0.8721904761904762
Model: Decision Tree, Random State: 63, Best Parameters: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10}, Accuracy: 0.8064516129032258, ROC AUC: 0.7998095238095239
Model: Random Forest, Random State: 63, Best Parameters: {'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}, Accuracy: 0.8258064516129032, ROC AUC: 0.8436190476190476
Model: Logistic Regression, Random State: 64, Best Parameters: {'C': 0.1, 'solver': 'newton-cg'}, Accuracy: 0.8258064516129032, ROC AUC: 0.911818181818182
Model: Decision Tree, Random State: 64, Best Parameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10}, Accuracy: 0.8193548387096774, ROC AUC: 0.8208181818181818
Model: Random Forest, Random State: 64, Best Parameters: {'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 100}, Accuracy: 0.8193548387096774, ROC AUC: 0.8541818181818182
Model: Logistic Regression, Random State: 65, Best Parameters: {'C': 0.1, 'solver': 'newton-cg'}, Accuracy: 0.832258064516129, ROC AUC: 0.8896190476190475
Model: Decision Tree, Random State: 65, Best Parameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10}, Accuracy: 0.7935483870967742, ROC AUC: 0.8317142857142857
Model: Random Forest, Random State: 65, Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}, Accuracy: 0.7806451612903226, ROC AUC: 0.8611428571428572
Model: Logistic Regression, Random State: 66, Best Parameters: {'C': 0.1, 'solver': 'newton-cg'}, Accuracy: 0.8387096774193549, ROC AUC: 0.8418045705279749
Model: Decision Tree, Random State: 66, Best Parameters: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 5}, Accuracy: 0.8064516129032258, ROC AUC: 0.7120764381402679
Model: Random Forest, Random State: 66, Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}, Accuracy: 0.8129032258064516, ROC AUC: 0.8529353821907013
Model: Logistic Regression, Random State: 67, Best Parameters: {'C': 1, 'solver': 'newton-cg'}, Accuracy: 0.8129032258064516, ROC AUC: 0.8265815760266372
Model: Decision Tree, Random State: 67, Best Parameters: {'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 5}, Accuracy: 0.8258064516129032, ROC AUC: 0.7794117647058822
Model: Random Forest, Random State: 67, Best Parameters: {'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 100}, Accuracy: 0.8258064516129032, ROC AUC: 0.8288938216796152
Model: Logistic Regression, Random State: 68, Best Parameters: {'C': 1, 'solver': 'newton-cg'}, Accuracy: 0.8580645161290322, ROC AUC: 0.8815238095238096
Model: Decision Tree, Random State: 68, Best Parameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2}, Accuracy: 0.8387096774193549, ROC AUC: 0.852
Model: Random Forest, Random State: 68, Best Parameters: {'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}, Accuracy: 0.8516129032258064, ROC AUC: 0.8634285714285714
Model: Logistic Regression, Random State: 69, Best Parameters: {'C': 0.1, 'solver': 'newton-cg'}, Accuracy: 0.8451612903225807, ROC AUC: 0.8738797610156834
Model: Decision Tree, Random State: 69, Best Parameters: {'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 10}, Accuracy: 0.8387096774193549, ROC AUC: 0.8378454070201643
Model: Random Forest, Random State: 69, Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 200}, Accuracy: 0.8451612903225807, ROC AUC: 0.8459671396564601
Model: Logistic Regression, Random State: 70, Best Parameters: {'C': 1, 'solver': 'newton-cg'}, Accuracy: 0.8129032258064516, ROC AUC: 0.7992905153099328
Model: Decision Tree, Random State: 70, Best Parameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2}, Accuracy: 0.8, ROC AUC: 0.7588685586258402
Model: Random Forest, Random State: 70, Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}, Accuracy: 0.7935483870967742, ROC AUC: 0.7967699775952202

Best Model Configuration:
Model: Random Forest, Random State: 61, Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}, Accuracy: 0.8838709677419355, ROC AUC: 0.8890253890253891

Best Model Configuration:
Model: Logistic Regression, Random State: 37, Best Parameters: {'C': 0.1, 'solver': 'newton-cg'}, Accuracy: 0.8838709677419355, ROC AUC: 0.8819758672699849