In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import gaussian_kde
import missingno as msno
from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.filterwarnings(action='ignore')

plt.rc('font', family='Malgun Gothic')
plt.rc('axes', unicode_minus=False)

In [None]:
train_data = pd.read_csv('./data/train.csv')
test_data = pd.read_csv('./data/test.csv')
gender_submission = pd.read_csv('./data/gender_submission.csv')
gender_submission

# Step 1.
Predict all passengers as deceased (Survived = 0) <br>
<img src="./images/step1submission.png" />

In [None]:
submission = pd.DataFrame({
    "PassengerId": test_data["PassengerId"],
    "Survived": 0
    })
submission.to_csv('./submission/submission.csv', index=False)

# Step 2.
Predict all females as survived and all males as deceased.
<img src="./images/step2submission.png" />

In [None]:
submission = pd.DataFrame({
    "PassengerId": test_data["PassengerId"],
    "Survived": test_data["Sex"].map({"male": 0, "female": 1})
    })
submission.to_csv('./submission/step2submission.csv', index=False)

# Step 3. **EDA**

Explore survival rates by __gender__, __age__, __passenger class (Pclass)__, and __family status
(SibSp/Parch)__

Tracks
- PassengerId - 탑승객 Id
- Survived - 생존유무 (0 = Deseased, 1 = Survived)
- Pclass - 티켓 클래스 (1 = 1st, 2 = 2nd, 3 = 3rd)
- Name - 탑승객 성명
- Sex - 성별
- Age - 나이(세)
- SibSp - 함께 탑승한 형제자매, 배우자 수 총합
- Parch - 함께 탑승한 부모, 자녀 수 총합
- Ticket - 티켓 넘버
- Fare - 탑승 요금
- Cabin - 객실 넘버
- Embarked - 탑승 항구

In [None]:
# number of train data
print('Number of train data: ', len(train_data))
# number of test data
print('Number of test data: ', len(test_data))

In [None]:
train_data.head()

In [None]:
test_data.head()

In [None]:
print(train_data.shape)
print(test_data.shape)

gender_submission.head()

In [None]:
# train_data, test_data 결측치 확인
train_null = train_data.isnull().sum()
test_null = test_data.isnull().sum()

In [None]:
train_null

In [None]:
test_null

train_data missing values: Age, Cabin, Embarked

test_data missing values: Age, Fare, Cabin

In [None]:
# Missing values visualization
msno.matrix(train_data, figsize=(12,5))
msno.matrix(test_data, figsize=(12,5))

Overall Survival Rate

In [None]:
# Overall Survival Rate
train_data['Survived'].value_counts()

plt.figure(figsize=(6,4))
labels = ['Deseased', 'Survived']
colors = ['#1B4F72', '#AED6F1']

train_data['Survived'].value_counts().plot.pie(explode=[0, 0.08],
                                              autopct='%1.1f%%',
                                              labels=labels,
                                              textprops={'fontsize': 15},
                                              colors=colors)
plt.ylabel('Survival Rate')

Survival Count / Rates by Gender

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

axes[0].set_title("Survival Count by Gender", size=15)
sns.countplot(x="Sex", hue="Survived", data=train_data, ax=axes[0], palette=['#1B4F72', '#AED6F1'], order=['male', 'female'])
axes[0].legend(labels=['Deceased', 'Survived'])

# Right - Survival Rates
gender_survival_rate = train_data.groupby('Sex')['Survived'].value_counts(normalize=True).reset_index(name='proportion')

sns.barplot(x='Sex', y='proportion', hue='Survived', data=gender_survival_rate, ax=axes[1], order=['male', 'female'], palette=['#1B4F72', '#AED6F1'])

for p in axes[1].patches:
    height = p.get_height()
    if height > 0:
        axes[1].text(p.get_x() + p.get_width() / 2.,
                     height + 0.01,
                     f'{height:.1%}',
                     ha="center",
                     fontsize=12)

axes[1].set_title('Survival Rate by Gender', fontsize=15)
axes[1].set_ylabel('Rate', fontsize=12)
axes[1].set_xlabel('Sex', fontsize=12)
axes[1].set_ylim(0, 1.0)
handles, labels = axes[1].get_legend_handles_labels()
axes[1].legend(handles, ['Deceased', 'Survived'])

plt.tight_layout()
plt.show()

Survival Count / Rates by Age Group

In [None]:
train_data['Age'].describe()

In [None]:
plt.figure(figsize=(8, 5))
sns.histplot(train_data['Age'], bins=25, kde=True)

In [None]:
# Survival Count by Age Group
train_data['AgeGroup'] = pd.cut(train_data['Age'], bins=[0, 12, 20, 40, 60, 80], labels=['Child', 'Teenager', 'Adult', 'Middle Aged', 'Senior'])

plt.figure(figsize=(10, 6))
ax = sns.countplot(x='AgeGroup', hue='Survived', data=train_data, palette=['#1B4F72', '#AED6F1'])
ax.set_title('Survival Count by Age Group', fontsize=15)
ax.legend(labels=['Deceased', 'Survived'])
plt.show()

In [None]:
# Survival Rates by Age Group
train_data['AgeGroup'] = pd.cut(train_data['Age'], bins=[0, 12, 20, 40, 60, 80], labels=['Child', 'Teenager', 'Adult', 'Middle Aged', 'Senior'])

age_survival_rate = train_data.groupby('AgeGroup')['Survived'].value_counts(normalize=True).reset_index(name='proportion')

plt.figure(figsize=(12, 7))
ax = sns.barplot(x='AgeGroup', y='proportion', hue='Survived', data=age_survival_rate, palette=['#1B4F72', '#AED6F1'])

for p in ax.patches:
    height = p.get_height()
    if height > 0:
        ax.text(p.get_x() + p.get_width() / 2.,
                height + 0.01,
                f'{height:.1%}',
                ha="center",
                fontsize=12)

ax.set_title('Survival Rate by Age Group', fontsize=15)
ax.set_ylabel('Rate')
ax.set_ylim(0, 1.0)
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles=handles, labels=['Deceased', 'Survived'])

plt.show()

Survival info. by PClass (Ticket Class)

In [None]:
train_data[['Pclass', 'Survived']].groupby('Pclass').mean()

In [None]:
# Total passengers of each PClass
plt.figure(figsize=(5, 3))
ax = sns.countplot(data=train_data, x='Pclass',
                   palette=['#1B4F72', '#5DADE2', '#AED6F1'])

ax.set_title("PClass Passengers Count", size=12)
plt.show()

In [None]:
# Survival Count / Rates by PClass

fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Left - Survival Count
axes[0].set_title("Survival Count by PClass", size=15)
sns.countplot(x="Pclass", hue="Survived", data=train_data, ax=axes[0], palette=['#1B4F72', '#AED6F1'])
axes[0].legend(labels=['Deceased', 'Survived'])

# Right - Survival Rates
pclass_survival_rate = train_data.groupby('Pclass')['Survived'].value_counts(normalize=True).reset_index(name='proportion')
sns.barplot(x='Pclass', y='proportion', hue='Survived', data=pclass_survival_rate, ax=axes[1], palette=['#1B4F72', '#AED6F1'])

for p in axes[1].patches:
    height = p.get_height()
    if height > 0:
        axes[1].text(p.get_x() + p.get_width() / 2.,
                     height + 0.01,
                     f'{height:.1%}',
                     ha="center",
                     fontsize=13)

axes[1].set_title("Survival Rate by PClass", size=15)
axes[1].set_ylabel("Rate")
axes[1].set_ylim(0, 1.0)
handles, labels = axes[1].get_legend_handles_labels()
axes[1].legend(handles=handles, labels=['Deceased', 'Survived'])


plt.tight_layout()
plt.show()

Fare distribution by Survival Status

In [None]:
# Fare distribution by Survival Status
fig, ax = plt.subplots(figsize=(15,6))

sns.kdeplot(train_data[train_data['Survived']==0]['Fare'], ax=ax, color='#610085', label='Deceased')
sns.kdeplot(train_data[train_data['Survived']==1]['Fare'], ax=ax, color='#197EC2', label='Survived')

ax.set(xlim=(0, train_data['Fare'].max()))
ax.set_title('Fare Distribution by Survival Status', fontsize=15)
ax.legend()
plt.show()

Survival Count / Rates by FamilyStatus (SibSp/Parch)

 - SibSp - 함께 탑승한 형제자매, 배우자 수 총합
 - Parch - 함께 탑승한 부모, 자녀 수 총합  

In [None]:
train_data[['SibSp', 'Survived']].groupby(['SibSp']).mean()

In [None]:
train_data[['Parch', 'Survived']].groupby(['Parch']).mean()

In [None]:
# Survival Count by SibSp
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Left - Total Passengers by SibSp
axes[0].set_title("SibSp Passengers Count", size=15)
sns.countplot(x="SibSp", data=train_data, ax=axes[0], palette=["#A4E8F7", "#65ADDD", "#387FAF", "#045185", '#1B4F72', '#1B4F72', '#1B4F72'])

# Right - Survival Count by SibSp
axes[1].set_title("Survival Count by SibSp", size=15)
sns.countplot(x="SibSp", hue="Survived", data=train_data, ax=axes[1], palette=['#1B4F72', '#AED6F1'])
handles, labels = axes[1].get_legend_handles_labels()
axes[1].legend(handles=handles, labels=['Deceased', 'Survived'])

plt.tight_layout()
plt.show()

In [None]:
# Survival Count by Parch
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Left - Total Parch Passengers
axes[0].set_title("Parch Passengers Count", size=15)
sns.countplot(x="Parch", data=train_data, ax=axes[0], palette=["#A4E8F7", "#65ADDD", "#387FAF", "#045185", '#1B4F72', '#1B4F72', '#1B4F72'])

# Right - Survival Count by Parch
axes[1].set_title("Survival Count by Parch", size=15)
sns.countplot(x="Parch", hue="Survived", data=train_data, ax=axes[1], palette=['#1B4F72', '#AED6F1'])
handles, labels = axes[1].get_legend_handles_labels()
axes[1].legend(handles=handles, labels=['Deceased', 'Survived'])

plt.tight_layout()
plt.show()

# Step 4. Feature Engineering

In [None]:
train_data = pd.read_csv('./data/train.csv')
test_data = pd.read_csv('./data/test.csv')

dataset = pd.concat([train_data, test_data], sort=False).reset_index(drop=True)
dataset

### Embarked null

In [None]:
dataset['Embarked'].value_counts()

In [None]:
dataset[dataset['Embarked'].isnull()]

In [None]:
dataset['Embarked'].fillna('S', inplace=True)

### Fare null

In [None]:
dataset[dataset['Fare'].isnull()]

In [None]:
dataset['Fare'].groupby(dataset['Pclass']).mean()

In [None]:
dataset.loc[1043, 'Fare'] = 13.3028

In [None]:
dataset['Age'] = dataset['Age'].groupby([dataset['Pclass'], dataset['Sex']]).transform(lambda x: x.fillna(x.mean()))

In [None]:
dataset['LastName'] = dataset['Name'].str.extract('([A-Za-z]+)\.', expand=False)
dataset['Survived'].groupby(dataset['LastName']).mean()

In [None]:
dataset['LastName'].value_counts()

In [None]:
dataset['LastName'] = dataset['LastName'].replace(['Capt', 'Col', 'Dr', 'Major', 'Rev', 'Don', 'Sir', 'Jonkheer'], 'Mr')
dataset['LastName'] = dataset['LastName'].replace(['Ms', 'Mlle'], 'Miss')
dataset['LastName'] = dataset['LastName'].replace(['Mme', 'Lady', 'Countess', 'Dona'], 'Mrs')
dataset['LastName'].value_counts()

In [None]:
dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch']
dataset['TicketFreq'] = dataset.groupby('Ticket')['Ticket'].transform('count')
dataset

In [None]:
dataset.loc[dataset['FamilySize'] == 0, 'Solo'] = 1
dataset.loc[dataset['TicketFreq'] == 1, 'Solo'] = 1
dataset['Solo'] = dataset['Solo'].fillna(0)

dataset['Fare'] = pd.qcut(dataset['Fare'], 9, labels=[1, 2, 3, 4, 5, 6, 7, 8, 9])
dataset['Age'] = pd.qcut(dataset['Age'], 10, labels=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10])

dataset = pd.concat([dataset, pd.get_dummies(dataset['Sex'])], axis=1)
dataset.rename(columns={'male': 'Male', 'female':'Female'}, inplace=True)

dataset = pd.concat([dataset, pd.get_dummies(dataset['Embarked'], prefix='Embarked')], axis=1)

dataset = pd.concat([dataset, pd.get_dummies(dataset['LastName'])], axis=1)

dataset = dataset.drop(columns=['Name', 'Sex', 'Ticket', 'Cabin', 'SibSp', 'Parch', 'Embarked', 'LastName'])

In [None]:
dataset['FamilySize'] = pd.cut(dataset['FamilySize'], bins=[-1, 0, 1, 4, 10], labels=[0, 1, 2, 3])
dataset['Survived'].groupby(dataset['TicketFreq']).mean()

In [None]:
dataset['TicketFreq'] = pd.cut(dataset['TicketFreq'], bins=[0, 1, 2, 4, 20], labels=[0, 1, 2, 3])
dataset['TicketFreq'].value_counts()

In [None]:
dataset.isnull().sum()

In [None]:
scaler = MinMaxScaler()
# PassengerId 제외하고 정규화
dataset[list(dataset.columns.difference(['PassengerId']))] = scaler.fit_transform(dataset[list(dataset.columns.difference(['PassengerId']))])

In [None]:
labels = dataset.loc[:890, 'Survived']
dataset = dataset.drop(columns = 'Survived')
features = dataset.iloc[:891, :]
features.drop('PassengerId', axis=1, inplace=True)
test_dataset = dataset.iloc[891:, :]
feature_names = features.columns.values

# Step 5.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
print(len(features), len(labels))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

In [None]:
lr = LogisticRegression(solver='liblinear')
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred))
print("Logistic Regression Classification Report:\n", classification_report(y_test, y_pred))
print("Logistic Regression Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

In [None]:
test_dataset['PassengerId'] = test_dataset['PassengerId'].astype(int)

submission = pd.DataFrame({
    "PassengerId": test_dataset["PassengerId"],
    "Survived": lr.predict(test_dataset.drop('PassengerId', axis=1)).astype(int)
    })
submission.to_csv('./submission/lr_liblinear_submission.csv', index=False)

<img src="./images/lr_liblinear_submission.png" />

In [None]:
lr2 = LogisticRegression(solver='lbfgs')
lr2.fit(X_train, y_train)
y_pred = lr2.predict(X_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred))
print("Logistic Regression Classification Report:\n", classification_report(y_test, y_pred))
print("Logistic Regression Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

In [None]:
test_dataset['PassengerId'] = test_dataset['PassengerId'].astype(int)

submission = pd.DataFrame({
    "PassengerId": test_dataset["PassengerId"],
    "Survived": lr2.predict(test_dataset.drop('PassengerId', axis=1)).astype(int)
    })
submission.to_csv('./submission/lr_libfgs_submission.csv', index=False)

<img src="./images/lr_libfgs_submission.png" />

In [None]:
lr3 = LogisticRegression(solver='saga')
lr3.fit(X_train, y_train)
y_pred = lr3.predict(X_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred))
print("Logistic Regression Classification Report:\n", classification_report(y_test, y_pred))
print("Logistic Regression Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

In [None]:
test_dataset['PassengerId'] = test_dataset['PassengerId'].astype(int)

submission = pd.DataFrame({
    "PassengerId": test_dataset["PassengerId"],
    "Survived": lr3.predict(test_dataset.drop('PassengerId', axis=1)).astype(int)
    })
submission.to_csv('./submission/lr_saga_submission.csv', index=False)

<img src="./images/lr_saga_submission.png" />

In [None]:
# -*- coding: utf-8 -*-
"""
멀티코어 활용 + tqdm 오류 수정(단일 셀)
- tqdm 래퍼가 iterable=None도 지원: with TQDM(total=...) as bar: 형태와 TQDM(iterable, total=...) 둘 다 동작
- DT/SVM 탐색을 ProcessPoolExecutor로 병렬화, RF/BO는 n_jobs=CPU 활용, MLP는 GPU 단일 프로세스
- 검증 정확도는 predict() 기준(일관성), 앙상블 확률 수집 후 soft voting
"""

# ===== 멀티코어/스레드 환경 변수 =====
import os
CPU = os.cpu_count() or 1
for k in ["OMP_NUM_THREADS","OPENBLAS_NUM_THREADS","MKL_NUM_THREADS","VECLIB_MAXIMUM_THREADS","NUMEXPR_NUM_THREADS"]:
    os.environ[k] = str(CPU)

import sys, time, math, random, warnings, itertools, io
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

# tqdm 고정: iterable=None도 허용 (수정 포인트)
from tqdm.std import tqdm as _tqdm
def TQDM(iterable=None, **kw):
    cfg = dict(file=sys.stdout, ncols=90, mininterval=0.3, leave=True, dynamic_ncols=False)
    if iterable is None:
        return _tqdm(**{**cfg, **kw})
    return _tqdm(iterable, **{**cfg, **kw})

from concurrent.futures import ProcessPoolExecutor, as_completed

# ===== 설정 =====
SEED = 42

# Decision Tree (Random Search, 병렬)
N_DT_SAMPLES = 120
DT_MAX_DEPTH_RANGE = (2, 40)
DT_MIN_SPLIT_RANGE = (2, 40)
DT_MIN_LEAF_RANGE = (1, 20)
DT_CRITERIA = ["gini", "entropy", "log_loss"]
DT_SPLITTERS = ["best", "random"]
DT_USE_NONE_DEPTH_PROB = 0.15

# Random Forest (Random Search, 내부 멀티코어 사용)
N_RF_SAMPLES = 160
RF_N_EST_RANGE = (100, 2000)
RF_MAX_DEPTH_RANGE = (2, 50)
RF_MIN_SPLIT_RANGE = (2, 30)
RF_MIN_LEAF_RANGE = (1, 20)
RF_MAX_FEATURES_CHOICES = ["sqrt", "log2", None, "float"]
RF_MAX_FEATURES_FLOAT_RANGE = (0.2, 1.0)
RF_BOOTSTRAP_CHOICES = [True, False]
RF_CLASS_WEIGHT_CHOICES = [None, "balanced"]

# SVM (Random Search, 병렬; 탐색 probability=False, 최종만 True)
N_SVM_SAMPLES_PER_KERNEL = 100
SVM_KERNELS = ["rbf", "linear", "poly", "sigmoid"]
SVM_C_LOG_RANGE = (-3, 3)
SVM_GAMMA_LOG_RANGE = (-4, 1)
SVM_DEGREE_RANGE = (2, 5)
SVM_COEF0_RANGE = (-1.0, 1.0)

# BO for RF (수동 GP+EI; 순차, 내부 RF는 n_jobs=CPU)
BO_INIT_SAMPLES = 12
BO_MAX_ITERS = 40
BO_EARLY_STOP_PATIENCE = 10
BO_IMPROVEMENT_TOL = 1e-4
BO_N_CANDIDATES = 800
BO_BOUNDS_LO = np.array([100,  2,  2, 0.2], dtype=float)  # (n_estimators, max_depth, min_samples_split, max_features_float)
BO_BOUNDS_HI = np.array([2000, 50, 30, 1.0], dtype=float)

# MLP (Random Search; GPU 단일 프로세스)
RUN_MLP = True
N_MLP_SAMPLES = 16
MLP_HIDDEN_RANGE = (64, 512)
MLP_DROPOUT_RANGE = (0.0, 0.5)
MLP_LR_LOG_RANGE = (-4, -2.5)
MLP_EPOCHS_RANGE = (120, 220)
MLP_BATCH_CHOICES = [32, 64, 128]
MLP_EARLY_STOP_PATIENCE = 20

# 앙상블 가중치 (None=균등)
ENSEMBLE_WEIGHTS = None

# ===== sklearn / torch =====
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import Matern, ConstantKernel as CK, WhiteKernel
from scipy.stats import norm

try:
    import torch
    import torch.nn as nn
    import torch.optim as optim
    TORCH_OK = True
except Exception:
    TORCH_OK = False

# ===== 재현성 =====
np.random.seed(SEED); random.seed(SEED)
if TORCH_OK:
    try:
        torch.manual_seed(SEED)
        if torch.cuda.is_available(): torch.cuda.manual_seed_all(SEED)
    except Exception: pass

os.makedirs("./submission", exist_ok=True)

# ===== 데이터/피처 =====
def build_features_from_raw():
    train_data = pd.read_csv('./data/train.csv')
    test_data  = pd.read_csv('./data/test.csv')

    dataset = pd.concat([train_data, test_data], sort=False).reset_index(drop=True)

    dataset['Embarked'].fillna('S', inplace=True)
    if dataset['Fare'].isnull().any():
        if 1043 in dataset.index: dataset.loc[1043, 'Fare'] = 13.3028
        dataset['Fare'] = dataset['Fare'].fillna(dataset.groupby('Pclass')['Fare'].transform('mean'))

    dataset['Age'] = dataset['Age'].groupby([dataset['Pclass'], dataset['Sex']]).transform(lambda x: x.fillna(x.mean()))

    dataset['LastName'] = dataset['Name'].str.extract('([A-Za-z]+)\.', expand=False)
    dataset['LastName'] = dataset['LastName'].replace(['Capt','Col','Dr','Major','Rev','Don','Sir','Jonkheer'],'Mr')
    dataset['LastName'] = dataset['LastName'].replace(['Ms','Mlle'],'Miss')
    dataset['LastName'] = dataset['LastName'].replace(['Mme','Lady','Countess','Dona'],'Mrs')

    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch']
    dataset['TicketFreq'] = dataset.groupby('Ticket')['Ticket'].transform('count')
    dataset.loc[dataset['FamilySize'] == 0, 'Solo'] = 1
    dataset.loc[dataset['TicketFreq'] == 1, 'Solo'] = 1
    dataset['Solo'] = dataset['Solo'].fillna(0)

    dataset['Fare'] = pd.qcut(dataset['Fare'], 9, labels=[1,2,3,4,5,6,7,8,9])
    dataset['Age']  = pd.qcut(dataset['Age'], 10, labels=[1,2,3,4,5,6,7,8,9,10])

    dataset = pd.concat([dataset, pd.get_dummies(dataset['Sex'])], axis=1)
    dataset.rename(columns={'male':'Male','female':'Female'}, inplace=True)
    dataset = pd.concat([dataset, pd.get_dummies(dataset['Embarked'], prefix='Embarked')], axis=1)
    dataset = pd.concat([dataset, pd.get_dummies(dataset['LastName'])], axis=1)

    dataset = dataset.drop(columns=['Name','Sex','Ticket','Cabin','SibSp','Parch','Embarked','LastName'])

    dataset['FamilySize'] = pd.cut(dataset['FamilySize'], bins=[-1,0,1,4,10], labels=[0,1,2,3])
    dataset['TicketFreq'] = pd.cut(dataset['TicketFreq'], bins=[0,1,2,4,20], labels=[0,1,2,3])

    scaler = MinMaxScaler()
    scale_cols = list(dataset.columns.difference(['PassengerId','Survived']))
    dataset[scale_cols] = scaler.fit_transform(dataset[scale_cols])

    labels   = dataset.loc[:890, 'Survived']
    features = dataset.iloc[:891, :].drop(columns='Survived')
    test_ds  = dataset.iloc[891:, :].copy()
    test_ds  = test_ds.drop(columns='Survived', errors='ignore')
    test_ds['PassengerId'] = test_ds['PassengerId'].astype(int)

    X_tr, X_val, y_tr, y_val = train_test_split(
        features.drop(columns='PassengerId'), labels, test_size=0.2, random_state=SEED, stratify=labels
    )
    return features, labels, X_tr, X_val, y_tr, y_val, test_ds

features, labels, X_train, X_test, y_train, y_test, test_dataset = build_features_from_raw()

def _prep_test_X(df):
    cols_to_drop = [c for c in ['PassengerId','Survived'] if c in df.columns]
    return df.drop(columns=cols_to_drop)

def sigmoid(x): return 1.0/(1.0+np.exp(-x))
def prob_to_logit(p, eps=1e-9):
    p = np.clip(p, eps, 1-eps); return np.log(p/(1-p))
def rand_log_uniform(a, b):  # return 10**u, u~U(a,b)
    return float(10 ** np.random.uniform(a, b))

def evaluate_and_submit(model_name, model, Xtr, ytr, Xva, yva, test_df, submit_stem, want_proba=True):
    t0 = time.time(); model.fit(Xtr, ytr); train_secs = time.time()-t0
    val_pred_hard = model.predict(Xva)
    val_acc = accuracy_score(yva, val_pred_hard)
    if want_proba and hasattr(model, "predict_proba"):
        val_p = model.predict_proba(Xva)[:, list(model.classes_).index(1)]
        test_p = model.predict_proba(_prep_test_X(test_df))[:, list(model.classes_).index(1)]
    elif want_proba and hasattr(model, "decision_function"):
        val_p = sigmoid(model.decision_function(Xva))
        test_p = sigmoid(model.decision_function(_prep_test_X(test_df)))
        if np.ndim(val_p)>1: val_p = val_p.ravel()
        if np.ndim(test_p)>1: test_p = test_p.ravel()
    else:
        val_p = val_pred_hard.astype(float)
        test_p = model.predict(_prep_test_X(test_df)).astype(float)
    sub = pd.DataFrame({'PassengerId': test_df['PassengerId'], 'Survived': (test_p>=0.5).astype(int)})
    sub_path = f'./submission/{submit_stem}.csv'; sub.to_csv(sub_path, index=False)
    return {
        'Model': model_name,
        'ValAccuracy': float(val_acc),
        'TrainSeconds': round(train_secs, 3),
        'BestParams': getattr(model, 'get_params', lambda: {})(),
        'SubmissionPath': sub_path,
        'val_prob': val_p, 'test_prob': test_p,
        'val_logit': prob_to_logit(val_p), 'test_logit': prob_to_logit(test_p)
    }

results = []
members = []

print(f"[PLAN] DT={N_DT_SAMPLES}, RF(rand)={N_RF_SAMPLES}, SVM(total)={N_SVM_SAMPLES_PER_KERNEL*len(SVM_KERNELS)}, RF(BO)={BO_INIT_SAMPLES+BO_MAX_ITERS}, MLP={(N_MLP_SAMPLES if RUN_MLP and TORCH_OK else 0)}")
os.makedirs("./submission", exist_ok=True)

# ===== 병렬 평가용 워커 =====
_GLOBAL = {}
def _init_worker(Xtr, ytr, Xva, yva):
    global _GLOBAL
    _GLOBAL['Xtr'] = Xtr; _GLOBAL['ytr'] = ytr; _GLOBAL['Xva'] = Xva; _GLOBAL['yva'] = yva

def _eval_dt(params):
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.metrics import accuracy_score
    Xtr=_GLOBAL['Xtr']; ytr=_GLOBAL['ytr']; Xva=_GLOBAL['Xva']; yva=_GLOBAL['yva']
    m = DecisionTreeClassifier(**params)
    m.fit(Xtr, ytr)
    acc = accuracy_score(yva, m.predict(Xva))
    return acc, params

def _eval_svm(params):
    from sklearn.svm import SVC
    from sklearn.metrics import accuracy_score
    Xtr=_GLOBAL['Xtr']; ytr=_GLOBAL['ytr']; Xva=_GLOBAL['Xva']; yva=_GLOBAL['yva']
    m = SVC(**params)
    m.fit(Xtr, ytr)
    acc = accuracy_score(yva, m.predict(Xva))
    return acc, params

# ---------- Decision Tree (Random Search, 멀티프로세스) ----------
dt_param_list = []
rng = np.random.default_rng(SEED)
for _ in range(N_DT_SAMPLES):
    max_depth = None if rng.random()<DT_USE_NONE_DEPTH_PROB else int(rng.integers(DT_MAX_DEPTH_RANGE[0], DT_MAX_DEPTH_RANGE[1]+1))
    dt_param_list.append(dict(
        random_state=SEED,
        criterion=random.choice(DT_CRITERIA),
        splitter=random.choice(DT_SPLITTERS),
        max_depth=max_depth,
        min_samples_split=int(rng.integers(DT_MIN_SPLIT_RANGE[0], DT_MIN_SPLIT_RANGE[1]+1)),
        min_samples_leaf=int(rng.integers(DT_MIN_LEAF_RANGE[0], DT_MIN_LEAF_RANGE[1]+1))
    ))
best_dt_acc, best_dt_params = -1.0, None
with ProcessPoolExecutor(max_workers=CPU, initializer=_init_worker, initargs=(X_train, y_train, X_test, y_test)) as ex:
    futures = [ex.submit(_eval_dt, p) for p in dt_param_list]
    for fut in TQDM(as_completed(futures), total=len(futures), desc="DT random search"):
        acc, params = fut.result()
        if acc > best_dt_acc:
            best_dt_acc, best_dt_params = acc, params
if best_dt_params is not None:
    r = evaluate_and_submit("DecisionTree", DecisionTreeClassifier(**best_dt_params), X_train, y_train, X_test, y_test, test_dataset, "dt_submission")
    r['BestParams']=best_dt_params; results.append(r); members.append(r)

# ---------- Random Forest (Random Search, 내부 멀티코어 n_jobs=CPU) ----------
best_rf_acc, best_rf_params = -1.0, None
for _ in TQDM(range(N_RF_SAMPLES), desc="RF random search", total=N_RF_SAMPLES):
    max_features_choice = random.choice(RF_MAX_FEATURES_CHOICES)
    if max_features_choice == "float":
        max_features = float(np.random.uniform(*RF_MAX_FEATURES_FLOAT_RANGE))
    else:
        max_features = max_features_choice
    params = dict(
        random_state=SEED,
        n_estimators=int(np.random.randint(RF_N_EST_RANGE[0], RF_N_EST_RANGE[1]+1)),
        max_depth=int(np.random.randint(RF_MAX_DEPTH_RANGE[0], RF_MAX_DEPTH_RANGE[1]+1)),
        min_samples_split=int(np.random.randint(RF_MIN_SPLIT_RANGE[0], RF_MIN_SPLIT_RANGE[1]+1)),
        min_samples_leaf=int(np.random.randint(RF_MIN_LEAF_RANGE[0], RF_MIN_LEAF_RANGE[1]+1)),
        max_features=max_features,
        bootstrap=random.choice(RF_BOOTSTRAP_CHOICES),
        class_weight=random.choice(RF_CLASS_WEIGHT_CHOICES),
        n_jobs=CPU
    )
    m = RandomForestClassifier(**params)
    m.fit(X_train, y_train)
    acc = accuracy_score(y_test, m.predict(X_test))
    if acc > best_rf_acc:
        best_rf_acc, best_rf_params = acc, params
if best_rf_params is not None:
    r = evaluate_and_submit("RandomForest(Rand)", RandomForestClassifier(**best_rf_params), X_train, y_train, X_test, y_test, test_dataset, "rf_rand_submission")
    r['BestParams']=best_rf_params; results.append(r); members.append(r)

# ---------- SVM (Random Search, 멀티프로세스; 탐색 probability=False, 최종만 True) ----------
svm_param_list = []
for kernel in SVM_KERNELS:
    for _ in range(N_SVM_SAMPLES_PER_KERNEL):
        C = rand_log_uniform(*SVM_C_LOG_RANGE)
        if kernel == "linear":
            params = dict(C=C, kernel='linear', probability=False, random_state=SEED)
        elif kernel == "rbf":
            gamma = rand_log_uniform(*SVM_GAMMA_LOG_RANGE)
            params = dict(C=C, kernel='rbf', gamma=gamma, probability=False, random_state=SEED)
        elif kernel == "poly":
            gamma = rand_log_uniform(*SVM_GAMMA_LOG_RANGE)
            degree = int(np.random.randint(SVM_DEGREE_RANGE[0], SVM_DEGREE_RANGE[1]+1))
            coef0 = float(np.random.uniform(*SVM_COEF0_RANGE))
            params = dict(C=C, kernel='poly', gamma=gamma, degree=degree, coef0=coef0, probability=False, random_state=SEED)
        else:  # sigmoid
            gamma = rand_log_uniform(*SVM_GAMMA_LOG_RANGE)
            coef0 = float(np.random.uniform(*SVM_COEF0_RANGE))
            params = dict(C=C, kernel='sigmoid', gamma=gamma, coef0=coef0, probability=False, random_state=SEED)
        svm_param_list.append(params)

best_svm_acc, best_svm_params = -1.0, None
with ProcessPoolExecutor(max_workers=CPU, initializer=_init_worker, initargs=(X_train, y_train, X_test, y_test)) as ex:
    futures = [ex.submit(_eval_svm, p) for p in svm_param_list]
    for fut in TQDM(as_completed(futures), total=len(futures), desc="SVM random search"):
        acc, params = fut.result()
        if acc > best_svm_acc:
            best_svm_acc, best_svm_params = acc, params
# 최종 베스트만 probability=True로 재학습하여 확률 획득
if best_svm_params is not None:
    best_svm_params_prob = dict(best_svm_params); best_svm_params_prob['probability'] = True
    svc_best = SVC(**best_svm_params_prob)
    r = evaluate_and_submit("SVM", svc_best, X_train, y_train, X_test, y_test, test_dataset, "svm_submission")
    r['BestParams']=best_svm_params_prob; results.append(r); members.append(r)

# ---------- Random Forest (Bayesian Optimization; 순차, 내부 멀티코어) ----------
def expected_improvement(mu, sigma, y_best, xi=0.01):
    Z = (mu - y_best - xi) / (sigma + 1e-12)
    return (mu - y_best - xi) * norm.cdf(Z) + sigma * norm.pdf(Z)

def rf_make_from_vec(x):
    ne  = int(np.clip(round(x[0]), BO_BOUNDS_LO[0], BO_BOUNDS_HI[0]))
    md  = int(np.clip(round(x[1]), BO_BOUNDS_LO[1], BO_BOUNDS_HI[1]))
    mss = int(np.clip(round(x[2]), BO_BOUNDS_LO[2], BO_BOUNDS_HI[2]))
    mxf = float(np.clip(x[3],    BO_BOUNDS_LO[3], BO_BOUNDS_HI[3]))
    return RandomForestClassifier(random_state=SEED, n_estimators=ne, max_depth=md, min_samples_split=mss,
                                  max_features=mxf, n_jobs=CPU), {'n_estimators':ne,'max_depth':md,'min_samples_split':mss,'max_features':mxf}

def rf_objective(x):
    m, pars = rf_make_from_vec(x)
    m.fit(X_train, y_train)
    acc = accuracy_score(y_test, m.predict(X_test))
    return acc, pars

X_obs=[]; y_obs=[]; P_obs=[]
for _ in TQDM(range(BO_INIT_SAMPLES), desc="RF(BO) init", total=BO_INIT_SAMPLES):
    x = BO_BOUNDS_LO + np.random.rand(4)*(BO_BOUNDS_HI-BO_BOUNDS_LO)
    acc, pars = rf_objective(x)
    X_obs.append(x); y_obs.append(acc); P_obs.append(pars)
X_obs=np.array(X_obs); y_obs=np.array(y_obs)

gp = GaussianProcessRegressor(
    kernel=CK(1.0,(1e-3,1e3))*Matern(length_scale=[400,10,5,0.2], length_scale_bounds=(1e-2,1e3), nu=2.5) + WhiteKernel(noise_level=1e-5, noise_level_bounds=(1e-8,1e-1)),
    alpha=1e-6, normalize_y=True, random_state=SEED
)

best=float(np.max(y_obs)); no_imp=0
for _ in TQDM(range(BO_MAX_ITERS), desc="RF(BO) iter", total=BO_MAX_ITERS):
    try: gp.fit(X_obs, y_obs)
    except Exception: gp.fit(X_obs + 1e-6*np.random.randn(*X_obs.shape), y_obs)
    cand = BO_BOUNDS_LO + np.random.rand(BO_N_CANDIDATES,4)*(BO_BOUNDS_HI-BO_BOUNDS_LO)
    mu, sig = gp.predict(cand, return_std=True)
    x_next = cand[int(np.argmax(expected_improvement(mu, sig, best, xi=0.01)))]
    acc, pars = rf_objective(x_next)
    X_obs=np.vstack([X_obs,x_next]); y_obs=np.append(y_obs, acc); P_obs.append(pars)
    if acc > best + BO_IMPROVEMENT_TOL:
        best=acc; no_imp=0
    else:
        no_imp+=1
        if no_imp>=BO_EARLY_STOP_PATIENCE: break

bp = P_obs[int(np.argmax(y_obs))]
rf_bo = RandomForestClassifier(random_state=SEED, **bp, n_jobs=CPU)
r = evaluate_and_submit("RandomForest(BO)", rf_bo, X_train, y_train, X_test, y_test, test_dataset, "rf_bo_submission")
r['BestParams']=bp; results.append(r); members.append(r)

# ---------- MLP (Random Search; GPU 단일 프로세스) ----------
if RUN_MLP and TORCH_OK:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    class MLP(nn.Module):
        def __init__(self, in_dim, hidden=128, dropout=0.2):
            super().__init__()
            self.f = nn.Sequential(
                nn.Linear(in_dim, hidden), nn.ReLU(), nn.Dropout(dropout),
                nn.Linear(hidden, hidden), nn.ReLU(), nn.Dropout(dropout),
                nn.Linear(hidden, 1)  # logit
            )
        def forward(self, x): return self.f(x)
    def train_mlp_collect(Xtr, ytr, Xva, yva, Xte, hidden, dropout, lr, epochs, batch, es_patience):
        Xtr_t=torch.tensor(Xtr.values, dtype=torch.float32).to(device)
        ytr_t=torch.tensor(ytr.values.reshape(-1,1), dtype=torch.float32).to(device)
        Xva_t=torch.tensor(Xva.values, dtype=torch.float32).to(device)
        Xte_t=torch.tensor(Xte.values, dtype=torch.float32).to(device)
        m=MLP(Xtr.shape[1], hidden, dropout).to(device); opt=optim.Adam(m.parameters(), lr=lr); crit=nn.BCEWithLogitsLoss()
        best_acc=-1; best_state=None; n=Xtr_t.size(0); no_imp=0
        for ep in range(int(epochs)):
            m.train(); perm=torch.randperm(n, device=device)
            for i in range(0,n,int(batch)):
                idx=perm[i:i+int(batch)]; xb=Xtr_t[idx]; yb=ytr_t[idx]
                opt.zero_grad(); loss=crit(m(xb), yb); loss.backward(); opt.step()
            m.eval()
            with torch.no_grad():
                val_logits=m(Xva_t).cpu().numpy().ravel(); val_pred=(val_logits>0).astype(int)
                acc=accuracy_score(yva.values, val_pred)
                if acc>best_acc: best_acc=acc; best_state={k:v.detach().cpu().clone() for k,v in m.state_dict().items()}; no_imp=0
                else: no_imp+=1
            if no_imp>=es_patience: break
        if best_state is not None: m.load_state_dict({k:v.to(device) for k,v in best_state.items()})
        m.eval()
        with torch.no_grad():
            val_logits=m(Xva_t).cpu().numpy().ravel()
            test_logits=m(Xte_t).cpu().numpy().ravel()
        return m, val_logits, test_logits, best_acc
    best_mlp, best_mlp_acc, best_cfg, best_val_p, best_test_p, best_val_logit, best_test_logit = None, -1.0, None, None, None, None, None
    XteX=_prep_test_X(test_dataset)
    for _ in TQDM(range(N_MLP_SAMPLES), desc="MLP random search", total=N_MLP_SAMPLES):
        hidden = int(np.random.randint(MLP_HIDDEN_RANGE[0], MLP_HIDDEN_RANGE[1]+1))
        dropout = float(np.random.uniform(*MLP_DROPOUT_RANGE))
        lr = float(10 ** np.random.uniform(*MLP_LR_LOG_RANGE))
        epochs = int(np.random.randint(MLP_EPOCHS_RANGE[0], MLP_EPOCHS_RANGE[1]+1))
        batch = random.choice(MLP_BATCH_CHOICES)
        cfg={'hidden':hidden,'dropout':dropout,'lr':lr,'epochs':epochs,'batch':batch,'es_patience':MLP_EARLY_STOP_PATIENCE}
        m, val_logits, test_logits, acc = train_mlp_collect(X_train, y_train, X_test, y_test, XteX, **cfg)
        if acc > best_mlp_acc:
            best_mlp_acc, best_mlp, best_cfg = acc, m, cfg
            best_val_logit, best_test_logit = val_logits, test_logits
            best_val_p = 1/(1+np.exp(-val_logits)); best_test_p = 1/(1+np.exp(-test_logits))
    if best_mlp is not None:
        mlp_sub='./submission/mlp_submission.csv'
        pd.DataFrame({'PassengerId': test_dataset['PassengerId'],'Survived':(best_test_p>=0.5).astype(int)}).to_csv(mlp_sub, index=False)
        r={'Model':'MLP(FC, PyTorch, GPU)' if torch.cuda.is_available() else 'MLP(FC, PyTorch, CPU)',
           'ValAccuracy':float(best_mlp_acc),'TrainSeconds':None,'BestParams':best_cfg,'SubmissionPath':mlp_sub,
           'val_prob':best_val_p,'test_prob':best_test_p,'val_logit':best_val_logit,'test_logit':best_test_logit}
        results.append(r); members.append(r)

# ---------- Soft voting ensemble ----------
names=[m['Model'] for m in members]
w = np.array([1.0 if not ENSEMBLE_WEIGHTS or n not in ENSEMBLE_WEIGHTS else float(ENSEMBLE_WEIGHTS[n]) for n in names], float)
w = w / (w.sum() if w.sum()!=0 else 1.0)
val_stack = np.stack([m['val_prob'] for m in members], axis=1)
test_stack= np.stack([m['test_prob'] for m in members], axis=1)
val_ens = (val_stack * w.reshape(1,-1)).sum(axis=1)
test_ens= (test_stack* w.reshape(1,-1)).sum(axis=1)
val_ens_acc = accuracy_score(y_test, (val_ens>=0.5).astype(int))
ens_path='./submission/ensemble_softvote_submission.csv'
pd.DataFrame({'PassengerId': test_dataset['PassengerId'], 'Survived': (test_ens>=0.5).astype(int)}).to_csv(ens_path, index=False)

# ---------- 결과 출력 ----------
rows=[[r['Model'], f"{r['ValAccuracy']:.4f}", r['BestParams'], r['SubmissionPath']] for r in results]
summary=pd.DataFrame(rows, columns=['Model','ValAcc','BestParams','CSV']).sort_values('ValAcc', ascending=False).reset_index(drop=True)
print("\n[Per-model results]")
print(summary.to_string(index=False))
print(f"\n[Ensemble] members={names}\nWeights={w.tolist()}\nValAccuracy(soft vote)={val_ens_acc:.4f}\nCSV={ens_path}")
summary.to_csv('./submission/model_eval_summary.csv', index=False)
