In [2]:
#1. 데이터 로드
import pandas as pd
from sklearn.linear_model import LinearRegression
import os

df = pd.read_csv("titanic.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
#2. 결측치 확인
dict(df.isnull().sum())

{'PassengerId': np.int64(0),
 'Survived': np.int64(0),
 'Pclass': np.int64(0),
 'Name': np.int64(0),
 'Sex': np.int64(0),
 'Age': np.int64(177),
 'SibSp': np.int64(0),
 'Parch': np.int64(0),
 'Ticket': np.int64(0),
 'Fare': np.int64(0),
 'Cabin': np.int64(687),
 'Embarked': np.int64(2)}

In [5]:
df["Age"] = df["Age"].fillna(df["Age"].mean()) # Age에 null값이 있으니 평균을 내서 채워줄꺼임
df["Embarked"] = df["Embarked"].fillna("S") # 결측치2 이므로 가장 많은 확률의 S항구로 적용해볼까

In [6]:
dict(df.isnull().sum())

{'PassengerId': np.int64(0),
 'Survived': np.int64(0),
 'Pclass': np.int64(0),
 'Name': np.int64(0),
 'Sex': np.int64(0),
 'Age': np.int64(0),
 'SibSp': np.int64(0),
 'Parch': np.int64(0),
 'Ticket': np.int64(0),
 'Fare': np.int64(0),
 'Cabin': np.int64(687),
 'Embarked': np.int64(0)}

In [7]:
#3. 컬럼 셀렉션
df.columns
Feature = df[['Pclass', 'Sex', 'Age', 'SibSp',
            'Parch', 'Fare', 'Embarked']].copy()
Target = df[['Survived']].copy()

In [8]:
# 성별 숫자화
Feature["Sex"] = Feature["Sex"].map(lambda x : 0 if x == 'male' else 1)

In [9]:
 display(Feature.head())

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,0,22.0,1,0,7.25,S
1,1,1,38.0,1,0,71.2833,C
2,3,1,26.0,0,0,7.925,S
3,1,1,35.0,1,0,53.1,S
4,3,0,35.0,0,0,8.05,S


In [10]:
# Embarked 숫자화
sitdict = {'S':0, 'C':1, 'Q':2}

Feature["Embarked"] = Feature["Embarked"].map(lambda x : sitdict[x])

In [11]:
 display(Feature.head())

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,0,22.0,1,0,7.25,0
1,1,1,38.0,1,0,71.2833,1
2,3,1,26.0,0,0,7.925,0
3,1,1,35.0,1,0,53.1,0
4,3,0,35.0,0,0,8.05,0


In [12]:
#4. K-means 파생변수
from sklearn.cluster import KMeans
km = KMeans(n_clusters=3, init = "k-means++") # KMeans 3개의 클러스터로 초기화
# km.fit_predict(new_df.drop("Survived", axis=1)) # 생존컬럼을 제외한 나머지 컬럼만 군집 라벨링
Feature["cluster"] = km.fit_predict(Feature)

In [13]:
 display(Feature.head())

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,cluster
0,3,0,22.0,1,0,7.25,0,0
1,1,1,38.0,1,0,71.2833,1,1
2,3,1,26.0,0,0,7.925,0,0
3,1,1,35.0,1,0,53.1,0,0
4,3,0,35.0,0,0,8.05,0,0


In [14]:
#5. training-test-split
from sklearn.preprocessing import RobustScaler # 데이터의 각 특성(feature)들이 갖는 값의 범위를 일정하게 맞춰주는 과정
from sklearn.model_selection import train_test_split

# 타겟 변수를 1차원 배열로 변환하여 DataConversionWarning 해결
X_train, X_valid, y_train, y_valid = train_test_split(Feature, Target.values.ravel(), test_size = 0.2, shuffle = True)

#6. scaling (학습, 검증데이터에 RobustScaler를 적용하여 특성 값의 범위를 조정해 줌)
# RobustScaler 객체 생성 및 학습 데이터(X_train)에 fit
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)

# 학습된 scaler 객체를 사용하여 검증 데이터(X_valid) transform
X_valid_scaled = scaler.transform(X_valid)

# 변수명 업데이트 (원래 코드와 동일하게 X_train, X_valid 사용함)
X_train = X_train_scaled
X_valid = X_valid_scaled

# Task
타이타닉 데이터셋을 사용하여 머신러닝 풀 프로세스를 진행하고, AutoML을 활용하여 상위 3개의 모델을 찾아 적용하고 평가합니다.

## Automl 라이브러리 선택 및 설치

### Subtask:
AutoML을 사용하기 위한 적절한 파이썬 라이브러리를 선택하고 설치합니다.


**Reasoning**:
Identify and install an appropriate AutoML library for the task. AutoGluon is a suitable choice for tabular data and is not currently imported in the notebook. I will install it using pip.



In [15]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.5.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.16.5-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.5.0-py3-none-any.whl (400 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.9/400.9 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.16.5-py3-none-any.whl (247 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m247.4/247.4 kB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.16.5 colorlog-6.9.0 optuna-4.5.0


In [16]:
# 8. optuna
import optuna
from sklearn.metrics import f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [17]:
def DecisionTreeobjective(trial):
    max_depth = trial.suggest_int("max_depth", 2, 20)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 20)
    model = DecisionTreeClassifier(max_depth=max_depth, min_samples_split=min_samples_split, random_state=4)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_valid)
    return f1_score(y_valid, y_pred)

study = optuna.create_study(direction="maximize")
study.optimize(DecisionTreeobjective, n_trials=20)
print("Best prarmeters:", study.best_params)

[I 2025-08-31 07:47:54,925] A new study created in memory with name: no-name-fdc9a498-3a61-4ed7-854d-8532eba31427
[I 2025-08-31 07:47:54,961] Trial 0 finished with value: 0.6990291262135923 and parameters: {'max_depth': 11, 'min_samples_split': 14}. Best is trial 0 with value: 0.6990291262135923.
[I 2025-08-31 07:47:54,972] Trial 1 finished with value: 0.6857142857142857 and parameters: {'max_depth': 7, 'min_samples_split': 13}. Best is trial 0 with value: 0.6990291262135923.
[I 2025-08-31 07:47:54,983] Trial 2 finished with value: 0.6727272727272727 and parameters: {'max_depth': 13, 'min_samples_split': 3}. Best is trial 0 with value: 0.6990291262135923.
[I 2025-08-31 07:47:54,993] Trial 3 finished with value: 0.6909090909090909 and parameters: {'max_depth': 5, 'min_samples_split': 6}. Best is trial 0 with value: 0.6990291262135923.
[I 2025-08-31 07:47:55,004] Trial 4 finished with value: 0.6285714285714286 and parameters: {'max_depth': 2, 'min_samples_split': 4}. Best is trial 0 with

Best prarmeters: {'max_depth': 11, 'min_samples_split': 14}


In [18]:
def RandomForestobjective(trial):
    n_estimators = trial.suggest_int("n_estimators", 50, 300)
    max_depth = trial.suggest_int("max_depth", 2, 20)
    model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_valid)
    return f1_score(y_valid, y_pred)

study = optuna.create_study(direction="maximize")
study.optimize(RandomForestobjective, n_trials=20)
print("Best parameters:", study.best_params)

[I 2025-08-31 07:47:59,313] A new study created in memory with name: no-name-5db62bd7-3a6d-43f9-9721-93a8722ee883
[I 2025-08-31 07:47:59,922] Trial 0 finished with value: 0.7017543859649122 and parameters: {'n_estimators': 129, 'max_depth': 15}. Best is trial 0 with value: 0.7017543859649122.
[I 2025-08-31 07:48:00,680] Trial 1 finished with value: 0.7047619047619048 and parameters: {'n_estimators': 186, 'max_depth': 8}. Best is trial 1 with value: 0.7047619047619048.
[I 2025-08-31 07:48:01,469] Trial 2 finished with value: 0.7155963302752294 and parameters: {'n_estimators': 174, 'max_depth': 12}. Best is trial 2 with value: 0.7155963302752294.
[I 2025-08-31 07:48:02,410] Trial 3 finished with value: 0.7079646017699115 and parameters: {'n_estimators': 166, 'max_depth': 17}. Best is trial 2 with value: 0.7155963302752294.
[I 2025-08-31 07:48:02,889] Trial 4 finished with value: 0.7083333333333334 and parameters: {'n_estimators': 62, 'max_depth': 3}. Best is trial 2 with value: 0.7155963

Best parameters: {'n_estimators': 98, 'max_depth': 9}


In [19]:
def Logisticobjective(trial):
    C = trial.suggest_loguniform("C", 1e-3, 10)
    model = LogisticRegression(C=C, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_valid)
    return f1_score(y_valid, y_pred)

study = optuna.create_study(direction="maximize")
study.optimize(Logisticobjective, n_trials=20)
print("Best parameters:", study.best_params)

[I 2025-08-31 07:48:17,143] A new study created in memory with name: no-name-8ed921b9-fa0f-4e04-8942-60f5d9e2ca2c
  C = trial.suggest_loguniform("C", 1e-3, 10)
[I 2025-08-31 07:48:17,171] Trial 0 finished with value: 0.6071428571428571 and parameters: {'C': 5.468739360494162}. Best is trial 0 with value: 0.6071428571428571.
  C = trial.suggest_loguniform("C", 1e-3, 10)
[I 2025-08-31 07:48:17,191] Trial 1 finished with value: 0.6666666666666666 and parameters: {'C': 0.05148444170677253}. Best is trial 1 with value: 0.6666666666666666.
  C = trial.suggest_loguniform("C", 1e-3, 10)
[I 2025-08-31 07:48:17,213] Trial 2 finished with value: 0.6071428571428571 and parameters: {'C': 8.898901694118656}. Best is trial 1 with value: 0.6666666666666666.
  C = trial.suggest_loguniform("C", 1e-3, 10)
[I 2025-08-31 07:48:17,235] Trial 3 finished with value: 0.6071428571428571 and parameters: {'C': 1.1108922344888907}. Best is trial 1 with value: 0.6666666666666666.
  C = trial.suggest_loguniform("C",

Best parameters: {'C': 0.05148444170677253}


In [20]:
# 9. Stacking
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import accuracy_score, f1_score

# 전방모델
estimators = [
    ('dt', DecisionTreeClassifier(max_depth = 6, min_samples_split = 10)),
    ('rf', RandomForestClassifier(n_estimators = 56, max_depth = 9))
]

# 후방모델
stack_model = StackingClassifier(
    estimators=estimators,
    final_estimator = LogisticRegression(C =  0.05148444170677253)
)

stack_model.fit(X_train, y_train)

y_pred = stack_model.predict(X_valid)
print("Accuracy:", accuracy_score(y_valid, y_pred))
print("F1 Score:", f1_score(y_valid, y_pred))

Accuracy: 0.8212290502793296
F1 Score: 0.6923076923076923
