In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier

# 데이터 불러오기
train = pd.read_csv('/home/hwaseop/ku_lhs2025/ml_dl_python/data/train.csv')
test = pd.read_csv('/home/hwaseop/ku_lhs2025/ml_dl_python/data/test.csv')
data = pd.concat([train, test], sort=False)

# 기본 전처리
data['Sex'] = data['Sex'].map({'male': 0, 'female': 1})
data['Embarked'] = data['Embarked'].fillna('S')
data['Embarked'] = data['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})
data['Age'].fillna(data['Age'].median(), inplace=True)
data['Fare'].fillna(data['Fare'].median(), inplace=True)

# Title 피처 엔지니어링
data['Title'] = data['Name'].str.extract(' ([A-Za-z]+)\\.', expand=False)
title_map = {
    'Mr': 'Mr', 'Miss': 'Miss', 'Mrs': 'Mrs', 'Master': 'Master',
    'Dr': 'Rare', 'Rev': 'Rare', 'Col': 'Rare', 'Major': 'Rare',
    'Mlle': 'Miss', 'Ms': 'Miss', 'Mme': 'Mrs', 'Countess': 'Rare',
    'Don': 'Rare', 'Jonkheer': 'Rare', 'Sir': 'Rare', 'Lady': 'Rare',
    'Capt': 'Rare'
}
data['Title'] = data['Title'].map(title_map).fillna('None')
data['Title'] = data['Title'].map({'Mr': 0, 'Miss': 1, 'Mrs': 2, 'Master': 3, 'Rare': 4, 'None': 5})

# 가족 관련 피처
data['FamilySize'] = data['SibSp'] + data['Parch'] + 1
data['IsAlone'] = (data['FamilySize'] == 1).astype(int)

# 구간화 피처
data['AgeBand'] = pd.cut(data['Age'], 5, labels=False)
data['FareBand'] = pd.qcut(data['Fare'], 4, labels=False)

# 상호작용 피처
data['Pclass_Sex'] = data['Pclass'] * data['Sex']

# Cabin 유무 피처
data['HasCabin'] = data['Cabin'].notnull().astype(int)

# Ticket 접두사 피처
data['TicketPrefix'] = data['Ticket'].str.extract('([A-Za-z./]+)', expand=False).fillna('None')
data['TicketPrefix'] = data['TicketPrefix'].map(lambda x: x.replace('.', '').replace('/', ''))
ticket_map = {k: i for i, k in enumerate(data['TicketPrefix'].unique())}
data['TicketPrefix'] = data['TicketPrefix'].map(ticket_map)

# 사용할 피처 리스트
features = [
    'Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 'Title',
    'FamilySize', 'IsAlone', 'AgeBand', 'FareBand', 'Pclass_Sex',
    'HasCabin', 'TicketPrefix'
]

# 학습/테스트 분리
train_data = data.loc[data['Survived'].notnull()]
test_data = data.loc[data['Survived'].isnull()]
X = train_data[features]
y = train_data['Survived'].astype(int)
X_test_final = test_data[features]

# 학습/검증 분할
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# 모델 학습
model = XGBClassifier(
    n_estimators=300,
    max_depth=5,
    learning_rate=0.03,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=1,
    random_state=42,
    tree_method='hist'
)
model.fit(X_train, y_train)

# 검증 정확도 출력
val_pred = model.predict(X_val)
val_acc = accuracy_score(y_val, val_pred)
print(f'📊 검증 정확도: {val_acc:.4f}')

# 교차검증
cv_scores = cross_val_score(model, X, y, cv=5)
print(f'📊 교차검증 평균 정확도: {cv_scores.mean():.4f}')

# 최종 예측 및 제출
test_pred = model.predict(X_test_final)
submission = pd.DataFrame({
    'PassengerId': test_data['PassengerId'],
    'Survived': test_pred.astype(int)
})
submission.to_csv('/home/hwaseop/ku_lhs2025/ml_dl_python/data/xgb_titanic_final.csv', index=False)
print("✅ 제출 파일 저장 완료!")


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Age'].fillna(data['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Fare'].fillna(data['Fare'].median(), inplace=True)


📊 검증 정확도: 0.8156
📊 교차검증 평균 정확도: 0.8305
✅ 제출 파일 저장 완료!
