In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier

# 1. 데이터 불러오기
train = pd.read_csv('/home/hwaseop/ku_lhs2025/ml_dl_python/data/train.csv')
test = pd.read_csv('/home/hwaseop/ku_lhs2025/ml_dl_python/data/test.csv')

# 2. 데이터 결합 (Title 추출용 등 공통 처리)
data = pd.concat([train, test], sort=False)

# 3. 피처 엔지니어링
data['Sex'] = data['Sex'].map({'male': 0, 'female': 1})

# Title 추출
data['Title'] = data['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
# Title 그룹 통일
title_map = {
    'Mr': 'Mr', 'Miss': 'Miss', 'Mrs': 'Mrs', 'Master': 'Master',
    'Dr': 'Rare', 'Rev': 'Rare', 'Col': 'Rare', 'Major': 'Rare',
    'Mlle': 'Miss', 'Ms': 'Miss', 'Mme': 'Mrs', 'Countess': 'Rare',
    'Don': 'Rare', 'Jonkheer': 'Rare', 'Sir': 'Rare', 'Lady': 'Rare',
    'Capt': 'Rare'
}
data['Title'] = data['Title'].map(title_map)
data['Title'] = data['Title'].fillna('None')
data['Title'] = data['Title'].map({'Mr': 0, 'Miss': 1, 'Mrs': 2, 'Master': 3, 'Rare': 4, 'None': 5})

# Embarked 인코딩
data['Embarked'] = data['Embarked'].fillna('S')
data['Embarked'] = data['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})

# 결측치 처리
data['Age'].fillna(data['Age'].median(), inplace=True)
data['Fare'].fillna(data['Fare'].median(), inplace=True)

# 가족 크기 관련 피처
data['FamilySize'] = data['SibSp'] + data['Parch'] + 1
data['IsAlone'] = (data['FamilySize'] == 1).astype(int)

# 4. 사용 피처 선택
features = ['Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 'Title', 'FamilySize', 'IsAlone']
train_data = data.loc[data['Survived'].notnull()]
test_data = data.loc[data['Survived'].isnull()]

X = train_data[features]
y = train_data['Survived'].astype(int)
X_test_final = test_data[features]

# 5. 학습/검증 분리
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 6. 모델 학습 (튜닝된 XGBoost 사용)
model = XGBClassifier(
    n_estimators=200,
    max_depth=4,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    tree_method='hist'
)
model.fit(X_train, y_train)

# 7. 검증 정확도 출력
val_pred = model.predict(X_val)
val_acc = accuracy_score(y_val, val_pred)
print(f'Validation Accuracy: {val_acc:.4f}')

# 8. 교차검증 (선택사항)
cv_scores = cross_val_score(model, X, y, cv=5)
print(f'Cross-Validation Mean Accuracy: {cv_scores.mean():.4f}')

# 9. 최종 예측
test_pred = model.predict(X_test_final)

# 10. 제출 파일 생성
submission = pd.DataFrame({
    'PassengerId': test_data['PassengerId'],
    'Survived': test_pred.astype(int)
})
submission.to_csv('/home/hwaseop/ku_lhs2025/ml_dl_python/data/xgb_titanic_improved.csv', index=False)
print("제출 파일 생성 완료 ✅")


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Age'].fillna(data['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Fare'].fillna(data['Fare'].median(), inplace=True)


Validation Accuracy: 0.8268
Cross-Validation Mean Accuracy: 0.8395
제출 파일 생성 완료 ✅


In [3]:
submission.info()
submission.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   PassengerId  418 non-null    int64
 1   Survived     418 non-null    int64
dtypes: int64(2)
memory usage: 6.7 KB


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,1
4,896,1
