In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report
from xgboost import XGBClassifier

# 데이터 불러오기
train = pd.read_csv('../7월/data/titanic_train.csv')
test = pd.read_csv('../7월/data/titanic_test.csv')

# PassengerId 저장
test_passenger = test['PassengerId']

# 데이터 합치기 (동일 전처리)
train['is_train'] = 1
test['is_train'] = 0
test['Survived'] = np.nan
data = pd.concat([train, test], sort=False)

# 결측치 처리
# Age, Fare: 중앙값 대체
for col in ['Age', 'Fare']:
    data[col] = data[col].fillna(data[col].median())
# Embarked: 최빈값 대체
data['Embarked'] = data['Embarked'].fillna(data['Embarked'].mode()[0])

# Cabin: 결측치 'N'으로 대체, 첫 알파벳만 추출
data['Cabin'] = data['Cabin'].fillna('N')
data['Cabin'] = data['Cabin'].str[0]

# Name에서 Title 추출
data['Title'] = data['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
# 희귀 Title 통합
rare_titles = ['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona']
data['Title'] = data['Title'].replace(rare_titles, 'Rare')
data['Title'] = data['Title'].replace({'Mlle': 'Miss', 'Ms': 'Miss', 'Mme': 'Mrs'})

# 범주형 변수 인코딩
for col in ['Sex', 'Embarked', 'Cabin', 'Title']:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])

# 불필요 컬럼 제거
drop_cols = ['PassengerId', 'Name', 'Ticket', 'is_train']
data = data.drop(drop_cols, axis=1)

# train, test 분리
train_data = data[~data['Survived'].isnull()]
test_data = data[data['Survived'].isnull()].drop('Survived', axis=1)
X = train_data.drop('Survived', axis=1)
y = train_data['Survived'].astype(int)

# 데이터 분할
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# 스케일링
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
test_scaled = scaler.transform(test_data)

# XGBoost 분류기
xgb_clf = XGBClassifier(max_depth=3, random_state=42, use_label_encoder=False, eval_metric='logloss')

# 교차검증
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cross_val_scores = cross_val_score(xgb_clf, X_train_scaled, y_train, cv=kfold, scoring='accuracy')
print('CV scores:', cross_val_scores)
print('CV mean:', np.mean(cross_val_scores))

# 학습
xgb_clf.fit(X_train_scaled, y_train)

# 검증 예측 및 평가
val_pred = xgb_clf.predict(X_val_scaled)
print(classification_report(y_val, val_pred))

# 테스트 데이터 예측
pred_test = xgb_clf.predict(test_scaled)

# 제출 파일 생성
submission = pd.DataFrame({'PassengerId': test_passenger, 'Survived': pred_test.astype(int)})
submission.to_csv('titanic_submission.csv', index=False)
print('titanic_submission.csv 저장 완료')

  data['Title'] = data['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


CV scores: [0.776      0.832      0.824      0.86290323 0.82258065]
CV mean: 0.8234967741935483
              precision    recall  f1-score   support

           0       0.81      0.88      0.84       165
           1       0.78      0.67      0.72       103

    accuracy                           0.80       268
   macro avg       0.79      0.77      0.78       268
weighted avg       0.80      0.80      0.80       268

titanic_submission.csv 저장 완료
