In [11]:
import seaborn as sns

# 1 데이터셋 불러오기.
# seaborn 라이브러리에 있는 titanic 데이터 로드
titanic = sns.load_dataset('titanic')

# 2-1 feature 분석.
# head 함수를 이용해 데이터의 feature를 파악
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [12]:
# 2-2 feature 분석.
# describe 함수를 통해 기본적인 통계 확인
titanic.describe()

# count : 각 컬럼 요소들의 총 개수  
# mean  : 각 컬럼 요소들의 평균  
# std   : 각 컬럼 요소들의 표준편차  
# min   : 각 컬럼 요소들의 최소값  
# 25%   : 각 컬럼 요소들의 값 중 하위 25%에 해당하는 값  
# 50%   : 각 컬럼 요소들의 중간값  
# max   : 각 컬럼 요소들의 최대값  

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [13]:
# 2-3 feature 분석.
# isnull() 함수와 sum() 함수를 이용해 각 열의 결측치 갯수 확인
titanic.isnull().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [14]:
# 3-1 결측치 처리.
# Age(나이)의 결측치는 중앙값으로
titanic['age'] = titanic['age'].fillna(titanic['age'].median())

# Embarked(승선 항구)의 결측치는 최빈값으로 대체
titanic['embarked'] = titanic['embarked'].fillna(titanic['embarked'].mode()[0])

# 결과를 isnull() 함수와 sum() 함수를 이용해 확인
print(titanic['age'].isnull().sum())
print(titanic['embarked'].isnull().sum())

0
0


In [15]:
# 3-2 수치형으로 인코딩.

# Sex(성별)는 남자:0, 여자:1.
titanic['sex'] = titanic['sex'].map({'male': 0, 'female': 1})
# alive(생존여부)는 생존:1, 사망:0.
titanic['alive'] = titanic['alive'].map({'yes': 1, 'no': 0})
# Embarked(승선 항구) ‘C’는 0, Q는 1, ‘S’는 2.
titanic['embarked'] = titanic['embarked'].map({'C': 0, 'Q': 1, 'S': 2,})

# 결과를 head 함수를 이용해 확인
print(titanic['sex'].head())
print(titanic['alive'].head())
print(titanic['embarked'].head())

0    0
1    1
2    1
3    1
4    0
Name: sex, dtype: int64
0    0
1    1
2    1
3    1
4    0
Name: alive, dtype: int64
0    2
1    0
2    2
3    2
4    2
Name: embarked, dtype: int64


In [16]:
# 3-3 새로운 feature 생성.
# sibsp(타이타닉호에 동승한 자매 및 배우자의 수), parch(타이타닉호에 동승한 부모 및 자식의 수)를 통해 family_size(가족크기) 생성
titanic['family_size'] = titanic['sibsp'] + titanic['parch'] + 1
print(titanic['family_size'].head())

0    2
1    2
2    1
3    2
4    1
Name: family_size, dtype: int64


In [17]:
#4-1 모델 학습 준비
import pandas as pd
import numpy as np

import sklearn                                          #파이썬 머신러닝 분석
import matplotlib.pyplot as plt                         #데이터 시각화

from sklearn.model_selection import train_test_split    #데이터 분할
from sklearn.preprocessing import StandardScaler        #데이터 칼럼 표준화
from sklearn.metrics import accuracy_score              #성능 평가 지표
from sklearn.metrics import classification_report       #평가 지표
from sklearn.metrics import mean_squared_error          # mse 손실함수

from sklearn.linear_model import LogisticRegression     # Logistic Regression (4-2)
from sklearn.tree import DecisionTreeClassifier         # Decision Tree (4-3)
import xgboost as xgb                                   # XGBOOST (4-4)


titanic = titanic[['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked', 'family_size']]
X = titanic.drop('survived', axis=1) # feature
y = titanic['survived'] # target

In [18]:
#4-2. Logistic Regression
# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 데이터 스케일링
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# 모델 생성 및 학습
model = LogisticRegression()
model.fit(X_train, y_train)

# 예측
y_pred = model.predict(X_test)

# 평가
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"Classification Report:\n{classification_report(y_test, y_pred)}")

Accuracy: 0.8044692737430168
Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.86      0.84       105
           1       0.78      0.73      0.76        74

    accuracy                           0.80       179
   macro avg       0.80      0.79      0.80       179
weighted avg       0.80      0.80      0.80       179



In [19]:
# 4-3. Decision Tree
# 데이터 분할(X라벨, Y라벨 설정)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 데이터 스케일링
scaler = StandardScaler() # 평균 = 0, 표준편차 = 1로 조정해서 모든 특성이 같은 크기를 갖게함
X_train = scaler.fit_transform(X_train) # train data에 scaler를 fit 하고
X_test = scaler.transform(X_test) # X_test에서 transform만 수행

# 모델 생성 및 학습
model = DecisionTreeClassifier(random_state=42) # 데이터 분할 시 셔플의 시드값 = 42
model.fit(X_train, y_train)

# 예측값을 예측
y_pred = model.predict(X_test)

# 정답지와 예측값
print(f"Accuracy: {accuracy_score(y_test, y_pred)}") 
print(f"Classification Report:\n{classification_report(y_test, y_pred)}")

Accuracy: 0.770949720670391
Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.76      0.80       105
           1       0.70      0.78      0.74        74

    accuracy                           0.77       179
   macro avg       0.77      0.77      0.77       179
weighted avg       0.78      0.77      0.77       179



In [23]:
# 4-4. XGBOOST

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 데이터 스케일링
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# XGBoost 모델 생성
xgb_model = xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

# 모델 학습
xgb_model.fit(X_train_scaled, y_train)

# 예측
y_pred_xgb = xgb_model.predict(X_test_scaled)

# 평가
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
print(f'XGBoost 모델의 MSE: {mse_xgb}')

XGBoost 모델의 MSE: 0.12981004899201257
