# Titanic Data Binary Classification Accuracy | 타이타닉 데이터 이진 분류 정확도

In [1]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import tensorflow as tf
seed = 2021
np.random.seed(seed)
tf.random.set_seed(seed)

## 데이터 전처리

In [2]:
df = sns.load_dataset('titanic')
df.head(3)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True


In [3]:
# 결측치 확인
df.isna().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [4]:
df = df[['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked', 'deck']]
df

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,deck
0,0,3,male,22.0,1,0,7.2500,S,
1,1,1,female,38.0,1,0,71.2833,C,C
2,1,3,female,26.0,0,0,7.9250,S,
3,1,1,female,35.0,1,0,53.1000,S,C
4,0,3,male,35.0,0,0,8.0500,S,
...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,
887,1,1,female,19.0,0,0,30.0000,S,B
888,0,3,female,,1,2,23.4500,S,
889,1,1,male,26.0,0,0,30.0000,C,C


In [5]:
df.age.fillna(df.age.mean(), inplace=True)

In [6]:
df.embarked.fillna('S', inplace=True)
df.embarked.isna().sum()

0

In [7]:
df.head(3)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,deck
0,0,3,male,22.0,1,0,7.25,S,
1,1,1,female,38.0,1,0,71.2833,C,C
2,1,3,female,26.0,0,0,7.925,S,


In [8]:
df.drop(columns=['deck'], inplace=True)

In [9]:
# 카테고리형 데이터 cat.codes로 변환
df['sex'] = df['sex'].astype('category').cat.codes
df['embarked'] = df['embarked'].astype('category').cat.codes

# one-hot 인코딩을 위한 get_dummies 사용
df['pclass'] = pd.get_dummies(df['pclass'])

# 가족 단위 수 group_num을 만들어 새로운 열로 추가
df['group_num'] = df['sibsp'] + df['parch'] + 1

In [10]:
df.head(3)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,group_num
0,0,0,1,22.0,1,0,7.25,2,2
1,1,1,0,38.0,1,0,71.2833,0,2
2,1,0,0,26.0,0,0,7.925,2,1


In [11]:
np.unique(df.values[:,0], return_counts=True) 

(array([0., 1.]), array([549, 342]))

In [12]:
df.iloc[:, 1:]

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,embarked,group_num
0,0,1,22.000000,1,0,7.2500,2,2
1,1,0,38.000000,1,0,71.2833,0,2
2,0,0,26.000000,0,0,7.9250,2,1
3,1,0,35.000000,1,0,53.1000,2,2
4,0,1,35.000000,0,0,8.0500,2,1
...,...,...,...,...,...,...,...,...
886,0,1,27.000000,0,0,13.0000,2,1
887,1,0,19.000000,0,0,30.0000,2,1
888,0,0,29.699118,1,2,23.4500,2,4
889,1,1,26.000000,0,0,30.0000,0,1


In [13]:
# 데이터 값의 단위가 상이하므로 정규화 수행
from sklearn.preprocessing import StandardScaler
X_scaled = StandardScaler().fit_transform(df.values[:,1:])
X_scaled

array([[-0.56568542,  0.73769513, -0.5924806 , ..., -0.50244517,
         0.58595414,  0.05915988],
       [ 1.76776695, -1.35557354,  0.63878901, ...,  0.78684529,
        -1.9423032 ,  0.05915988],
       [-0.56568542, -1.35557354, -0.2846632 , ..., -0.48885426,
         0.58595414, -0.56097483],
       ...,
       [-0.56568542, -1.35557354,  0.        , ..., -0.17626324,
         0.58595414,  1.29942929],
       [ 1.76776695,  0.73769513, -0.2846632 , ..., -0.04438104,
        -1.9423032 , -0.56097483],
       [-0.56568542,  0.73769513,  0.17706291, ..., -0.49237783,
        -0.67817453, -0.56097483]])

In [14]:
y = df.survived
y

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: survived, Length: 891, dtype: int64

## 모델 정의/설정

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, stratify=y, random_state=seed, test_size=0.2
)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((712, 8), (179, 8), (712,), (179,))

In [16]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [17]:
model = Sequential([
                    Dense(62, input_dim = 8, activation = 'relu'),
                    Dense(36, activation = 'relu'),
                    Dense(28, activation = 'relu'),
                    Dense(16, activation = 'relu'),
                    Dense(8, activation = 'relu'),
                    Dense(4, activation = 'relu'),
                    Dense(1, activation = 'sigmoid')
])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 62)                558       
                                                                 
 dense_1 (Dense)             (None, 36)                2268      
                                                                 
 dense_2 (Dense)             (None, 28)                1036      
                                                                 
 dense_3 (Dense)             (None, 16)                464       
                                                                 
 dense_4 (Dense)             (None, 8)                 136       
                                                                 
 dense_5 (Dense)             (None, 4)                 36        
                                                                 
 dense_6 (Dense)             (None, 1)                 5

In [18]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

## 모델 저장관련 설정

In [19]:
import os
if not os.path.exists('model'):
    os.mkdir('model')

In [20]:
model_path = 'model/titanic.h5'

In [21]:
from tensorflow.keras.callbacks import ModelCheckpoint
checkpoint = ModelCheckpoint(
    model_path, monitor='val_loss', verbose=1, save_best_only=True
)

In [22]:
# from tensorflow.keras.callbacks import EarlyStopping
# early_stopping = EarlyStopping(patience=100)

## 모델 학습 및 저장

In [23]:
hist = model.fit(
    X_train, y_train, validation_split=0.2, verbose=1,
    epochs=200, batch_size=210, callbacks=[checkpoint]
)

Epoch 1/200
Epoch 00001: val_loss improved from inf to 0.68903, saving model to model/titanic.h5
Epoch 2/200
Epoch 00002: val_loss improved from 0.68903 to 0.68296, saving model to model/titanic.h5
Epoch 3/200
Epoch 00003: val_loss improved from 0.68296 to 0.67707, saving model to model/titanic.h5
Epoch 4/200
Epoch 00004: val_loss improved from 0.67707 to 0.67029, saving model to model/titanic.h5
Epoch 5/200
Epoch 00005: val_loss improved from 0.67029 to 0.66284, saving model to model/titanic.h5
Epoch 6/200
Epoch 00006: val_loss improved from 0.66284 to 0.65389, saving model to model/titanic.h5
Epoch 7/200
Epoch 00007: val_loss improved from 0.65389 to 0.64309, saving model to model/titanic.h5
Epoch 8/200
Epoch 00008: val_loss improved from 0.64309 to 0.63127, saving model to model/titanic.h5
Epoch 9/200
Epoch 00009: val_loss improved from 0.63127 to 0.61858, saving model to model/titanic.h5
Epoch 10/200
Epoch 00010: val_loss improved from 0.61858 to 0.60374, saving model to model/tita

## 평가 - 베스트 모델

In [24]:
from tensorflow.keras.models import load_model
best_model=load_model(model_path)
best_model.evaluate(X_test, y_test)



[0.46344244480133057, 0.8212290406227112]