# Titanic Data Binary Classification Accuracy | 타이타닉 데이터 이진 분류 정확도

In [1]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import tensorflow as tf
seed = 2021
np.random.seed(seed)
tf.random.set_seed(seed)

## 데이터 전처리

In [2]:
df = sns.load_dataset('titanic')
df.head(3)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True


In [3]:
# 결측치 확인
df.isna().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [4]:
# 결측치 제거
df = df.dropna()

In [5]:
# 이진분류 어려운 데이터 드랍
df.drop(['class','who','deck','embark_town','alive'], axis=1, inplace=True)

In [6]:
df.head(3)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,adult_male,alone
1,1,1,female,38.0,1,0,71.2833,C,False,False
3,1,1,female,35.0,1,0,53.1,S,False,False
6,0,1,male,54.0,0,0,51.8625,S,True,True


In [7]:
# 카테고리형 데이터 cat.codes로 변환
df['sex'] = df['sex'].astype('category').cat.codes
df['embarked'] = df['embarked'].astype('category').cat.codes
df['adult_male'] = df['adult_male'].astype('category').cat.codes
df['alone'] = df['alone'].astype('category').cat.codes

# one-hot 인코딩을 위한 get_dummies 사용
df['pclass'] = pd.get_dummies(df['pclass'])

# 가족 단위 수 group_num을 만들어 새로운 열로 추가
df['group_num'] = df['sibsp'] + df['parch'] + 1

In [8]:
df.head(3)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,adult_male,alone,group_num
1,1,1,0,38.0,1,0,71.2833,0,0,0,2
3,1,1,0,35.0,1,0,53.1,2,0,0,2
6,0,1,1,54.0,0,0,51.8625,2,1,1,1


In [9]:
np.unique(df.values[:,0], return_counts=True)  # 생존자 123, 사망자 59

(array([0., 1.]), array([ 59, 123]))

In [10]:
df.iloc[:, 1:]

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,embarked,adult_male,alone,group_num
1,1,0,38.0,1,0,71.2833,0,0,0,2
3,1,0,35.0,1,0,53.1000,2,0,0,2
6,1,1,54.0,0,0,51.8625,2,1,1,1
10,0,0,4.0,1,1,16.7000,2,0,0,3
11,1,0,58.0,0,0,26.5500,2,0,1,1
...,...,...,...,...,...,...,...,...,...,...
871,1,0,47.0,1,1,52.5542,2,0,0,3
872,1,1,33.0,0,0,5.0000,2,1,1,1
879,1,0,56.0,0,1,83.1583,0,0,0,2
887,1,0,19.0,0,0,30.0000,2,0,1,1


In [11]:
# 데이터 값의 단위가 상이하므로 정규화 수행
from sklearn.preprocessing import StandardScaler
X_scaled = StandardScaler().fit_transform(df.values[:,1:])
X_scaled

array([[ 0.39904344, -1.03352882,  0.15208196, ..., -0.9569689 ,
        -0.8660254 ,  0.04958766],
       [ 0.39904344, -1.03352882, -0.03987502, ..., -0.9569689 ,
        -0.8660254 ,  0.04958766],
       [ 0.39904344,  0.96755889,  1.17585249, ...,  1.04496604,
         1.15470054, -0.85290771],
       ...,
       [ 0.39904344, -1.03352882,  1.30382381, ..., -0.9569689 ,
        -0.8660254 ,  0.04958766],
       [ 0.39904344, -1.03352882, -1.06364555, ..., -0.9569689 ,
         1.15470054, -0.85290771],
       [ 0.39904344,  0.96755889, -0.61574594, ...,  1.04496604,
         1.15470054, -0.85290771]])

In [12]:
y = df.survived
y

1      1
3      1
6      0
10     1
11     1
      ..
871    1
872    0
879    1
887    1
889    1
Name: survived, Length: 182, dtype: int64

## 모델 정의/설정

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, stratify=y, random_state=seed, test_size=0.2
)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((145, 10), (37, 10), (145,), (37,))

In [14]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [19]:
model = Sequential([
    Dense(32, input_dim=10, activation='relu'),
    Dense(32, activation='relu'),
    Dense(4, activation='relu'),
    Dense(1, activation='sigmoid')
])
model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_10 (Dense)            (None, 32)                352       
                                                                 
 dense_11 (Dense)            (None, 32)                1056      
                                                                 
 dense_12 (Dense)            (None, 4)                 132       
                                                                 
 dense_13 (Dense)            (None, 1)                 5         
                                                                 
Total params: 1,545
Trainable params: 1,545
Non-trainable params: 0
_________________________________________________________________


In [20]:
model.compile(
    optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']
)

## 모델 저장관련 설정

In [21]:
import os
if not os.path.exists('model'):
    os.mkdir('model')

In [22]:
model_path = 'model/titanic.h5'

In [23]:
from tensorflow.keras.callbacks import ModelCheckpoint
checkpoint = ModelCheckpoint(
    model_path, monitor='val_loss', verbose=1, save_best_only=True
)

In [24]:
from tensorflow.keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(patience=100)

## 모델 학습 및 저장

In [25]:
hist = model.fit(
    X_train, y_train, validation_split=0.2, verbose=0,
    epochs=100, batch_size=50, callbacks=[checkpoint, early_stopping]
)


Epoch 00001: val_loss improved from inf to 0.70219, saving model to model/titanic.h5

Epoch 00002: val_loss improved from 0.70219 to 0.69377, saving model to model/titanic.h5

Epoch 00003: val_loss improved from 0.69377 to 0.68611, saving model to model/titanic.h5

Epoch 00004: val_loss improved from 0.68611 to 0.67941, saving model to model/titanic.h5

Epoch 00005: val_loss improved from 0.67941 to 0.67221, saving model to model/titanic.h5

Epoch 00006: val_loss improved from 0.67221 to 0.66396, saving model to model/titanic.h5

Epoch 00007: val_loss improved from 0.66396 to 0.65484, saving model to model/titanic.h5

Epoch 00008: val_loss improved from 0.65484 to 0.64606, saving model to model/titanic.h5

Epoch 00009: val_loss improved from 0.64606 to 0.63694, saving model to model/titanic.h5

Epoch 00010: val_loss improved from 0.63694 to 0.62774, saving model to model/titanic.h5

Epoch 00011: val_loss improved from 0.62774 to 0.61831, saving model to model/titanic.h5

Epoch 00012: 

## 평가 - 베스트 모델

In [26]:
from tensorflow.keras.models import load_model
best_model=load_model(model_path)
best_model.evaluate(X_test, y_test)



[0.4617364704608917, 0.7567567825317383]