In [30]:
import numpy as np
import pandas as pd
import tensorflow as tf

# Raw Data Loading
train = pd.read_csv('./data/titanic/train.csv')

# 필요없는 column 제거
train.drop(['PassengerId','Name','Ticket','Fare','Cabin'], axis=1, inplace=True)

# Sex mapping 처리
sex_mapping = {'male' : 0, 'female' : 1}
train['Sex'] = train['Sex'].map(sex_mapping)

# 가족처리
train['Family'] = train['SibSp'] + train['Parch']
train.drop(['SibSp','Parch'], axis=1, inplace=True)

# Embarked 결측치 처리
train['Embarked'] = train['Embarked'].fillna('Q')

# Age 결측치 처리
train['Age'] = train['Age'].fillna(train['Age'].mean())

# Embarked mapping 처리
Embarked_mapping = {'S' : 0, 'C' : 1, 'Q' : 2}
train['Embarked'] = train['Embarked'].map(Embarked_mapping)

# Age에 대해서 Binning 처리(Numerical value => categorical value)
train.loc[train['Age'] < 8, 'Age'] = 0
train.loc[(train['Age'] >= 8) & (train['Age'] < 20), 'Age'] = 1
train.loc[(train['Age'] >= 20) & (train['Age'] < 65), 'Age'] = 2
train.loc[train['Age'] >= 65, 'Age'] = 3
display(train)

Unnamed: 0,Survived,Pclass,Sex,Age,Embarked,Family
0,0,3,0,2.0,0,1
1,1,1,1,2.0,1,1
2,1,3,1,2.0,0,0
3,1,1,1,2.0,0,1
4,0,3,0,2.0,0,0
...,...,...,...,...,...,...
886,0,2,0,2.0,0,0
887,1,1,1,1.0,0,0
888,0,3,1,2.0,0,3
889,1,1,0,2.0,1,0


In [31]:
train_data = train.iloc[:int(train.shape[0] * 0.7)]
test_data = train.iloc[int(train.shape[0] * 0.7):]

train_x_data = train_data.drop('Survived', axis=1, inplace=False).values
train_t_data = train_data['Survived'].values.reshape(-1,1)

test_x_data = test_data.drop('Survived', axis=1, inplace=False).values
test_t_data = test_data['Survived'].values.reshape(-1,1)

In [34]:
# tensorflow 구현

# Placeholder 생성
X = tf.placeholder(shape=[None,5], dtype=tf.float32)  # 독립변수가 1개인 경우(simple), shape 명시 x
T = tf.placeholder(shape=[None,1], dtype=tf.float32)

# Weight & bias
W = tf.Variable(tf.random.normal([5,1]), name='weight')
b = tf.Variable(tf.random.normal([1]), name='weight')

# Hypothesis
logit = tf.matmul(X,W) + b  # Linear Regression Hypothesis
H = tf.sigmoid(logit)

# Loss Function
loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logit, labels=T))

# Train
train = tf.train.GradientDescentOptimizer(learning_rate=1e-1).minimize(loss)

# Session & 초기화
sess = tf.Session()
sess.run(tf.global_variables_initializer())

# 학습
for step in range(30000):
    _, W_val, b_val, loss_val = sess.run([train,W,b,loss], feed_dict={X : train_x_data, T : train_t_data})
    
    if step % 3000 == 0 :
        print('W : {}, b : {}, loss : {}'.format(W_val, b_val, loss_val))

# 정확도(Accuracy) 측정        
predict = tf.cast(H >= 0.5, dtype=tf.float32)  # True => 1.0, False => 0
correct = tf.equal(predict,T)
accuracy = tf.reduce_mean(tf.cast(correct, dtype=tf.float32))
accuracy_val = sess.run(accuracy, feed_dict={X : test_x_data, T : test_t_data})
print('Accuracy : {}'.format(accuracy_val))

W : [[ 1.3046112 ]
 [-0.97477955]
 [ 1.9386021 ]
 [ 0.21370609]
 [ 0.34390917]], b : [0.24029395], loss : 4.852078914642334
W : [[-0.8589562 ]
 [ 2.743469  ]
 [-0.71253204]
 [ 0.33421502]
 [-0.2027796 ]], b : [1.7038312], loss : 0.4511796534061432
W : [[-0.9250197 ]
 [ 2.7675176 ]
 [-0.8712522 ]
 [ 0.3311024 ]
 [-0.23346585]], b : [2.1679614], loss : 0.4502158463001251
W : [[-0.94073147]
 [ 2.774876  ]
 [-0.9090891 ]
 [ 0.33048385]
 [-0.24094374]], b : [2.2778008], loss : 0.45016220211982727
W : [[-0.9445485 ]
 [ 2.77675   ]
 [-0.9182877 ]
 [ 0.3303383 ]
 [-0.24276978]], b : [2.3044589], loss : 0.450158953666687
W : [[-0.94548005]
 [ 2.7772212 ]
 [-0.92053723]
 [ 0.3303059 ]
 [-0.24321729]], b : [2.3109677], loss : 0.4501588046550751
W : [[-0.9457049 ]
 [ 2.777316  ]
 [-0.921076  ]
 [ 0.33029854]
 [-0.243323  ]], b : [2.312537], loss : 0.4501586854457855
W : [[-0.9457474 ]
 [ 2.777335  ]
 [-0.9211838 ]
 [ 0.33029637]
 [-0.24334434]], b : [2.3128457], loss : 0.4501587748527527
W : [[-0.