In [30]:
import numpy as np
import pandas as pd
import tensorflow as tf

# Raw Data Loading
train = pd.read_csv('./data/titanic/train.csv')

# 필요없는 column 제거
train.drop(['PassengerId','Name','Ticket','Fare','Cabin'], axis=1, inplace=True)

# Sex mapping 처리
sex_mapping = {'male' : 0, 'female' : 1}
train['Sex'] = train['Sex'].map(sex_mapping)

# 가족처리
train['Family'] = train['SibSp'] + train['Parch']
train.drop(['SibSp','Parch'], axis=1, inplace=True)

# Embarked 결측치 처리
train['Embarked'] = train['Embarked'].fillna('Q')

# Age 결측치 처리
train['Age'] = train['Age'].fillna(train['Age'].mean())

# Embarked mapping 처리
Embarked_mapping = {'S' : 0, 'C' : 1, 'Q' : 2}
train['Embarked'] = train['Embarked'].map(Embarked_mapping)

# Age에 대해서 Binning 처리(Numerical value => categorical value)
train.loc[train['Age'] < 8, 'Age'] = 0
train.loc[(train['Age'] >= 8) & (train['Age'] < 20), 'Age'] = 1
train.loc[(train['Age'] >= 20) & (train['Age'] < 65), 'Age'] = 2
train.loc[train['Age'] >= 65, 'Age'] = 3
display(train)

Unnamed: 0,Survived,Pclass,Sex,Age,Embarked,Family
0,0,3,0,2.0,0,1
1,1,1,1,2.0,1,1
2,1,3,1,2.0,0,0
3,1,1,1,2.0,0,1
4,0,3,0,2.0,0,0
...,...,...,...,...,...,...
886,0,2,0,2.0,0,0
887,1,1,1,1.0,0,0
888,0,3,1,2.0,0,3
889,1,1,0,2.0,1,0


In [31]:
train_data = train.iloc[:int(train.shape[0] * 0.7)]
test_data = train.iloc[int(train.shape[0] * 0.7):]

train_x_data = train_data.drop('Survived', axis=1, inplace=False).values
train_t_data = train_data['Survived'].values.reshape(-1,1)

test_x_data = test_data.drop('Survived', axis=1, inplace=False).values
test_t_data = test_data['Survived'].values.reshape(-1,1)

In [34]:
# tensorflow 구현

# Placeholder 생성
X = tf.placeholder(shape=[None,5], dtype=tf.float32)  # 독립변수가 1개인 경우(simple), shape 명시 x
T = tf.placeholder(shape=[None,1], dtype=tf.float32)

# Weight & bias
W = tf.Variable(tf.random.normal([5,1]), name='weight')
b = tf.Variable(tf.random.normal([1]), name='weight')

# Hypothesis
logit = tf.matmul(X,W) + b  # Linear Regression Hypothesis
H = tf.sigmoid(logit)

# Loss Function
loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logit, labels=T))

# Train
train = tf.train.GradientDescentOptimizer(learning_rate=1e-1).minimize(loss)

# Session & 초기화
sess = tf.Session()
sess.run(tf.global_variables_initializer())

# 학습
for step in range(30000):
    _, W_val, b_val, loss_val = sess.run([train,W,b,loss], feed_dict={X : train_x_data, T : train_t_data})
    
    if step % 3000 == 0 :
        print('W : {}, b : {}, loss : {}'.format(W_val, b_val, loss_val))

# 정확도(Accuracy) 측정        
predict = tf.cast(H >= 0.5, dtype=tf.float32)  # True => 1.0, False => 0
correct = tf.equal(predict,T)
accuracy = tf.reduce_mean(tf.cast(correct, dtype=tf.float32))
accuracy_val = sess.run(accuracy, feed_dict={X : test_x_data, T : test_t_data})
print('Accuracy : {}'.format(accuracy_val))

W : [[ 1.3046112 ]
 [-0.97477955]
 [ 1.9386021 ]
 [ 0.21370609]
 [ 0.34390917]], b : [0.24029395], loss : 4.852078914642334
W : [[-0.8589562 ]
 [ 2.743469  ]
 [-0.71253204]
 [ 0.33421502]
 [-0.2027796 ]], b : [1.7038312], loss : 0.4511796534061432
W : [[-0.9250197 ]
 [ 2.7675176 ]
 [-0.8712522 ]
 [ 0.3311024 ]
 [-0.23346585]], b : [2.1679614], loss : 0.4502158463001251
W : [[-0.94073147]
 [ 2.774876  ]
 [-0.9090891 ]
 [ 0.33048385]
 [-0.24094374]], b : [2.2778008], loss : 0.45016220211982727
W : [[-0.9445485 ]
 [ 2.77675   ]
 [-0.9182877 ]
 [ 0.3303383 ]
 [-0.24276978]], b : [2.3044589], loss : 0.450158953666687
W : [[-0.94548005]
 [ 2.7772212 ]
 [-0.92053723]
 [ 0.3303059 ]
 [-0.24321729]], b : [2.3109677], loss : 0.4501588046550751
W : [[-0.9457049 ]
 [ 2.777316  ]
 [-0.921076  ]
 [ 0.33029854]
 [-0.243323  ]], b : [2.312537], loss : 0.4501586854457855
W : [[-0.9457474 ]
 [ 2.777335  ]
 [-0.9211838 ]
 [ 0.33029637]
 [-0.24334434]], b : [2.3128457], loss : 0.4501587748527527
W : [[-0.

In [10]:
import numpy as np
import pandas as pd
import tensorflow as tf

# Raw Data Loading
train = pd.read_csv('./data/titanic/train.csv')

# 필요없는 column 제거
train.drop(['PassengerId','Name','Ticket','Cabin'], axis=1, inplace=True)

# Fare 처리
train['Fare'] = train['Fare'].fillna(train['Fare'].mean())

# Sex mapping 처리
sex_mapping = {'male' : 0, 'female' : 1}
train['Sex'] = train['Sex'].map(sex_mapping)

# 가족처리
train['Family'] = train['SibSp'] + train['Parch']
train.drop(['SibSp','Parch'], axis=1, inplace=True)

# Embarked 결측치 처리
train['Embarked'] = train['Embarked'].fillna('S')
# print(sum(train['Embarked'].values == 'S'))
# print(sum(train['Embarked'].values == 'C'))
# print(sum(train['Embarked'].values == 'Q'))

# Age 결측치 처리
train['Age'] = train['Age'].fillna(train['Age'].mean())

# Embarked mapping 처리
Embarked_mapping = {'S' : 0, 'C' : 1, 'Q' : 2}
train['Embarked'] = train['Embarked'].map(Embarked_mapping)

# Age에 대해서 Binning 처리(Numerical value => categorical value)
train.loc[train['Age'] < 16, 'Age'] = 0
train.loc[(train['Age'] >= 16) & (train['Age'] < 32), 'Age'] = 1
train.loc[(train['Age'] >= 32) & (train['Age'] < 48), 'Age'] = 2
train.loc[(train['Age'] >= 48) & (train['Age'] < 60), 'Age'] = 3
train.loc[train['Age'] >= 60, 'Age'] = 4
display(train)

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,Family
0,0,3,0,1.0,7.2500,0,1
1,1,1,1,2.0,71.2833,1,1
2,1,3,1,1.0,7.9250,0,0
3,1,1,1,2.0,53.1000,0,1
4,0,3,0,2.0,8.0500,0,0
...,...,...,...,...,...,...,...
886,0,2,0,1.0,13.0000,0,0
887,1,1,1,1.0,30.0000,0,0
888,0,3,1,1.0,23.4500,0,3
889,1,1,0,1.0,30.0000,1,0


In [18]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# Data Split
x_data_train, x_data_test, t_data_train, t_data_test = \
train_test_split(train.drop('Survived', axis=1, inplace=False), train['Survived'], test_size=0.3, random_state=0)

x_data_train = x_data_train.values
x_data_test = x_data_test.values
t_data_train = t_data_train.values.reshape(-1,1)
t_data_test = t_data_test.values.reshape(-1,1)

# Min-Max Normalization
scaler = MinMaxScaler()
scaler.fit(x_data_train)
x_data_train_norm = scaler.transform(x_data_train)
x_data_test_norm = scaler.transform(x_data_test)

In [20]:
# tensorflow 구현

# Placeholder 생성
X = tf.placeholder(shape=[None,6], dtype=tf.float32)  # 독립변수가 1개인 경우(simple), shape 명시 x
T = tf.placeholder(shape=[None,1], dtype=tf.float32)

W1 = tf.Variable(tf.random.normal([6,200]), name='weight1')
b1 = tf.Variable(tf.random.normal([200]), name='bias1')
hidden1 = tf.sigmoid(tf.matmul(X,W1) + b1)

W2 = tf.Variable(tf.random.normal([200,100]), name='weight2')
b2 = tf.Variable(tf.random.normal([100]), name='bias2')
hidden2 = tf.sigmoid(tf.matmul(hidden1,W2) + b2)

W3 = tf.Variable(tf.random.normal([100,1]), name='weight3')
b3 = tf.Variable(tf.random.normal([1]), name='bias3')

# Hypothesis
logit = tf.matmul(hidden2,W3) + b3
H = tf.sigmoid(logit)

# Loss Function
loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logit, labels=T))

# Train
train = tf.train.GradientDescentOptimizer(learning_rate=1e-1).minimize(loss)

# Session & 초기화
sess = tf.Session()
sess.run(tf.global_variables_initializer())

# 학습
for step in range(30000):
    _, loss_val = sess.run([train,loss], feed_dict={X : x_data_train_norm, T : t_data_train})
    
    if step % 3000 == 0 :
        print('loss : {}'.format(loss_val))

# 정확도(Accuracy) 측정        
predict = tf.cast(H >= 0.5, dtype=tf.float32)  # True => 1.0, False => 0
correct = tf.equal(predict,T)
accuracy = tf.reduce_mean(tf.cast(correct, dtype=tf.float32))
accuracy_val = sess.run(accuracy, feed_dict={X : x_data_test_norm, T : t_data_test})
print('Accuracy : {}'.format(accuracy_val))

loss : 4.3507161140441895
loss : 0.388933926820755
loss : 0.3754255175590515
loss : 0.36563119292259216
loss : 0.3580297827720642
loss : 0.3513486087322235
loss : 0.34534186124801636
loss : 0.3400896191596985
loss : 0.3357399106025696
loss : 0.33214542269706726
Accuracy : 0.8171641826629639


In [22]:
# Submission

# Raw Data Loading
test = pd.read_csv('./data/titanic/test.csv')

# 필요없는 column 제거
test.drop(['PassengerId','Name','Ticket','Cabin'], axis=1, inplace=True)

# Fare 처리
test['Fare'] = test['Fare'].fillna(test['Fare'].mean())

# Sex mapping 처리
sex_mapping = {'male' : 0, 'female' : 1}
test['Sex'] = test['Sex'].map(sex_mapping)

# 가족처리
test['Family'] = test['SibSp'] + test['Parch']
test.drop(['SibSp','Parch'], axis=1, inplace=True)

# Embarked 결측치 처리
test['Embarked'] = test['Embarked'].fillna('S')
# print(sum(train['Embarked'].values == 'S'))
# print(sum(train['Embarked'].values == 'C'))
# print(sum(train['Embarked'].values == 'Q'))

# Age 결측치 처리
test['Age'] = test['Age'].fillna(test['Age'].mean())

# Embarked mapping 처리
Embarked_mapping = {'S' : 0, 'C' : 1, 'Q' : 2}
test['Embarked'] = test['Embarked'].map(Embarked_mapping)

# Age에 대해서 Binning 처리(Numerical value => categorical value)
test.loc[test['Age'] < 16, 'Age'] = 0
test.loc[(test['Age'] >= 16) & (test['Age'] < 32), 'Age'] = 1
test.loc[(test['Age'] >= 32) & (test['Age'] < 48), 'Age'] = 2
test.loc[(test['Age'] >= 48) & (test['Age'] < 60), 'Age'] = 3
test.loc[test['Age'] >= 60, 'Age'] = 4
display(test)

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked,Family
0,3,0,2.0,7.8292,2,0
1,3,1,2.0,7.0000,0,1
2,2,0,4.0,9.6875,2,0
3,3,0,1.0,8.6625,0,0
4,3,1,1.0,12.2875,0,2
...,...,...,...,...,...,...
413,3,0,1.0,8.0500,0,0
414,1,1,2.0,108.9000,1,0
415,3,0,2.0,7.2500,0,0
416,3,0,1.0,8.0500,0,0


In [26]:
test = test.values
scaler = MinMaxScaler()
scaler.fit(test)
test = scaler.transform(test)

array([[1.        , 0.        , 0.5       , 0.01528158, 1.        ,
        0.        ],
       [1.        , 1.        , 0.5       , 0.01366309, 0.        ,
        0.1       ],
       [0.5       , 0.        , 1.        , 0.01890874, 1.        ,
        0.        ],
       ...,
       [1.        , 0.        , 0.5       , 0.01415106, 0.        ,
        0.        ],
       [1.        , 0.        , 0.25      , 0.01571255, 0.        ,
        0.        ],
       [1.        , 0.        , 0.25      , 0.0436405 , 0.5       ,
        0.2       ]])

In [31]:
accuracy = tf.cast(H >= 0.5, dtype=tf.int32)
result = sess.run(accuracy, feed_dict={X:test})
result = result.ravel()

[0 0 0 0 0 1 1 0 0 0 0 0 1 0 1 1 0 0 1 0 0 1 1 1 1 0 1 0 0 0 0 0 0 0 1 0 1
 1 0 0 0 0 0 1 1 0 0 0 1 0 0 0 1 1 0 0 0 0 0 1 0 0 0 1 1 1 1 0 0 1 1 0 1 0
 1 0 0 1 0 1 1 0 1 0 0 0 1 1 1 1 1 0 1 0 0 0 1 0 1 0 1 0 0 0 1 0 0 0 0 0 0
 1 1 1 1 0 0 1 0 1 1 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0
 0 0 1 0 0 0 0 0 1 1 0 0 1 1 1 0 0 0 0 0 1 1 0 0 0 0 0 1 1 0 1 1 0 1 1 0 1
 0 1 0 0 0 0 0 1 0 1 0 1 1 0 1 1 1 0 1 0 0 0 0 1 0 0 0 0 1 0 0 1 0 1 0 1 0
 1 0 1 1 0 1 0 0 1 1 0 0 0 0 0 0 1 1 1 1 0 0 0 0 1 0 1 1 1 0 0 0 0 0 0 0 1
 0 0 0 1 1 0 0 1 0 1 0 0 0 1 1 0 1 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 1 0 0 0 0
 1 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0
 1 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 1 0 0 0 0 1 1 0 0 1 0 1 1 0 1 0 0 1 1 0
 0 1 1 0 1 1 0 0 1 0 0 0 1 1 0 1 0 0 0 0 0 1 1 0 0 1 0 1 0 0 1 0 1 0 1 0 0
 0 1 1 1 1 1 0 1 0 0 0]


In [33]:
submission = pd.DataFrame()
submission['PassengerId'] = np.arange(892,892+len(result))
submission['Survived'] = result
display(submission)
submission.to_csv('./data/titanic/submission.csv', index=False)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0
