In [1]:
import pandas as pd
import seaborn as sns
import re
import numpy as np

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
def get_title(name):
    match = re.search(' ([A-Za-z]+)\.', name)
    if match:
        return match.group()
    else:
        return NaN
#정규 표현식을 이용해 name에서 title을 추출 하는 함수

In [4]:
def guess_age(cols):
    age = cols[0]
    title = cols[1]
    if pd.isnull(age):
        if title == 1:
            return 32.376543
        elif title == 3:
            return 35.898148
        elif title == 4:
            return 21.773973
        elif title == 5:
            return 4.574167
        elif title == 6:
            return 53.250000
        elif title == 7:
            return 42.000000
        elif title == 10:
            return 45.222222
    else:
        return age
    
#누락된 age값을 채우기 위해 각 title별 평균나이를 return하는 함수

In [5]:
sex_map = {"male":0,"female":0.5}
embarked_map = {'S':0,'Q':0.5,'C':1}
boolean_map = {False:0,True:1}
title_map = {' Mr.':1, ' Mrs.':3,' Miss.':4, ' Master.':5, ' Don.':10, ' Rev.':10, ' Dr.':7, ' Mme.':1, ' Ms.':1,
 ' Major.':6, ' Lady.':1, ' Sir.':1, ' Mlle.':1, ' Col.':6, ' Capt.':10, ' Countess.':1,' Jonkheer.':10}

special_title_map = {' Don.':1, ' Rev.':2.0, ' Major.':3, ' Col.':4,' Capt.':5.0, ' Mr.':6, ' Mrs.':7, ' Miss.':8, ' Master.':9,  ' Dr.':10,
                     ' Mme.':11, ' Ms.':12,' Lady.':13, ' Sir.':14, ' Mlle.':15, ' Countess.':16,' Jonkheer.':17, ' Dona.':18}

#각 feature를 전처리 하기 위한(숫자로 바꾸기 위한) dict자료형 

train['Sex'] = train['Sex'].map(sex_map) #Sex에 대한 전처리 적용

train['Embarked'] = train['Embarked'].map(embarked_map)#Embarked에 대한 전처리 적용
train['Embarked'] = train['Embarked'].fillna(0)#누락된값이 2개밖에 없어 누락된 부분에 S를 넣음

train['Cabin'] = train['Cabin'].notnull()
train['Cabin'] = train['Cabin'].map(boolean_map)#Carbin에 대해 값이 존재하면 1, 누락되었으면 0을 적용

train['Title'] = train['Name'].map(get_title)#Title column을 생성하여 Name으로부터 title을 추출하여 적용

train['Specialtitle'] = train['Title'].map(special_title_map)#Specialtitle은 학습에 직접 적용될 title값. 기존의 title은 나이를 유추하는데 사용


print(train[['Title','Age','Survived']].groupby(['Title']).mean())# title별 Age의 평균, 생존확률 확인

#기존에 생존 확률 별 title을 binding 하여 feature로 사용하였으나, 오히려 정확도를 떨어뜨려 삭제함.

train['Title'] = train['Title'].map(title_map) #title에 대한 전처리 과정

train['Title'] = train['Title'].fillna(5)#title이 비어있으면 5를 넣음

train['Age'] = train[['Age','Title']].apply(guess_age,axis=1)#비어있는 Age에 대해 각 title별 평균나이 적용

test['Sex'] = test['Sex'].map(sex_map)
test['Embarked'] = test['Embarked'].map(embarked_map)
test['Embarked'] = test['Embarked'].fillna(0)
test['Cabin'] = test['Cabin'].notnull()
test['Cabin'] = test['Cabin'].map(boolean_map)
test['Title'] = test['Name'].map(get_title)
test['Specialtitle'] = test['Title'].map(special_title_map)
test['Title'] = test['Title'].map(title_map)
test['Title'] = test['Title'].fillna(5)
test['Age'] = test[['Age','Title']].apply(guess_age,axis=1)
test['Fare'] = test['Fare'].fillna(8)
#Test Set에 대해서도 동일한 전처리 적용


train['Age']=train['Age'].map(lambda x: x/30)
train['Fare']=train['Fare'].map(lambda x: x/100)
train['Pclass']=train['Pclass'].map(lambda x:x/3)
train['SibSp']=train['SibSp'].map(lambda x: x/5)
train['Parch']=train['Parch'].map(lambda x:x/5)
test['Age']=test['Age'].map(lambda x: x/30)
test['Fare']=test['Fare'].map(lambda x: x/100)
test['Pclass']=test['Pclass'].map(lambda x:x/3)
test['SibSp']=test['SibSp'].map(lambda x: x/5)
test['Parch']=test['Parch'].map(lambda x:x/5)
#각 데이터에 대해 중간값으로 나누어 스케일링함.


train.drop('PassengerId',axis=1,inplace=True)
train.drop('Name',axis=1,inplace=True)
train.drop('Ticket',axis=1,inplace=True)
train.drop('Title',axis=1,inplace=True)

test.drop('Name',axis=1,inplace=True)
test.drop('Ticket',axis=1,inplace=True)
test.drop('Title',axis=1,inplace=True)
test.drop('PassengerId',axis=1,inplace=True)
#이제 학습을 위해 사용하지 않을 feature들을 drop


                  Age  Survived
Title                          
 Capt.      70.000000  0.000000
 Col.       58.000000  0.500000
 Countess.  33.000000  1.000000
 Don.       40.000000  0.000000
 Dr.        42.000000  0.428571
 Jonkheer.  38.000000  0.000000
 Lady.      48.000000  1.000000
 Major.     48.500000  0.500000
 Master.     4.574167  0.575000
 Miss.      21.773973  0.697802
 Mlle.      24.000000  1.000000
 Mme.       24.000000  1.000000
 Mr.        32.368090  0.156673
 Mrs.       35.898148  0.792000
 Ms.        28.000000  1.000000
 Rev.       43.166667  0.000000
 Sir.       49.000000  1.000000


In [6]:
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score 
from tensorflow.contrib import learn


In [7]:
y_data = train['Survived']
x_data = train[['Pclass','Sex','Age','SibSp','Parch','Fare','Cabin','Embarked','Specialtitle']]
#x_train, x_test, y_train, y_test = train_test_split(x_data,y_data,test_size=0.2,random_state=42) 해당 코드는 교차검증을 할 때 사용
x_train = x_data
y_train = y_data#train set의 x_data와 y_data 설정

In [8]:
feature_columns = tf.contrib.learn.infer_real_valued_columns_from_input(x_train)
classifier = learn.DNNClassifier(feature_columns=feature_columns, hidden_units=[20,20], n_classes=2,dropout=0.1)
classifier.fit(x_train, y_train,steps=5000) #hidden layer 2층, dropout 0.1, Activation Func는 ReLU 

Instructions for updating:
Please specify feature columns explicitly.
Instructions for updating:
Please use tensorflow/transform or tf.data.
Instructions for updating:
Please feed input to tf.data to support dask.
Instructions for updating:
Please access pandas data directly.
Instructions for updating:
Please use tensorflow/transform or tf.data.
Instructions for updating:
Please convert numpy dtypes explicitly.
Instructions for updating:
Please specify feature columns explicitly.
Instructions for updating:
Please switch to tf.contrib.estimator.*_head.
Instructions for updating:
Please replace uses of any Estimator from tf.contrib.learn with an Estimator from tf.estimator.*
Instructions for updating:
When switching to tf.estimator.Estimator, use tf.estimator.RunConfig instead.
INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_task_type': None, '_task_id': 0, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x00000248CD0F65C0>, '_master':

INFO:tensorflow:global_step/sec: 74.0169
INFO:tensorflow:loss = 0.41683474, step = 601 (1.350 sec)
INFO:tensorflow:global_step/sec: 70.2381
INFO:tensorflow:loss = 0.42631176, step = 701 (1.431 sec)
INFO:tensorflow:global_step/sec: 55.7962
INFO:tensorflow:loss = 0.41933635, step = 801 (1.785 sec)
INFO:tensorflow:global_step/sec: 76.2798
INFO:tensorflow:loss = 0.4227364, step = 901 (1.311 sec)
INFO:tensorflow:global_step/sec: 65.8539
INFO:tensorflow:loss = 0.4159038, step = 1001 (1.520 sec)
INFO:tensorflow:global_step/sec: 82.8403
INFO:tensorflow:loss = 0.4121674, step = 1101 (1.205 sec)
INFO:tensorflow:global_step/sec: 79.1991
INFO:tensorflow:loss = 0.4156713, step = 1201 (1.264 sec)
INFO:tensorflow:global_step/sec: 72.1404
INFO:tensorflow:loss = 0.3982657, step = 1301 (1.388 sec)
INFO:tensorflow:global_step/sec: 76.3641
INFO:tensorflow:loss = 0.41236994, step = 1401 (1.308 sec)
INFO:tensorflow:global_step/sec: 77.0677
INFO:tensorflow:loss = 0.40057722, step = 1501 (1.296 sec)
INFO:tens

DNNClassifier(params={'head': <tensorflow.contrib.learn.python.learn.estimators.head._BinaryLogisticHead object at 0x00000248CD0F6FD0>, 'hidden_units': [20, 20], 'feature_columns': (_RealValuedColumn(column_name='', dimension=9, default_value=None, dtype=tf.float64, normalizer=None),), 'optimizer': None, 'activation_fn': <function relu at 0x00000248C6A601E0>, 'dropout': 0.1, 'gradient_clip_norm': None, 'embedding_lr_multipliers': None, 'input_layer_min_slice_size': None})

In [9]:
#predicted = classifier.predict(x_test)
#print(accuracy_score(y_test,list(predicted)))
#해당 코드는 교차검증을 확인할 때 사용함

In [10]:
result = classifier.predict(test)#생성된 모델로 결과 추출

Instructions for updating:
Please switch to predict_classes, or set `outputs` argument.
Instructions for updating:
Estimator is decoupled from Scikit Learn interface by moving into
separate class SKCompat. Arguments x, y and batch_size are only
available in the SKCompat class, Estimator will only accept input_fn.
Example conversion:
  est = Estimator(...) -> est = SKCompat(Estimator(...))
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from C:\Users\ljh_9\AppData\Local\Temp\tmpzg4hszin\model.ckpt-5000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


In [11]:
submission = pd.DataFrame({
    "PassengerId" : test.index+892,
    #"PassengerId" : test["PassengerId"],
    "Survived" : list(result)
})
submission.to_csv('submission.csv',index=False)
#결과를 새로운 형태로 저장하여 Kaggle에 Submission
print(submission)

#최종 모델 교차검증 결과 약 0.81, Kaggle Score 0.79904(10000명 중 약 1300등)

     PassengerId  Survived
0            892         0
1            893         0
2            894         0
3            895         0
4            896         0
5            897         0
6            898         1
7            899         0
8            900         1
9            901         0
10           902         0
11           903         0
12           904         1
13           905         0
14           906         1
15           907         1
16           908         0
17           909         0
18           910         0
19           911         0
20           912         0
21           913         1
22           914         1
23           915         0
24           916         1
25           917         0
26           918         1
27           919         0
28           920         0
29           921         0
..           ...       ...
388         1280         0
389         1281         0
390         1282         0
391         1283         1
392         1284         1
3