In [131]:
import numpy
import pandas as pd
import datetime

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers.core import Dropout
from keras.layers.normalization import BatchNormalization

In [144]:
############################################################
# SibSp　-> one hot enconding
# One hot encoding SibSp
############################################################
def get_dummies_sibSp(df_all, df, df_test) :

    categories = set(df_all['SibSp'].unique())
    df['SibSp'] = pandas.Categorical(df['SibSp'], categories=categories)
    df_test['SibSp'] = pandas.Categorical(df_test['SibSp'], categories=categories)

    df = pandas.get_dummies(df, columns=['SibSp'])
    df_test = pandas.get_dummies(df_test, columns=['SibSp'])

    return df, df_test

############################################################
# Parch　-> one hot enconding
# One hot encoding SibSp
############################################################
def get_dummies_parch(df_all, df, df_test) :

    categories = set(df_all['Parch'].unique())
    df['Parch'] = pandas.Categorical(df['Parch'], categories=categories)
    df_test['Parch'] = pandas.Categorical(df_test['Parch'], categories=categories)

    df = pandas.get_dummies(df, columns=['Parch'])
    df_test = pandas.get_dummies(df_test, columns=['Parch'])

    return df, df_test

############################################################
# Ticket　-> one hot enconding
# One hot encoding Ticket
############################################################
def get_dummies_ticket(df_all, df, df_test) :

    ticket_values = df_all['Ticket'].value_counts()
    ticket_values = ticket_values[ticket_values > 1]
    ticket_values = pandas.Series(ticket_values.index, name='Ticket')
    categories = set(ticket_values.tolist())
    df['Ticket'] = pandas.Categorical(df['Ticket'], categories=categories)
    df_test['Ticket'] = pandas.Categorical(df_test['Ticket'], categories=categories)

    df = pandas.get_dummies(df, columns=['Ticket'])
    df_test = pandas.get_dummies(df_test, columns=['Ticket'])

    return df, df_test

############################################################
# Standardization
############################################################
def standardization(df, df_test) :

    standard = StandardScaler()
    df_std = pandas.DataFrame(standard.fit_transform(df[['Pclass', 'Fare']].values), columns=['Pclass', 'Fare'])
    df.loc[:,'Pclass'] = df_std['Pclass']
    df.loc[:,'Fare'] = df_std['Fare']

    df_test_std = pandas.DataFrame(standard.transform(df_test[['Pclass', 'Fare']].values), columns=['Pclass', 'Fare'])
    df_test.loc[:,'Pclass'] = df_test_std['Pclass']
    df_test.loc[:,'Fare'] = df_test_std['Fare']

    return df, df_test

In [153]:


############################################################
# prepare Data
############################################################
def prepareData() :

    ##############################
    # Data preprocessing
    # Extract necessary items
    ##############################
    # Load gender_submission.csv
    df = pandas.read_csv('train.csv')
    df_test = pandas.read_csv('test.csv')

    df_all = pandas.concat([df, df_test], sort=False)

    df_test_index = df_test[['PassengerId']]

    df = df[['Survived', 'Pclass', 'Sex', 'SibSp', 'Parch','Ticket', 'Fare','Age','Name']]
    df_test = df_test[['Pclass', 'Sex', 'SibSp', 'Parch', 'Ticket', 'Fare','Age','Name']]

    ##############################
    # Data preprocessing
    # Fill or remove missing values
    ##############################
    df = df[df['Fare'] != 5].reset_index(drop=True)
    df = df[df['Fare'] != 0].reset_index(drop=True)

    ##############################
    # Data preprocessing
    # Digitize labels
    ##############################
    # Gender
    ##############################
    encoder_sex = LabelEncoder()
    df['Sex'] = encoder_sex.fit_transform(df['Sex'].values)
    df_test['Sex'] = encoder_sex.transform(df_test['Sex'].values)
    
    
    #Age
    
    df = df.fillna(df.mean()['Age'])
    
    df["Age_Category"] = df["Age"].apply(lambda x: 1 if (x >= 16) & (x < 32) 
                                     else 2 if (x >= 32) & (x < 48) 
                                     else 3 if (x >= 48) & (x < 64)
                                     else 4 if (x >= 64)
                                     else 0 if (x < 16)
                                     else -1)
    
    df_test = df_test.fillna(df.mean()['Age'])
    df_test["Age_Category"] = df_test["Age"].apply(lambda x: 1 if (x >= 16) & (x < 32) 
                                     else 2 if (x >= 32) & (x < 48) 
                                     else 3 if (x >= 48) & (x < 64)
                                     else 4 if (x >= 64)
                                     else 0 if (x < 16)
                                     else -1)
    
    #title
    
    df['Title'] = df.Name.str.extract(' ([A-Za-z]+)\.', expand=False)

    df['Title'] = df['Title'].replace(['Lady', 'Countess','Capt', 'Col', 'Don',\
                                             'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'],\
                                             'Rare')
    df['Title'] = df['Title'].replace('Mlle', 'Miss')
    df['Title'] = df['Title'].replace('Ms', 'Miss')
    df['Title'] = df['Title'].replace('Mme', 'Mrs')
    
    _, df['Title'] = numpy.unique(df['Title'], return_inverse=True)
    
    df_test['Title'] = df_test.Name.str.extract(' ([A-Za-z]+)\.', expand=False)

    df_test['Title'] = df_test['Title'].replace(['Lady', 'Countess','Capt', 'Col', 'Don',\
                                             'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'],\
                                             'Rare')
    df_test['Title'] = df_test['Title'].replace('Mlle', 'Miss')
    df_test['Title'] = df_test['Title'].replace('Ms', 'Miss')
    df_test['Title'] = df_test['Title'].replace('Mme', 'Mrs')
    
    _, df_test['Title'] = numpy.unique(df_test['Title'], return_inverse=True)
    
    
    
    
    
    

    ##############################
    # Data preprocessing
    # One-Hot Encoding
    ##############################
    ##############################
    # SibSp
    ##############################
    df, df_test = get_dummies_sibSp(df_all, df, df_test)

    ##############################
    # Parch
    ##############################
    df, df_test = get_dummies_parch(df_all, df, df_test)

    ##############################
    # Ticket
    ##############################
#     df, df_test = get_dummies_ticket(df_all, df, df_test)

    ##############################
    ##############################
    df, df_test = standardization(df, df_test)

    ##############################
    # Data preprocessing
    # Fill or remove missing values
    ##############################
    df.fillna({'Fare':0}, inplace=True)
    df_test.fillna({'Fare':0}, inplace=True)

    ##############################
    # Split training data and test data
    ##############################
    x_pre1 = df.drop(columns='Survived')
#     x_pre2 = x_pre1.drop(columns = 'Age')
    x_pre3 = x_pre2.drop(columns = 'Name')
    x = x_pre3.drop(columns = 'Ticket')
    y = df[['Survived']]
    
    df_test_pre1 = df_test.drop(columns='Age')
    df_test_pre2 = df_test_pre1.drop(columns = 'Name')
    df_test =df_test_pre2.drop(columns = 'Ticket')
    
    return x, y, df_test, df_test_index



In [154]:
prepareData()

(       Pclass  Sex      Fare  Age_Category  Title  SibSp_0  SibSp_1  SibSp_2  \
 0    0.818989    1 -0.511484             1      2        0        1        0   
 1   -1.577718    0  0.771029             2      3        0        1        0   
 2    0.818989    0 -0.497964             1      1        1        0        0   
 3   -1.577718    0  0.406838             2      3        0        1        0   
 4    0.818989    1 -0.495461             2      2        1        0        0   
 ..        ...  ...       ...           ...    ...      ...      ...      ...   
 870 -0.379365    1 -0.396318             1      4        1        0        0   
 871 -1.577718    0 -0.055828             1      1        1        0        0   
 872  0.818989    0 -0.187017             1      1        0        1        0   
 873 -1.577718    1 -0.055828             1      2        1        0        0   
 874  0.818989    1 -0.501470             2      2        1        0        0   
 
      SibSp_3  SibSp_4  Si

In [155]:
##############################
# Model -> 5perceptron
##############################
def create_model_5dim_layer_perceptron(input_dim, \
                                       activation="relu", \
                                       optimizer="adam", \
                                       out_dim=100, \
                                       dropout=0.5):

    model = Sequential()

    # Input - Hidden1
    model.add(Dense(input_dim=input_dim, units=out_dim))
    model.add(BatchNormalization())
    model.add(Activation(activation))
    model.add(Dropout(dropout))

    # Hidden1 - Hidden2
    model.add(Dense(units=out_dim))
    model.add(BatchNormalization())
    model.add(Activation(activation))
    model.add(Dropout(dropout))

    # Hidden2 - Hidden3
    model.add(Dense(units=out_dim))
    model.add(BatchNormalization())
    model.add(Activation(activation))
    model.add(Dropout(dropout))

    # Hidden3 - Output
    model.add(Dense(units=1))
    model.add(Activation("sigmoid"))

    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

    return model

In [156]:
x_train, y_train, x_test, y_test_index = prepareData()

model = create_model_5dim_layer_perceptron(len(x_train.columns), \
                                           activation="relu", \
                                           optimizer="adam", \
                                           out_dim=702, \
                                           dropout=0.5)
model.summary()    

Model: "sequential_13"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_49 (Dense)             (None, 702)               14742     
_________________________________________________________________
batch_normalization_37 (Batc (None, 702)               2808      
_________________________________________________________________
activation_49 (Activation)   (None, 702)               0         
_________________________________________________________________
dropout_37 (Dropout)         (None, 702)               0         
_________________________________________________________________
dense_50 (Dense)             (None, 702)               493506    
_________________________________________________________________
batch_normalization_38 (Batc (None, 702)               2808      
_________________________________________________________________
activation_50 (Activation)   (None, 702)             

In [157]:
# Training
fit = model.fit(x_train, y_train, epochs=25, batch_size=16, verbose=2)

# Predict
y_test_proba = model.predict(x_test)
y_test = numpy.round(y_test_proba).astype(int)

# Combine the data frame of PassengerId and the result
df_output = pandas.concat([y_test_index, pandas.DataFrame(y_test, columns=['Survived'])], axis=1)

# Write result.csv to the current directory
df_output.to_csv('result.csv', index=False)

Epoch 1/25
 - 1s - loss: 0.7022 - accuracy: 0.7006
Epoch 2/25
 - 1s - loss: 0.6269 - accuracy: 0.7520
Epoch 3/25
 - 1s - loss: 0.5734 - accuracy: 0.7726
Epoch 4/25
 - 1s - loss: 0.5706 - accuracy: 0.7623
Epoch 5/25
 - 1s - loss: 0.5186 - accuracy: 0.7783
Epoch 6/25
 - 1s - loss: 0.5070 - accuracy: 0.7897
Epoch 7/25
 - 1s - loss: 0.5050 - accuracy: 0.7931
Epoch 8/25
 - 2s - loss: 0.4787 - accuracy: 0.7989
Epoch 9/25
 - 1s - loss: 0.4865 - accuracy: 0.7977
Epoch 10/25
 - 1s - loss: 0.4512 - accuracy: 0.8091
Epoch 11/25
 - 1s - loss: 0.4688 - accuracy: 0.8000
Epoch 12/25
 - 1s - loss: 0.4343 - accuracy: 0.8103
Epoch 13/25
 - 2s - loss: 0.4378 - accuracy: 0.8171
Epoch 14/25
 - 1s - loss: 0.4853 - accuracy: 0.8103
Epoch 15/25
 - 1s - loss: 0.4600 - accuracy: 0.8183
Epoch 16/25
 - 2s - loss: 0.4423 - accuracy: 0.8023
Epoch 17/25
 - 1s - loss: 0.4353 - accuracy: 0.8171
Epoch 18/25
 - 1s - loss: 0.4306 - accuracy: 0.8126
Epoch 19/25
 - 1s - loss: 0.4322 - accuracy: 0.8080
Epoch 20/25
 - 2s - l