In [1]:
import numpy
import pandas
import datetime

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers.core import Dropout
from keras.layers.normalization import BatchNormalization

from sklearn.model_selection import GridSearchCV
from keras.wrappers.scikit_learn import KerasClassifier

Using Theano backend.


In [2]:
############################################################
# SibSp　-> one hot enconding
# One hot encoding SibSp
############################################################
def get_dummies_sibSp(df_all, df, df_test) :

    categories = set(df_all['SibSp'].unique())
    df['SibSp'] = pandas.Categorical(df['SibSp'], categories=categories)
    df_test['SibSp'] = pandas.Categorical(df_test['SibSp'], categories=categories)

    df = pandas.get_dummies(df, columns=['SibSp'])
    df_test = pandas.get_dummies(df_test, columns=['SibSp'])

    return df, df_test

############################################################
# Parch　-> one hot enconding
# One hot encoding SibSp
############################################################
def get_dummies_parch(df_all, df, df_test) :

    categories = set(df_all['Parch'].unique())
    df['Parch'] = pandas.Categorical(df['Parch'], categories=categories)
    df_test['Parch'] = pandas.Categorical(df_test['Parch'], categories=categories)

    df = pandas.get_dummies(df, columns=['Parch'])
    df_test = pandas.get_dummies(df_test, columns=['Parch'])

    return df, df_test

############################################################
# Ticket　-> one hot enconding
# One hot encoding Ticket
############################################################
def get_dummies_ticket(df_all, df, df_test) :

    ticket_values = df_all['Ticket'].value_counts()
    ticket_values = ticket_values[ticket_values > 1]
    ticket_values = pandas.Series(ticket_values.index, name='Ticket')
    categories = set(ticket_values.tolist())
    df['Ticket'] = pandas.Categorical(df['Ticket'], categories=categories)
    df_test['Ticket'] = pandas.Categorical(df_test['Ticket'], categories=categories)

    df = pandas.get_dummies(df, columns=['Ticket'])
    df_test = pandas.get_dummies(df_test, columns=['Ticket'])

    return df, df_test

############################################################
# Cabin
## 
############################################################
def get_dummies_Cabin(df_all, df, df_test) :

    Cabin_values = df_all['Cabin'].value_counts()
    Cabin_values = Cabin_values[Cabin_values > 1]
    Cabin_values = pandas.Series(Cabin_values.index, name='Cabin')
    categories = set(Cabin_values.tolist())
    df['Cabin'] = pandas.Categorical(df['Cabin'], categories=categories)
    df_test['Cabin'] = pandas.Categorical(df_test['Cabin'], categories=categories)

    df = pandas.get_dummies(df, columns=['Cabin'])
    df_test = pandas.get_dummies(df_test, columns=['Cabin'])

    return df, df_test

############################################################
# Standardization
############################################################
def standardization(df, df_test) :

    standard = StandardScaler()
    df_std = pandas.DataFrame(standard.fit_transform(df[['Pclass', 'Fare']].values), columns=['Pclass', 'Fare'])
    df.loc[:,'Pclass'] = df_std['Pclass']
    df.loc[:,'Fare'] = df_std['Fare']

    df_test_std = pandas.DataFrame(standard.transform(df_test[['Pclass', 'Fare']].values), columns=['Pclass', 'Fare'])
    df_test.loc[:,'Pclass'] = df_test_std['Pclass']
    df_test.loc[:,'Fare'] = df_test_std['Fare']

    return df, df_test

############################################################
# prepare Data
############################################################
def prepareData() :

    ##############################
    # Data preprocessing
    # Extract necessary items
    ##############################
    # Load gender_submission.csv
    df = pandas.read_csv('train.csv')
    df_test = pandas.read_csv('test.csv')

    df_all = pandas.concat([df, df_test], sort=False)

    df_test_index = df_test[['PassengerId']]

    df = df[['Survived', 'Pclass', 'Sex', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Fare', 'Age', 'Name']]
    df_test = df_test[['Pclass', 'Sex', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Fare', 'Age', 'Name']]

    ##############################
    # Data preprocessing
    # Fill or remove missing values
    ##############################
    df = df[df['Fare'] != 5].reset_index(drop=True)
    df = df[df['Fare'] != 0].reset_index(drop=True)

    ##############################
    # Data preprocessing
    # Digitize labels
    ##############################
    # Gender
    ##############################
    encoder_sex = LabelEncoder()
    df['Sex'] = encoder_sex.fit_transform(df['Sex'].values)
    df_test['Sex'] = encoder_sex.transform(df_test['Sex'].values)

    ##############################
    # Data preprocessing
    # One-Hot Encoding
    ##############################
    ##############################
    # SibSp
    ##############################
    df, df_test = get_dummies_sibSp(df_all, df, df_test)

    ##############################
    # Parch
    ##############################
    df, df_test = get_dummies_parch(df_all, df, df_test)

    ##############################
    # Ticket
    ##############################
    df, df_test = get_dummies_ticket(df_all, df, df_test)

    ##############################
    # Cabin
    ##############################
    df, df_test = get_dummies_Cabin(df_all, df, df_test)
    
    ##############################
    df, df_test = standardization(df, df_test)

    ##############################
    # Data preprocessing
    # Fill or remove missing values
    ##############################
    df.fillna({'Fare':0}, inplace=True)
    df_test.fillna({'Fare':0}, inplace=True)
    df.fillna({'Cabin':0}, inplace=True)
    df_test.fillna({'Cabin':0}, inplace=True) 
    ##############################
    # Age 
    ## class별로 Fare을 가지고 나이를 추정한 다음, 추정 그룹의 평균으로 결측치를 채움
    ##############################
    chlid_Age_mean = df[(df["Age"] < 16)]["Age"].mean()
    class3_Age_mean = df[(df["Pclass"]==3)]["Age"].mean()
    class2_Age_mean = df[(df["Pclass"]==2)]["Age"].mean()
    class1_Age_mean = df[(df["Pclass"]==1)]["Age"].mean()

    df.loc[df["Age"].isna() & (df["Pclass"]==3) & (df["Fare"] > 10)]["Age"] = class3_Age_mean
    df.loc[df["Age"].isna() & (df["Pclass"]==3) & (df["Fare"] < 10)]["Age"] = chlid_Age_mean
    df.loc[df["Age"].isna() & (df["Pclass"]==2) & (df["Fare"] == 0)]["Age"] = chlid_Age_mean
    df.loc[df["Age"].isna() & (df["Pclass"]==2) & (df["Fare"] != 0)]["Age"] = class2_Age_mean
    df.loc[df["Age"].isna() & (df["Pclass"]==1) & (df["Fare"] == 0)]["Age"] = chlid_Age_mean
    df.loc[df["Age"].isna() & (df["Pclass"]==1) & (df["Fare"] != 0)]["Age"] = class1_Age_mean
    
    df["Age_Category"] = df["Age"].apply(lambda x: 1 if (x >= 16) & (x < 32)\
                                        else 2 if (x >= 32) & (x < 48)\
                                        else 3 if (x >= 48) & (x < 64)\
                                        else 4 if (x >= 64)\
                                        else 0 if (x < 16)\
                                        else -1)
    
    df["Age"] = df["Age_Category"]
    df.drop(["Age_Category"], axis=1)
    
    chlid_Age_mean = df_test[(df_test["Age"] < 16)]["Age"].mean()
    class3_Age_mean = df_test[(df_test["Pclass"]==3)]["Age"].mean()
    class2_Age_mean = df_test[(df_test["Pclass"]==2)]["Age"].mean()
    class1_Age_mean = df_test[(df_test["Pclass"]==1)]["Age"].mean()

    df_test.loc[df_test["Age"].isna() & (df_test["Pclass"]==3) & (df_test["Fare"] > 10)]["Age"] = class3_Age_mean
    df_test.loc[df_test["Age"].isna() & (df_test["Pclass"]==3) & (df_test["Fare"] < 10)]["Age"] = chlid_Age_mean
    df_test.loc[df_test["Age"].isna() & (df_test["Pclass"]==2) & (df_test["Fare"] == 0)]["Age"] = chlid_Age_mean
    df_test.loc[df_test["Age"].isna() & (df_test["Pclass"]==2) & (df_test["Fare"] != 0)]["Age"] = class2_Age_mean
    df_test.loc[df_test["Age"].isna() & (df_test["Pclass"]==1) & (df_test["Fare"] == 0)]["Age"] = chlid_Age_mean
    df_test.loc[df_test["Age"].isna() & (df_test["Pclass"]==1) & (df_test["Fare"] != 0)]["Age"] = class1_Age_mean
    
    df_test["Age_Category"] = df_test["Age"].apply(lambda x: 1 if (x >= 16) & (x < 32)\
                                        else 2 if (x >= 32) & (x < 48)\
                                        else 3 if (x >= 48) & (x < 64)\
                                        else 4 if (x >= 64)\
                                        else 0 if (x < 16)\
                                        else -1)
    
    df_test["Age"] = df_test["Age_Category"]
    df_test.drop(["Age_Category"], axis=1)
    
    ##############################
    # Title
    ##############################
    df['Title'] = df.Name.str.extract(' ([A-Za-z]+)\.', expand=False)

    df['Title'] = df['Title'].replace(['Lady', 'Countess','Capt', 'Col', 'Don',\
                                             'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'],\
                                             'Rare')
    df['Title'] = df['Title'].replace('Mlle', 'Miss')
    df['Title'] = df['Title'].replace('Ms', 'Miss')
    df['Title'] = df['Title'].replace('Mme', 'Mrs')
    
    _, df['Title'] = numpy.unique(df['Title'], return_inverse=True)
    
    df_test['Title'] = df_test.Name.str.extract(' ([A-Za-z]+)\.', expand=False)

    df_test['Title'] = df_test['Title'].replace(['Lady', 'Countess','Capt', 'Col', 'Don',\
                                             'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'],\
                                             'Rare')
    df_test['Title'] = df_test['Title'].replace('Mlle', 'Miss')
    df_test['Title'] = df_test['Title'].replace('Ms', 'Miss')
    df_test['Title'] = df_test['Title'].replace('Mme', 'Mrs')
    
    _, df_test['Title'] = numpy.unique(df_test['Title'], return_inverse=True)
    
    ##############################
    # Split training data and test data
    ##############################
    x = df.drop(columns='Survived')
    x = x.drop(columns='Name')
    df_test = df_test.drop(columns = "Name")
    y = df[['Survived']]

    return x, y, df_test, df_test_index

##############################
# Model -> 5perceptron
##############################
def create_model_5dim_layer_perceptron(input_dim, \
                                       activation="relu", \
                                       optimizer="adam", \
                                       out_dim=100, \
                                       dropout=0.5):

    model = Sequential()

    # Input - Hidden1
    model.add(Dense(input_dim=input_dim, units=out_dim))
    model.add(BatchNormalization())
    model.add(Activation(activation))
    model.add(Dropout(dropout))

    # Hidden1 - Hidden2
    model.add(Dense(units=out_dim))
    model.add(BatchNormalization())
    model.add(Activation(activation))
    model.add(Dropout(dropout))

    # Hidden2 - Hidden3
    model.add(Dense(units=out_dim))
    model.add(BatchNormalization())
    model.add(Activation(activation))
    model.add(Dropout(dropout))
    
    model.add(Dense(units=out_dim))
    model.add(BatchNormalization())
    model.add(Activation(activation))
    model.add(Dropout(dropout))

    # Hidden3 - Output
    model.add(Dense(units=1))
    model.add(Activation("sigmoid"))

    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    print(f"activation: {activation}, optimizer={optimizer}")
    return model

In [3]:
x_train, y_train, x_test, y_test_index = prepareData()

model = KerasClassifier(build_fn=create_model_5dim_layer_perceptron, verbose=1)
param_grid ={
    'input_dim':[len(x_train.columns)],
    'activation': ["relu", "sigmoid", "tanh"],
    'optimizer' :["adam", 'adagrad'],
    'out_dim':[300,500,700],
    'dropout':[0.2,0.5],
    
}
grid= GridSearchCV(estimator=model,
                   param_grid=param_grid)


In [None]:
# Training
# fit = model.fit(x_train, y_train, epochs=40, batch_size=16, verbose=2)
fit = grid.fit(x_train, y_train, epochs=25, batch_size=16, verbose=2)
print("="*30)
print(grid.best_score_)
print(grid.best_params_)


activation: relu, optimizer=adam
Epoch 1/25
 - 1s - loss: 0.5834 - accuracy: 0.7243
Epoch 2/25
 - 0s - loss: 0.4951 - accuracy: 0.7800
Epoch 3/25
 - 1s - loss: 0.4831 - accuracy: 0.8014
Epoch 4/25
 - 0s - loss: 0.4413 - accuracy: 0.8200
Epoch 5/25
 - 1s - loss: 0.3974 - accuracy: 0.8386
Epoch 6/25
 - 0s - loss: 0.4091 - accuracy: 0.8343
Epoch 7/25
 - 1s - loss: 0.3810 - accuracy: 0.8500
Epoch 8/25
 - 1s - loss: 0.3574 - accuracy: 0.8586
Epoch 9/25
 - 1s - loss: 0.3758 - accuracy: 0.8457
Epoch 10/25
 - 1s - loss: 0.3445 - accuracy: 0.8700
Epoch 11/25
 - 1s - loss: 0.3367 - accuracy: 0.8686
Epoch 12/25
 - 1s - loss: 0.3053 - accuracy: 0.8786
Epoch 13/25
 - 1s - loss: 0.2911 - accuracy: 0.8871
Epoch 14/25
 - 1s - loss: 0.2948 - accuracy: 0.8757
Epoch 15/25
 - 1s - loss: 0.3217 - accuracy: 0.8771
Epoch 16/25
 - 1s - loss: 0.3037 - accuracy: 0.8886
Epoch 17/25
 - 1s - loss: 0.2811 - accuracy: 0.8857
Epoch 18/25
 - 0s - loss: 0.3063 - accuracy: 0.8900
Epoch 19/25
 - 1s - loss: 0.2841 - accur

In [17]:
print(grid.best_score_)
print(grid.best_params_)

0.8502857208251953
{'activation': 'relu', 'dropout': 0.5, 'input_dim': 316, 'optimizer': 'adagrad', 'out_dim': 500}


best_params_로 하니깐 결과가 0.87이 나옴

In [27]:
model_1 = create_model_5dim_layer_perceptron(len(x_train.columns), \
                                           activation=grid.best_params_['activation'], \
                                           optimizer=grid.best_params_['optimizer'], \
                                           out_dim=grid.best_params_['out_dim'], \
                                           dropout=grid.best_params_['dropout'])
fit = model.fit(x_train, y_train, epochs=28, batch_size=16, verbose=2)

activation: relu, optimizer=adagrad
Epoch 1/28
 - 0s - loss: 0.2741 - accuracy: 0.8994
Epoch 2/28
 - 0s - loss: 0.2471 - accuracy: 0.8983
Epoch 3/28
 - 0s - loss: 0.2533 - accuracy: 0.8971
Epoch 4/28
 - 0s - loss: 0.2589 - accuracy: 0.8903
Epoch 5/28
 - 0s - loss: 0.2659 - accuracy: 0.9006
Epoch 6/28
 - 0s - loss: 0.2621 - accuracy: 0.9017
Epoch 7/28
 - 0s - loss: 0.2443 - accuracy: 0.9029
Epoch 8/28
 - 0s - loss: 0.2516 - accuracy: 0.8994
Epoch 9/28
 - 0s - loss: 0.2540 - accuracy: 0.8949
Epoch 10/28
 - 0s - loss: 0.2509 - accuracy: 0.8960
Epoch 11/28
 - 0s - loss: 0.2313 - accuracy: 0.9097
Epoch 12/28
 - 0s - loss: 0.2602 - accuracy: 0.8994
Epoch 13/28
 - 0s - loss: 0.2415 - accuracy: 0.9074
Epoch 14/28
 - 0s - loss: 0.2534 - accuracy: 0.8994
Epoch 15/28
 - 0s - loss: 0.2495 - accuracy: 0.8960
Epoch 16/28
 - 0s - loss: 0.2619 - accuracy: 0.9063
Epoch 17/28
 - 0s - loss: 0.2458 - accuracy: 0.8994
Epoch 18/28
 - 0s - loss: 0.2531 - accuracy: 0.8971
Epoch 19/28
 - 0s - loss: 0.2361 - ac

In [None]:
# Predict
y_test_proba = model_1.predict(x_test)
# y_test_proba = grid.predict(x_test)
y_test = numpy.round(y_test_proba).astype(int)

# Combine the data frame of PassengerId and the result
df_output = pandas.concat([y_test_index, pandas.DataFrame(y_test, columns=['Survived'])], axis=1)

# Write result.csv to the current directory
df_output.to_csv('result.csv', index=False)

# Training
# model = create_model_5dim_layer_perceptron(grid.best_params_)


In [30]:
loss, acc=model.evaluate(x_test,y_test)
print(loss, acc)

1.104421151311774 0.6866028904914856


In [31]:
model_2 = create_model_5dim_layer_perceptron(len(x_train.columns), \
                                           activation=grid.best_params_['activation'], \
                                           optimizer=grid.best_params_['optimizer'], \
                                           out_dim=grid.best_params_['out_dim'], \
                                           dropout=grid.best_params_['dropout'])
fit = model.fit(x_train, y_train, epochs=28, batch_size=16, verbose=2)

activation: relu, optimizer=adagrad
Epoch 1/28
 - 0s - loss: 0.2465 - accuracy: 0.9051
Epoch 2/28
 - 0s - loss: 0.2572 - accuracy: 0.8983
Epoch 3/28
 - 0s - loss: 0.2606 - accuracy: 0.9040
Epoch 4/28
 - 0s - loss: 0.2501 - accuracy: 0.8949
Epoch 5/28
 - 0s - loss: 0.2496 - accuracy: 0.8994
Epoch 6/28
 - 0s - loss: 0.2530 - accuracy: 0.8983
Epoch 7/28
 - 0s - loss: 0.2378 - accuracy: 0.9074
Epoch 8/28
 - 0s - loss: 0.2455 - accuracy: 0.8994
Epoch 9/28
 - 0s - loss: 0.2433 - accuracy: 0.9097
Epoch 10/28
 - 0s - loss: 0.2321 - accuracy: 0.9074
Epoch 11/28
 - 0s - loss: 0.2447 - accuracy: 0.9086
Epoch 12/28
 - 0s - loss: 0.2431 - accuracy: 0.8926
Epoch 13/28
 - 0s - loss: 0.2625 - accuracy: 0.8960
Epoch 14/28
 - 0s - loss: 0.2529 - accuracy: 0.9051
Epoch 15/28
 - 0s - loss: 0.2343 - accuracy: 0.9097
Epoch 16/28
 - 0s - loss: 0.2559 - accuracy: 0.9006
Epoch 17/28
 - 0s - loss: 0.2520 - accuracy: 0.8994
Epoch 18/28
 - 0s - loss: 0.2591 - accuracy: 0.8960
Epoch 19/28
 - 0s - loss: 0.2338 - ac

In [32]:
# Predict
y_test_proba = model_2.predict(x_test)
# y_test_proba = grid.predict(x_test)
y_test = numpy.round(y_test_proba).astype(int)

# Combine the data frame of PassengerId and the result
df_output = pandas.concat([y_test_index, pandas.DataFrame(y_test, columns=['Survived'])], axis=1)

# Write result.csv to the current directory
df_output.to_csv('result2.csv', index=False)

loss, acc=model.evaluate(x_test,y_test)
print(loss, acc)

1.6045313497479452 0.3349282443523407
