In [None]:
import numpy
import pandas
import datetime

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers.core import Dropout
from keras.layers.normalization import BatchNormalization

from sklearn.model_selection import GridSearchCV
from keras.wrappers.scikit_learn import KerasClassifier

In [None]:
############################################################
# SibSp　-> one hot enconding
# One hot encoding SibSp
############################################################
def get_dummies_sibSp(df_all, df, df_test) :

    categories = set(df_all['SibSp'].unique())
    df['SibSp'] = pandas.Categorical(df['SibSp'], categories=categories)
    df_test['SibSp'] = pandas.Categorical(df_test['SibSp'], categories=categories)

    df = pandas.get_dummies(df, columns=['SibSp'])
    df_test = pandas.get_dummies(df_test, columns=['SibSp'])

    return df, df_test

############################################################
# Parch　-> one hot enconding
# One hot encoding SibSp
############################################################
def get_dummies_parch(df_all, df, df_test) :

    categories = set(df_all['Parch'].unique())
    df['Parch'] = pandas.Categorical(df['Parch'], categories=categories)
    df_test['Parch'] = pandas.Categorical(df_test['Parch'], categories=categories)

    df = pandas.get_dummies(df, columns=['Parch'])
    df_test = pandas.get_dummies(df_test, columns=['Parch'])

    return df, df_test

############################################################
# Ticket　-> one hot enconding
# One hot encoding Ticket
############################################################
def get_dummies_ticket(df_all, df, df_test) :

    ticket_values = df_all['Ticket'].value_counts()
    ticket_values = ticket_values[ticket_values > 1]
    ticket_values = pandas.Series(ticket_values.index, name='Ticket')
    categories = set(ticket_values.tolist())
    df['Ticket'] = pandas.Categorical(df['Ticket'], categories=categories)
    df_test['Ticket'] = pandas.Categorical(df_test['Ticket'], categories=categories)

    df = pandas.get_dummies(df, columns=['Ticket'])
    df_test = pandas.get_dummies(df_test, columns=['Ticket'])

    return df, df_test

############################################################
# Cabin 
## 객실 위치. 2개 이상 있는 value 찾아서 one- hot- encoding
############################################################
def get_dummies_Cabin(df_all, df, df_test) :

    Cabin_values = df_all['Cabin'].value_counts()
    Cabin_values = Cabin_values[Cabin_values > 1]
    Cabin_values = pandas.Series(Cabin_values.index, name='Cabin')
    categories = set(Cabin_values.tolist())
    df['Cabin'] = pandas.Categorical(df['Cabin'], categories=categories)
    df_test['Cabin'] = pandas.Categorical(df_test['Cabin'], categories=categories)

    df = pandas.get_dummies(df, columns=['Cabin'])
    df_test = pandas.get_dummies(df_test, columns=['Cabin'])

    return df, df_test


############################################################
# Standardization
############################################################
def standardization(df, df_test) :

    standard = StandardScaler()
    df_std = pandas.DataFrame(standard.fit_transform(df[['Pclass', 'Fare']].values), columns=['Pclass', 'Fare'])
    df.loc[:,'Pclass'] = df_std['Pclass']
    df.loc[:,'Fare'] = df_std['Fare']

    df_test_std = pandas.DataFrame(standard.transform(df_test[['Pclass', 'Fare']].values), columns=['Pclass', 'Fare'])
    df_test.loc[:,'Pclass'] = df_test_std['Pclass']
    df_test.loc[:,'Fare'] = df_test_std['Fare']

    return df, df_test

############################################################
# prepare Data
############################################################
def prepareData() :

    ##############################
    # Data preprocessing
    # Extract necessary items
    ##############################
    # Load gender_submission.csv
    df = pandas.read_csv('train.csv')
    df_test = pandas.read_csv('test.csv')

    df_all = pandas.concat([df, df_test], sort=False)

    df_test_index = df_test[['PassengerId']]

    df = df[['Survived', 'Pclass', 'Sex', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Fare', 'Age', 'Name']]
    df_test = df_test[['Pclass', 'Sex', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Fare', 'Age', 'Name']]

    ##############################
    # Data preprocessing
    # Fill or remove missing values
    ##############################
    df = df[df['Fare'] != 5].reset_index(drop=True)
    df = df[df['Fare'] != 0].reset_index(drop=True)

    ##############################
    # Data preprocessing
    # Digitize labels
    ##############################
    # Gender
    ##############################
    encoder_sex = LabelEncoder()
    df['Sex'] = encoder_sex.fit_transform(df['Sex'].values)
    df_test['Sex'] = encoder_sex.transform(df_test['Sex'].values)

    ##############################
    # Data preprocessing
    # One-Hot Encoding
    ##############################
    ##############################
    # SibSp
    ##############################
    df, df_test = get_dummies_sibSp(df_all, df, df_test)

    ##############################
    # Parch
    ##############################
    df, df_test = get_dummies_parch(df_all, df, df_test)

    ##############################
    # Ticket
    ##############################
    df, df_test = get_dummies_ticket(df_all, df, df_test)

    ##############################
    # Cabin
    ##############################
    df, df_test = get_dummies_Cabin(df_all, df, df_test)
    
    ##############################
    df, df_test = standardization(df, df_test)

    ##############################
    # Data preprocessing
    # Fill or remove missing values
    ##############################
    df.fillna({'Fare':0}, inplace=True)
    df_test.fillna({'Fare':0}, inplace=True)
    df.fillna({'Cabin':0}, inplace=True)
    df_test.fillna({'Cabin':0}, inplace=True) 
    ##############################
    # Age
    ## class 별로 Fare 를 가지고 나이를 추정한 다음, 추정 그룹의 평균으로 결측치를 채움 
    ##############################
    chlid_Age_mean = df[(df["Age"] < 16)]["Age"].mean()
    class3_Age_mean = df[(df["Pclass"]==3)]["Age"].mean()
    class2_Age_mean = df[(df["Pclass"]==2)]["Age"].mean()
    class1_Age_mean = df[(df["Pclass"]==1)]["Age"].mean()

    df.loc[df["Age"].isna() & (df["Pclass"]==3) & (df["Fare"] > 10)]["Age"] = class3_Age_mean
    df.loc[df["Age"].isna() & (df["Pclass"]==3) & (df["Fare"] < 10)]["Age"] = chlid_Age_mean
    df.loc[df["Age"].isna() & (df["Pclass"]==2) & (df["Fare"] == 0)]["Age"] = chlid_Age_mean
    df.loc[df["Age"].isna() & (df["Pclass"]==2) & (df["Fare"] != 0)]["Age"] = class2_Age_mean
    df.loc[df["Age"].isna() & (df["Pclass"]==1) & (df["Fare"] == 0)]["Age"] = chlid_Age_mean
    df.loc[df["Age"].isna() & (df["Pclass"]==1) & (df["Fare"] != 0)]["Age"] = class1_Age_mean
    
    df["Age_Category"] = df["Age"].apply(lambda x: 1 if (x >= 16) & (x < 32)\
                                        else 2 if (x >= 32) & (x < 48)\
                                        else 3 if (x >= 48) & (x < 64)\
                                        else 4 if (x >= 64)\
                                        else 0 if (x < 16)\
                                        else -1)
    
    df["Age"] = df["Age_Category"]
    df.drop(["Age_Category"], axis=1)
    
    chlid_Age_mean = df_test[(df_test["Age"] < 16)]["Age"].mean()
    class3_Age_mean = df_test[(df_test["Pclass"]==3)]["Age"].mean()
    class2_Age_mean = df_test[(df_test["Pclass"]==2)]["Age"].mean()
    class1_Age_mean = df_test[(df_test["Pclass"]==1)]["Age"].mean()

    df_test.loc[df_test["Age"].isna() & (df_test["Pclass"]==3) & (df_test["Fare"] > 10)]["Age"] = class3_Age_mean
    df_test.loc[df_test["Age"].isna() & (df_test["Pclass"]==3) & (df_test["Fare"] < 10)]["Age"] = chlid_Age_mean
    df_test.loc[df_test["Age"].isna() & (df_test["Pclass"]==2) & (df_test["Fare"] == 0)]["Age"] = chlid_Age_mean
    df_test.loc[df_test["Age"].isna() & (df_test["Pclass"]==2) & (df_test["Fare"] != 0)]["Age"] = class2_Age_mean
    df_test.loc[df_test["Age"].isna() & (df_test["Pclass"]==1) & (df_test["Fare"] == 0)]["Age"] = chlid_Age_mean
    df_test.loc[df_test["Age"].isna() & (df_test["Pclass"]==1) & (df_test["Fare"] != 0)]["Age"] = class1_Age_mean
    
    df_test["Age_Category"] = df_test["Age"].apply(lambda x: 1 if (x >= 16) & (x < 32)\
                                        else 2 if (x >= 32) & (x < 48)\
                                        else 3 if (x >= 48) & (x < 64)\
                                        else 4 if (x >= 64)\
                                        else 0 if (x < 16)\
                                        else -1)
    
    df_test["Age"] = df_test["Age_Category"]
    df_test.drop(["Age_Category"], axis=1)
    
    ##############################
    # Title
    ## 예) Mr, Mrs 등의 용어가 들어간 이름은 귀족 계급이라 추정, 해당 용어가 들어간 귀족들의 이름 대상자들의
    ## 생존율이 높았음을 확인, 따라서 Name 변수에서 귀족들의 특성을 뽑은 Title 변수를 추가함. 
    ##############################
    df['Title'] = df.Name.str.extract(' ([A-Za-z]+)\.', expand=False)

    df['Title'] = df['Title'].replace(['Lady', 'Countess','Capt', 'Col', 'Don',\
                                             'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'],\
                                             'Rare')
    df['Title'] = df['Title'].replace('Mlle', 'Miss')
    df['Title'] = df['Title'].replace('Ms', 'Miss')
    df['Title'] = df['Title'].replace('Mme', 'Mrs')
    
    _, df['Title'] = numpy.unique(df['Title'], return_inverse=True)
    
    df_test['Title'] = df_test.Name.str.extract(' ([A-Za-z]+)\.', expand=False)

    df_test['Title'] = df_test['Title'].replace(['Lady', 'Countess','Capt', 'Col', 'Don',\
                                             'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'],\
                                             'Rare')
    df_test['Title'] = df_test['Title'].replace('Mlle', 'Miss')
    df_test['Title'] = df_test['Title'].replace('Ms', 'Miss')
    df_test['Title'] = df_test['Title'].replace('Mme', 'Mrs')
    
    _, df_test['Title'] = numpy.unique(df_test['Title'], return_inverse=True)
    
    ##############################
    # Split training data and test data
    ##############################
    x = df.drop(columns='Survived')
    x = x.drop(columns='Name')
    df_test = df_test.drop(columns = "Name")
    y = df[['Survived']]

    return x, y, df_test, df_test_index

##############################
# Model -> 5perceptron
##############################
def create_model_5dim_layer_perceptron(input_dim, \
                                       activation="relu", \
                                       optimizer="adam", \
                                       out_dim=100, \
                                       dropout=0.5):

    model = Sequential()

    # Input - Hidden1
    model.add(Dense(input_dim=input_dim, units=out_dim))
    model.add(BatchNormalization())
    model.add(Activation(activation))
    model.add(Dropout(dropout))

    # Hidden1 - Hidden2
    model.add(Dense(units=out_dim))
    model.add(BatchNormalization())
    model.add(Activation(activation))
    model.add(Dropout(dropout))

    # Hidden2 - Hidden3
    model.add(Dense(units=out_dim))
    model.add(BatchNormalization())
    model.add(Activation(activation))
    model.add(Dropout(dropout))
    
    model.add(Dense(units=out_dim))
    model.add(BatchNormalization())
    model.add(Activation(activation))
    model.add(Dropout(dropout))

    # Hidden3 - Output
    model.add(Dense(units=1))
    model.add(Activation("sigmoid"))

    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    print(f"activation: {activation}, optimizer={optimizer}")
    return model

## GridSearch를 사용하여 최고의 Parameter 찾기

In [None]:
x_train, y_train, x_test, y_test_index = prepareData()

model = KerasClassifier(build_fn=create_model_5dim_layer_perceptron, verbose=1)
param_grid ={
    'input_dim':[len(x_train.columns)],
    # 'activation': ["relu"],
    # 'optimizer' :["adam"],
    # 'out_dim':[300],
    'activation': ["relu", "sigmoid", "tanh"],
    'optimizer' :["adam", 'adagrad'],
    'out_dim':[500,700],
    'dropout':[0.2, 0.5],
    
}
grid= GridSearchCV(estimator=model,
                   param_grid=param_grid,cv=3)


In [12]:
# Training
# fit = model.fit(x_train, y_train, epochs=40, batch_size=16, verbose=2)
fit = grid.fit(x_train, y_train, epochs=40, batch_size=16, verbose=2)
print("="*30)
print(grid.best_score_)
print(grid.best_params_)


[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
Epoch 3/40
 - 0s - loss: 0.6183 - accuracy: 0.7273
Epoch 4/40
 - 0s - loss: 0.5556 - accuracy: 0.7461
Epoch 5/40
 - 0s - loss: 0.5009 - accuracy: 0.7736
Epoch 6/40
 - 0s - loss: 0.4601 - accuracy: 0.8079
Epoch 7/40
 - 0s - loss: 0.4559 - accuracy: 0.8199
Epoch 8/40
 - 0s - loss: 0.4291 - accuracy: 0.8302
Epoch 9/40
 - 0s - loss: 0.4113 - accuracy: 0.8508
Epoch 10/40
 - 0s - loss: 0.4723 - accuracy: 0.8182
Epoch 11/40
 - 0s - loss: 0.4250 - accuracy: 0.8302
Epoch 12/40
 - 0s - loss: 0.3544 - accuracy: 0.8714
Epoch 13/40
 - 0s - loss: 0.3847 - accuracy: 0.8611
Epoch 14/40
 - 0s - loss: 0.3886 - accuracy: 0.8559
Epoch 15/40
 - 0s - loss: 0.3749 - accuracy: 0.8611
Epoch 16/40
 - 0s - loss: 0.3578 - accuracy: 0.8559
Epoch 17/40
 - 0s - loss: 0.3446 - accuracy: 0.8799
Epoch 18/40
 - 0s - loss: 0.3167 - accuracy: 0.8782
Epoch 19/40
 - 0s - loss: 0.3225 - accuracy: 0.8782
Epoch 20/40
 - 0s - loss: 0.2950 - accuracy: 0.8851
Epoch 21/40
 - 0s - l

In [13]:
print(grid.best_score_)
print(grid.best_params_)

0.8285709818204244
{'activation': 'relu', 'dropout': 0.5, 'input_dim': 316, 'optimizer': 'adagrad', 'out_dim': 500}


In [17]:
model = create_model_5dim_layer_perceptron(len(x_train.columns), \
                                           activation=grid.best_params_['activation'], \
                                           optimizer=grid.best_params_['optimizer'], \
                                           out_dim=grid.best_params_['out_dim'], \
                                           dropout=grid.best_params_['dropout'])
fit = model.fit(x_train, y_train, epochs=28, batch_size=16, verbose=2)

activation: relu, optimizer=adagrad
Epoch 1/28
 - 1s - loss: 0.7690 - accuracy: 0.6949
Epoch 2/28
 - 0s - loss: 0.5298 - accuracy: 0.7851
Epoch 3/28
 - 0s - loss: 0.5225 - accuracy: 0.7840
Epoch 4/28
 - 0s - loss: 0.4691 - accuracy: 0.8114
Epoch 5/28
 - 0s - loss: 0.4843 - accuracy: 0.7977
Epoch 6/28
 - 0s - loss: 0.4245 - accuracy: 0.8206
Epoch 7/28
 - 0s - loss: 0.4114 - accuracy: 0.8343
Epoch 8/28
 - 0s - loss: 0.3799 - accuracy: 0.8480
Epoch 9/28
 - 0s - loss: 0.3894 - accuracy: 0.8617
Epoch 10/28
 - 0s - loss: 0.4164 - accuracy: 0.8457
Epoch 11/28
 - 0s - loss: 0.3957 - accuracy: 0.8526
Epoch 12/28
 - 0s - loss: 0.3791 - accuracy: 0.8571
Epoch 13/28
 - 0s - loss: 0.3536 - accuracy: 0.8629
Epoch 14/28
 - 0s - loss: 0.3738 - accuracy: 0.8560
Epoch 15/28
 - 0s - loss: 0.3518 - accuracy: 0.8560
Epoch 16/28
 - 0s - loss: 0.3695 - accuracy: 0.8720
Epoch 17/28
 - 0s - loss: 0.3225 - accuracy: 0.8709
Epoch 18/28
 - 0s - loss: 0.3328 - accuracy: 0.8766
Epoch 19/28
 - 0s - loss: 0.3522 - ac

In [None]:
# Predict
y_test_proba = model.predict(x_test)
# y_test_proba = grid.predict(x_test)
y_test = numpy.round(y_test_proba).astype(int)

# Combine the data frame of PassengerId and the result
df_output = pandas.concat([y_test_index, pandas.DataFrame(y_test, columns=['Survived'])], axis=1)

# Write result.csv to the current directory
df_output.to_csv('result.csv', index=False)

# Training
# model = create_model_5dim_layer_perceptron(grid.best_params_)


In [19]:
loss, acc=model.evaluate(x_test,y_test)
print(loss, acc)

0.1843599399738905 1.0
