In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
train_df = pd.read_csv("../input/train.csv")
test_df = pd.read_csv("../input/test.csv")
data_df = train_df.append(test_df) # The entire data: train + test.

data_df['Title'] = data_df['Name']
# Cleaning name and extracting Title
for name_string in data_df['Name']:
    data_df['Title'] = data_df['Name'].str.extract('([A-Za-z]+)\.', expand=True)

# Replacing rare titles with more common ones
mapping = {'Mlle': 'Miss', 'Major': 'Mr', 'Col': 'Mr', 'Sir': 'Mr', 'Don': 'Mr', 'Mme': 'Miss',
          'Jonkheer': 'Mr', 'Lady': 'Mrs', 'Capt': 'Mr', 'Countess': 'Mrs', 'Ms': 'Miss', 'Dona': 'Mrs'}
data_df.replace({'Title': mapping}, inplace=True)
titles = ['Dr', 'Master', 'Miss', 'Mr', 'Mrs', 'Rev']
for title in titles:
    age_to_impute = data_df.groupby('Title')['Age'].median()[titles.index(title)]
    data_df.loc[(data_df['Age'].isnull()) & (data_df['Title'] == title), 'Age'] = age_to_impute
    
# Substituting Age values in TRAIN_DF and TEST_DF:
train_df['Age'] = data_df['Age'][:891]
test_df['Age'] = data_df['Age'][891:]

# Dropping Title feature
data_df.drop('Title', axis = 1, inplace = True)

data_df['Family_Size'] = data_df['Parch'] + data_df['SibSp']

# Substituting Age values in TRAIN_DF and TEST_DF:
train_df['Family_Size'] = data_df['Family_Size'][:891]
test_df['Family_Size'] = data_df['Family_Size'][891:]

data_df['Last_Name'] = data_df['Name'].apply(lambda x: str.split(x, ",")[0])
data_df['Fare'].fillna(data_df['Fare'].mean(), inplace=True)

DEFAULT_SURVIVAL_VALUE = 0.5
data_df['Family_Survival'] = DEFAULT_SURVIVAL_VALUE

for grp, grp_df in data_df[['Survived','Name', 'Last_Name', 'Fare', 'Ticket', 'PassengerId',
                           'SibSp', 'Parch', 'Age', 'Cabin']].groupby(['Last_Name', 'Fare']):
    
    if (len(grp_df) != 1):
        # A Family group is found.
        for ind, row in grp_df.iterrows():
            smax = grp_df.drop(ind)['Survived'].max()
            smin = grp_df.drop(ind)['Survived'].min()
            passID = row['PassengerId']
            if (smax == 1.0):
                data_df.loc[data_df['PassengerId'] == passID, 'Family_Survival'] = 1
            elif (smin==0.0):
                data_df.loc[data_df['PassengerId'] == passID, 'Family_Survival'] = 0

print("Number of passengers with family survival information:", 
      data_df.loc[data_df['Family_Survival']!=0.5].shape[0])

for _, grp_df in data_df.groupby('Ticket'):
    if (len(grp_df) != 1):
        for ind, row in grp_df.iterrows():
            if (row['Family_Survival'] == 0) | (row['Family_Survival']== 0.5):
                smax = grp_df.drop(ind)['Survived'].max()
                smin = grp_df.drop(ind)['Survived'].min()
                passID = row['PassengerId']
                if (smax == 1.0):
                    data_df.loc[data_df['PassengerId'] == passID, 'Family_Survival'] = 1
                elif (smin==0.0):
                    data_df.loc[data_df['PassengerId'] == passID, 'Family_Survival'] = 0
                        
print("Number of passenger with family/group survival information: " 
      +str(data_df[data_df['Family_Survival']!=0.5].shape[0]))

# # Family_Survival in TRAIN_DF and TEST_DF:
train_df['Family_Survival'] = data_df['Family_Survival'][:891]
test_df['Family_Survival'] = data_df['Family_Survival'][891:]

data_df['Fare'].fillna(data_df['Fare'].median(), inplace = True)

# Making Bins
data_df['FareBin'] = pd.qcut(data_df['Fare'], 5)

label = LabelEncoder()
data_df['FareBin_Code'] = label.fit_transform(data_df['FareBin'])

train_df['FareBin_Code'] = data_df['FareBin_Code'][:891]
test_df['FareBin_Code'] = data_df['FareBin_Code'][891:]

train_df.drop(['Fare'], 1, inplace=True)
test_df.drop(['Fare'], 1, inplace=True)

data_df['AgeBin'] = pd.qcut(data_df['Age'], 4)

label = LabelEncoder()
data_df['AgeBin_Code'] = label.fit_transform(data_df['AgeBin'])

train_df['AgeBin_Code'] = data_df['AgeBin_Code'][:891]
test_df['AgeBin_Code'] = data_df['AgeBin_Code'][891:]

train_df.drop(['Age'], 1, inplace=True)
test_df.drop(['Age'], 1, inplace=True)

train_df['Sex'].replace(['male','female'],[0,1],inplace=True)
test_df['Sex'].replace(['male','female'],[0,1],inplace=True)

train_df.drop(['Name', 'PassengerId', 'SibSp', 'Parch', 'Ticket', 'Cabin',
               'Embarked'], axis = 1, inplace = True)
test_df.drop(['Name','PassengerId', 'SibSp', 'Parch', 'Ticket', 'Cabin',
              'Embarked'], axis = 1, inplace = True)

X = train_df.drop('Survived', 1)
y = train_df['Survived']
X_test = test_df.copy()

std_scaler = StandardScaler()
X = std_scaler.fit_transform(X)
X_test = std_scaler.transform(X_test)

In [None]:
X.shape

In [None]:
from keras.layers import Dense, Input, Activation
from keras.optimizers import SGD
from keras.models import Model

In [None]:
def build_model():
    input_layer = Input(shape=(6,), name='input')
    hidden_layer1 = Dense(units=10, name="HiddenLayer1")(input_layer)
    hidden_layer1 = Activation('relu', name='ActivationLayer1')(hidden_layer1)
    
    hidden_layer2 = Dense(units=10, name="HiddenLayer2")(hidden_layer1)
    hidden_layer2 = Activation('relu', name='ActivationLayer2')(hidden_layer2)
    
    output_layer = Dense(units=1)(hidden_layer2)
    
    opt  = SGD(lr = 0.01)
    model = Model(input_layer, output_layer)
    model.compile(optimizer = opt, loss='mse', metrics = ['accuracy'])
    return model

m = build_model()
m.summary()

In [None]:
m.fit(X, y, batch_size=5, epochs=100)

In [None]:
# from sklearn.model_selection import StratifiedKFold

# folds = list(StratifiedKFold(n_splits=7, shuffle=True, random_state=1).split(X, y))
# skf = StratifiedKFold(n_splits=5)

# for j, (train_idx, val_idx) in enumerate(folds):
    
#     print('\nFold ',j)
#     X_train_cv = X[train_idx]
#     y_train_cv = y[train_idx]
#     X_valid_cv = X[val_idx]
#     y_valid_cv= y[val_idx]
    
#     m = build_model()
#     m.fit(X_train_cv, y_train_cv, batch_size=40, epochs=50, validation_data=(X_valid_cv, y_valid_cv), verbose=0)
#     print("Loss = {}, Accuracy = {}".format(m.evaluate(X_valid_cv, y_valid_cv)[0], m.evaluate(X_valid_cv, y_valid_cv)[1])) 


In [None]:
# from sklearn.model_selection import KFold, cross_val_score

# model = build_model()
# k_fold = KFold(n_splits=5)
# cross_val_score(model, X, y, cv=k_fold, scoring='accuracy', n_jobs=-1)

#==> Eoor: Cannot clone object '<keras.engine.training.Model object at 0x7fd92e530710>'
# (type <class 'keras.engine.training.Model'>): it does not seem to be a scikit-learn estimator as it does not implement a 'get_params' methods.

In [None]:
y_pred = m.predict(X_test)
y_pred[y_pred >= 0.5] = 1
y_pred[y_pred < 0.5] = 0
y_pred = y_pred.astype(int)

temp = pd.DataFrame(pd.read_csv("../input/test.csv")['PassengerId'])
temp['Survived'] = y_pred
temp.to_csv("../working/submission.csv", index = False)

In [None]:
# ## Use sklearn to grid serach
# activation = ['relu', 'tanh', 'sigmoid', "hard_sigmoid", 'linear'] # softmax, softplus, softsign
# momentum = [0.0, 0.2, 0.4, 0.6, 0.8, 0.9]
# learn_rate = [0.001, 0.01, 0.1, 0.2, 0.3]
# dropout_rate = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
# weight_constraint = [1, 2, 3, 4, 5]
# neurons = [1, 5, 10, 15, 20, 25, 30]
# init = ['uniform', 'lecun_uniform', 'normal', 'zero', 'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform']
# optimizer = ['SGD', 'RMSprop', 'Adagrad', 'Adadelta', 'Adam', 'Adamax', 'Nadam']
# ######################################################################
# ## grid search epochs, batch size
# epochs = [1, 10, 50, 100]
# batch_size = [5, 10, 20, 40, 60, 80, 100, 1000, 2000]

# param_grid = {
#     'epochs' : epochs,
#     'batch_size' : batch_size
# }

# from sklearn.model_selection import GridSearchCV
# model = build_model()
# grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1)  # ==>  If no scoring is specified, the estimator passed should have a 'score' method. 
#                                                                         # The estimator <keras.engine.training.Model object at 0x7fd92e5f5160> does not.
# # grid_result = grid.fit(X, y)

In [None]:
# # Explore NN using My own Grid-Search
# from sklearn.metrics import accuracy_score
# from sklearn.model_selection import train_test_split

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
# def trainNN(epochs, batch_size):
#     m = build_model()
#     m.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, verbose=0)
#     y_pred = m.predict(X_test)
#     y_pred[y_pred >= 0.5] = 1
#     y_pred[y_pred < 0.5] = 0
#     y_pred = y_pred.flatten()
    
#     accuracy = round(accuracy_score(y_test, y_pred), 4)
#     print('epochs: {} / batch_size: {} ---- Accuracy: {}'.format(epochs, batch_size, accuracy))
#     return accuracy

# acc = dict()
# for epochs in [1, 10, 50, 100]:
#     for batch_size in [5, 10, 20, 40, 60, 80, 100, 1000, 2000]:
#         acc[trainNN(epochs, batch_size)] = [epochs, batch_size]

# print("Best params: epochs = {}, batch_size = {} gives accuracy = {}".format(acc[max(acc)][0], acc[max(acc)][1], max(acc)))