## Utility Functions Load

In [None]:
# %load utilities.py
def plot_distribution(data):
    import matplotlib.pyplot as plt
    import seaborn as sns
    from scipy import stats

    plt.figure()
    sns.distplot(data, fit=stats.norm);
    mean, std = stats.norm.fit(data)
    print('mean = {:.4f}\nstd = {:.4f}\nskewness = {:.4f}\nkurtosis = {:.4f}'
          .format(mean, std, data.skew(), data.kurtosis()))
    plt.figure()
    stats.probplot(data, plot=plt);

def describe_numerical(data):
    print('mean:', data.mean())
    print('median:', data.median())
    print('mode:', data.mode().values[0])
    print(data.describe())

def missing_value(data):
    print('missing value number:', data.isnull().sum())
    print('missing value percentage:', data.isnull().sum()/len(data))

def submit(test_X_og, pred_y):
    import pandas as pd

    submit = pd.DataFrame(data=[test_X_og.index, pred_y]).T
    submit.columns = ['PassengerId', 'Survived']
    submit = submit.astype('int32')
    submit.to_csv('submit.csv', index=False)

def gridsearchcv(model, param_grid, train_X, train_Y, dev_X, dev_Y):
    from sklearn.model_selection import (cross_val_score, GridSearchCV, KFold)
    from sklearn.metrics import accuracy_score

    model = GridSearchCV(model, param_grid=param_grid, scoring='accuracy',
                        cv=KFold(n_splits=5, shuffle=True, random_state=None))
    model.fit(train_X, train_Y)
    print('grid search best parameters:', model.best_params_)
    print('grid search best scores: {:.4f}'.format(model.best_score_))

    train_scores = cross_val_score(model, train_X, train_Y, scoring='accuracy',
                                cv=KFold(n_splits=5, shuffle=True, random_state=None))
    train_score = train_scores.mean()
    print('cv score: {:.4f}'.format(train_score))

    pred_y = model.predict(dev_X)
    dev_score = accuracy_score(dev_Y, pred_y)
    print('dev score: {:.4f}'.format(dev_score))

    return model

def plot_result(history):
    import matplotlib.pyplot as plt

    print('train set loss: {:.4f}'.format(history.history['loss'][-1]))
    print('dev set loss: {:.4f}'.format(history.history['val_loss'][-1]))
    print('train set accuracy: {:.4f}'.format(history.history['binary_accuracy'][-1]))
    print('dev set accuracy: {:.4f}'.format(history.history['val_binary_accuracy'][-1]))

    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('Model Loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend(['Train Loss', 'Dev Loss'], loc='upper right')
    plt.show()

    plt.plot(history.history['binary_accuracy'])
    plt.plot(history.history['val_binary_accuracy'])
    plt.title('Model Accuracy')
    plt.ylabel('Loss Accuracy')
    plt.xlabel('Epoch')
    plt.legend(['Train Accuracy', 'Dev Accuracy'], loc='upper right')
    plt.show()

def create_nn(train_X, train_Y, dev_X, dev_Y, l1, l2, lr, batch_size, epochs):
    from keras.callbacks import (History, EarlyStopping)
    from keras.models import Sequential
    from keras.layers import Dense, Dropout
    from keras import losses
    from keras import metrics
    from keras import optimizers
    from keras import initializers
    from keras import regularizers

    model = Sequential()

    model.add(Dense(64, activation='relu',
              kernel_initializer=initializers.he_normal(seed=42),
              bias_initializer=initializers.Zeros(),
              kernel_regularizer=regularizers.l1_l2(l1=l1, l2=l2),
              bias_regularizer=regularizers.l1_l2(l1=l1, l2=l2)))

    model.add(Dropout(rate=0.5, seed=42))

    model.add(Dense(32, activation='relu',
              kernel_regularizer=regularizers.l1_l2(l1=l1, l2=l2),
              bias_regularizer=regularizers.l1_l2(l1=l1, l2=l2)))

    model.add(Dense(1, activation='sigmoid'))

    model.compile(optimizer=optimizers.Adam(lr=lr),
              loss=losses.binary_crossentropy,
              metrics = [metrics.binary_accuracy])

    history = History()
    early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

    model.fit(train_X, train_Y, validation_data=[dev_X, dev_Y], shuffle=True, verbose=0,
                batch_size=batch_size, epochs=epochs, callbacks=[history, early_stop])

    return model, history, early_stop


## Data Load

In [None]:
import pandas as pd
import numpy as np
from scipy import stats
# from utilities import *
import matplotlib.pyplot as plt
%matplotlib inline
import re
from sklearn_pandas import DataFrameMapper
from sklearn.model_selection import (train_test_split, cross_val_score, GridSearchCV, KFold)
from sklearn import (preprocessing, feature_extraction, linear_model, svm, neighbors, 
                     gaussian_process, tree, ensemble)
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
import pickle
import warnings
warnings.filterwarnings('ignore')
%load_ext autoreload
%autoreload

train_og = pd.read_csv('../input/train.csv', index_col=0)
train_og = train_og.copy()
test_X_og = pd.read_csv('../input/test.csv', index_col=0)
test_X_og = test_X_og.copy()

print('train set shape:', train_og.shape)
print('test set shape:', test_X_og.shape)

train_X_og, train_Y_og = train_og[train_og.columns[1:]], train_og[train_og.columns[0]]

X_og = pd.concat([train_X_og, test_X_og])

## Feature Classification

In [None]:
X_og = X_og.rename(columns={'Pclass' : 'class', 'Name' : 'full name', 
                           'Sex' : 'sex', 'Age' : 'age', 'SibSp' : 'family size 01', 
                           'Parch' : 'family size 02', 'Ticket' : 'ticket', 
                           'Fare' : 'fare', 'Cabin' : 'cabin class number', 'Embarked' : 'embarked'})

# print(X_og.columns)

categorical = ['class', 'full name', 'sex', 'ticket', 'cabin class number', 'embarked']
numerical = ['age', 'fare', 'family size 01', 'family size 02']

## Check Missing Values 

In [None]:
# print('train_Y missing value:', train_Y_og.isnull().sum())
# print()
# print('X_og missing value number:')
# print((X_og.isnull().sum()).sort_values(ascending=False)[:10])
# print()
# print('X_og missing value percentage:')
# print((X_og.isnull().sum()/len(X_og)).sort_values(ascending=False)[:10])
X_fill = X_og.copy()

## 'cabin class number' Feature Missing Values

In [None]:
# missing_value(X_og['cabin class number'])

X_fill['cabin class number'] = X_fill['cabin class number'].fillna('None')

## 'age' Feature Missing Values 

In [None]:
# missing_value(X_og['age'])

# describe_numerical(X_og['age'])

X_fill['age'] = X_fill['age'].fillna(X_fill['age'].median())

## 'embarked' & 'fare' Feature Missing Values

In [None]:
# missing_value(X_og['embarked'])

X_fill['embarked'].describe()

X_fill['embarked'] = X_fill['embarked'].fillna(X_fill['embarked'].describe()['top'])

# missing_value(X_og['fare'])

# describe_numerical(X_og['fare'])

X_fill['fare'] = X_fill['fare'].fillna(X_fill['fare'].median())

## Check Filled Missing Values

In [None]:
# print('X_fill missing value number:')
# print((X_fill.isnull().sum()).sort_values(ascending=False)[:10])
# print('X_fill missing value percentage:')
# print((X_fill.isnull().sum()/len(X_fill)).sort_values(ascending=False)[:10])

## Numerical Feature Skewness & Kurtosis

In [None]:
# for i in numerical: print(i+' skewness:', X_og[i].skew())

X_fill['fare'] = np.log1p(X_fill['fare'])

# print('fare skewness:', X_og['fare'].skew())

## Drop 'ticket' Feature

In [None]:
X_fill = X_fill.drop('ticket', axis=1)

## Add Feature 'cabin class' & Drop 'cabin class number' Feature

In [None]:
X_fill['cabin class'] = list(X_fill['cabin class number'].str[0])

X_fill = X_fill.drop('cabin class number', axis=1)

## Add Feature 'family size' & Drop 'family size 01/02' Features

In [None]:
X_fill['family size'] = X_fill['family size 01'] + X_fill['family size 02'] + 1

X_fill = X_fill.drop(['family size 01', 'family size 02'], axis=1)

## Add Feature 'alone'

In [None]:
X_fill['alone'] = np.where(X_fill['family size'] == 1, 1, 0)

## Add Feature 'honorific' & Drop 'full name' Feature

In [None]:
honorific = [i[0] for i in X_fill['full name'].str.split(', ', expand=True)[1].str.split('.')]

X_fill['honorific'] = honorific

X_fill = X_fill.drop('full name', axis=1)

## Feature Value Transformation

In [None]:
X_pre_transform = X_fill.copy()

# print(X_pre_transform.columns)

nominal = ['sex', 'alone', 'honorific']
ordinal = ['class', 'embarked', 'cabin class']
discrete = ['family size']
continuous = ['age', 'fare']

mapper = DataFrameMapper([
    (nominal, preprocessing.OneHotEncoder(sparse=False)), 
    (ordinal, preprocessing.OrdinalEncoder()), 
    (discrete, preprocessing.LabelEncoder()), 
    (continuous, preprocessing.StandardScaler())], 
    df_out=True)

X = mapper.fit_transform(X_pre_transform)

# print('X feature set shape:', X.shape)

## Create Train, Dev, Test Sets

In [None]:
train_X = X.loc[train_X_og.index].values
test_X = X.loc[test_X_og.index].values

train_Y = train_Y_og.values

train_X, dev_X, train_Y, dev_Y = train_test_split(train_X, train_Y, random_state=42)

## Model 01 - LogisticRegression L1 Regularization

In [None]:
model1 = linear_model.LogisticRegression(penalty='l1', multi_class='ovr', max_iter=1000)

grid1 = {'C' : [1, 1.5, 2], 
         'solver' : ['liblinear', 'saga']}

model1 = gridsearchcv(model1, grid1, train_X, train_Y, dev_X, dev_Y)

In [None]:
pickle.dump(model1, open('model1.sav', 'wb'))
model1_load = pickle.load(open('model1.sav', 'rb'))
y1 = model1_load.predict(test_X)
y1_save = pd.DataFrame(data=[test_X_og.index, y1]).T
y1_save.columns = ['PassengerId', 'Survived']
y1_save = y1_save.astype('int32')
y1_save.to_csv('y1.csv', index=False)

## Model 02 - LogisticRegression L2 Regularization

In [None]:
model2 = linear_model.LogisticRegression(penalty='l2', max_iter=1000)

grid2 = {'C' : [1, 1.5, 2], 
         'solver' : ['newton-cg', 'lbfgs', 'sag']}

model2 = gridsearchcv(model2, grid2, train_X, train_Y, dev_X, dev_Y)

In [None]:
pickle.dump(model2, open('model2.sav', 'wb'))
model2_load = pickle.load(open('model2.sav', 'rb'))
y2 = model2_load.predict(test_X)
y2_save = pd.DataFrame(data=[test_X_og.index, y2]).T
y2_save.columns = ['PassengerId', 'Survived']
y2_save = y2_save.astype('int32')
y2_save.to_csv('y2.csv', index=False)

## Model 03 - SupportVectorMachine 

In [None]:
model3 = svm.SVC(decision_function_shape='ovr', random_state=42)

grid3 = {'C' : [5, 5.5, 6], 
         'kernel' : ['linear', 'poly', 'rbf', 'sigmoid'], 
         'degree' : [2, 3]}

model3 = gridsearchcv(model3, grid3, train_X, train_Y, dev_X, dev_Y)

In [None]:
pickle.dump(model3, open('model3.sav', 'wb'))
model3_load = pickle.load(open('model3.sav', 'rb'))
y3 = model3_load.predict(test_X)
y3_save = pd.DataFrame(data=[test_X_og.index, y3]).T
y3_save.columns = ['PassengerId', 'Survived']
y3_save = y3_save.astype('int32')
y3_save.to_csv('y3.csv', index=False)

## Model 04 - NearestNeighbors

In [None]:
model4 = neighbors.KNeighborsClassifier()

grid4 = {'n_neighbors' : [3, 4, 5, 6, 7, 8, 9], 
         'weights' : ['uniform', 'distance'], 
         'algorithm' : ['ball_tree', 'kd_tree', 'brute'], 
         'leaf_size' : [20, 30, 40, 50, 60], 
         'p' : [1, 2]}

model4 = gridsearchcv(model4, grid4, train_X, train_Y, dev_X, dev_Y)

In [None]:
pickle.dump(model4, open('model4.sav', 'wb'))
model4_load = pickle.load(open('model4.sav', 'rb'))
y4 = model4_load.predict(test_X)
y4_save = pd.DataFrame(data=[test_X_og.index, y4]).T
y4_save.columns = ['PassengerId', 'Survived']
y4_save = y4_save.astype('int32')
y4_save.to_csv('y4.csv', index=False)

## Model 05 - DecisionTree

In [None]:
model5 = tree.DecisionTreeClassifier(random_state=42)

grid5 = {'criterion' : ['gini', 'entropy'], 
        'splitter' : ['best', 'random'], 
        'max_depth' : [2, 3, 4, 5, 6, 7, 8, 9], 
        'min_samples_split' : [2, 3, 4, 5, 6, 7, 8, 9], 
        'min_samples_leaf' : [1, 2, 3, 4, 5, 6, 7, 8, 9]}

model5 = gridsearchcv(model5, grid5, train_X, train_Y, dev_X, dev_Y)

In [None]:
pickle.dump(model5, open('model5.sav', 'wb'))
model5_load = pickle.load(open('model5.sav', 'rb'))
y5 = model5_load.predict(test_X)
y5_save = pd.DataFrame(data=[test_X_og.index, y5]).T
y5_save.columns = ['PassengerId', 'Survived']
y5_save = y5_save.astype('int32')
y5_save.to_csv('y5.csv', index=False)

## Model 06 - RandomForestClassifier

In [None]:
model6 = ensemble.RandomForestClassifier(random_state=42)

grid6 = {'n_estimators' : [10, 20, 30, 40, 50], 
        'criterion' : ['gini', 'entropy'], 
        'max_depth' : [6, 7, 8], 
        'min_samples_split' : [2, 3, 4, 5], 
        'min_samples_leaf' : [1, 2, 3, 4]}

model6 = gridsearchcv(model6, grid6, train_X, train_Y, dev_X, dev_Y)

In [None]:
pickle.dump(model6, open('model6.sav', 'wb'))
model6_load = pickle.load(open('model6.sav', 'rb'))
y6 = model6_load.predict(test_X)
y6_save = pd.DataFrame(data=[test_X_og.index, y6]).T
y6_save.columns = ['PassengerId', 'Survived']
y6_save = y6_save.astype('int32')
y6_save.to_csv('y6.csv', index=False)

## Model 07 - AdaBoostClassifier

In [None]:
model7 = ensemble.AdaBoostClassifier(base_estimator=model6.estimator, random_state=42)

grid7 = {'n_estimators' : [30, 40, 50, 60, 70], 
        'learning_rate' : [0.001, 0.01, 1]}

model7 = gridsearchcv(model7, grid7, train_X, train_Y, dev_X, dev_Y)

In [None]:
pickle.dump(model7, open('model7.sav', 'wb'))
model7_load = pickle.load(open('model7.sav', 'rb'))
y7 = model7_load.predict(test_X)
y7_save = pd.DataFrame(data=[test_X_og.index, y7]).T
y7_save.columns = ['PassengerId', 'Survived']
y7_save = y7_save.astype('int32')
y7_save.to_csv('y7.csv', index=False)

## Model 08 - GradientBoostingClassifier

In [None]:
model8 = ensemble.GradientBoostingClassifier(criterion='friedman_mse', n_estimators=100, 
                                             n_iter_no_change=5, random_state=42)
grid8 = {'loss' : ['deviance', 'exponential'], 
        'learning_rate' : [0.001, 0.01, 1], 
        'min_samples_split' : [2, 3, 4, 5, 6], 
        'min_samples_leaf' : [1, 2, 3, 4, 5], 
        'max_depth' : [2, 3, 4, 5, 6]}

model8 = gridsearchcv(model8, grid8, train_X, train_Y, dev_X, dev_Y)

In [None]:
pickle.dump(model8, open('model8.sav', 'wb'))
model8_load = pickle.load(open('model8.sav', 'rb'))
y8 = model8_load.predict(test_X)
y8_save = pd.DataFrame(data=[test_X_og.index, y8]).T
y8_save.columns = ['PassengerId', 'Survived']
y8_save = y8_save.astype('int32')
y8_save.to_csv('y8.csv', index=False)

## Model 09 - XGBClassifier

In [None]:
model9 = XGBClassifier(objective='binary:logistic', eval_metric='error', gamma=0.0001, 
                       n_estimators=100, verbosity=2, random_state=42)

grid9 = {'max_depth' : [2, 3, 4, 5, 6], 
        'learning_rate' : [0.0001, 0.001, 0.01, 0.1, 1]}

model9 = gridsearchcv(model9, grid9, train_X, train_Y, dev_X, dev_Y)

In [None]:
pickle.dump(model9, open('model9.sav', 'wb'))
model9_load = pickle.load(open('model9.sav', 'rb'))
y9 = model9_load.predict(test_X)
y9_save = pd.DataFrame(data=[test_X_og.index, y9]).T
y9_save.columns = ['PassengerId', 'Survived']
y9_save = y9_save.astype('int32')
y9_save.to_csv('y9.csv', index=False)

## Model 10 - VotingClassifier

In [None]:
model1_load = pickle.load(open('../input/model1.sav', 'rb'))
model2_load = pickle.load(open('../input/model2.sav', 'rb'))
model3_load = pickle.load(open('../input/model3.sav', 'rb'))
model4_load = pickle.load(open('../input/model4.sav', 'rb'))
model5_load = pickle.load(open('../input/model5.sav', 'rb'))
model6_load = pickle.load(open('../input/model6.sav', 'rb'))
model7_load = pickle.load(open('../input/model7.sav', 'rb'))
model8_load = pickle.load(open('../input/model8.sav', 'rb'))
model9_load = pickle.load(open('../input/model9.sav', 'rb'))

In [None]:
estimators = [('model1', model1_load.estimator), ('model2', model2_load.estimator), 
              ('model3', model3_load.estimator), ('model4', model4_load.estimator), 
              ('model5', model5_load.estimator), ('model6', model6_load.estimator), 
              ('model7', model7_load.estimator), ('model8', model8_load.estimator), 
              ('model9', model9_load.estimator)]

In [None]:
model10 = ensemble.VotingClassifier(estimators=estimators, voting='hard')

grid10 = {}

model10 = gridsearchcv(model10, grid10, train_X, train_Y, dev_X, dev_Y)

In [None]:
pickle.dump(model10, open('model10.sav', 'wb'))
model10_load = pickle.load(open('model10.sav', 'rb'))
y10 = model10_load.predict(test_X)
y10_save = pd.DataFrame(data=[test_X_og.index, y10]).T
y10_save.columns = ['PassengerId', 'Survived']
y10_save = y10_save.astype('int32')
y10_save.to_csv('y10.csv', index=False)