## Import packages

In [1]:
from keras.optimizers import Adam
from keras.models import Sequential, load_model
from keras.layers import Dense, AlphaDropout, BatchNormalization
from keras.wrappers.scikit_learn import KerasClassifier

from sklearn.preprocessing import Normalizer, QuantileTransformer
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import classification_report
from time import time
import pandas as pd

Using TensorFlow backend.


### Helper function to get and process the data

In [2]:
def dataget(train_path, test_path):

    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)
    #Join the train and test data to cleanse and enhance the data
    df = train_data.append(test_data, ignore_index=True)
    Titles_Dictionary = {
                        "Capt":         "Officer",
                        "Col":          "Officer",
                        "Major":        "Officer",
                        "Jonkheer":     "Royalty",
                        "Don":          "Royalty",
                        "Sir":          "Royalty",
                        "Dr":           "Officer",
                        "Rev":          "Officer",
                        "the Countess": "Royalty",
                        "Dona":         "Royalty",
                        "Mme":          "Mrs",
                        "Mlle":         "Miss",
                        "Ms":           "Mrs",
                        "Mr":           "Mr",
                        "Mrs":          "Mrs",
                        "Miss":         "Miss",
                        "Master":       "Master",
                        "Lady":         "Royalty"
                        }
    ## Extract Title and map to the Titles from each Name
    df['Title'] = df['Name'].apply(lambda x: Titles_Dictionary[x.split(',')[1].split('.')[0].strip()])
    ## Fill missing Embarked with 'C'
    df['Embarked'].fillna('C', inplace=True)
    ## Note down the Imputed Ages
    df['Imputed'] = df['Age'].isnull().astype('uint8')
    columns = ['Age','Fare']
    groups = ['Title', 'Embarked']
    ## Fill null Ages with the mean Age based on Title, Embarked
    df[columns] = df.groupby(groups)[columns].transform(lambda x: x.fillna(x.mean()))
    ## Convert to categorical data
    categories = ['Title', 'Sex', 'Pclass', 'SibSp', 'Parch', 'Embarked']
    df[categories] = df[categories].apply(lambda x: x.astype('category'))
    df = df.drop(columns=['Cabin', 'Name', 'Ticket'])
    #df = df.drop(columns=['Title', 'SibSp', 'Imputed', 'Pclass', 'Parch', 'Embarked', 'Fare'])
    df = df.round(2)
    original = df.copy()
    df = pd.get_dummies(df, drop_first=True)
    test_data = df[df.Survived.isnull()].copy()
    test_data = test_data.drop(columns=['Survived'])
    train_data = df.dropna().copy()
    train_data['Survived'] = train_data['Survived'].astype('uint8')
    train_data = train_data.drop(columns=['PassengerId'])

    return original, train_data, test_data

## Neural Network

In [3]:
def build_model(optimizer=Adam(amsgrad=True),
                total_features=1,
                activation='elu',
                units=1,
                dropout_value=0.3,
                multi_layer=True,
                op_activation='sigmoid',
                loadprevmodel=False,
                modelname='Titanic-Kaggle-best'
               ):
    if loadprevmodel:
        try:
            model = load_model(modelname + '.h5')
            print('Model loaded successfully')
        except IOError:
            print('Loading previous model failed, Building a new model')       
    model = Sequential()
    model.add(Dense(input_dim=total_features, activation=activation, units=units))
    if activation == 'selu':
        model.add(AlphaDropout(dropout_value))
    else:
        model.add(BatchNormalization())
    if multi_layer:
        model.add(Dense(activation=activation, units=max(3,int(units/2))))
        if activation == 'selu':
            model.add(AlphaDropout(min(dropout_value, dropout_value * 0.9)))
        else:
            model.add(BatchNormalization())
    model.add(Dense(units=1, activation=op_activation))
    model.compile(optimizer=optimizer,
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

In [4]:
train_path = 'train.csv'
test_path = 'test.csv'
original, train_data, test_data = dataget(train_path, test_path)
df = original.copy()
df.drop(columns=['PassengerId'], inplace=True)
#df.dropna(inplace=True)
print(df.head(10))

     Age Embarked   Fare Parch Pclass     Sex SibSp  Survived   Title  Imputed
0  22.00        S   7.25     0      3    male     1       0.0      Mr        0
1  38.00        C  71.28     0      1  female     1       1.0     Mrs        0
2  26.00        S   7.92     0      3  female     0       1.0    Miss        0
3  35.00        S  53.10     0      1  female     1       1.0     Mrs        0
4  35.00        S   8.05     0      3    male     0       0.0      Mr        0
5  36.24        Q   8.46     0      3    male     0       0.0      Mr        1
6  54.00        S  51.86     0      1    male     0       0.0      Mr        0
7   2.00        S  21.08     1      3    male     3       0.0  Master        0
8  27.00        S  11.13     2      3  female     0       1.0     Mrs        0
9  14.00        C  30.07     0      2  female     1       1.0     Mrs        0


In [5]:
print(df.describe())

               Age         Fare    Survived      Imputed
count  1309.000000  1309.000000  891.000000  1309.000000
mean     30.085829    33.285921    0.383838     0.200917
std      13.214767    51.740153    0.486592     0.400839
min       0.170000     0.000000    0.000000     0.000000
25%      22.000000     7.900000    0.000000     0.000000
50%      30.000000    14.450000    0.000000     0.000000
75%      36.240000    31.280000    1.000000     0.000000
max      80.000000   512.330000    1.000000     1.000000


In [6]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 10 columns):
Age         1309 non-null float64
Embarked    1309 non-null category
Fare        1309 non-null float64
Parch       1309 non-null category
Pclass      1309 non-null category
Sex         1309 non-null category
SibSp       1309 non-null category
Survived    891 non-null float64
Title       1309 non-null category
Imputed     1309 non-null uint8
dtypes: category(6), float64(3), uint8(1)
memory usage: 40.9 KB
None


In [7]:
search = True
modelh5 = 'Titanic-Kaggle'
loadmodelh5 = 'Titanic-Kaggle-best'
batch_size = 891
epochs = 300
normalizer = Normalizer(norm='l1')
df = train_data
train = df.dropna()
y_train = train['Survived'].values
x_train = train.drop(columns=['Survived']).values
quantile_transformer = QuantileTransformer(output_distribution='normal')
X_train = normalizer.fit_transform(x_train)
#X_train = quantile_transformer.fit_transform(x_train)

In [8]:
clf = KerasClassifier(build_model,
                      total_features=X_train.shape[1],
                      units=5,
                      batch_size=batch_size,
                      epochs=epochs,
                      verbose=0
                     )
dr = [element/100 for element in range(1,50,5)]
param_dist = {'units': list(range(int(X_train.shape[1]/2), int(X_train.shape[1] * 2))),
              'dropout_value': dr,
              'activation': ['elu', 'relu', 'selu'],
              'multi_layer': [True,False]
}
n_iter_search = 2

if search:
    random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_jobs =1,
                                       n_iter=n_iter_search, scoring='accuracy', random_state=42)

    start = time()
    random_search.fit(X_train, y_train)

    print("RandomizedSearchCV took %.2f seconds for %d candidates"
          " parameter settings." % ((time() - start), n_iter_search))

    print(random_search.best_score_)
    print(random_search.best_params_)

RandomizedSearchCV took 90.37 seconds for 2 candidates parameter settings.
0.8002244668911336
{'units': 20, 'multi_layer': False, 'dropout_value': 0.31, 'activation': 'relu'}


In [9]:
#http://scikit-learn.org/stable/auto_examples/preprocessing/plot_all_scaling.html#sphx-glr-auto-examples-preprocessing-plot-all-scaling-py

In [10]:
#params = {'units': 20, 'multi_layer': False, 'dropout_value': 0.31, 'activation': 'relu'} quantile
params = {'units': 20, 'multi_layer': False, 'dropout_value': 0.31, 'activation': 'relu'}
clf = KerasClassifier(build_model,
                      total_features=X_train.shape[1],
                      batch_size=batch_size,
                      epochs=1500,
                      verbose=0,
                     **params)
clf.fit(X_train, y_train)
y_true, y_pred = y_train, clf.predict(X_train)

In [11]:
print(classification_report(y_true, y_pred))

             precision    recall  f1-score   support

          0       0.79      0.98      0.88       549
          1       0.94      0.59      0.73       342

avg / total       0.85      0.83      0.82       891



In [12]:
df = test_data
resultdf = pd.DataFrame(data=df['PassengerId'])
df = df.drop(columns=['PassengerId'])
#test_x = quantile_transformer.transform(df)
test_x = normalizer.transform(df)
predictions = clf.predict(test_x)
resultdf['Survived'] = predictions.astype(int)

resultdf.to_csv('submission.csv', index=False)