In [1]:
import numpy
import pandas as pd
from sklearn import preprocessing
numpy.random.seed(10)

In [2]:
all_df = pd.read_excel("data/titanic3.xls")

In [3]:
cols=['survived', 'name', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked']
all_df = all_df[cols]

In [4]:
msk = numpy.random.rand( len(all_df) ) < 0.8
train_df = all_df[msk]
test_df = all_df[~msk]

In [5]:
print('total:', len(all_df))
print('train:', len(train_df))
print('test:', len(test_df))

total: 1309
train: 1034
test: 275


In [6]:
# define the preprocess function  
def preprocess_data(raw_df):
    df = raw_df.drop( ['name'], axis=1 )
    age_mean = df['age'].mean()
    df['age'] = df['age'].fillna(age_mean)
    fare_mean = df['fare'].mean()
    df['fare'] = df['fare'].fillna(fare_mean)
    df['sex'] = df['sex'].map( {'female':0, 'male':1} ).astype(int)
    x_one_hot_df = pd.get_dummies( data=df, columns=["embarked" ])
    
    nd_array = x_one_hot_df.values
    feature = nd_array[:, 1:]
    label = nd_array[:, 0]
    
    minmax_scale = preprocessing.MinMaxScaler( feature_range=(0,1) )
    scaled_features = minmax_scale.fit_transform( feature )
    
    return scaled_features,label

In [7]:
train_features, train_label = preprocess_data( train_df )
test_features, test_label = preprocess_data( test_df )

In [8]:
train_features[:5]

array([[0.        , 0.        , 0.38844819, 0.        , 0.        ,
        0.41250333, 0.        , 0.        , 1.        ],
       [0.        , 1.        , 0.00679502, 0.125     , 0.22222222,
        0.2958059 , 0.        , 0.        , 1.        ],
       [0.        , 0.        , 0.02151711, 0.125     , 0.22222222,
        0.2958059 , 0.        , 0.        , 1.        ],
       [0.        , 1.        , 0.40203823, 0.125     , 0.22222222,
        0.2958059 , 0.        , 0.        , 1.        ],
       [0.        , 0.        , 0.33408803, 0.125     , 0.22222222,
        0.2958059 , 0.        , 0.        , 1.        ]])

In [9]:
train_label[:5]

array([1., 1., 0., 0., 0.])

In [10]:
from keras.models import Sequential
from keras.layers import Dense, Dropout

Using TensorFlow backend.


In [11]:
model = Sequential()

model.add( Dense(units=40, input_dim=9, kernel_initializer='uniform', activation='relu') )

model.add( Dense(units=30, kernel_initializer='uniform', activation='relu') )

model.add( Dense(units=1, kernel_initializer='uniform', activation='sigmoid') )

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 40)                400       
_________________________________________________________________
dense_2 (Dense)              (None, 30)                1230      
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 31        
Total params: 1,661
Trainable params: 1,661
Non-trainable params: 0
_________________________________________________________________


In [12]:
model.compile( loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'] )

In [13]:
train_history = model.fit( x=train_features, y=train_label, validation_split=0.1, epochs=30, batch_size=30, verbose=2)

Train on 930 samples, validate on 104 samples
Epoch 1/30
 - 2s - loss: 0.6901 - acc: 0.5774 - val_loss: 0.6713 - val_acc: 0.7885
Epoch 2/30
 - 0s - loss: 0.6681 - acc: 0.5957 - val_loss: 0.5915 - val_acc: 0.7885
Epoch 3/30
 - 0s - loss: 0.6101 - acc: 0.6677 - val_loss: 0.4942 - val_acc: 0.8173
Epoch 4/30
 - 0s - loss: 0.5454 - acc: 0.7634 - val_loss: 0.4624 - val_acc: 0.7788
Epoch 5/30
 - 0s - loss: 0.5062 - acc: 0.7624 - val_loss: 0.4552 - val_acc: 0.7885
Epoch 6/30
 - 0s - loss: 0.4911 - acc: 0.7645 - val_loss: 0.4399 - val_acc: 0.7788
Epoch 7/30
 - 0s - loss: 0.4823 - acc: 0.7591 - val_loss: 0.4424 - val_acc: 0.7885
Epoch 8/30
 - 0s - loss: 0.4789 - acc: 0.7581 - val_loss: 0.4344 - val_acc: 0.7885
Epoch 9/30
 - 0s - loss: 0.4734 - acc: 0.7645 - val_loss: 0.4274 - val_acc: 0.7885
Epoch 10/30
 - 0s - loss: 0.4697 - acc: 0.7753 - val_loss: 0.4252 - val_acc: 0.8077
Epoch 11/30
 - 0s - loss: 0.4657 - acc: 0.7602 - val_loss: 0.4219 - val_acc: 0.8173
Epoch 12/30
 - 0s - loss: 0.4639 - acc:

In [14]:
jack = pd.Series( [0, 'Jack', 3, 'male', 23, 1, 0, 5.0000, 'S'] )
rose = pd.Series( [1, 'Rose', 1, 'female', 20, 1, 0, 100.0000, 'S'] )

In [16]:
jack_rose_df = pd.DataFrame( [list(jack), list(rose)], columns=['survived', 'name', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked'] )

In [17]:
all_df = pd.concat( [all_df, jack_rose_df] )

In [18]:
all_df[-2:]

Unnamed: 0,survived,name,pclass,sex,age,sibsp,parch,fare,embarked
0,0,Jack,3,male,23.0,1,0,5.0,S
1,1,Rose,1,female,20.0,1,0,100.0,S


In [19]:
all_df[-5:]

Unnamed: 0,survived,name,pclass,sex,age,sibsp,parch,fare,embarked
1306,0,"Zakarian, Mr. Mapriededer",3,male,26.5,0,0,7.225,C
1307,0,"Zakarian, Mr. Ortin",3,male,27.0,0,0,7.225,C
1308,0,"Zimmerman, Mr. Leo",3,male,29.0,0,0,7.875,S
0,0,Jack,3,male,23.0,1,0,5.0,S
1,1,Rose,1,female,20.0,1,0,100.0,S


In [20]:
all_feature, label = preprocess_data(all_df)

In [21]:
all_probability = model.predict(all_feature)

In [22]:
all_probability[:10]

array([[0.97696316],
       [0.609896  ],
       [0.9721666 ],
       [0.40561366],
       [0.97113806],
       [0.2749112 ],
       [0.9495579 ],
       [0.32654467],
       [0.94781196],
       [0.29248822]], dtype=float32)

In [23]:
pd = all_df
pd.insert( len(all_df.columns), 'probability', all_probability)

In [24]:
pd[-5:]

Unnamed: 0,survived,name,pclass,sex,age,sibsp,parch,fare,embarked,probability
1306,0,"Zakarian, Mr. Mapriededer",3,male,26.5,0,0,7.225,C,0.244171
1307,0,"Zakarian, Mr. Ortin",3,male,27.0,0,0,7.225,C,0.241578
1308,0,"Zimmerman, Mr. Leo",3,male,29.0,0,0,7.875,S,0.144389
0,0,Jack,3,male,23.0,1,0,5.0,S,0.150749
1,1,Rose,1,female,20.0,1,0,100.0,S,0.969594


In [25]:
pd[-2:]

Unnamed: 0,survived,name,pclass,sex,age,sibsp,parch,fare,embarked,probability
0,0,Jack,3,male,23.0,1,0,5.0,S,0.150749
1,1,Rose,1,female,20.0,1,0,100.0,S,0.969594
