# Kaggle Titanic Dataset

In [191]:
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
# from sklearn.preprocessing import LabelEncoder   # prepare for to_categorical()
# from keras.utils import to_categorical  # one-hot encoding from integer values
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

In [163]:
train_dataset = pd.read_csv("./data/train.csv",delimiter=',')
test_dataset = pd.read_csv("./data/test.csv",delimiter=',')

## Preprocessing for both train and test datasets

In [164]:
train_dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [165]:
# Extract Title from Name: https://www.kaggle.com/ash316/eda-to-prediction-dietanic
# a different mapping: http://rstudio-pubs-static.s3.amazonaws.com/227239_a42941af5d7d457398ed3721f9ad0f6f.html

def extractTitle(df):
    df['Title']=0
    df['Title']=df.Name.str.extract('([A-Za-z]+)\.') #lets extract the Salutations
    df['Title'].replace(['Mlle','Mme','Ms','Dr','Major','Lady','Countess','Dona','Jonkheer','Col','Rev','Capt','Sir','Don'],
                        ['Miss','Miss','Miss','Mr','Mr','Mrs','Mrs','Mrs','Other','Other','Other','Mr','Mr','Mr'],inplace=True)
    
extractTitle(train_dataset)
extractTitle(test_dataset)

In [166]:
# x['Cabin'].isna().sum()  # 687 out of 891 are NaN 
# x['Age'].isna().sum()    # 177 out of 891 are NaN 
# let's drop 'Name','Ticket','Cabin','PassengerId', 'Age'

train_dataset = train_dataset.drop(['Name','Ticket','Cabin','PassengerId','Age'], axis=1)
test_dataset = test_dataset.drop(['Name','Ticket','Cabin','PassengerId','Age'], axis=1)

In [167]:
train_dataset.isna().sum()

Survived    0
Pclass      0
Sex         0
SibSp       0
Parch       0
Fare        0
Embarked    2
Title       0
dtype: int64

In [168]:
# Populate Embarked Prot: "As we saw that maximum passengers boarded from Port S, we replace NaN with S"
# https://www.kaggle.com/ash316/eda-to-prediction-dietanic
train_dataset['Embarked'].fillna('S',inplace=True)

In [169]:
train_dataset.isna().sum()  # no NA

Survived    0
Pclass      0
Sex         0
SibSp       0
Parch       0
Fare        0
Embarked    0
Title       0
dtype: int64

In [170]:
test_dataset

Unnamed: 0,Pclass,Sex,SibSp,Parch,Fare,Embarked,Title
0,3,male,0,0,7.8292,Q,Mr
1,3,female,1,0,7.0000,S,Mrs
2,2,male,0,0,9.6875,Q,Mr
3,3,male,0,0,8.6625,S,Mr
4,3,female,1,1,12.2875,S,Mrs
...,...,...,...,...,...,...,...
413,3,male,0,0,8.0500,S,Mr
414,1,female,0,0,108.9000,C,Mrs
415,3,male,0,0,7.2500,S,Mr
416,3,male,0,0,8.0500,S,Mr


In [171]:
# one-hot encoding
train_dataset = pd.get_dummies(train_dataset)
test_dataset = pd.get_dummies(test_dataset)

In [172]:
# lable encode "Embarked","Sex", and "Title"

# le = LabelEncoder()
# train_dataset['Embarked'] = le.fit_transform(train_dataset['Embarked'].astype(str))
# test_dataset['Embarked'] = le.transform(test_dataset['Embarked'].astype(str))

# le = LabelEncoder()
# train_dataset['Sex'] = le.fit_transform(train_dataset['Sex'])
# test_dataset['Sex'] = le.transform(test_dataset['Sex'])

# le = LabelEncoder()
# train_dataset['Title'] = le.fit_transform(train_dataset['Title'])
# test_dataset['Title'] = le.transform(test_dataset['Title'])

In [173]:
train_dataset.head()

Unnamed: 0,Survived,Pclass,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Other
0,0,3,1,0,7.25,0,1,0,0,1,0,0,1,0,0
1,1,1,1,0,71.2833,1,0,1,0,0,0,0,0,1,0
2,1,3,0,0,7.925,1,0,0,0,1,0,1,0,0,0
3,1,1,1,0,53.1,1,0,0,0,1,0,0,0,1,0
4,0,3,0,0,8.05,0,1,0,0,1,0,0,1,0,0


In [174]:
test_dataset.head()

Unnamed: 0,Pclass,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Other
0,3,0,0,7.8292,0,1,0,1,0,0,0,1,0,0
1,3,1,0,7.0,1,0,0,0,1,0,0,0,1,0
2,2,0,0,9.6875,0,1,0,1,0,0,0,1,0,0
3,3,0,0,8.6625,0,1,0,0,1,0,0,1,0,0
4,3,1,1,12.2875,1,0,0,0,1,0,0,0,1,0


In [175]:
# fare is missing for a passenger
test_dataset.isna().sum()

Pclass          0
SibSp           0
Parch           0
Fare            1
Sex_female      0
Sex_male        0
Embarked_C      0
Embarked_Q      0
Embarked_S      0
Title_Master    0
Title_Miss      0
Title_Mr        0
Title_Mrs       0
Title_Other     0
dtype: int64

In [176]:
# impute fare for passenger 1044: 
# "Since he travelled 3rd class from Southhampton, it seems reasonable to impute his missing fare with the median fare per passenger in this class."
# https://rpubs.com/renrele/titanic

test_dataset['Fare'].fillna(7.7958,inplace=True)

In [177]:
# separate train_dataset into features (x) and target (y)
train_dataset_x = train_dataset.drop(['Survived'], axis=1)
train_dataset_y = train_dataset["Survived"]

In [178]:
# standardize every column in train_dataset_x and train_dataset
# # https://datascience.stackexchange.com/a/27616

# # centering and scaling happen independently on each feature
# scaler = StandardScaler()
# train_dataset_x = scaler.fit_transform(train_dataset_x)   
# test_dataset = scaler.transform(test_dataset)

In [179]:
# only standardize "Fare" in train_dataset_x and train_dataset
scaler = StandardScaler()
train_dataset_x[['Fare']] = scaler.fit_transform(train_dataset_x[['Fare']])  
test_dataset[['Fare']] = scaler.fit_transform(test_dataset[['Fare']])
# need to use double brackest to pass a dataframe; single bracket returns a series

In [182]:
# need all train_dataset_x, train_dataset_y, and train_dataset_y to be numpy arrays for Keras
train_dataset_x = train_dataset_x.to_numpy()
train_dataset_y = train_dataset_y.to_numpy()
test_dataset = test_dataset.to_numpy()

In [194]:
assert isinstance(train_dataset_x, np.ndarray)
assert isinstance(train_dataset_y, np.ndarray)
assert isinstance(test_dataset, np.ndarray)

## Sequential Model 

In [195]:
input_dim = train_dataset_x.shape[1]

In [196]:
input_dim

14

In [234]:
model = Sequential()
model.add(Dense(4, input_dim=input_dim, activation='relu'))
model.add(Dense(3, activation='relu'))
model.add(Dense(1, activation='sigmoid'))    # sigmoid because output is 0 or 1

In [235]:
# compile the keras model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [236]:
model.summary()

Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_27 (Dense)             (None, 4)                 60        
_________________________________________________________________
dense_28 (Dense)             (None, 3)                 15        
_________________________________________________________________
dense_29 (Dense)             (None, 1)                 4         
Total params: 79
Trainable params: 79
Non-trainable params: 0
_________________________________________________________________


In [237]:
# fit the keras model on the dataset
model.fit(train_dataset_x, train_dataset_y, epochs=150, batch_size=10)

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78

Epoch 84/150
Epoch 85/150
Epoch 86/150
Epoch 87/150
Epoch 88/150
Epoch 89/150
Epoch 90/150
Epoch 91/150
Epoch 92/150
Epoch 93/150
Epoch 94/150
Epoch 95/150
Epoch 96/150
Epoch 97/150
Epoch 98/150
Epoch 99/150
Epoch 100/150
Epoch 101/150
Epoch 102/150
Epoch 103/150
Epoch 104/150
Epoch 105/150
Epoch 106/150
Epoch 107/150
Epoch 108/150
Epoch 109/150
Epoch 110/150
Epoch 111/150
Epoch 112/150
Epoch 113/150
Epoch 114/150
Epoch 115/150
Epoch 116/150
Epoch 117/150
Epoch 118/150
Epoch 119/150
Epoch 120/150
Epoch 121/150
Epoch 122/150
Epoch 123/150
Epoch 124/150
Epoch 125/150
Epoch 126/150
Epoch 127/150
Epoch 128/150
Epoch 129/150
Epoch 130/150
Epoch 131/150
Epoch 132/150
Epoch 133/150
Epoch 134/150
Epoch 135/150
Epoch 136/150
Epoch 137/150
Epoch 138/150
Epoch 139/150
Epoch 140/150
Epoch 141/150
Epoch 142/150
Epoch 143/150
Epoch 144/150
Epoch 145/150
Epoch 146/150
Epoch 147/150
Epoch 148/150
Epoch 149/150
Epoch 150/150


<tensorflow.python.keras.callbacks.History at 0x7fc00af32280>

In [238]:
# evaluate
loss, accuracy = model.evaluate(train_dataset_x, train_dataset_y)  



### Prediction

In [239]:
# prediction using validation_x
predictions = model.predict(test_dataset)

# to dataframe
predictions = pd.DataFrame(predictions,columns=["Survived"])

In [240]:
# rounding to 0 or 1
predictions = predictions.apply(round)

# float32 to int
predictions['Survived'] = predictions['Survived'].astype(int)

### Output

In [241]:
test_dataset_OG = pd.read_csv("./data/test.csv",delimiter=',')

In [242]:
# saves these new predictions in a CSV file my_submission.csv
output = pd.merge(test_dataset_OG, predictions, left_index=True, right_index=True)
output = output[['PassengerId','Survived']]
output.to_csv('my_submission.csv', index=False)