In [37]:
import pandas as pd
from tensorflow.python.keras import  Input
from tensorflow.python.keras.models import Model
from tensorflow.python.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import accuracy_score

In [95]:
df_train = pd.read_csv("data/titanic/train.csv")


In [96]:
def prepare_cabin(x_data: pd.DataFrame):
  x_data['Cabin'] = df_train['Cabin'].isna().astype(int)
  return x_data

def prepare_age(x_data: pd.DataFrame):
  x_data['Age'] = x_data['Age'].fillna(x_data['Age'].mean())
  return x_data

def prepare_sex(x_data: pd.DataFrame):
  x_data = pd.get_dummies(x_data, columns=['Sex'])
  return x_data

def prepare_embarked(x_data: pd.DataFrame):
  x_data = pd.get_dummies(x_data, columns=['Embarked'])
  return x_data

def prepare_name(x_data: pd.DataFrame):
  def extract_title(name):
    titles = {'mr', 'miss', 'mrs', 'master', 'dr', 'rev'}
    title = name.split('.')[0].split(', ')[1].lower()
    return title if title in titles else 'default'
  x_data['Name'] = x_data['Name'].map(extract_title)
  print(x_data['Name'].value_counts())
  x_data = pd.get_dummies(x_data, columns=['Name'])
  return x_data

def prepare_x_data(x_data):
  x_data = x_data[['Pclass', 'Sex', 'Embarked', 'Fare', 'Age', 'Name']]
  x_data = prepare_age(x_data)
  x_data = prepare_sex(x_data)
  x_data = prepare_embarked(x_data)
  x_data = prepare_name(x_data)
  x_data = prepare_cabin(x_data)
  return x_data

def prepare_prediction(y_data):
  return (y_data > 0.5).astype(int).reshape(len(y_data))

In [97]:
y_train = df_train["Survived"]
x_train = prepare_x_data(df_train)

mr         517
miss       182
mrs        125
master      40
default     14
dr           7
rev          6
Name: Name, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [98]:
x_train.isna().value_counts()

Pclass  Fare   Age    Sex_female  Sex_male  Embarked_C  Embarked_Q  Embarked_S  Name_default  Name_dr  Name_master  Name_miss  Name_mr  Name_mrs  Name_rev  Cabin
False   False  False  False       False     False       False       False       False         False    False        False      False    False     False     False    891
dtype: int64

In [99]:
x_train.head()

Unnamed: 0,Pclass,Fare,Age,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Name_default,Name_dr,Name_master,Name_miss,Name_mr,Name_mrs,Name_rev,Cabin
0,3,7.25,22.0,0,1,0,0,1,0,0,0,0,1,0,0,1
1,1,71.2833,38.0,1,0,1,0,0,0,0,0,0,0,1,0,0
2,3,7.925,26.0,1,0,0,0,1,0,0,0,1,0,0,0,1
3,1,53.1,35.0,1,0,0,0,1,0,0,0,0,0,1,0,0
4,3,8.05,35.0,0,1,0,0,1,0,0,0,0,1,0,0,1


In [100]:
x_train.shape

(891, 16)

In [101]:
input_layer = Input(shape=x_train.shape[1])
layer = Dense(60, activation='relu')(input_layer)
layer = Dense(60, activation='relu')(layer)
layer = Dense(1, activation='sigmoid')(layer)
model = Model(input_layer, layer)
model.compile(optimizer='adam', loss='binary_crossentropy')
model.summary()

Model: "functional_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         [(None, 16)]              0         
_________________________________________________________________
dense_9 (Dense)              (None, 60)                1020      
_________________________________________________________________
dense_10 (Dense)             (None, 60)                3660      
_________________________________________________________________
dense_11 (Dense)             (None, 1)                 61        
Total params: 4,741
Trainable params: 4,741
Non-trainable params: 0
_________________________________________________________________


In [112]:
model.fit(
  x=x_train,
  y=y_train,
  batch_size=30,
  epochs=20,
  callbacks=[EarlyStopping(monitor='loss', patience=2)],
)

accuracy_score(y_train, prepare_prediction(model.predict(x_train)))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20


0.8383838383838383

In [113]:
df_test = pd.read_csv('data/titanic/test.csv')
x_test = prepare_x_data(df_test)

mr         240
miss        78
mrs         72
master      21
default      4
rev          2
dr           1
Name: Name, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [114]:
x_test.head()

Unnamed: 0,Pclass,Fare,Age,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Name_default,Name_dr,Name_master,Name_miss,Name_mr,Name_mrs,Name_rev,Cabin
0,3,7.8292,34.5,0,1,0,1,0,0,0,0,0,1,0,0,1
1,3,7.0,47.0,1,0,0,0,1,0,0,0,0,0,1,0,0
2,2,9.6875,62.0,0,1,0,1,0,0,0,0,0,1,0,0,1
3,3,8.6625,27.0,0,1,0,0,1,0,0,0,0,1,0,0,0
4,3,12.2875,22.0,1,0,0,0,1,0,0,0,0,0,1,0,1


In [115]:
y_test_pred = model.predict(x_test)

In [116]:
submission_df = pd.DataFrame(
  {
    'PassengerId': df_test['PassengerId'],
    'Survived': prepare_prediction(y_test_pred),
  }
)



In [117]:
submission_df.to_csv('data/titanic/prediction.csv', index=False)