In [1]:
import pandas as pd
from tensorflow.python.keras import  Input
from tensorflow.python.keras.models import Model
from tensorflow.python.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import accuracy_score

In [2]:
df_train = pd.read_csv("../data/titanic/train.csv")

In [4]:
df_train


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [5]:
def prepare_cabin(x_data: pd.DataFrame):
  x_data['Cabin'] = df_train['Cabin'].isna().astype(int)
  return x_data

def prepare_age(x_data: pd.DataFrame):
  x_data['Age'] = x_data['Age'].fillna(x_data['Age'].mean())
  return x_data

def prepare_sex(x_data: pd.DataFrame):
  x_data = pd.get_dummies(x_data, columns=['Sex'])
  # x_data['Sex'] = (x_data['Sex'] == 'male').astype(int)
  return x_data

def prepare_embarked(x_data: pd.DataFrame):
  x_data = pd.get_dummies(x_data, columns=['Embarked'])
  return x_data

def prepare_name(x_data: pd.DataFrame):
  def extract_title(name):
    titles = {'mr', 'miss', 'mrs', 'master', 'dr', 'rev'}
    title = name.split('.')[0].split(', ')[1].lower()
    return title if title in titles else 'default'
  x_data['Name'] = x_data['Name'].map(extract_title)
  print(x_data['Name'].value_counts())
  x_data = pd.get_dummies(x_data, columns=['Name'])
  return x_data

def prepare_x_data(x_data):
  x_data = x_data[['Pclass', 'Sex', 'Embarked', 'Fare', 'Age', 'Name']]
  x_data = prepare_age(x_data)
  x_data = prepare_sex(x_data)
  x_data = prepare_embarked(x_data)
  x_data = prepare_name(x_data)
  x_data = prepare_cabin(x_data)
  return x_data

def prepare_prediction(y_data):
  return (y_data > 0.5).astype(int).reshape(len(y_data))

In [6]:
y_train = df_train["Survived"]
x_train = prepare_x_data(df_train)

mr         517
miss       182
mrs        125
master      40
default     14
dr           7
rev          6
Name: Name, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [8]:
x_train


Unnamed: 0,Pclass,Fare,Age,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Name_default,Name_dr,Name_master,Name_miss,Name_mr,Name_mrs,Name_rev,Cabin
0,3,7.2500,22.000000,0,1,0,0,1,0,0,0,0,1,0,0,1
1,1,71.2833,38.000000,1,0,1,0,0,0,0,0,0,0,1,0,0
2,3,7.9250,26.000000,1,0,0,0,1,0,0,0,1,0,0,0,1
3,1,53.1000,35.000000,1,0,0,0,1,0,0,0,0,0,1,0,0
4,3,8.0500,35.000000,0,1,0,0,1,0,0,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,2,13.0000,27.000000,0,1,0,0,1,0,0,0,0,0,0,1,1
887,1,30.0000,19.000000,1,0,0,0,1,0,0,0,1,0,0,0,0
888,3,23.4500,29.699118,1,0,0,0,1,0,0,0,1,0,0,0,1
889,1,30.0000,26.000000,0,1,1,0,0,0,0,0,0,1,0,0,0


In [9]:
x_train.isna().value_counts()

Pclass  Fare   Age    Sex_female  Sex_male  Embarked_C  Embarked_Q  Embarked_S  Name_default  Name_dr  Name_master  Name_miss  Name_mr  Name_mrs  Name_rev  Cabin
False   False  False  False       False     False       False       False       False         False    False        False      False    False     False     False    891
dtype: int64

In [None]:

x_train.head()

In [10]:
x_train.shape

(891, 16)

In [11]:
input_layer = Input(shape=x_train.shape[1])
layer = Dense(60, activation='relu')(input_layer)
layer = Dense(60, activation='relu')(layer)
layer = Dense(1, activation='sigmoid')(layer)
model = Model(input_layer, layer)
model.compile(optimizer='adam', loss='binary_crossentropy')
model.summary()

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 16)]              0         
_________________________________________________________________
dense (Dense)                (None, 60)                1020      
_________________________________________________________________
dense_1 (Dense)              (None, 60)                3660      
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 61        
Total params: 4,741
Trainable params: 4,741
Non-trainable params: 0
_________________________________________________________________


In [12]:
model.fit(
  x=x_train,
  y=y_train,
  batch_size=30,
  epochs=20,
  callbacks=[EarlyStopping(monitor='loss', patience=2)],
)

accuracy_score(y_train, prepare_prediction(model.predict(x_train)))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20


0.7003367003367004

In [None]:
df_test = pd.read_csv('../data/titanic/test.csv')
x_test = prepare_x_data(df_test)

In [None]:
x_test.head()

In [None]:
y_test_pred = model.predict(x_test)

In [None]:
submission_df = pd.DataFrame(
  {
    'PassengerId': df_test['PassengerId'],
    'Survived': prepare_prediction(y_test_pred),
  }
)

In [None]:
submission_df.to_csv('../data/titanic/prediction.csv', index=False)