In [1]:
import os

import math
import shutil
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
import logging

logger = tf.get_logger()
logger.setLevel(logging.INFO)

pd.options.display.max_rows = 10
pd.options.display.float_format = '{:.1f}'.format


In [2]:
!pip install -q kaggle
#!pip install google.colab

[31mERROR: jupyterlab-git 0.10.1 has requirement nbdime<2.0.0,>=1.1.0, but you'll have nbdime 2.0.0 which is incompatible.[0m
[31mERROR: apache-beam 2.22.0 has requirement httplib2<0.18.0,>=0.8, but you'll have httplib2 0.18.1 which is incompatible.[0m
[31mERROR: apache-beam 2.22.0 has requirement mock<3.0.0,>=1.0.1, but you'll have mock 4.0.2 which is incompatible.[0m


In [7]:
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!ls ~/.kaggle
!chmod 600 ~/.kaggle/kaggle.json

!ls -l ~/.kaggle/kaggle.json

kaggle.json
-rw------- 1 jupyter jupyter 73 Aug  7 14:55 /home/jupyter/.kaggle/kaggle.json


In [8]:
data_base_path = os.path.join(os.path.curdir, '../data')
titanic_base_path = os.path.join(data_base_path, 'titanic')

!ls $data_base_path

ls: cannot access './../data': No such file or directory


In [9]:
def load_data_from_kaggle(dataset="titanic", data_base_path=data_base_path):
    data_path = os.path.join(os.path.curdir, f"{data_base_path}/{dataset}")
    !kaggle competitions download -c $dataset --path $data_path --force
    !unzip -o $data_path/titanic.zip -d $data_path

In [10]:
load_data_from_kaggle(dataset="titanic")

Downloading titanic.zip to ././../data/titanic
  0%|                                               | 0.00/34.1k [00:00<?, ?B/s]
100%|██████████████████████████████████████| 34.1k/34.1k [00:00<00:00, 25.1MB/s]
Archive:  ././../data/titanic/titanic.zip
  inflating: ././../data/titanic/gender_submission.csv  
  inflating: ././../data/titanic/test.csv  
  inflating: ././../data/titanic/train.csv  


In [11]:
def load_titanic_dateset(titanic_path=titanic_base_path):
    gender_submission_csv_path = os.path.join(titanic_path, "gender_submission.csv")
    train_csv_path = os.path.join(titanic_path, "train.csv")
    test_csv_path = os.path.join(titanic_path, "test.csv")
    return pd.read_csv(gender_submission_csv_path), pd.read_csv(train_csv_path), pd.read_csv(test_csv_path)

In [12]:
gender_sub_df, train_df, test_df = load_titanic_dateset(titanic_path=titanic_base_path)

In [14]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.3,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.1,,S


## Build a neural network model

In [15]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [16]:
median_age = None
most_embarked_from = None
train_fare_mean = None
train_fare_std = None

def drop_columns(df):
    df.drop("Name", axis=1, inplace=True)
    df.drop("PassengerId", axis=1, inplace=True)
    df.drop("Ticket", axis=1, inplace=True)
    df.drop("Cabin", axis=1, inplace=True)
    df.drop("Sex_female", axis=1, inplace=True)
    df.drop('SibSp', axis=1, inplace=True)
    df.drop('Parch', axis=1, inplace=True)
    return df


def fill_missing_values(df):
    copy_df = df.copy()
    
    global median_age
    global most_embarked_from
    global train_fare_mean
    global train_fare_std
    
    median_age = median_age or copy_df["Age"].median(skipna=True)
    most_embarked_from = most_embarked_from or copy_df["Embarked"].value_counts().idxmax()
    train_fare_mean = train_fare_mean or copy_df["Fare"].mean(skipna=True)
    train_fare_std = train_fare_std or copy_df["Fare"].std(skipna=True)
    
    copy_df["Age"].fillna(median_age, inplace=True)
    copy_df["Embarked"].fillna(most_embarked_from, inplace=True)
    copy_df["Fare"] = (copy_df["Fare"] - train_fare_mean) / train_fare_std
    
    return copy_df
    
def create_categorical(df):
    copy_df = pd.get_dummies(df, columns=["Pclass", "Embarked", "Title", "Sex"])

    return copy_df

def add_engineered(df):
    df['TravelAlone'] = np.where((df["SibSp"] + df["Parch"])>0, 0, 1)
    df['TravelAlone'] = df['TravelAlone'].astype('uint8')
    
    df['Title'] = df.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
    df['Title'] = df['Title'].replace(['Lady', 'Countess','Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    
    df['Title'] = df['Title'].replace(['Mlle','Mme','Ms'], 'Miss') # Mlle = Mademoiselle
    return df

def change_types(df):
    df['TravelAlone'] = df['TravelAlone'].astype('uint8')
    if 'Survived' in df.columns:
        df['Survived'] = df['Survived'].astype('uint8')
    return df

def clean_data(df):
    copy_df = df.copy()
    
    copy_df = fill_missing_values(copy_df)
    copy_df = add_engineered(copy_df)
    copy_df = create_categorical(copy_df)
    copy_df = drop_columns(copy_df)
    copy_df = change_types(copy_df)
    return copy_df

In [17]:
train_df_clean = clean_data(train_df)
test_df_clean = clean_data(test_df)


In [28]:
test_df_clean.head() 


Unnamed: 0,Age,Fare,TravelAlone,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Rare,Sex_male
0,34.5,-0.5,1,0,0,1,0,1,0,0,0,1,0,0,1
1,47.0,-0.5,0,0,0,1,0,0,1,0,0,0,1,0,0
2,62.0,-0.5,1,0,1,0,0,1,0,0,0,1,0,0,1
3,27.0,-0.5,1,0,0,1,0,0,1,0,0,1,0,0,1
4,22.0,-0.4,0,0,0,1,0,0,1,0,0,0,1,0,0


In [19]:
train_df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 16 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Survived      891 non-null    uint8  
 1   Age           891 non-null    float64
 2   Fare          891 non-null    float64
 3   TravelAlone   891 non-null    uint8  
 4   Pclass_1      891 non-null    uint8  
 5   Pclass_2      891 non-null    uint8  
 6   Pclass_3      891 non-null    uint8  
 7   Embarked_C    891 non-null    uint8  
 8   Embarked_Q    891 non-null    uint8  
 9   Embarked_S    891 non-null    uint8  
 10  Title_Master  891 non-null    uint8  
 11  Title_Miss    891 non-null    uint8  
 12  Title_Mr      891 non-null    uint8  
 13  Title_Mrs     891 non-null    uint8  
 14  Title_Rare    891 non-null    uint8  
 15  Sex_male      891 non-null    uint8  
dtypes: float64(2), uint8(14)
memory usage: 26.2 KB


In [20]:
for column in train_df_clean.columns:
    print(column)

Survived
Age
Fare
TravelAlone
Pclass_1
Pclass_2
Pclass_3
Embarked_C
Embarked_Q
Embarked_S
Title_Master
Title_Miss
Title_Mr
Title_Mrs
Title_Rare
Sex_male


In [21]:
featcols = {
  colname : tf.feature_column.numeric_column(colname) \
    #for colname in 'Age,Fare,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S,Sex_male,TravelAlone'.split(',')
    for colname in train_df_clean.columns if colname != 'Survived'
}
featcols.keys()

dict_keys(['Age', 'Fare', 'TravelAlone', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Embarked_C', 'Embarked_Q', 'Embarked_S', 'Title_Master', 'Title_Miss', 'Title_Mr', 'Title_Mrs', 'Title_Rare', 'Sex_male'])

In [25]:
# Split into train and eval
msk = np.random.rand(len(train_df_clean)) < 0.8
traindf = train_df_clean[msk]
evaldf = train_df_clean[~msk]

BATCH_SIZE= 20
OUTDIR = '../models'


def make_input_fn(df, mode, batch_size = BATCH_SIZE):
    global mean_train_fare
    if mode == tf.estimator.ModeKeys.TRAIN:
        num_epochs = None # loop indefinetly
        shuffle=True
        y = df["Survived"]
    elif mode == tf.estimator.ModeKeys.EVAL:
        num_epochs = 1 # one run and it's over
        shuffle=False
        y = df["Survived"]
    elif mode == tf.estimator.ModeKeys.PREDICT:
        num_epochs = 1 # one run and it's over
        shuffle=False
        y = None
    return tf.compat.v1.estimator.inputs.pandas_input_fn(x = df[list(featcols.keys())],
                                                y = y,
                                                num_epochs = num_epochs,
                                                batch_size = batch_size, 
                                                shuffle = shuffle)

def train_input_fn(df, batch_size=BATCH_SIZE):
    return make_input_fn(df, mode=tf.estimator.ModeKeys.TRAIN, batch_size=batch_size)

def eval_input_fn(df):
    return make_input_fn(df, mode=tf.estimator.ModeKeys.EVAL, batch_size=len(df))

def test_input_fn(df):
    return make_input_fn(df, mode=tf.estimator.ModeKeys.PREDICT, batch_size=len(df))

In [26]:
NUM_CLASSES = 2
estimator = None

def train_and_evaluate(output_dir, num_train_steps):
    myopt = tf.keras.optimizers.Ftrl(learning_rate = 0.01, l1_regularization_strength=0.001) # note the learning rate
    #ada_optimizer=tf.compat.v1.train.ProximalAdagradOptimizer(learning_rate=0.1, l1_regularization_strength=0.001)
    
    adam_opt = tf.keras.optimizers.Adam(learning_rate=0.001 ) # note the learning rate

    estimator = tf.estimator.DNNClassifier(
                        model_dir = output_dir, 
                        feature_columns = featcols.values(),
                        hidden_units=[1024, 256, 32],
                        optimizer = adam_opt,
                        dropout=0.2,
                        n_classes=NUM_CLASSES)
    
    #estimator = tf.estimator.DNNClassifier(
    #feature_columns=[categorical_feature_a_emb, categorical_feature_b_emb],
    #hidden_units=[1024, 512, 256])
  
    #def my_auc(labels, predictions):
    #    auc_metric = tf.keras.metrics.AUC(name="my_auc")
    #    auc_metric.update_state(y_true=labels, y_pred=predictions['logistic'])
    #    return {'auc': auc_metric}

    #estimator = tf.compat.v1.estimator.add_metrics(estimator, rmse)
    
    train_spec = tf.estimator.TrainSpec(input_fn = train_input_fn(df = traindf, batch_size = BATCH_SIZE),
                                      max_steps = num_train_steps)
    eval_spec = tf.estimator.EvalSpec(input_fn = eval_input_fn(df = evaldf),
                                        steps = None,                                        
                                        start_delay_secs = 1, # start evaluating after N seconds
                                        throttle_secs = 10  # evaluate every N seconds)
                                     )
    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
    
    return estimator
    
# Run training    
shutil.rmtree(OUTDIR, ignore_errors = True) # start fresh each time
estimator = train_and_evaluate(OUTDIR, num_train_steps = (100 * len(traindf)) / BATCH_SIZE) 

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '../models', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Not using Distribute Coordinator.
INFO:tensorflow:Running training and evaluation locally (non-distributed).

In [40]:
results = estimator.predict(input_fn=test_input_fn(test_df_clean))

In [52]:
predicted_classes = [p["classes"] for p in results]
print('Number of predictions: {}'.format(len(predicted_classes)))


Number of predictions: 416


In [49]:
test_id = test_df['PassengerId']

submission = pd.DataFrame()
submission['PassengerId'] = test_id
submission['Survived'] = predicted_classes
submission['Survived'] = submission['Survived'].astype(int)

print('Submission shape: {}'.format(submission.shape))
submission.head()

Submission shape: (418, 2)


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [50]:
submission.to_csv('titanic_DNN_submission_1.csv',index=False)