In [10]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras import layers

In [11]:
file_path = "/Users/kiyoshitakeuchi/Desktop/Machine Learning/titanic/"
file_name = "train.csv"
test_file_name = "test.csv"

In [12]:
titanic_train = pd.read_csv(file_path + file_name)
titanic_train.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [13]:
titanic_test = pd.read_csv(file_path + test_file_name)
titanic_test.head(10)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
5,897,3,"Svensson, Mr. Johan Cervin",male,14.0,0,0,7538,9.225,,S
6,898,3,"Connolly, Miss. Kate",female,30.0,0,0,330972,7.6292,,Q
7,899,2,"Caldwell, Mr. Albert Francis",male,26.0,1,1,248738,29.0,,S
8,900,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",female,18.0,0,0,2657,7.2292,,C
9,901,3,"Davies, Mr. John Samuel",male,21.0,2,0,A/4 48871,24.15,,S


The dataframes have a lot of NaNs which will cause a problem. So before continueing, we will get rid of the NaNs.

In [14]:
def remove_nan(data_frame): 
    mean = data_frame.select_dtypes(include = 'number').median()
    mean = mean.to_dict()
    data_frame.fillna(value=mean, inplace=True)
    data_frame.fillna(value="?", inplace=True)
    return

remove_nan(titanic_test)
remove_nan(titanic_train)

In [15]:
training_set = titanic_train.sample(frac = .80)
validation_set = titanic_train.drop(training_set.index)

In [16]:
print(len(training_set))
print(len(validation_set))

713
178


In [17]:
training_set.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
208,209,1,3,"Carr, Miss. Helen ""Ellen""",female,16.0,0,0,367231,7.75,?,Q
761,762,0,3,"Nirva, Mr. Iisakki Antino Aijo",male,41.0,0,0,SOTON/O2 3101272,7.125,?,S
552,553,0,3,"O'Brien, Mr. Timothy",male,28.0,0,0,330979,7.8292,?,Q
288,289,1,2,"Hosono, Mr. Masabumi",male,42.0,0,0,237798,13.0,?,S
96,97,0,1,"Goldschmidt, Mr. George B",male,71.0,0,0,PC 17754,34.6542,A5,C
390,391,1,1,"Carter, Mr. William Ernest",male,36.0,1,2,113760,120.0,B96 B98,S
152,153,0,3,"Meo, Mr. Alfonzo",male,55.5,0,0,A.5. 11206,8.05,?,S
690,691,1,1,"Dick, Mr. Albert Adrian",male,31.0,1,0,17474,57.0,B20,S
714,715,0,2,"Greenberg, Mr. Samuel",male,52.0,0,0,250647,13.0,?,S
99,100,0,2,"Kantor, Mr. Sinai",male,34.0,1,0,244367,26.0,?,S


Here is where the actual pre-processing will begin. 

In [19]:
titanic_features = training_set.copy()
titanic_labels = titanic_features.pop('Survived')

In [20]:
def preprocess_data(data_frame):    
    inputs = {}

    for name, column in data_frame.items():
        dtype = column.dtype
        if dtype == object:
            dtype = tf.string
        else:
            dtype = tf.float32
        inputs[name] = tf.keras.Input(shape=(1,), name=name, dtype=dtype)
        
    numeric_inputs = {name:input for name,input in inputs.items() 
                      if input.dtype==tf.float32}


    x = layers.Concatenate()(list(numeric_inputs.values()))
    norm = layers.Normalization()
    norm.adapt(np.array(titanic_train[numeric_inputs.keys()]))
    all_numeric_inputs = norm(x)

    preprocessed_inputs = [all_numeric_inputs]
    
    for name, input in inputs.items():
        if input.dtype == tf.float32:
            continue 

        lookup = layers.StringLookup(vocabulary=np.unique(data_frame[name]))
        one_hot = layers.CategoryEncoding(num_tokens=lookup.vocabulary_size())
        x = lookup(input)
        x = one_hot(x)
        preprocessed_inputs.append(x)
    
    preprocessed_inputs_cat = layers.Concatenate()(preprocessed_inputs)

    titanic_preprocessing = tf.keras.Model(inputs, preprocessed_inputs_cat)
    
    return [inputs, titanic_preprocessing]

In [22]:
preprocess_data(titanic_features)[1]

<keras.engine.functional.Functional at 0x1441de970>

In [37]:
def titanic_model(data_frame):
    body = tf.keras.Sequential([
    layers.Dense(64),
    layers.Dense(30),
    layers.Dense(1)
    ])
    
    inputs, preprocessing_head = preprocess_data(data_frame)
    preprocessed_inputs = preprocessing_head(inputs)
    result = body(preprocessed_inputs)
    model = tf.keras.Model(inputs, result)
    

    model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                optimizer=tf.keras.optimizers.Adam())
    return model

In [38]:
titanic_features_dict = {name: np.array(value) 
                         for name, value in titanic_features.items()}

In [39]:
titanic_test_features_dict = {name: np.array(value) 
                         for name, value in titanic_test.copy().items()}

In [40]:
titanic_model = titanic_model(titanic_features)

In [46]:
titanic_model.fit(x=titanic_features_dict, y=titanic_labels, validation_split = 0.2, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x144fc0610>

In [47]:
array = titanic_model.predict(titanic_test_features_dict)

In [48]:
import csv

with open('titanic_solution_2.csv', 'w', newline='') as file:
    fieldnames = ['PassengerId', 'Survived']
    writer = csv.DictWriter(file, fieldnames=fieldnames)

    writer.writeheader()
    n = 892
    for element in array:
        if element > 0:
            writer.writerow({'PassengerId': n, 'Survived': 1})
            n += 1
        else:
            writer.writerow({'PassengerId': n, 'Survived': 0})
            n += 1        

In [53]:
solution_file_path = "/Users/kiyoshitakeuchi/Documents/GitHub/Machine-Learning/Kaggle/titanic_solution.csv"
fileObject = csv.reader(solution_file_path)



row_count = sum(1 for row in fileObject)
print(row_count)

84


In [54]:
pandas_solution = pd.read_csv(solution_file_path)
pandas_solution.head(419)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0
