## Start the Docker container

### Run the following commands:
docker run -it --rm -p 8888:8888 -v /Users/sylvain/Data_Science/Kaggle/competition_titanic/datasets:/home/jovyan/datasets jupyter-server

In [168]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import tensorflow as tf
from tensorflow import keras
from keras import layers
from keras.callbacks import EarlyStopping

import pandas as pd
import numpy as np
import matplotlib as plt

<h3>Data preprocessing

In [169]:
def cleanData(df):
    
    # Replace the value of the missing "Embarked" by "S" (the most common)
    df.loc[df.loc[:,"Embarked"].isna(), "Embarked"] = df.loc[df.loc[:,"Embarked"].isna(), "Embarked"].apply(lambda x: "S")
    # Replace the missing pclass value with the most common one
    df.loc[df.loc[:,"Pclass"].isna(), "Pclass"] = df.loc[df.loc[:,"Pclass"].isna(), "Pclass"].apply(lambda x: 3)
    # Replace the missing Fare values with the mean one
    df.loc[df.loc[:,"Fare"].isna(), "Fare"] = df.loc[df.loc[:,"Fare"].isna(), "Fare"].apply(lambda x:df.loc[:,"Fare"].mean())
    # fill the missing age values
    df.loc[df.loc[:,"Age"].isna(), "Age"] = df.loc[df.loc[:,"Age"].isna(), "Age"].apply(lambda x: df.loc[:,"Age"].mean())
    # df = guessAge(df=df)
    
    # drop Name, Ticket and Cabin
    df.drop(
        labels=["Name", "Ticket", "Cabin"],
        inplace=True,
        axis=1
        ) 

    # replace Sex and Embarked with dummy variables
    df = pd.get_dummies(data=df)
    
    return df

In [170]:
def scaleData(df=pd.DataFrame):
	'''
 	This function will scale the columns of the data Frame which are not binary and return it
  	'''
	columnsName = df.columns
	indexName = df.index
	scaler = StandardScaler()
	scaledColumns = pd.DataFrame(scaler.fit_transform(X=df.loc[:, ["Age", "Fare"]]), columns=["Age", "Fare"], index=indexName)
	
	return df

In [171]:
def featureEngineering_1(df:pd.DataFrame):
    
	# define the helper functions
	def isClassMember(x, classTicket):
		if x == classTicket:
			return 1
		return 0

	def singleOrCouple(x):
		if x>0:
			return 1
		return 0

	def oneOrMore(x):
		if x>0:
			return 1
		return 0

	# encode the new features
	df["siblings"] = df.loc[:,"SibSp"].apply(lambda x: oneOrMore(x))
	df["couple"] = df.loc[:,"Parch"].apply(lambda x:singleOrCouple(x))
	df["firstClass"] = df.loc[:,"Pclass"].apply(lambda x:isClassMember(x, 1))
	df["secondClass"] = df.loc[:,"Pclass"].apply(lambda x:isClassMember(x, 2))
	df["thirdClass"] = df.loc[:,"Pclass"].apply(lambda x:isClassMember(x, 3))
	
	# drop the old ones
	df = df.drop(labels=["Pclass", "SibSp", "Parch"], axis=1)
 
	return df

In [172]:
def dfPipeline(df:pd.DataFrame):
	df = cleanData(df)
	df = featureEngineering_1(df)
	df = scaleData(df)
	return df

In [173]:
# Load the Dataset
df = pd.read_csv(
	filepath_or_buffer="datasets/train.csv",
 index_col=0
)

In [174]:
# Clean and scale the data
df = dfPipeline(df)
# create the features and target datasets
features = df.copy()
features.drop(labels="Survived", axis=1, inplace=True)
target = df.loc[:,"Survived"].copy()
# create the train and test datasets
x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.25, random_state=42)

### Neural network

In [180]:
INPUT_SHAPE = [len(x_train.columns)]
X=features
Y=target
x=x_train
y=y_train

model = keras.Sequential([
	layers.Dense(units = 256, input_shape = INPUT_SHAPE, activation="relu"),
	layers.BatchNormalization(),
	layers.Dropout(0.3),
 	layers.Dense(units = 512, activation="relu"),
	# layers.BatchNormalization(),
	layers.Dropout(0.3),
  	layers.Dense(units = 256, activation="relu"),
	# layers.BatchNormalization(),
	layers.Dropout(0.3),
	layers.Dense(units = 1, activation = "sigmoid")
])

model.compile(
	optimizer="adam",
	loss="binary_crossentropy",
	metrics=["binary_accuracy"],
)

early_stopping = EarlyStopping(
	patience = 50,
 	min_delta = 0.01,
	restore_best_weights = True,
)

# train the model
training_history = model.fit(
	x=X,
	y=Y,
	validation_data = (
		x_test,
		y_test,
	),
	batch_size = 256,
	epochs = 1250,
 	callbacks = [early_stopping],
	verbose = False,
)

# see the training history
history_df = pd.DataFrame(training_history.history)
history_df.loc[10:,:].plot()
best_val_acc = history_df.loc[:,"val_binary_accuracy"].max()
print(f"The best validation accuray is = {best_val_acc}")
mean_val_acc = history_df.iloc[-10:,3].mean()
print(f"The mean validation accuray is = {mean_val_acc}")
mean_val_acc = history_df.iloc[-10:,1].mean()
print(f"The mean accuray is = {mean_val_acc}")

In [176]:
'''
The best validation accuray is = 0.834080696105957
The mean validation accuray is = 0.8255605340003968
The mean accuray is = 0.8191919267177582
'''

'\nThe best validation accuray is = 0.834080696105957\nThe mean validation accuray is = 0.8255605340003968\nThe mean accuray is = 0.8191919267177582\n'

<h3>Make a prediction for the competition</h3>

In [178]:
# Load the Dataset
df = pd.read_csv(filepath_or_buffer="datasets/test.csv", index_col=0)

# Save the index
resDic = {"PassengerId": df.index}

# Clean and scale the data
df = dfPipeline(df)

# make the prediction
temp = model.predict(df)
pred = []
for arr in temp:
    pred.append(arr[0])
resDic["Survived"] = pred

# save it in a DataFrame
prediction = pd.DataFrame(data=resDic)

# make the prediction an integer
prediction.loc[:,"Survived"] = prediction.loc[:,"Survived"].apply(lambda x:round(x))

# save the prediction
prediction.to_csv(path_or_buf="datasets/predictionNN6.csv", index=False)



  prediction.loc[:,"Survived"] = prediction.loc[:,"Survived"].apply(lambda x:round(x))
