## Start the Docker container

### Run the following commands:
docker run -it --rm -p 8888:8888 -v /Users/sylvain/Data_Science/Kaggle/competition_titanic/datasets:/home/jovyan/datasets jupyter-server

In [13]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import tensorflow as tf
from tensorflow import keras
from keras import layers

import pandas as pd
import numpy as np
import matplotlib as plt

<h3>Data preprocessing

In [2]:
def modelAgePredictor():
	'''
	Instantiate and train a model to guess the age of a passenger
	'''
	df_age = pd.read_csv(filepath_or_buffer="datasets/train.csv",index_col=0)
	df_age = df_age.loc[:,["Sex", "Age", "SibSp", "Parch", "Fare"]]
	df_age = pd.get_dummies(data=df_age)
	df_age = df_age.dropna(axis=0)

	ageFeatures = df_age.copy()
	ageFeatures.drop(labels="Age", axis=1, inplace=True)
	ageTarget = df_age.loc[:,"Age"]

	agePredictor = LinearRegression()
	agePredictor.fit(X=ageFeatures, y=ageTarget)

	return agePredictor

def guessAge(df):
	'''
	Fill the missing ages cell with the most likely value
 	'''
	agePredictor = modelAgePredictor()
	features_age = df.loc[df.loc[:,"Age"].isna(), ["Sex", "SibSp", "Parch", "Fare"]]
	features_age = pd.get_dummies(data=features_age)
	yhat = agePredictor.predict(X=features_age)
	df.loc[df.loc[:,"Age"].isna(), "Age"] = yhat
	
	return df

In [3]:
def cleanData(df):
    
    # Replace the value of the missing "Embarked" by "S" (the most common)
    df.loc[df.loc[:,"Embarked"].isna(), "Embarked"] = df.loc[df.loc[:,"Embarked"].isna(), "Embarked"].apply(lambda x: "S")
    # Replace the missing pclass value with the most common one
    df.loc[df.loc[:,"Pclass"].isna(), "Pclass"] = df.loc[df.loc[:,"Pclass"].isna(), "Pclass"].apply(lambda x: 3)
    # Replace the missing Fare values with the mean one
    df.loc[df.loc[:,"Fare"].isna(), "Fare"] = df.loc[df.loc[:,"Fare"].isna(), "Fare"].apply(lambda x:df.loc[:,"Fare"].mean())
    # fill the missing age values
    df.loc[df.loc[:,"Age"].isna(), "Age"] = df.loc[df.loc[:,"Age"].isna(), "Age"].apply(lambda x: df.loc[:,"Age"].mean())
    # df = guessAge(df=df)
    
    # drop Name, Ticket and Cabin
    df.drop(
        labels=["Name", "Ticket", "Cabin"],
        inplace=True,
        axis=1
        ) 

    # replace Sex and Embarked with dummy variables
    df = pd.get_dummies(data=df)
    
    return df

In [4]:
def scaleData(df=pd.DataFrame):
	'''
 	This function will scale the columns of the data Frame which are not binary and return it
  	'''
	columnsName = df.columns
	indexName = df.index
	scaler = StandardScaler()
	scaledColumns = pd.DataFrame(scaler.fit_transform(X=df.loc[:, ["Pclass", "Age", "Fare"]]), columns=["Pclass", "Age", "Fare"], index=indexName)
	
	return df

In [5]:
# Load the Dataset
df = pd.read_csv(
	filepath_or_buffer="datasets/train.csv",
 index_col=0
)
# Clean and scale the data
df = cleanData(df)
# df = scaleData(df)
# create the features and target datasets
features = df.copy()
features.drop(labels="Survived", axis=1, inplace=True)
target = df.loc[:,"Survived"].copy()
# create the train and test datasets
x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

### Neural network

In [None]:
strategy = tf.distribute.get_strategy() # default strategy that works on CPU and single GPU
print("Number of accelerators: ", strategy.num_replicas_in_sync)

In [None]:
model = keras.Sequential([
	layers.Dense()
])

<h3>Make a prediction for the competition</h3>

In [None]:
# Load the Dataset
df = pd.read_csv(filepath_or_buffer="datasets/test.csv", index_col=0)
# Save the index
resDic = {"PassengerId": df.index}
# Clean and scale the data
df = cleanData(df)
df = scaleData(df)
# Choose a model to assign
bestModel = tree_cv
# make the prediction
resDic["Survived"] = bestModel.predict(X=df)
# save it in a DataFrame
prediction = pd.DataFrame(data=resDic)
# make the prediction an integer
prediction.loc[:,"Survived"] = prediction.loc[:,"Survived"].apply(lambda x:int(x))
# save the prediction
prediction.to_csv(path_or_buf="datasets/prediction.csv", index=False)

In [None]:
df = pd.read_csv(filepath_or_buffer="datasets/train.csv", index_col=0)
df.head(50)

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C
