## Start the Docker container

### Run the following commands:
docker run -it --rm -p 8888:8888 -v /Users/sylvain/Data_Science/Kaggle/competition_titanic/datasets:/home/jovyan/datasets jupyter-server

In [9]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.tree import plot_tree
from sklearn.metrics import f1_score
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.model_selection import GridSearchCV

from tensorflow import keras
from keras import layers

import pandas as pd
import numpy as np
import matplotlib as plt

<h3>Missing Age Data (Model Training and helper function)

In [10]:
def modelAgePredictor():
	'''
	Instantiate and train a model to guess the age of a passenger
	'''
	df_age = pd.read_csv(filepath_or_buffer="datasets/train.csv",index_col=0)
	df_age = df_age.loc[:,["Sex", "Age", "SibSp", "Parch", "Fare"]]
	df_age = pd.get_dummies(data=df_age)
	df_age = df_age.dropna(axis=0)

	ageFeatures = df_age.copy()
	ageFeatures.drop(labels="Age", axis=1, inplace=True)
	ageTarget = df_age.loc[:,"Age"]

	agePredictor = LinearRegression()
	agePredictor.fit(X=ageFeatures, y=ageTarget)

	return agePredictor

def guessAge(df):
	'''
	Fill the missing ages cell with the most likely value
 	'''
	agePredictor = modelAgePredictor()
	features_age = df.loc[df.loc[:,"Age"].isna(), ["Sex", "SibSp", "Parch", "Fare"]]
	features_age = pd.get_dummies(data=features_age)
	yhat = agePredictor.predict(X=features_age)
	df.loc[df.loc[:,"Age"].isna(), "Age"] = yhat
	
	return df

# Data preprocessing
<h3>Missing values</h3>
- we will replace the missing values of the cabins by "unknown"
<br>
- we will replace the missing age by the median of the age group
<br>
- we will drop the two missing embarked
<h3>Non numerical values</h3>
- Name: we will drop it
<br>
- Sex: we will encode it (2 catégories)
<br>
- Ticket: we will drop it
<br>
- Cabin: we will first drop it, and then try to find a way to encode it and use it
<br>
- Embarked: we will encode it (3 catégories)

In [11]:
def cleanData(df):
    
    # Replace the value of the missing "Embarked" by "S" (the most common)
    df.loc[df.loc[:,"Embarked"].isna(), "Embarked"] = df.loc[df.loc[:,"Embarked"].isna(), "Embarked"].apply(lambda x: "S")
    # Replace the missing pclass value with the most common one
    df.loc[df.loc[:,"Pclass"].isna(), "Pclass"] = df.loc[df.loc[:,"Pclass"].isna(), "Pclass"].apply(lambda x: 3)
    # Replace the missing Fare values with the mean one
    df.loc[df.loc[:,"Fare"].isna(), "Fare"] = df.loc[df.loc[:,"Fare"].isna(), "Fare"].apply(lambda x:df.loc[:,"Fare"].mean())
    # fill the missing age values
    df.loc[df.loc[:,"Age"].isna(), "Age"] = df.loc[df.loc[:,"Age"].isna(), "Age"].apply(lambda x: df.loc[:,"Age"].mean())
    # df = guessAge(df=df)
    
    # drop Name, Ticket and Cabin
    df.drop(
        labels=["Name", "Ticket", "Cabin"],
        inplace=True,
        axis=1
        ) 

    # replace Sex and Embarked with dummy variables
    df = pd.get_dummies(data=df)
    
    return df

<h3> Scale the Data (if necessary)

In [12]:
def scaleData(df=pd.DataFrame):
	'''
 	This function will scale the columns of the data Frame which are not binary and return it
  	'''
	columnsName = df.columns
	indexName = df.index
	scaler = StandardScaler()
	scaledColumns = pd.DataFrame(scaler.fit_transform(X=df.loc[:, ["Pclass", "Age", "Fare"]]), columns=["Pclass", "Age", "Fare"], index=indexName)
	
	return df

<h3>Load the Dataset and split it

In [13]:
# Load the Dataset
df = pd.read_csv(
	filepath_or_buffer="datasets/train.csv",
 index_col=0
)
# Clean and scale the data
df = cleanData(df)
# df = scaleData(df)
# create the features and target datasets
features = df.copy()
features.drop(labels="Survived", axis=1, inplace=True)
target = df.loc[:,"Survived"].copy()
# create the train and test datasets
x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

<h4>K-Nearest-Neighbors</h4>

In [14]:
bestScore = 0
bestK = 0

for i in range(1,20):    
	knnModel = KNeighborsClassifier(n_neighbors=i)
	knnModel.fit(X=x_train, y=y_train)
	score = knnModel.score(X=x_test, y=y_test)

	if score > bestScore:
		bestScore = score
		bestK = i

print(f"The best score is: {bestScore} and was obtaines with k={bestK}")     

The best score is: 0.7430167597765364 and was obtaines with k=13


<h4>Decision Tree</h4>

In [15]:
for hyperParameter in ["gini", "entropy", "log_loss"]:
	treeModel = DecisionTreeClassifier(criterion=hyperParameter,random_state=42, max_depth=5)
	treeModel.fit(X=x_train, y=y_train)
	score = treeModel.score(X=x_test, y=y_test)
	print(f"The decision tree {hyperParameter} has a score of {score}")

The decision tree gini has a score of 0.7988826815642458
The decision tree entropy has a score of 0.8044692737430168
The decision tree log_loss has a score of 0.8044692737430168


<h4>Support Vector Machine</h4>

In [16]:
bestScore = 0
bestC = ""
bestKernel = ""
for c in [0.001, 0.01, 0.1, 1]:
	for kernel in ["linear", "poly", "rbf", "sigmoid"]:
		svmModel = SVC(C=c, kernel = kernel)
		svmModel.fit(X=x_train, y=y_train)
		score = svmModel.score(X=x_test, y=y_test)
		if score > bestScore:
			bestC = c
			bestScore = score
			bestKernel = kernel
print(f"The SVM classifier with c = {bestC}, and kernel = {bestKernel}, has an accuracy score of: {bestScore}")


The SVM classifier with c = 0.01, and kernel = linear, has an accuracy score of: 0.7821229050279329


<h4>Logistic Regression</h4>

In [17]:
logRegModel = LogisticRegression(max_iter=100, solver="newton-cg")
logRegModel.fit(X=x_train, y=y_train)
logRegModel.score(X=x_test, y=y_test)

0.8100558659217877

In [18]:
logRegModel.fit(X=features, y=target)
logRegModel.score(X=features, y=target)

0.8013468013468014

In [19]:
# hyperParameter = [{
#     "C": [0.3, 1, 3],
#     "kernel": ["linear", "rbf", "poly"],
#     "gamma": ["scale", "auto"],
#     "decision_function_shape": ["ovo", "ovr"],
# }]

# svmEstimator = SVC()

# svm_cv = GridSearchCV(
# 	estimator=svmEstimator,
# 	param_grid=hyperParameter,
#  	cv=4,
#   	scoring="accuracy"
# )

# svm_cv.fit(X=x_train, y=y_train)
# svm_cv.best_score_

<h4>Search the best parameter for a tree classifier

In [None]:
parameters = {'criterion': ['gini', 'entropy'],
     'splitter': ['best', 'random'],
     'max_depth': [2*n for n in range(1,10)],
     'max_features': ['sqrt'],
     'min_samples_leaf': [1, 2, 4],
     'min_samples_split': [2, 5, 10]}

tree = DecisionTreeClassifier()

tree_cv = GridSearchCV(
     estimator=tree,
     param_grid=parameters,
     cv=8
)

tree_cv.fit(X=features, y=target)
tree_cv.best_score_

0.826063787001287

<h3>Make a prediction for the competition</h3>

In [None]:
# Load the Dataset
df = pd.read_csv(filepath_or_buffer="datasets/test.csv", index_col=0)
# Save the index
resDic = {"PassengerId": df.index}
# Clean and scale the data
df = cleanData(df)
df = scaleData(df)
# Choose a model to assign
bestModel = tree_cv
# make the prediction
resDic["Survived"] = bestModel.predict(X=df)
# save it in a DataFrame
prediction = pd.DataFrame(data=resDic)
# make the prediction an integer
prediction.loc[:,"Survived"] = prediction.loc[:,"Survived"].apply(lambda x:int(x))
# save the prediction
prediction.to_csv(path_or_buf="datasets/prediction.csv", index=False)