In [22]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt


# Load the dataset
data_train = pd.read_csv("./Train.csv")

# Extract information about the data

In [23]:
# Fill nan value with mean value: 
data_train = data_train.fillna(data_train.median())
data_train["Embarked"] = data_train["Embarked"].fillna("S")

In [24]:
# Display columns

print ("Columns : {}".format(data_train.columns))
print ("Nb rows : {}".format(data_train.shape[0]))

Columns : Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
Nb rows : 891


In [10]:
data_train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,13.002015,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,22.0,0.0,0.0,7.9104
50%,446.0,0.0,3.0,29.699118,0.0,0.0,14.4542
75%,668.5,1.0,3.0,35.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


# Preprocessing

In [25]:
# We remove useless columns : 
data_train = data_train.drop(columns=['Cabin','Ticket','Name'])

In [26]:
# We transform string to numeric values : 
data_train.loc[data_train["Sex"] == "male", "Sex"] = 1
data_train.loc[data_train["Sex"] == "female", "Sex"] = 0

data_train.loc[data_train["Embarked"]=="S","Embarked"] = 0
data_train.loc[data_train["Embarked"]=="C","Embarked"] = 1
data_train.loc[data_train["Embarked"]=="Q","Embarked"] = 2



In [27]:
#Display unique values by columns to be sure that we have only numbers

for col in data_train.columns:
    print("Unique values for {}".format(col))
    print("\t {}".format(data_train[col].unique()))

Unique values for PassengerId
	 [  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18
  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36
  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54
  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72
  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90
  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107 108
 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126
 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144
 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162
 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198
 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216
 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234
 235 236 237 238 23

In [37]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

x = data_train.drop(columns=["PassengerId", "Survived"])
y = data_train["Survived"]

x_preprocess = scaler.fit_transform(x)

# Model 1 : Random Forest 

In [42]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

for n in [10, 20, 30, 50, 60]:
    algo = RandomForestClassifier(n_estimators=n, random_state=0)
    scores = cross_val_score(algo, x, y, cv=3)

    print ("Score using {} estimators : {}".format(n, scores.mean()))

Score using 10 estimators : 0.7946127946127945
Score using 20 estimators : 0.7912457912457912
Score using 30 estimators : 0.7957351290684623
Score using 50 estimators : 0.8035914702581369
Score using 60 estimators : 0.7957351290684623


# Model 2 : Simple Neural net

In [52]:
from sklearn.neural_network import MLPClassifier

for n in [10, 100, 200, 500]:
    algo = MLPClassifier(hidden_layer_sizes=[n], activation="tanh", max_iter=400, random_state=0)
    scores = cross_val_score(algo, x_preprocess, y, cv=3)
    print ("Score using {} neurones : {}".format(n, scores.mean()))


Score using 10 neurones : 0.7957351290684626
Score using 100 neurones : 0.7901234567901234
Score using 200 neurones : 0.7890011223344556
Score using 500 neurones : 0.7878787878787877


# Model 3 : SVM 

In [56]:
from sklearn import svm

algo = svm.SVC()
scores = cross_val_score(algo, x_preprocess, y, cv=3)

print ("Mean : {}".format(scores.mean()))


Mean : 0.7867564534231201
