# Titanic: Machine Learning from Disaster

### Challange of [Kaggle](https://www.kaggle.com/c/titanic)

import [numpy](http://www.numpy.org/)

import [pandas](http://pandas.pydata.org/)

import [scikit-learn](http://scikit-learn.org/stable)

In [462]:
import numpy as np
from sklearn import tree
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

### Get DataSets of train and test

In [463]:
train_path = "train.csv"
train = pd.read_csv(train_path)

test_path = "test.csv"
test = pd.read_csv(test_path)

### Show columns for observation and analysis

In [464]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [465]:
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,,0.0,0.0,7.9104
50%,446.0,0.0,3.0,,0.0,0.0,14.4542
75%,668.5,1.0,3.0,,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


### Check the amount of people that survived in the end 

In [466]:
train['Survived'].value_counts()

0    549
1    342
Name: Survived, dtype: int64

In [467]:
train['Survived'].value_counts(normalize = True) * 100

0    61.616162
1    38.383838
Name: Survived, dtype: float64

### How much people of male sex survived?

In [468]:
print(train['Survived'][train['Sex'] == 'male'].value_counts())

0    468
1    109
Name: Survived, dtype: int64


In [469]:
print(train['Survived'][train['Sex'] == 'male'].value_counts(normalize = True) * 100)

0    81.109185
1    18.890815
Name: Survived, dtype: float64


### Check the amount of null data in the file

In [470]:
for col in list(train.columns.values):
    print ("Number of missing data on " + col,train[col].isnull().values.sum())

Number of missing data on PassengerId 0
Number of missing data on Survived 0
Number of missing data on Pclass 0
Number of missing data on Name 0
Number of missing data on Sex 0
Number of missing data on Age 177
Number of missing data on SibSp 0
Number of missing data on Parch 0
Number of missing data on Ticket 0
Number of missing data on Fare 0
Number of missing data on Cabin 687
Number of missing data on Embarked 2


### Change number age in period (child, adult, old man)

In [471]:
train['Period'] = float('NaN')
train.Period[train['Age'] < 18] = 0
train.Period[train['Age'] >= 18] = 1
train.Period[train['Age'] > 60] = 2
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Period
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,1.0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1.0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,1.0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1.0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,1.0


In [472]:
print ("Child " + str(train['Period'][train['Period'] == 0].value_counts()))
print ("--------------------")
print ("Adult " + str(train['Period'][train['Period'] == 1].value_counts()))
print ("--------------------")
print ("Old man " + str(train['Period'][train['Period'] == 2].value_counts()))

Child 0.0    113
Name: Period, dtype: int64
--------------------
Adult 1.0    579
Name: Period, dtype: int64
--------------------
Old man 2.0    22
Name: Period, dtype: int64


### Check amount people that survived by age period

In [473]:
print ("Child")
print (train['Survived'][train['Period'] == 0].value_counts(normalize = True) * 100)
print ("--------------------")
print ("Adult")
print (train['Survived'][train['Period'] == 1].value_counts(normalize = True) * 100)
print ("--------------------")
print ("Old man")
print (train['Survived'][train['Period'] == 2].value_counts(normalize = True) * 100)

Child
1    53.982301
0    46.017699
Name: Survived, dtype: float64
--------------------
Adult
0    61.312608
1    38.687392
Name: Survived, dtype: float64
--------------------
Old man
0    77.272727
1    22.727273
Name: Survived, dtype: float64


In [474]:
train['Period'] = train['Period'].fillna(1)

### Change the sex text by a number that represents it

In [475]:
train['Sex'][train['Sex'] == 'male'] = 0
train['Sex'][train['Sex'] == 'female'] = 1

In [476]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Period
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,S,1.0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,C,1.0
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,S,1.0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,S,1.0
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,S,1.0


### Set value 'S' in null values of column Embarked

In [477]:
train['Embarked'] = train['Embarked'].fillna('S')

In [478]:
for col in list(train.columns.values):
    print ("Number of missing data on " + col,train[col].isnull().values.sum())

Number of missing data on PassengerId 0
Number of missing data on Survived 0
Number of missing data on Pclass 0
Number of missing data on Name 0
Number of missing data on Sex 0
Number of missing data on Age 177
Number of missing data on SibSp 0
Number of missing data on Parch 0
Number of missing data on Ticket 0
Number of missing data on Fare 0
Number of missing data on Cabin 687
Number of missing data on Embarked 0
Number of missing data on Period 0


### Change the Embarked text by a number that represents it

In [479]:
train['Embarked'][train['Embarked'] == 'S'] = 0
train['Embarked'][train['Embarked'] == 'C'] = 1
train['Embarked'][train['Embarked'] == 'Q'] = 2

In [480]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Period
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,0,1.0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,1,1.0
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,0,1.0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,0,1.0
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,0,1.0


### Change the Fare value by a number that represents a range

In [481]:
train['PassengerFare'] = float('NaN')
train['PassengerFare'][train['Fare'] >= 0] = 0
train['PassengerFare'][train['Fare'] >= 20] = 1
train['PassengerFare'][train['Fare'] >= 40] = 2
train['PassengerFare'][train['Fare'] >= 60] = 3
train['PassengerFare'][train['Fare'] >= 80] = 4

### Set new train

In [482]:
new_train = train[['Pclass', 'Sex', 'Parch', 'Embarked', 'Period', 'Fare', 'Survived']]
new_train.dropna(axis = 0, inplace = True)

print ("Train shape " + str(train.shape))
print ("New_train shape " + str(new_train.shape))

Train shape (891, 14)
New_train shape (891, 7)


In [484]:
target = new_train['Survived'].values
features_one = new_train[['Pclass', 'Sex', 'Parch', 'Embarked', 'Period', 'Fare']].values

### Starting the Machine Learning technique

In [485]:
my_tree_one = tree.DecisionTreeClassifier()
my_tree_one = my_tree_one.fit(features_one, target)

In [486]:
feature_list = ['Pclass', 'Sex', 'Parch', 'Embarked', 'Period', 'Fare']
importances = my_tree_one.feature_importances_

for k in range(0, len(feature_list)):
    print ("Feature: " + str(feature_list[k]) + " Importance: " + str(importances[k]))

Feature: Pclass Importance: 0.105220921236
Feature: Sex Importance: 0.377157654272
Feature: Parch Importance: 0.0570357612539
Feature: Embarked Importance: 0.0308478501997
Feature: Period Importance: 0.0618173369388
Feature: Fare Importance: 0.3679204761


In [487]:
print(my_tree_one.score(features_one, target))

0.928170594837


### Formatting of test dataset

In [488]:
for col in list(test.columns.values):
    print ("Number of missing data on " + col,test[col].isnull().values.sum())

Number of missing data on PassengerId 0
Number of missing data on Pclass 0
Number of missing data on Name 0
Number of missing data on Sex 0
Number of missing data on Age 86
Number of missing data on SibSp 0
Number of missing data on Parch 0
Number of missing data on Ticket 0
Number of missing data on Fare 1
Number of missing data on Cabin 327
Number of missing data on Embarked 0


In [489]:
test.Fare = test.Fare.fillna(test.Fare.median())
test.Age = test.Age.fillna(test.Age.median())

In [490]:
test['Sex'][test['Sex'] == 'male'] = 0
test['Sex'][test['Sex'] == 'female'] = 1

test['Embarked'] = test['Embarked'].fillna('S')
test['Embarked'][test['Embarked'] == 'S'] = 0
test['Embarked'][test['Embarked'] == 'C'] = 1
test['Embarked'][test['Embarked'] == 'Q'] = 2

test['Period'] = float('NaN')
test.Period[test['Age'] < 18] = 0
test.Period[test['Age'] >= 18] = 1
test.Period[test['Age'] > 60] = 2

test['PassengerFare'] = float('NaN')
test['PassengerFare'][test['Fare'] >= 0] = 0
test['PassengerFare'][test['Fare'] >= 20] = 1
test['PassengerFare'][test['Fare'] >= 40] = 2
test['PassengerFare'][test['Fare'] >= 60] = 3
test['PassengerFare'][test['Fare'] >= 80] = 4

In [491]:
for col in list(test.columns.values):
    print ("Number of missing data on " + col,test[col].isnull().values.sum())

Number of missing data on PassengerId 0
Number of missing data on Pclass 0
Number of missing data on Name 0
Number of missing data on Sex 0
Number of missing data on Age 0
Number of missing data on SibSp 0
Number of missing data on Parch 0
Number of missing data on Ticket 0
Number of missing data on Fare 0
Number of missing data on Cabin 327
Number of missing data on Embarked 0
Number of missing data on Period 0
Number of missing data on PassengerFare 0


In [492]:
test_features = test[['Pclass', 'Sex', 'Parch', 'Embarked', 'Period', 'Fare']].values
print(test_features)

my_prediction = my_tree_one.predict(test_features)

[[3 0 0 2 1.0 7.8292]
 [3 1 0 0 1.0 7.0]
 [2 0 0 2 2.0 9.6875]
 ..., 
 [3 0 0 0 1.0 7.25]
 [3 0 0 0 1.0 8.05]
 [3 0 1 1 1.0 22.3583]]


### Creating the solution csv

In [493]:
PassengerId = np.array(test["PassengerId"]).astype(int)
my_solution = pd.DataFrame(my_prediction, PassengerId, columns = ["Survived"])
print(my_solution.shape)
my_solution.to_csv("my_solution.csv", index_label = ["PassengerId"])

(418, 1)
