In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

/kaggle/input/titanic/test.csv
/kaggle/input/titanic/train.csv
/kaggle/input/titanic/gender_submission.csv


In [2]:
#loading test and train datasets
test = pd.read_csv('/kaggle/input/titanic/test.csv')
train = pd.read_csv('/kaggle/input/titanic/train.csv')
test_passenger_id = test['PassengerId']

In [3]:
#read the head from the files
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [4]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
#identify some statistical values
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [6]:
#confirm all possible classes
np.unique(train['Pclass'])

array([1, 2, 3])

In [7]:
# PassengerId >> just a unique number from every passenger
# Survived >> it's possible to see it's a binary value (1-Survived, 0-Died)
# Pclass >> it can be 1, 2 or 3
#Age >> we have 891 entries in every column, except this one, so we have null values here (891-714=177, amoust 20% so we can't just cut it off)
#SibSp = Sibling = brother, sister, stepbrother, stepsister//Spouse = husband, wife (mistresses and fiancÃ©s were ignored) 
# >> it varies a lot, with a maximum value of 8, maybe more SibSp means more difficult to survive
#Parch = Parent = mother, father//Child = daughter, son, stepdaughter, stepson
# >> it varies a lot, with a maximum value of 6, maybe more Parch means more difficult to survive
#Fare >> how much the passenger paid for the ticket

In [8]:
#we need to change the null values from Age column
#first lets see how many persons died/survived having Age as null
died = train['Sex'][(train['Age'].isnull()) & (train['Survived'] == 0)].count()
survived = train['Sex'][(train['Age'].isnull()) & (train['Survived'] == 1)].count()
print(died, 'passengers died with null age')
print(survived, 'passengers survived with null age')
print('total:', survived+died)

125 passengers died with null age
52 passengers survived with null age
total: 177


In [9]:
#how many female with null age died/survived
fem_died = train['Sex'][(train['Sex'] == 'female')][(train['Age'].isnull())][(train['Survived'] == 0)].count()
fem_survived = train['Sex'][(train['Sex'] == 'female')][(train['Age'].isnull())][(train['Survived'] == 1)].count()
print(fem_died, 'females died with null age')
print(fem_survived, 'females survived with null age')
print('total:', fem_died + fem_survived)

17 females died with null age
36 females survived with null age
total: 53


In [10]:
men_died = train['Sex'][(train['Sex'] == 'male')][(train['Age'].isnull())][(train['Survived'] == 0)].count()
men_survived = train['Sex'][(train['Sex'] == 'male')][(train['Age'].isnull())][(train['Survived'] == 1)].count()
print(men_died, 'males died with null age')
print(men_survived, 'males survived with null age')
print('total:', men_died+men_survived)

108 males died with null age
16 males survived with null age
total: 124


In [11]:
#86.4% of the age null who died was men
#try to see how many of then was in every class
for x in range(1, 4):
    y = train['PassengerId'][(train['Pclass'] == x)][(train['Sex'] == 'male') & (train['Age'].isnull()) & (train['Survived'] == 0)].count()
    print('From the', x ,'class:', y)

From the 1 class: 16
From the 2 class: 7
From the 3 class: 85


In [12]:
#men from 3 class has null value on their Age, lets see some other information about them
train[(train['Pclass'] == 3)][(train['Sex'] == 'male') & (train['Age'].isnull()) & (train['Survived'] == 0)].describe()

  


Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,85.0,85.0,85.0,0.0,85.0,85.0,85.0
mean,459.529412,0.0,3.0,,0.529412,0.129412,13.162644
std,255.857549,0.0,0.0,,1.736089,0.482756,14.312055
min,6.0,0.0,3.0,,0.0,0.0,6.8583
25%,224.0,0.0,3.0,,0.0,0.0,7.7375
50%,491.0,0.0,3.0,,0.0,0.0,7.8958
75%,649.0,0.0,3.0,,0.0,0.0,8.7125
max,879.0,0.0,3.0,,8.0,2.0,69.55


In [13]:
#how many SibSp the null men who died has:
for x in range (0, 9):
    y = train['SibSp'][(train['SibSp'] == x)][(train['Pclass'] == 3)][(train['Sex'] == 'male') & (train['Age'].isnull()) & (train['Survived'] == 0)].count()
    print('For SibSp = ', x, ':', y)

For SibSp =  0 : 71
For SibSp =  1 : 8
For SibSp =  2 : 1
For SibSp =  3 : 1
For SibSp =  4 : 0
For SibSp =  5 : 0
For SibSp =  6 : 0
For SibSp =  7 : 0
For SibSp =  8 : 4


In [14]:
#how many Parch the null men who died has:
for x in range (0, 9):
    y = train['Parch'][(train['Parch'] == x)][(train['Pclass'] == 3)][(train['Sex'] == 'male') & (train['Age'].isnull()) & (train['Survived'] == 0)].count()
    print('For Parch = ', x, ':', y)

For Parch =  0 : 79
For Parch =  1 : 1
For Parch =  2 : 5
For Parch =  3 : 0
For Parch =  4 : 0
For Parch =  5 : 0
For Parch =  6 : 0
For Parch =  7 : 0
For Parch =  8 : 0


In [15]:
#its clear that men with null Age doesnt have SibSp and Parch too, so let's see the mean Age of the men with this caracteristcs
train['Age'][(train['Parch'] == 0)][(train['Pclass'] == 3)][(train['SibSp'] == 0)][(train['Sex'] == 'male')][(train['Age'] > 0)][(train['Survived'] == 0)].describe()

count    161.000000
mean      29.338509
std       11.252028
min       11.000000
25%       21.000000
50%       26.000000
75%       34.500000
max       74.000000
Name: Age, dtype: float64

In [16]:
#as we can see, women and children was saved first, men with 26 years would probably be a good value to use in null ages
train['Age'] = train['Age'].fillna(26)
test['Age'] = test['Age'].fillna(26)

In [17]:
#confirming there's no null in Age anymore
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,28.964276,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,13.085607,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,22.0,0.0,0.0,7.9104
50%,446.0,0.0,3.0,26.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,35.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [18]:
#Lets see how many variables can be usefull
train.count()

PassengerId    891
Survived       891
Pclass         891
Name           891
Sex            891
Age            891
SibSp          891
Parch          891
Ticket         891
Fare           891
Cabin          204
Embarked       889
dtype: int64

In [19]:
#more than 75% of Cabin has no information so it won't be a usefull column, for this time I will drop some another columns too
train = train.drop(['Cabin','PassengerId','Ticket','Fare','Embarked','Name'], axis=1)
test = test.drop(['Cabin','Ticket','PassengerId','Fare','Embarked','Name'], axis=1)

In [20]:
#just to have sure, let's see if men really died more than women and children
print(sum(map(lambda x: x == 0, train['Survived'][(train['Sex'] == 'male')])), 'men died')
print(sum(map(lambda x: x == 1, train['Survived'][(train['Sex'] == 'male')])), 'men survived')

468 men died
109 men survived


In [21]:
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch
0,0,3,male,22.0,1,0
1,1,1,female,38.0,1,0
2,1,3,female,26.0,0,0
3,1,1,female,35.0,1,0
4,0,3,male,35.0,0,0


In [22]:
#create categories to separate ages, transforming continuous values in discrete values
train['Age'] = pd.cut(train['Age'], bins = [0, 12, 22, 45, 60, 80], labels = ['Child', 'Young Adult', 'Adult', 'Old Adult', 'Senior'])
test['Age'] = pd.cut(test['Age'], bins = [0, 12, 22, 45, 60, 80], labels = ['Child', 'Young Adult', 'Adult', 'Old Adult', 'Senior'])

In [23]:
train = pd.get_dummies(train)
test = pd.get_dummies(test)

In [24]:
#separate x columns from y column
X = train.iloc[:,1:]
y = train.iloc[:,0]
print(y)

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64


In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [26]:
#Using Decision Tree Classifier
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)
acc = accuracy_score(y_pred, y_test)
print(acc)

0.7982062780269058




In [27]:
#decision tree prediction
y_pred_rfc = rfc.predict(test)


In [28]:
#create the Survived column and index the pred y
test['PassengerId'] = test_passenger_id
test['Survived'] = y_pred_rfc


In [29]:
test[['PassengerId', 'Survived']].to_csv('submission_dtc.csv', index=False)
