In [1]:
import pandas as pd
from IPython.display import display
from sklearn.tree import DecisionTreeClassifier

In [2]:
data = pd.read_csv('titanic_data/train.csv')
test_data = pd.read_csv('titanic_data/test.csv')

# Brief data analysis

In [3]:
data.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [4]:
# check whether a column has a nan value
print('In training set: ')
for col in data.columns:
    if data[col].isnull().values.any():
        print('column {} has NaN value'.format(col))

print('')

print('In testing set')
for col in test_data.columns:
    if test_data[col].isnull().values.any():
        print('column {} has NaN value'.format(col))

In training set: 
column Age has NaN value
column Cabin has NaN value
column Embarked has NaN value

In testing set
column Age has NaN value
column Fare has NaN value
column Cabin has NaN value


In [5]:
data['Survived'].value_counts().reset_index().rename(columns={"index": "Survived_type", "Survived": "counts"})

Unnamed: 0,Survived_type,counts
0,0,549
1,1,342


In [6]:
data[data['Cabin'].apply(lambda x: not isinstance(x, float))][['Cabin', 'Survived']]['Survived'].value_counts()

1    136
0     68
Name: Survived, dtype: int64

In [7]:
data[data['Cabin'].apply(lambda x: isinstance(x, float))][['Cabin', 'Survived']]['Survived'].value_counts()

0    481
1    206
Name: Survived, dtype: int64

* It is clear that the exitence of Cabin number can reduce the impurity of data

In [8]:
# create a new column that displays the existence of Cabin number 
data['HasCabinNum'] = data['Cabin'].apply(lambda x: isinstance(x, str))
test_data['HasCabinNum'] = test_data['Cabin'].apply(lambda x: isinstance(x, str))

In [9]:
# convert str type 'male/female' to int type '0/1', 0 for male ,1 for female
data['Sex'] = data['Sex'].apply(lambda x: 0 if x=='male' else 1)
test_data['Sex'] = test_data['Sex'].apply(lambda x: 0 if x=='male' else 1)

In [10]:
# there is a high correlation between 'Ticket' and 'Fare'
# for pair in data[['Ticket', 'Fare', 'Survived']].groupby('Ticket'):
#     display(pair[1])

In [11]:
(data['Age']==0).value_counts()

False    891
Name: Age, dtype: int64

In [12]:
 (test_data['Age']==0).value_counts()

False    418
Name: Age, dtype: int64

* Fill NaN values in column 'Age' with int 0

In [13]:
data['Age'] = data['Age'].fillna(0)
test_data['Age'] = test_data['Age'].fillna(0)

* Fill NaN values in column 'Embarked' with 'unknown_location'

In [14]:
data['Embarked'] = data['Embarked'].fillna('unknown_location')
test_data['Embarked'] = test_data['Embarked'].fillna('unknown_location')

In [15]:
display(data.loc[data['Embarked']=='unknown_location'])
display(test_data.loc[test_data['Embarked']=='unknown_location'])

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,HasCabinNum
61,62,1,1,"Icard, Miss. Amelie",1,38.0,0,0,113572,80.0,B28,unknown_location,True
829,830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",1,62.0,0,0,113572,80.0,B28,unknown_location,True


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,HasCabinNum


* Fill NaN values in column 'Fare' with -1

In [16]:
data['Fare'] = data['Fare'].fillna(-1)
test_data['Fare'] = test_data['Fare'].fillna(-1)

# model training

In [17]:
X_train = data[['PassengerId', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'HasCabinNum']]
# onehot encoding (column 'Embarked' is of str type)
X_train = pd.get_dummies(X_train)
y_train = data['Survived']

X_test = test_data[['PassengerId', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'HasCabinNum']]
X_test = pd.get_dummies(X_test)
# the column 'Embarked_unknown_location' to the X_test
X_test['Embarked_unknown_location'] = [0] * X_test.shape[0]

In [18]:
clf = DecisionTreeClassifier(random_state=0)
clf.fit(X_train.iloc[:, 1:], y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=0, splitter='best')

In [19]:
clf.score(X_train.iloc[:, 1:], y_train)

0.9865319865319865

In [20]:
result = pd.DataFrame({"PassengerId": X_test['PassengerId']})
result['Survived'] = clf.predict(X_test.iloc[:, 1:])

In [21]:
submission_name = 'First'
result.to_csv('{}_submission.csv'.format(submission_name), index=False)