In [36]:
import sklearn
import pandas as pd
import numpy as np
%matplotlib inline

In [37]:
data = pd.read_csv('./data/train.csv')

In [38]:
data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [39]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [40]:
data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [41]:
def data_process(data):
    data.drop(['Name', "Ticket", "Cabin", 'PassengerId'], inplace=True, axis=1)
    processed_data = data.copy()
    processed_data['Age'] = processed_data['Age'].fillna(processed_data['Age'].mean())
    processed_data['Fare'] = processed_data['Fare'].fillna(processed_data['Fare'].mean())
    processed_data['Embarked'] = processed_data['Embarked'].fillna(processed_data['Embarked'].mode()[0])

    processed_data = processed_data.join(pd.get_dummies(data['Pclass'], prefix="Pclass"))
    processed_data = processed_data.join(pd.get_dummies(processed_data['Sex'], prefix='Sex'))
    processed_data['Embarked'] = processed_data['Embarked'].astype(str)
    processed_data = processed_data.join(pd.get_dummies(processed_data['Embarked'], prefix='Embarked'))
    processed_data.drop(['Sex', 'Pclass','Embarked'], axis=1, inplace=True)
    
    return processed_data

In [42]:
processed_data = data_process(data)

In [43]:
target = processed_data['Survived']
features = processed_data.drop('Survived', axis=1)

In [44]:
import sklearn.model_selection

In [45]:
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(features, target, test_size=0.2, random_state=1)

In [46]:
from sklearn.tree import DecisionTreeClassifier

In [47]:
from sklearn.model_selection import GridSearchCV

In [48]:
clf = DecisionTreeClassifier(random_state=2333)

In [49]:
clf.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=2333, splitter='best')

In [50]:
clf.score(X_test, y_test)


0.75977653631284914

In [63]:
test_data = pd.read_csv('./data/test.csv')

In [64]:
processed_test_data = data_process(test_data.copy())

In [65]:
res = clf.predict(processed_test_data)

In [66]:
output = pd.DataFrame(test_data['PassengerId'])
output['Survived'] = res

In [67]:
output.to_csv("output", index=False)

In [68]:
from sklearn.tree import export_graphviz

In [69]:
export_graphviz(clf, out_file='dtree.dot',feature_names=features.columns)