In [1]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns

In [2]:
train_df = pd.read_csv('../../Desktop/titanic_train.csv')
test_df = pd.read_csv('../../Desktop/titanic_test.csv')

In [3]:
y = train_df['Survived']

In [4]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
train_df.describe(include='all')

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,891.0,891.0,891.0,891,891,714.0,891.0,891.0,891.0,891.0,204,889
unique,,,,891,2,,,,681.0,,147,3
top,,,,"Davies, Mr. Alfred J",male,,,,1601.0,,B96 B98,S
freq,,,,1,577,,,,7.0,,4,644
mean,446.0,0.383838,2.308642,,,29.699118,0.523008,0.381594,,32.204208,,
std,257.353842,0.486592,0.836071,,,14.526497,1.102743,0.806057,,49.693429,,
min,1.0,0.0,1.0,,,0.42,0.0,0.0,,0.0,,
25%,223.5,0.0,2.0,,,20.125,0.0,0.0,,7.9104,,
50%,446.0,0.0,3.0,,,28.0,0.0,0.0,,14.4542,,
75%,668.5,1.0,3.0,,,38.0,1.0,0.0,,31.0,,


In [6]:
train_df['Age'].fillna(train_df['Age'].median(), inplace=True)
test_df['Age'].fillna(train_df['Age'].median(), inplace=True)
train_df['Embarked'].fillna('S', inplace=True)
test_df['Fare'].fillna(train_df['Fare'].median(), inplace=True)

In [7]:
train_df = pd.concat([train_df, pd.get_dummies(train_df['Pclass'], 
                                               prefix="PClass"),
                      pd.get_dummies(train_df['Sex'], prefix="Sex"),
                      pd.get_dummies(train_df['SibSp'], prefix="SibSp"),
                      pd.get_dummies(train_df['Parch'], prefix="Parch"),
                     pd.get_dummies(train_df['Embarked'], prefix="Embarked")],
                     axis=1)
test_df = pd.concat([test_df, pd.get_dummies(test_df['Pclass'], 
                                             prefix="PClass"),
                      pd.get_dummies(test_df['Sex'], prefix="Sex"),
                      pd.get_dummies(test_df['SibSp'], prefix="SibSp"),
                      pd.get_dummies(test_df['Parch'], prefix="Parch"),
                    pd.get_dummies(test_df['Embarked'], prefix="Embarked")],
                     axis=1)

In [8]:
train_df.drop(['Survived', 'Pclass', 'Name', 'Sex', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Embarked', 'PassengerId'], 
              axis=1, inplace=True)
test_df.drop(['Pclass', 'Name', 'Sex', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Embarked', 'PassengerId'], 
             axis=1, inplace=True)

In [9]:
train_df.shape, test_df.shape

((891, 24), (418, 25))

In [10]:
set(test_df.columns) - set(train_df.columns)

{'Parch_9'}

In [11]:
test_df.drop(['Parch_9'], axis=1, inplace=True)

In [12]:
train_df.head()

Unnamed: 0,Age,Fare,PClass_1,PClass_2,PClass_3,Sex_female,Sex_male,SibSp_0,SibSp_1,SibSp_2,...,Parch_0,Parch_1,Parch_2,Parch_3,Parch_4,Parch_5,Parch_6,Embarked_C,Embarked_Q,Embarked_S
0,22.0,7.25,0,0,1,0,1,0,1,0,...,1,0,0,0,0,0,0,0,0,1
1,38.0,71.2833,1,0,0,1,0,0,1,0,...,1,0,0,0,0,0,0,1,0,0
2,26.0,7.925,0,0,1,1,0,1,0,0,...,1,0,0,0,0,0,0,0,0,1
3,35.0,53.1,1,0,0,1,0,0,1,0,...,1,0,0,0,0,0,0,0,0,1
4,35.0,8.05,0,0,1,0,1,1,0,0,...,1,0,0,0,0,0,0,0,0,1


In [13]:
test_df.head()

Unnamed: 0,Age,Fare,PClass_1,PClass_2,PClass_3,Sex_female,Sex_male,SibSp_0,SibSp_1,SibSp_2,...,Parch_0,Parch_1,Parch_2,Parch_3,Parch_4,Parch_5,Parch_6,Embarked_C,Embarked_Q,Embarked_S
0,34.5,7.8292,0,0,1,0,1,1,0,0,...,1,0,0,0,0,0,0,0,1,0
1,47.0,7.0,0,0,1,1,0,0,1,0,...,1,0,0,0,0,0,0,0,0,1
2,62.0,9.6875,0,1,0,0,1,1,0,0,...,1,0,0,0,0,0,0,0,1,0
3,27.0,8.6625,0,0,1,0,1,1,0,0,...,1,0,0,0,0,0,0,0,0,1
4,22.0,12.2875,0,0,1,1,0,0,1,0,...,0,1,0,0,0,0,0,0,0,1


In [14]:
from sklearn.model_selection import train_test_split, cross_val_score
import numpy as np

In [15]:
x = train_df

In [59]:
x_train, x_valid, y_train, y_valid = train_test_split(x,y,
                                                     test_size=0.3,
                                                     random_state=17)

In [60]:
x_train.shape, x_valid.shape

((623, 24), (268, 24))

In [61]:
first_tree = DecisionTreeClassifier(max_depth=2, random_state=17)

In [62]:
np.mean(cross_val_score(first_tree, x_train, y_train, cv=5))

0.7849032258064517

In [63]:
first_tree.fit(x_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=2, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=17, splitter='best')

In [65]:
first_tree.score(x_train, y_train)

0.8009630818619583

In [64]:
tree_valid_pred = first_tree.predict(x_valid)

In [54]:
accuracy_score(y_valid, tree_valid_pred)

0.753731343283582

In [35]:
1 - np.mean(y)

0.6161616161616161

In [24]:
pip install pydotplus

Note: you may need to restart the kernel to use updated packages.


In [25]:
pip install graphviz

Note: you may need to restart the kernel to use updated packages.


In [30]:
import os

os.environ['PATH'] = os.environ['PATH']+';'+os.environ['CONDA_PREFIX']+r"\Library\bin\graphviz"

In [31]:
from ipywidgets import Image
from io import StringIO
import pydotplus
from sklearn.tree import export_graphviz

dot_data = StringIO()
export_graphviz(first_tree, feature_names=x.columns, 
                out_file=dot_data, filled=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
Image(value=graph.create_png())

Image(value=b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x02\x89\x00\x00\x01g\x08\x06\x00\x00\x00\xcaq\xd3N\x…

In [None]:
def write_to_submission_file(predicted_labels, out_file, train_num=891,
                            targer='Survived', index_label='PassengerId'
# turn predictions into data frame and save as csv file
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(train_num + 1,
                                                  train_num + 1 +
                                                  predicted_labels.shape[0]),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

In [None]:
from sklearn.tree import export_graphviz

In [None]:
export_graphviz(tree_grid.best_estimator_, out_file='titanic_tree.dot',
              feature_names=x.columns, filled=True )

In [49]:
x.columns

Index(['Age', 'Fare', 'PClass_1', 'PClass_2', 'PClass_3', 'Sex_female',
       'Sex_male', 'SibSp_0', 'SibSp_1', 'SibSp_2', 'SibSp_3', 'SibSp_4',
       'SibSp_5', 'SibSp_8', 'Parch_0', 'Parch_1', 'Parch_2', 'Parch_3',
       'Parch_4', 'Parch_5', 'Parch_6', 'Embarked_C', 'Embarked_Q',
       'Embarked_S'],
      dtype='object')

In [None]:
from os import system

In [None]:
dotfile = open("C:/Users/maksn_000/Lesson/Lesson03/titanic_tree.dot", 'w')
dotfile = export_graphviz(tree_grid.best_estimator_, out_file='titanic_tree.dot',
              feature_names=x.columns, filled=True )
dotfile.close()
system("dot -Tpng C:.dot -o C:/Users/maksn_000/Lesson/Lesson03/titanic_tree.dot")

In [48]:
from sklearn.model_selection import GridSearchCV

In [47]:
tree_params = {'max_depth' : list(range( 2, 3))}

In [46]:
tree_grid = GridSearchCV(first_tree, tree_params, cv=5, n_jobs=-1)

NameError: name 'tree_params' is not defined

In [45]:
%time
tree_grid.fit(x_train,y_train)



Wall time: 0 ns


NameError: name 'tree_grid' is not defined

In [44]:
tree_grid.best_score_, tree_grid.best_params_

NameError: name 'tree_grid' is not defined

In [43]:
tree_valid_pred = tree_grid.predict(x_valid)

NameError: name 'tree_grid' is not defined

In [42]:
from sklearn.metrics import accuracy_score

In [41]:
accuracy_score(y_valid,tree_valid_pred)

0.753731343283582

In [40]:
1 - np.mean(y)

0.6161616161616161