In [137]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.metrics import confusion_matrix, accuracy_score
import graphviz

# reading the dataset

In [138]:
train = pd.read_csv('./dataset/train.csv')
test = pd.read_csv('./dataset/test.csv')
submissao = pd.read_csv('./dataset/gender_submission.csv')

# Data Manipulation | Transformation
>> ## Editing Key and Answer Variable

In [139]:
train = train.set_index(['PassengerId'])
train = train.drop(['Name', 'Ticket', 'Cabin'], axis=1)
train['Embarked'] = np.where(train['Embarked'] == 'C', 0, np.where(train['Embarked'] == 'Q', 1, 2))
train = pd.get_dummies(train)
train = train.drop(['Sex_male'], axis=1)
train = train[['Survived',	'Pclass',	'Sex_female', 'Age',	'SibSp',	'Parch',	'Fare',	'Embarked']]
train

Unnamed: 0_level_0,Survived,Pclass,Sex_female,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0,3,0,22.0,1,0,7.2500,2
2,1,1,1,38.0,1,0,71.2833,0
3,1,3,1,26.0,0,0,7.9250,2
4,1,1,1,35.0,1,0,53.1000,2
5,0,3,0,35.0,0,0,8.0500,2
...,...,...,...,...,...,...,...,...
887,0,2,0,27.0,0,0,13.0000,2
888,1,1,1,19.0,0,0,30.0000,2
889,0,3,1,,1,2,23.4500,2
890,1,1,0,26.0,0,0,30.0000,0


# Exploring missing values

In [140]:
train.isnull().sum()
#test.isnull().sum()
age_median = train['Age'].median()
train.fillna(age_median, inplace=True)
train.isnull().sum()

Survived      0
Pclass        0
Sex_female    0
Age           0
SibSp         0
Parch         0
Fare          0
Embarked      0
dtype: int64

# Describe

In [141]:
train.describe()

Unnamed: 0,Survived,Pclass,Sex_female,Age,SibSp,Parch,Fare,Embarked
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.383838,2.308642,0.352413,29.361582,0.523008,0.381594,32.204208,1.536476
std,0.486592,0.836071,0.47799,13.019697,1.102743,0.806057,49.693429,0.791503
min,0.0,1.0,0.0,0.42,0.0,0.0,0.0,0.0
25%,0.0,2.0,0.0,22.0,0.0,0.0,7.9104,1.0
50%,0.0,3.0,0.0,28.0,0.0,0.0,14.4542,2.0
75%,1.0,3.0,1.0,35.0,1.0,0.0,31.0,2.0
max,1.0,3.0,1.0,80.0,8.0,6.0,512.3292,2.0


# Decision Tree Classifier

In [142]:
x_trains, x_tests, y_trains, y_tests = train_test_split(train.drop(
    ['Survived'], axis=1), train['Survived'], test_size=0.2, random_state=100)
arvore = DecisionTreeClassifier(criterion='gini', max_depth=4)
arvore.fit(x_trains, y_trains)
feat_names = ['Pclass', 'Sex_female', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
graph_dot = export_graphviz(arvore, class_names=[
                            'Morto', 'Vivo'], feature_names=feat_names, out_file=None, filled=True)
g = graphviz.Source(graph_dot, format='pdf')
g.render('decision_tree')
previsao = arvore.predict(x_tests)
previsao
confusao = confusion_matrix(y_tests, previsao)
confusao
acerto = accuracy_score(y_tests, previsao)
'acerto = {0:.2f}%'.format(acerto*100)


'acerto = 80.45%'

## Up the decision_tree Solution

In [143]:
sub_test = test.drop(['Cabin', 'Name', 'Ticket'], axis=1)
sub_test['Embarked'] = np.where(
    sub_test['Embarked'] == 'C', 0, np.where(sub_test['Embarked'] == 'Q', 1, 2))
sub_test = pd.get_dummies(sub_test)
sub_test.fillna(age_median, inplace=True)
sub_test = sub_test.drop(['Sex_male'], axis=1)
sub_test = sub_test.set_index(['PassengerId'])
sub_test = sub_test[['Pclass', 'Sex_female', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]

submissao['Survived'] = arvore.predict(sub_test)
submissao.to_csv('submission.csv', index=False)
submissao
'''
erros = abs(submissao.Survived - submissao.Surviveds).sum()
percentual = (erros/418)*100
acertos = 100 - percentual
acertos'''

'\nerros = abs(submissao.Survived - submissao.Surviveds).sum()\npercentual = (erros/418)*100\nacertos = 100 - percentual\nacertos'