In [176]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier as rfc
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import seaborn as sns


# reading the dataset

In [177]:
train = pd.read_csv('./dataset/train.csv')
test = pd.read_csv('./dataset/test.csv')

train.shape
test.shape


(418, 11)

# Editing Key and Answer Variable

In [178]:
train = train.set_index(['PassengerId'])
train = train.drop(['Name', 'Ticket', 'Cabin', 'Embarked'], axis=1)
train

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0,3,male,22.0,1,0,7.2500
2,1,1,female,38.0,1,0,71.2833
3,1,3,female,26.0,0,0,7.9250
4,1,1,female,35.0,1,0,53.1000
5,0,3,male,35.0,0,0,8.0500
...,...,...,...,...,...,...,...
887,0,2,male,27.0,0,0,13.0000
888,1,1,female,19.0,0,0,30.0000
889,0,3,female,,1,2,23.4500
890,1,1,male,26.0,0,0,30.0000


# Exploring missing values

In [179]:
train.isnull().sum()
#test.isnull().sum()


Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
dtype: int64

In [180]:
age_median = train['Age'].median()


# fill NA

In [181]:
train.fillna(age_median, inplace=True)
train.isnull().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
dtype: int64

# Describe


In [182]:
train.describe()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.361582,0.523008,0.381594,32.204208
std,0.486592,0.836071,13.019697,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,22.0,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,35.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


# Data Manipulation | Transformation

In [183]:
treinamento = pd.get_dummies(train)
data_tree = treinamento
treinamento

Unnamed: 0_level_0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0,3,22.0,1,0,7.2500,0,1
2,1,1,38.0,1,0,71.2833,1,0
3,1,3,26.0,0,0,7.9250,1,0
4,1,1,35.0,1,0,53.1000,1,0
5,0,3,35.0,0,0,8.0500,0,1
...,...,...,...,...,...,...,...,...
887,0,2,27.0,0,0,13.0000,0,1
888,1,1,19.0,0,0,30.0000,1,0
889,0,3,28.0,1,2,23.4500,1,0
890,1,1,26.0,0,0,30.0000,0,1


In [184]:
treinamento=treinamento.drop(['Sex_male'], axis=1)
treinamento = treinamento.rename(columns={'Sex_female': 'Female'})
treinamento = treinamento[['Survived','Fare', 'Female', 'Age','SibSp', 'Parch', 'Pclass']]
treinando = pd.get_dummies(treinamento['Pclass'])
treinamento = treinamento.drop(['Pclass'], axis=1)
treinando = treinando.rename(
    columns={1: 'Pclass_1', 2: 'Pclass_2', 3: 'Pclass_3'})
treinamento = treinamento.join(treinando)
treinamento


Unnamed: 0_level_0,Survived,Fare,Female,Age,SibSp,Parch,Pclass_1,Pclass_2,Pclass_3
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,0,7.2500,0,22.0,1,0,0,0,1
2,1,71.2833,1,38.0,1,0,1,0,0
3,1,7.9250,1,26.0,0,0,0,0,1
4,1,53.1000,1,35.0,1,0,1,0,0
5,0,8.0500,0,35.0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...
887,0,13.0000,0,27.0,0,0,0,1,0
888,1,30.0000,1,19.0,0,0,1,0,0
889,0,23.4500,1,28.0,1,2,0,0,1
890,1,30.0000,0,26.0,0,0,1,0,0


# Training

In [222]:
x_train, x_test, y_train, y_test = train_test_split(treinamento.drop(
    ['Survived'], axis=1), treinamento['Survived'], test_size=0.3, random_state=0)

f'treino: {x_train.shape}, teste: {x_test.shape}'


'treino: (623, 10), teste: (268, 10)'

# Decision Tree Classifier

In [227]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.metrics import confusion_matrix, accuracy_score
import graphviz
x_trains, x_tests, y_trains, y_tests = train_test_split(data_tree.drop(
    ['Survived', 'Sex_male'], axis=1), data_tree['Survived'], test_size=0.15, random_state=100)
arvore = DecisionTreeClassifier()
arvore.fit(x_trains, y_trains)
feat_names = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Female']
graph_dot = export_graphviz(arvore, class_names=['Morto', 'Vivo'], feature_names=feat_names, out_file=None, filled=True)
g = graphviz.Source(graph_dot, format='pdf')
g.render('decision_tree')
previsao = arvore.predict(x_tests)
previsao
confusao = confusion_matrix(y_tests, previsao)
confusao
acerto = accuracy_score(y_tests, previsao)
'acerto = {0:.2f}%'.format(acerto*100)


'acerto = 82.84%'

# Random Forest

In [223]:
model = rfc(n_estimators=700, criterion='gini', max_depth=3, random_state=100)

model.fit(x_train, y_train)

probability = model.predict_proba(treinamento.drop('Survived', axis=1))[:, 1]

classification = model.predict(treinamento.drop('Survived', axis=1))

treinamento['probability'] = probability
treinamento['classification'] = classification

treinamento


Unnamed: 0_level_0,Survived,Fare,Female,Age,SibSp,Parch,Pclass_1,Pclass_2,Pclass_3,probability,classification
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,7.2500,0,22.0,1,0,0,0,1,0.083860,0
2,1,71.2833,1,38.0,1,0,1,0,0,0.941544,1
3,1,7.9250,1,26.0,0,0,0,0,1,0.177856,0
4,1,53.1000,1,35.0,1,0,1,0,0,0.945817,1
5,0,8.0500,0,35.0,0,0,0,0,1,0.085856,0
...,...,...,...,...,...,...,...,...,...,...,...
887,0,13.0000,0,27.0,0,0,0,1,0,0.113394,0
888,1,30.0000,1,19.0,0,0,1,0,0,0.941908,1
889,0,23.4500,1,28.0,1,2,0,0,1,0.151746,0
890,1,30.0000,0,26.0,0,0,1,0,0,0.543536,1


In [224]:
result = treinamento[['Survived', 'probability', 'classification']]
result

Unnamed: 0_level_0,Survived,probability,classification
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0,0.083860,0
2,1,0.941544,1
3,1,0.177856,0
4,1,0.945817,1
5,0,0.085856,0
...,...,...,...
887,0,0.113394,0
888,1,0.941908,1
889,0,0.151746,0
890,1,0.543536,1


# ACCURACY

In [225]:
#model.fit(x_train, y_train)
x_test_pred = model.predict(x_test)

accuracy = accuracy_score(x_test_pred, y_test)

accuracy

0.8208955223880597