In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier as rfc
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import seaborn as sns


# reading the dataset

In [2]:
train = pd.read_csv('./dataset/train.csv')
test = pd.read_csv('./dataset/test.csv')

train.shape
test.shape


(418, 11)

# Editing Key and Answer Variable

In [3]:
train = train.set_index(['PassengerId'])
train = train.drop(['Name', 'Ticket', 'Cabin', 'Embarked'], axis=1)


# Exploring missing values

In [4]:
train.isnull().sum()
test.isnull().sum()


PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

# fill NA

In [5]:
train.fillna(0, inplace=True)
train.isnull().sum()


Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
dtype: int64

# Describe


In [6]:
train.describe()


Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.383838,2.308642,23.799293,0.523008,0.381594,32.204208
std,0.486592,0.836071,17.596074,1.102743,0.806057,49.693429
min,0.0,1.0,0.0,0.0,0.0,0.0
25%,0.0,2.0,6.0,0.0,0.0,7.9104
50%,0.0,3.0,24.0,0.0,0.0,14.4542
75%,1.0,3.0,35.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


# Data Manipulation | Transformation

In [7]:
train['Women'] = np.where(train['Sex'] == 'female', 1, 0)
train['Pclass_1'] = np.where(train['Pclass'] == 1, 1, 0)
train['Pclass_2'] = np.where(train['Pclass'] == 2, 1, 0)
train['Pclass_3'] = np.where(train['Pclass'] == 3, 1, 0)

train = train.drop(['Pclass', 'Sex'], axis=1)

train.head()


Unnamed: 0_level_0,Survived,Age,SibSp,Parch,Fare,Women,Pclass_1,Pclass_2,Pclass_3
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,0,22.0,1,0,7.25,0,0,0,1
2,1,38.0,1,0,71.2833,1,1,0,0
3,1,26.0,0,0,7.925,1,0,0,1
4,1,35.0,1,0,53.1,1,1,0,0
5,0,35.0,0,0,8.05,0,0,0,1


# Training

In [8]:
x_train, x_test, y_train, y_test = train_test_split(train.drop(
    ['Survived'], axis=1), train['Survived'], test_size=0.3, random_state=0)

f'treino: {x_train.shape}, teste: {x_test.shape}'


'treino: (623, 8), teste: (268, 8)'

# Random Forest

In [9]:
rndforest = rfc(n_estimators=1000, criterion='gini', max_depth=5)

rndforest.fit(x_train, y_train)

probability = rndforest.predict_proba(train.drop('Survived', axis=1))[:, 1]

classification = rndforest.predict(train.drop('Survived', axis=1))

train['probability'] = probability
train['classification'] = classification

train


Unnamed: 0_level_0,Survived,Age,SibSp,Parch,Fare,Women,Pclass_1,Pclass_2,Pclass_3,probability,classification
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,22.0,1,0,7.2500,0,0,0,1,0.135407,0
2,1,38.0,1,0,71.2833,1,1,0,0,0.916680,1
3,1,26.0,0,0,7.9250,1,0,0,1,0.478101,0
4,1,35.0,1,0,53.1000,1,1,0,0,0.919905,1
5,0,35.0,0,0,8.0500,0,0,0,1,0.122352,0
...,...,...,...,...,...,...,...,...,...,...,...
887,0,27.0,0,0,13.0000,0,0,1,0,0.196141,0
888,1,19.0,0,0,30.0000,1,1,0,0,0.834884,1
889,0,0.0,1,2,23.4500,1,0,0,1,0.439289,0
890,1,26.0,0,0,30.0000,0,1,0,0,0.423577,0


In [16]:
result = train[['Survived', 'probability', 'classification']]
result

Unnamed: 0_level_0,Survived,probability,classification
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0,0.135407,0
2,1,0.916680,1
3,1,0.478101,0
4,1,0.919905,1
5,0,0.122352,0
...,...,...,...
887,0,0.196141,0
888,1,0.834884,1
889,0,0.439289,0
890,1,0.423577,0
