## **Otimização de Hiperparâmetros**

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, KFold

In [3]:
treino = pd.read_csv('train.csv')
teste  = pd.read_csv('test.csv')

In [4]:
treino.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [11]:
teste.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [10]:
treino.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [9]:
treino['Embarked'].value_counts()

Embarked
S    644
C    168
Q     77
Name: count, dtype: int64

In [5]:
# Preencher 'Embarket' nulo com a Moda, que é o valor que MAIS APARECE.
moda_embarked = treino['Embarked'].value_counts().index[0]
moda_embarked
treino.loc[treino['Embarked'].isna(), 'Embarked'] = moda_embarked
treino.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
# Preencher Age nulo com a média
mean_age = treino.loc[~treino['Age'].isna(), 'Age'].mean()
treino.loc[treino['Age'].isna(), 'Age'] = mean_age
treino.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [15]:
treino.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64

In [7]:
X_treino = treino.drop(columns=['Survived', 'Name', 'Ticket', 'Cabin'], axis=1)
X_treino['Sex'] = np.where(treino['Sex'] == 'male', 1, 0)
X_treino['Embarked'] = X_treino['Embarked'].map({'C':0, 'S':1, 'Q':2})
y_treino = treino['Survived']

In [8]:
# Preencher 'Embarket' e 'Age' nulos com cálculo do TREINO ( TESTE é produção e nós não conhecemos produção)
teste.loc[teste['Embarked'].isna(), 'Embarked'] = moda_embarked
teste.loc[teste['Age'].isna(), 'Age'] = mean_age

In [9]:
X_teste = teste.drop(columns=['Name', 'Ticket', 'Cabin'], axis='columns')
X_teste['Sex'] = np.where(X_teste['Sex'] == 'male', 1, 0)
X_teste['Embarked'] = X_teste['Embarked'].map({'C':0, 'S':1, 'Q':2})

In [24]:
X_teste.shape

(418, 8)

## **Modelo Sem Otimização de Hiperparmetros**

In [10]:
from sklearn.ensemble import RandomForestClassifier
from numpy import mean, std

randomforest_sem_otimizacao = RandomForestClassifier()

cv = KFold(n_splits=10, shuffle=True, random_state=1)
scores = cross_val_score(randomforest_sem_otimizacao, X_treino, y_treino, cv=cv, scoring='accuracy')

print('Accuracy sem Otimização: %.4f (%.3f)' % (mean(scores), std(scores)))

Accuracy sem Otimização: 0.8284 (0.046)


## **GridSearch - Otimização de Hiperparâmetros**

In [None]:
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold

kfold = KFold(n_splits=5, shuffle=True)

param_grid = {
    'bootstrap':[True],
    'max_depth':[6, 10],
    'max_features':['auto', 'sqrt'],
    'min_samples_leaf':[3, 5],
    'min_samples_split':[4,6],
    'n_estimators':[100, 350]
}

randomforest_com_gridsearch = RandomForestClassifier()

score_gridsearch = GridSearchCV(randomforest_com_gridsearch, 
                                param_grid=param_grid,
                                cv=kfold,
                                scoring='accuracy',
                                return_train_score=True,
                                verbose=True)

score_gridsearch.fit(X_treino, y_treino)

In [47]:
score_gridsearch.best_estimator_

In [12]:
print('Accuracy com GridSearch: %.5f' % (score_gridsearch.best_score_))

Accuracy com GridSearch: 0.82267


## **RandomizedSearchCV - Otimização de Hiperparâmetros**

In [None]:
from sklearn.model_selection import RandomizedSearchCV

param_scape = {
    'bootstrap':[True],
    'max_depth':[6, 8, 10, 12, 14],
    'max_features':['auto', 'sqrt', 'log2'],
    'min_samples_leaf':[2, 3, 4, 5],
    'min_samples_split':[2, 3, 4, 5],
    'n_estimators':[100, 200, 250, 300, 350, 400, 450, 500, 550, 600, 650, 700, 750, 800, 850, 900, 950, 1000]
}

randomforest_com_randomsearch = RandomForestClassifier()

score_randomsearch = RandomizedSearchCV(randomforest_com_randomsearch, 
                                        param_scape,
                                        n_iter=32,
                                        scoring='accuracy',
                                        cv=5,
                                        return_train_score=True,
                                        verbose=True)

score_randomsearch.fit(X_treino, y_treino)

In [14]:
score_randomsearch.best_estimator_

In [15]:
print('Accuracy com RandomizedSearch: %.5f' % (score_randomsearch.best_score_))

Accuracy com RandomizedSearch: 0.82942


## **Bayesian Optimization - Otimização de Hiperparâmetros**

In [18]:
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

search_space = {
    'bootstrap': Categorical([True, False]),
    'max_depth': Integer(6, 20),
    'max_features': Categorical(['sqrt', 'log2']),
    'min_samples_leaf': Integer(2, 10),
    'min_samples_split': Integer(2,10),
    'n_estimators':Integer(100, 500)
}

randomforest_com_bayesiansearch = RandomForestClassifier()

score_bayesiansearch = BayesSearchCV(randomforest_com_bayesiansearch, 
                        search_space, 
                        n_iter=32,
                        scoring='accuracy',
                        cv=5)

score_bayesiansearch.fit(X_treino, y_treino)

In [19]:
score_bayesiansearch.best_estimator_

In [20]:
print('Accuracy com RandomizedSearch: %.5f' % (score_bayesiansearch.best_score_))

Accuracy com RandomizedSearch: 0.82716
