# **Machine Learning e Validação Cruzada - Machine Learning 19**

### Bibliotecas

In [9]:
# Bibliotecas
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, KFold

### Base de dados

In [2]:
# dados
ds = pd.read_csv('../data/titanic.csv')
ds.isnull().sum()

Sex           0
Embarked      0
Age         177
Pclass        0
SibSp         0
Parch         0
Fare          0
Survived      0
dtype: int64

In [3]:
# Agrupando as idades por gênero e classe
idade = ds.groupby(['Pclass', 'Sex'])['Age'].transform('mean')
idade = round(idade, 1)
# Substituído o valor NAs
ds['Age'] = ds['Age'].fillna(idade)
ds.head(10)

Unnamed: 0,Sex,Embarked,Age,Pclass,SibSp,Parch,Fare,Survived
0,male,S,22.0,3,1,0,7.25,0
1,female,C,38.0,1,1,0,71.2833,1
2,female,S,26.0,3,0,0,7925.0,1
3,female,S,35.0,1,1,0,53.1,1
4,male,S,35.0,3,0,0,8.05,0
5,male,Q,26.5,3,0,0,8.4583,0
6,male,S,54.0,1,0,0,51.8625,0
7,male,S,2.0,3,3,1,21075.0,0
8,female,S,27.0,3,0,2,11.1333,1
9,female,C,14.0,2,1,0,30.0708,1


### Divisão entre treino e teste

In [4]:
X = ds.iloc[:, 0:7].values
y = ds.iloc[:, 7].values
X

array([['male', 'S', 22.0, ..., 1, 0, 7.25],
       ['female', 'C', 38.0, ..., 1, 0, 71.2833],
       ['female', 'S', 26.0, ..., 0, 0, 7925.0],
       ...,
       ['female', 'S', 21.8, ..., 1, 2, 23.45],
       ['male', 'C', 26.0, ..., 0, 0, 30.0],
       ['male', 'Q', 32.0, ..., 0, 0, 7.75]], dtype=object)

### Transformação dos atributos categóricos em atributos numéricos

In [5]:
# Bibliotecas
from sklearn.preprocessing import LabelEncoder
# Transformação dos atributos categóricos em atributos numéricos, passando o índice de cada coluna categórica
labelencoder1 = LabelEncoder()
X[:,0] = labelencoder1.fit_transform(X[:,0])

labelencoder2 = LabelEncoder()
X[:,1] = labelencoder2.fit_transform(X[:,1])

X

array([[1, 2, 22.0, ..., 1, 0, 7.25],
       [0, 0, 38.0, ..., 1, 0, 71.2833],
       [0, 2, 26.0, ..., 0, 0, 7925.0],
       ...,
       [0, 2, 21.8, ..., 1, 2, 23.45],
       [1, 0, 26.0, ..., 0, 0, 30.0],
       [1, 1, 32.0, ..., 0, 0, 7.75]], dtype=object)

### Modelo de ML com Validação cruzada

In [6]:
# treinamento do modelo
resultados_naive_bayes_cv = []
resultados_naive_bayes_cv_300 = []
resultados_logistica_cv = []
resultados_logistica_cv_300 = []
resultados_forest_cv = []
resultados_forest_cv_300 = []
for i in range(50):
    kfold = KFold(n_splits = 10, shuffle = True, random_state = i)
    
    naive_bayes = GaussianNB()
    scores = cross_val_score(naive_bayes, X, y, cv = kfold)
    resultados_naive_bayes_cv_300.append(scores)
    resultados_naive_bayes_cv.append(scores.mean())

    logistica = LogisticRegression()
    scores = cross_val_score(logistica, X, y, cv = kfold)
    resultados_logistica_cv_300.append(scores)
    resultados_logistica_cv.append(scores.mean())
    
    random_forest = RandomForestClassifier()
    scores = cross_val_score(random_forest, X, y, cv = kfold)
    resultados_forest_cv_300.append(scores)
    resultados_forest_cv.append(scores.mean())

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [7]:
# tamanho da lista
len(resultados_naive_bayes_cv), len(resultados_naive_bayes_cv_300)

(50, 50)

### Trasformando os dados

In [10]:
# redimensionando os dados
resultados_naive_bayes_cv = np.array(resultados_naive_bayes_cv)
resultados_naive_bayes_cv_300 = np.array(np.asarray(resultados_naive_bayes_cv_300).reshape(-1))
resultados_logistica_cv = np.array(resultados_logistica_cv)
resultados_logistica_cv_300 = np.array(np.asarray(resultados_logistica_cv_300).reshape(-1))
resultados_forest_cv = np.array(resultados_forest_cv)
resultados_forest_cv_300 = np.array(np.asarray(resultados_forest_cv_300).reshape(-1))

In [11]:
# Média
resultados_naive_bayes_cv.mean(), resultados_logistica_cv.mean(), resultados_forest_cv.mean()

(0.6700681647940074, 0.6869885143570537, 0.8072751560549313)