In [1]:
# Importação das bibliotecas
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler
from yellowbrick.classifier import ConfusionMatrix

In [2]:
dataSoybean = pd.read_csv('soybean.csv')
dataSoybean.shape

(683, 36)

In [3]:
#separação dos variáveis, ignoro primeira pois não tem valor semântico
atributos = dataSoybean.iloc[:,0:35].values
classes = dataSoybean.iloc[:, 35].values
#temos um arry e não mais um data frame
atributos[0:2]

array([['october', 'normal', 'gt-norm', 'norm', 'yes', 'same-lst-yr',
        'low-areas', 'pot-severe', 'none', '90-100', 'abnorm', 'abnorm',
        'absent', 'dna', 'dna', 'absent', 'absent', 'absent', 'abnorm',
        'no', 'above-sec-nde', 'brown', 'present', 'firm-and-dry',
        'absent', 'none', 'absent', 'norm', 'dna', 'norm', 'absent',
        'absent', 'norm', 'absent', 'norm'],
       ['august', 'normal', 'gt-norm', 'norm', 'yes', 'same-lst-two-yrs',
        'scattered', 'severe', 'fungicide', '80-89', 'abnorm', 'abnorm',
        'absent', 'dna', 'dna', 'absent', 'absent', 'absent', 'abnorm',
        'yes', 'above-sec-nde', 'brown', 'present', 'firm-and-dry',
        'absent', 'none', 'absent', 'norm', 'dna', 'norm', 'absent',
        'absent', 'norm', 'absent', 'norm']], dtype=object)

In [4]:
labelencoder = LabelEncoder()

for x in range(35):
    atributos[:, x] = labelencoder.fit_transform(atributos[:, x])

In [5]:
#one hot encoder coluna credit_history
#deve adicionar 5 colunas
onehotencoder = make_column_transformer((OneHotEncoder(categories='auto', sparse_output=False), [1]), remainder="passthrough")
atributos = onehotencoder.fit_transform(atributos)
atributos[0:2]

array([[0.0, 0.0, 1.0, 6, 1, 3, 2, 4, 1, 2, 2, 1, 1, 0, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 2, 2, 1, 3, 1, 4, 4, 2, 1, 1, 2, 1, 2],
       [0.0, 0.0, 1.0, 2, 1, 3, 2, 3, 2, 3, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1,
        1, 2, 1, 1, 2, 2, 1, 3, 1, 4, 4, 2, 1, 1, 2, 1, 2]], dtype=object)

In [6]:
#Excluimos a variável para evitar a dummy variable trap
#atributos = atributos[:,1:]
#atributos = atributos.astype('float')
#atributos

In [7]:
#Laber encoder com a classe
labelencoder_classes = LabelEncoder()
classes = labelencoder_classes.fit_transform(classes)
classes

array([10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  7,  7,  7,  7,  7,  7,  7,
        7,  7,  7, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 15, 15, 15, 15,
       15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
       15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
       15, 15,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
        6,  6,  6,  6,  6, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 11, 11,
       11, 11, 11, 11, 11, 11, 11, 11,  5,  5,  5,  5,  5,  5,  5,  5,  5,
        5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
        5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  3,  3,  3,
        3,  3,  3,  3,  3,  3,  3,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
       17, 17, 17, 17, 17, 17, 17, 17, 17, 17,  2,  2,  2,  2,  2,  2,  2,
        2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2, 14, 14, 14, 14,
       14, 14, 14, 14, 14, 14,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1

In [8]:
atributos_treinamento, atributos_teste, classes_treinamento, classes_teste = train_test_split(atributos, classes, test_size = 0.3, random_state = 0)
print(len(atributos_treinamento),len(atributos_teste),len(classes_treinamento),len(classes_teste))

478 205 478 205


In [9]:
naive_bayes = GaussianNB()
naive_bayes.fit(atributos_treinamento, classes_treinamento)

In [10]:
previsoes = naive_bayes.predict(atributos_teste)
previsoes

array([ 1, 15, 13, 17,  8,  3,  6,  4,  5,  1,  7,  3,  2,  2,  1,  5,  1,
        1, 18, 15,  2,  6, 14, 14, 10,  5,  2, 10,  4, 15, 12,  1, 14, 12,
       12,  4,  6,  7,  5,  1,  6,  1, 15,  2,  3,  0,  1, 14, 14,  1, 14,
        6,  2, 12,  6, 12,  1, 10,  6, 11,  4, 11,  1,  1,  3,  2, 18, 15,
        2,  2,  8,  1,  1,  5, 18, 15, 15, 12, 12, 10,  5, 12,  1, 10, 17,
       15,  2,  1, 12, 15,  7,  5, 10,  2,  2,  2,  5, 15, 11,  1,  2,  1,
        5,  1,  7, 11,  2, 14,  1,  8,  2, 14,  1,  1,  7, 14, 12, 18,  6,
       15, 15,  7, 12,  2,  0, 15,  5,  1,  2,  9, 14,  2, 18,  1, 17,  4,
        6, 10, 14, 12,  6, 11,  1, 17,  1,  1,  1, 16, 12, 14,  0, 15, 12,
       15,  3,  3,  6,  3, 17, 13,  9, 14,  1,  7,  4,  8, 12,  3,  7,  6,
       16, 14, 15,  1,  8, 15, 12,  1, 12,  2,  1,  7,  1,  1, 10, 12,  6,
       12,  9, 15,  9, 12,  2,  1, 12, 15, 16,  1, 13,  1,  2,  9, 17,  4,
        9])

In [11]:
#geração da matriz de confusão e cálculo da taxa de acerto e erro
confusao = confusion_matrix(classes_teste, previsoes)
confusao

array([[ 3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0],
       [ 0, 26,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,
         0,  0,  0],
       [ 0,  0, 14,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0],
       [ 0,  0,  0,  8,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0],
       [ 0,  0,  0,  0,  7,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0],
       [ 0,  7,  0,  0,  0, 10,  0,  0,  0,  0,  0,  0,  1,  0,  5,  0,
         0,  0,  0],
       [ 0,  0,  0,  0,  0,  0, 13,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  9,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  5,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  6,  0,  0,  0,  0,  0,  0,
         0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  8,  0,  0,  0,  0,  0

In [12]:
taxa_acerto = accuracy_score(classes_teste, previsoes)
taxa_acerto

0.8585365853658536