In [1]:
# Importação das bibliotecas
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import np_utils
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_transformer

In [2]:
dataset = pd.read_csv("soybean.csv")
dataset

Unnamed: 0,date,plant-stand,precip,temp,hail,crop-hist,area-damaged,severity,seed-tmt,germination,...,sclerotia,fruit-pods,fruit-spots,seed,mold-growth,seed-discolor,seed-size,shriveling,roots,class
0,october,normal,gt-norm,norm,yes,same-lst-yr,low-areas,pot-severe,none,90-100,...,absent,norm,dna,norm,absent,absent,norm,absent,norm,diaporthe-stem-canker
1,august,normal,gt-norm,norm,yes,same-lst-two-yrs,scattered,severe,fungicide,80-89,...,absent,norm,dna,norm,absent,absent,norm,absent,norm,diaporthe-stem-canker
2,july,normal,gt-norm,norm,yes,same-lst-yr,scattered,severe,fungicide,lt-80,...,absent,norm,dna,norm,absent,absent,norm,absent,norm,diaporthe-stem-canker
3,july,normal,gt-norm,norm,yes,same-lst-yr,scattered,severe,none,80-89,...,absent,norm,dna,norm,absent,absent,norm,absent,norm,diaporthe-stem-canker
4,october,normal,gt-norm,norm,yes,same-lst-two-yrs,scattered,pot-severe,none,lt-80,...,absent,norm,dna,norm,absent,absent,norm,absent,norm,diaporthe-stem-canker
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
678,april,?,?,?,?,?,upper-areas,?,?,?,...,?,?,?,?,?,?,?,?,?,2-4-d-injury
679,april,lt-normal,?,lt-norm,?,diff-lst-year,scattered,?,?,?,...,?,dna,?,?,?,?,?,?,rotted,herbicide-injury
680,june,lt-normal,?,lt-norm,?,diff-lst-year,scattered,?,?,?,...,?,dna,?,?,?,?,?,?,rotted,herbicide-injury
681,april,lt-normal,?,lt-norm,?,same-lst-yr,whole-field,?,?,?,...,?,dna,?,?,?,?,?,?,rotted,herbicide-injury


In [3]:
#separação dos variáveis, ignoro primeira pois não tem valor semântico
X = dataset.iloc[:,0:35].values
y = dataset.iloc[:, 35].values
#temos um arry e não mais um data frame
X[0:2]

array([['october', 'normal', 'gt-norm', 'norm', 'yes', 'same-lst-yr',
        'low-areas', 'pot-severe', 'none', '90-100', 'abnorm', 'abnorm',
        'absent', 'dna', 'dna', 'absent', 'absent', 'absent', 'abnorm',
        'no', 'above-sec-nde', 'brown', 'present', 'firm-and-dry',
        'absent', 'none', 'absent', 'norm', 'dna', 'norm', 'absent',
        'absent', 'norm', 'absent', 'norm'],
       ['august', 'normal', 'gt-norm', 'norm', 'yes', 'same-lst-two-yrs',
        'scattered', 'severe', 'fungicide', '80-89', 'abnorm', 'abnorm',
        'absent', 'dna', 'dna', 'absent', 'absent', 'absent', 'abnorm',
        'yes', 'above-sec-nde', 'brown', 'present', 'firm-and-dry',
        'absent', 'none', 'absent', 'norm', 'dna', 'norm', 'absent',
        'absent', 'norm', 'absent', 'norm']], dtype=object)

In [4]:
labelencoder = LabelEncoder()

for x in range(35):
    X[:, x] = labelencoder.fit_transform(X[:, x])

In [5]:
#one hot encoder coluna credit_history
#deve adicionar 5 colunas
onehotencoder = make_column_transformer((OneHotEncoder(categories='auto', sparse_output=False), [1]), remainder="passthrough")
X = onehotencoder.fit_transform(X)
X[0:2]

array([[0.0, 0.0, 1.0, 6, 1, 3, 2, 4, 1, 2, 2, 1, 1, 0, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 2, 2, 1, 3, 1, 4, 4, 2, 1, 1, 2, 1, 2],
       [0.0, 0.0, 1.0, 2, 1, 3, 2, 3, 2, 3, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1,
        1, 2, 1, 1, 2, 2, 1, 3, 1, 4, 4, 2, 1, 1, 2, 1, 2]], dtype=object)

In [6]:
#Excluimos a variável para evitar a dummy variable trap
X = X[:,1:]
X = X.astype('float')
X

array([[0., 1., 6., ..., 2., 1., 2.],
       [0., 1., 2., ..., 2., 1., 2.],
       [0., 1., 3., ..., 2., 1., 2.],
       ...,
       [1., 0., 4., ..., 0., 0., 3.],
       [1., 0., 1., ..., 0., 0., 3.],
       [1., 0., 4., ..., 0., 0., 3.]])

In [7]:
#Laber encoder com a classe
labelencoder_Y = LabelEncoder()
y = labelencoder_Y.fit_transform(y)
classe_dummy = np_utils.to_categorical(y)
classe_dummy

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [8]:
#separação em treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, classe_dummy, test_size = 0.2, random_state = 0)
print(len(X_train),len(X_test),len(y_train),len(y_test))

546 137 546 137


In [9]:
y_test

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [10]:
X_test

array([[0., 1., 4., ..., 2., 1., 2.],
       [1., 0., 2., ..., 0., 0., 3.],
       [1., 0., 5., ..., 0., 0., 3.],
       ...,
       [0., 1., 6., ..., 2., 1., 2.],
       [1., 0., 4., ..., 2., 1., 3.],
       [0., 1., 6., ..., 2., 1., 2.]])

In [11]:
classifier = Sequential()
classifier.add(Dense(units = 18, input_dim = 36))
classifier.add(Dense(units = 38))
classifier.add(Dense(units = 19, activation = 'softmax'))
classifier.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 18)                666       
                                                                 
 dense_1 (Dense)             (None, 38)                722       
                                                                 
 dense_2 (Dense)             (None, 19)                741       
                                                                 
Total params: 2,129
Trainable params: 2,129
Non-trainable params: 0
_________________________________________________________________


In [12]:
classifier.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])
classifier.fit(X_train, y_train, epochs = 100, validation_data = (X_test, y_test))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100


Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x198279b3190>

In [13]:
y_pred = classifier.predict(X_test)
y_pred = (y_pred > 0.5)
y_pred[0:2]



array([[False, False, False, False, False,  True, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False],
       [False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False,  True, False, False,
        False]])

In [14]:
y_teste_matrix = [np.argmax(t) for t in y_test]
y_previsao_matrix = [np.argmax(t) for t in y_pred]

In [15]:
confusao = confusion_matrix(y_teste_matrix, y_previsao_matrix)
confusao

array([[ 2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0],
       [ 5, 13,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0,  0,
         0,  0],
       [ 0,  0, 11,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0],
       [ 1,  0,  0,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0],
       [ 0,  0,  0,  0,  5,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0],
       [ 2,  1,  0,  0,  0, 14,  0,  0,  0,  0,  0,  0,  1,  0,  1,  0,
         0,  0],
       [ 0,  0,  0,  0,  0,  0,  9,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  6,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  3,  0,  0,  0,  0,  0,  0,  0,
         0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,
         0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  6,  0,  0,  0,  0,  0,
         0,  0],
       [ 0,  0,  0,  