### Importando Bibliotecas

In [1]:
import pandas as pd  # tratar dados
from sklearn.model_selection import train_test_split  # divisão treino-teste
from sklearn.linear_model import LogisticRegression  # modelo de Regressão Logística
from sklearn.metrics import classification_report, confusion_matrix  # métricas para avaliar a precisão do modelo

### Importando Dataset

In [2]:
# dataset proveniente de: https://www.kaggle.com/datasets/uciml/mushroom-classification
df = pd.read_csv('mushrooms.csv')  # armazena o dataset como um dataframe na variável df

In [3]:
df.head()  # mostra as 5 primeiras linhas do dataframe

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [4]:
df.columns  # mostra o nome dos atributos do dataframe

Index(['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat'],
      dtype='object')

#### Informação dos atributos
**class**: atributo dependente, tendo 2 valores possíveis: e = edible (comestível), p = poisonous (venenoso)\
**cap-shape**: b = bell, c = conical, x = convex , f = flat, k = knobbed, s = sunken\
**cap-surface**: f = fibrous, g = grooves, y = scaly, s = smooth\
**cap-color**: n = brown, buff = b, c = cinnamon, g = gray, r = green, p = pink, u = purple, e = red, w = white, y = yellow\
**bruises**: t = bruises, f = no\
**odor**: a = almond, l = anise, c = creosote, y = fishy, f = foul, m = musty, n = none, p = pungent, s = spicy\
**gill-attachment**: a = attached, d = descending, f = free, n = notched\
**gill-spacing**: c = close, w = crowded, d = distant\
**gill-size**: b = broad, n = narrow\
**gill-color**: k = black, n = brown, b = buff, h = chocolate, g = gray, r = green, o = orange, p = pink, u = purple, e = red, w = white, y = yellow\
**stalk-shape**: e = enlarging, t = tapering\
**stalk-root**: b = bulbous, c = club, u = cup, e = equal, z = rhizomorphs, r = rooted, ? = missing\
**stalk-surface-above-ring**: f = fibrous, y = scaly, k = silky, s = smooth\
**stalk-surface-below-ring**: f = fibrous, y = scaly, k = silky, s = smooth\
**stalk-color-above-ring**: n = brown, b = buff, c = cinnamon, g = gray, o = orange, p = pink, e = red, w = white, y = yellow\
**stalk-color-below-ring**: n = brown, b = buff, c = cinnamon, g = gray, o = orange, p = pink, e = red, w = white, y = yellow\
**veil-type**: p = partial, u = universal\
**veil-color**: n = brown, o = orange, w = white, y = yellow\
**ring-number**: n = none, o = one, t = two\
**ring-type**: c = cobwebby, e = evanescent, f = flaring, l = large, n = none, p = pendant, s = sheathing, z = zone\
**spore-print-color**: k = black, n = brown, b = buff, h = chocolate, r = green, o = orange, u = purple, w = white, y = yellow\
**population**: a = abundant, c = clustered, n = numerous, s = scattered, v = several, y = solitary\
**habitat**: g = grasses, l = leaves, m = meadows, p = paths, u = urban, w = waste, d = woods\

### Analisando o dataset

In [5]:
pd.options.display.max_columns = df.shape[1]  # configuração para o pandas não esconder colunas da análise

df.describe(include='all')

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,stalk-root,stalk-surface-above-ring,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
count,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124
unique,2,6,4,10,2,9,2,2,2,12,2,5,4,4,9,9,1,4,3,5,9,6,7
top,e,x,y,n,f,n,f,c,b,b,t,b,s,s,w,w,p,w,o,p,w,v,d
freq,4208,3656,3244,2284,4748,3528,7914,6812,5612,1728,4608,3776,5176,4936,4464,4384,8124,7924,7488,3968,2388,4040,3148


In [6]:
df.dtypes  # tipos de dados de cada atributo

class                       object
cap-shape                   object
cap-surface                 object
cap-color                   object
bruises                     object
odor                        object
gill-attachment             object
gill-spacing                object
gill-size                   object
gill-color                  object
stalk-shape                 object
stalk-root                  object
stalk-surface-above-ring    object
stalk-surface-below-ring    object
stalk-color-above-ring      object
stalk-color-below-ring      object
veil-type                   object
veil-color                  object
ring-number                 object
ring-type                   object
spore-print-color           object
population                  object
habitat                     object
dtype: object

Primeiramente, é possível perceber que o atributo veil-type possui apenas um valor em todos os dados e, portanto, não será útil para o modelo.\
Como todos os atributos são categóricos, então deve-se realizar a transformação em numéricos, através de one-hot encoding, utilizando o método get_dummies da biblioteca pandas.

In [7]:
df.drop(labels='veil-type', axis=1)  # excluindo a coluna que possui somente um valor de atributo
aux = df['class']  # armazena o atributo classe nessa variável
df = pd.get_dummies(df.drop('class', axis=1))  # transformando todas as colunas categóricas em numéricas (menos a coluna classe)
df['class'] = aux  # retornando a coluna classe ao dataframe

In [8]:
df.head()  # mostrando as 5 primeiras linhas do dataframe após o tratamento dos dados

Unnamed: 0,cap-shape_b,cap-shape_c,cap-shape_f,cap-shape_k,cap-shape_s,cap-shape_x,cap-surface_f,cap-surface_g,cap-surface_s,cap-surface_y,cap-color_b,...,population_s,population_v,population_y,habitat_d,habitat_g,habitat_l,habitat_m,habitat_p,habitat_u,habitat_w,class
0,0,0,0,0,0,1,0,0,1,0,0,...,1,0,0,0,0,0,0,0,1,0,p
1,0,0,0,0,0,1,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0,e
2,1,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0,e
3,0,0,0,0,0,1,0,0,0,1,0,...,1,0,0,0,0,0,0,0,1,0,p
4,0,0,0,0,0,1,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0,e


### Separando os dados de treino e teste

In [9]:
X = df.drop('class', axis=1)  # armazenando todas as colunas menos a coluna classe na variável X
y = df['class']  # pega somente a coluna classe

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True)  # separa dados de treino e teste com a proporção 70/30

### Selecionando modelo e setando hiperparâmetros

In [11]:
modelo = LogisticRegression()  # cria o objeto do modelo
modelo.fit(X_train, y_train)  # treina o modelo com os dados de treino

In [12]:
modelo_test = modelo.predict(X_test)  # realiza a previsão dos dados de teste

### Avaliando resultados

In [13]:
print(confusion_matrix(y_test, modelo_test))  # printa a matriz de confusão
print(classification_report(y_test, modelo_test))  # printa o relatório de classificação

[[1251    0]
 [   0 1187]]
              precision    recall  f1-score   support

           e       1.00      1.00      1.00      1251
           p       1.00      1.00      1.00      1187

    accuracy                           1.00      2438
   macro avg       1.00      1.00      1.00      2438
weighted avg       1.00      1.00      1.00      2438

