In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline

import seaborn as sns

In [3]:
data = pd.read_csv("./mushrooms.csv")


In [4]:
data.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [5]:
data.shape

(8124, 23)

In [7]:
data['class'].value_counts()

e    4208
p    3916
Name: class, dtype: int64

In [8]:
for i in range(len(data.columns)):
    print(data.iloc[:,i].value_counts())

e    4208
p    3916
Name: class, dtype: int64
x    3656
f    3152
k     828
b     452
s      32
c       4
Name: cap-shape, dtype: int64
y    3244
s    2556
f    2320
g       4
Name: cap-surface, dtype: int64
n    2284
g    1840
e    1500
y    1072
w    1040
b     168
p     144
c      44
u      16
r      16
Name: cap-color, dtype: int64
f    4748
t    3376
Name: bruises, dtype: int64
n    3528
f    2160
y     576
s     576
a     400
l     400
p     256
c     192
m      36
Name: odor, dtype: int64
f    7914
a     210
Name: gill-attachment, dtype: int64
c    6812
w    1312
Name: gill-spacing, dtype: int64
b    5612
n    2512
Name: gill-size, dtype: int64
b    1728
p    1492
w    1202
n    1048
g     752
h     732
u     492
k     408
e      96
y      86
o      64
r      24
Name: gill-color, dtype: int64
t    4608
e    3516
Name: stalk-shape, dtype: int64
b    3776
?    2480
e    1120
c     556
r     192
Name: stalk-root, dtype: int64
s    5176
k    2372
f     552
y      24
Name: stalk-surf

In [9]:
class_mapping = {'e':0,'p':1}


In [10]:
data['class']=data['class'].map(class_mapping)


In [11]:
data2 = pd.get_dummies(data)


In [12]:
data2.head()

Unnamed: 0,class,cap-shape_b,cap-shape_c,cap-shape_f,cap-shape_k,cap-shape_s,cap-shape_x,cap-surface_f,cap-surface_g,cap-surface_s,...,population_s,population_v,population_y,habitat_d,habitat_g,habitat_l,habitat_m,habitat_p,habitat_u,habitat_w
0,1,0,0,0,0,0,1,0,0,1,...,1,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,0,1,0,0,1,...,0,0,0,0,1,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
3,1,0,0,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,0,1,0,0,1,...,0,0,0,0,1,0,0,0,0,0


In [13]:
from sklearn.model_selection import train_test_split


In [14]:
x_train,x_test,y_train,y_test = train_test_split(data2.iloc[:,1:],data2.iloc[:,0],test_size=0.2)

In [15]:
x_train,x_val,y_train,y_val = train_test_split(x_train,y_train,test_size=0.2)


In [16]:
from sklearn.linear_model import LogisticRegression


In [17]:
lr = LogisticRegression()


In [18]:
lr.fit(x_train,y_train)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [19]:
y_pred_train = lr.predict(x_train)


In [20]:
y_pred_val = lr.predict(x_val)


In [21]:
y_pred_test = lr.predict(x_test)


In [22]:
lr.score(x_train,y_train)


1.0

In [23]:
lr.score(x_val,y_val)


1.0

In [24]:
lr.score(x_test,y_test)


1.0

In [25]:
from sklearn.metrics import classification_report


In [26]:
print(classification_report(y_train,y_pred_train))


             precision    recall  f1-score   support

          0       1.00      1.00      1.00      2656
          1       1.00      1.00      1.00      2543

avg / total       1.00      1.00      1.00      5199



In [27]:
print(classification_report(y_test,y_pred_test))


             precision    recall  f1-score   support

          0       1.00      1.00      1.00       854
          1       1.00      1.00      1.00       771

avg / total       1.00      1.00      1.00      1625



In [28]:
print(classification_report(y_val,y_pred_val))


             precision    recall  f1-score   support

          0       1.00      1.00      1.00       698
          1       1.00      1.00      1.00       602

avg / total       1.00      1.00      1.00      1300



In [29]:
from sklearn.metrics import confusion_matrix


In [30]:
confusion_matrix(y_train,y_pred_train)


array([[2656,    0],
       [   0, 2543]], dtype=int64)

In [31]:
confusion_matrix(y_test,y_pred_test)


array([[854,   0],
       [  0, 771]], dtype=int64)

In [32]:
confusion_matrix(y_val,y_pred_val)


array([[698,   0],
       [  0, 602]], dtype=int64)