In [122]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings('ignore')

# Data cleaning

In [123]:
data = pd.read_csv("mushrooms.csv")

In [124]:
data.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [125]:
data.isnull().sum()

class                       0
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

In [126]:
data.nunique()

class                        2
cap-shape                    6
cap-surface                  4
cap-color                   10
bruises                      2
odor                         9
gill-attachment              2
gill-spacing                 2
gill-size                    2
gill-color                  12
stalk-shape                  2
stalk-root                   5
stalk-surface-above-ring     4
stalk-surface-below-ring     4
stalk-color-above-ring       9
stalk-color-below-ring       9
veil-type                    1
veil-color                   4
ring-number                  3
ring-type                    5
spore-print-color            9
population                   6
habitat                      7
dtype: int64

In [127]:
data=data.drop("veil-type",axis=1)

# Preprocesing

In [128]:
data.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-above-ring,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,s,w,w,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,s,w,w,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,s,w,w,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,s,w,w,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,s,w,w,w,o,e,n,a,g


In [129]:
data.nunique()

class                        2
cap-shape                    6
cap-surface                  4
cap-color                   10
bruises                      2
odor                         9
gill-attachment              2
gill-spacing                 2
gill-size                    2
gill-color                  12
stalk-shape                  2
stalk-root                   5
stalk-surface-above-ring     4
stalk-surface-below-ring     4
stalk-color-above-ring       9
stalk-color-below-ring       9
veil-color                   4
ring-number                  3
ring-type                    5
spore-print-color            9
population                   6
habitat                      7
dtype: int64

In [130]:
data_1=pd.get_dummies(data,drop_first=True)

In [131]:
data_1.head()

Unnamed: 0,class_p,cap-shape_c,cap-shape_f,cap-shape_k,cap-shape_s,cap-shape_x,cap-surface_g,cap-surface_s,cap-surface_y,cap-color_c,...,population_n,population_s,population_v,population_y,habitat_g,habitat_l,habitat_m,habitat_p,habitat_u,habitat_w
0,1,0,0,0,0,1,0,1,0,0,...,0,1,0,0,0,0,0,0,1,0
1,0,0,0,0,0,1,0,1,0,0,...,1,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,...,1,0,0,0,0,0,1,0,0,0
3,1,0,0,0,0,1,0,0,1,0,...,0,1,0,0,0,0,0,0,1,0
4,0,0,0,0,0,1,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0


In [132]:
data_1.shape

(8124, 96)

In [133]:
y=data_1["class_p"]

In [134]:
x=data_1.drop("class_p",axis=1)

In [135]:
y.head()

0    1
1    0
2    0
3    1
4    0
Name: class_p, dtype: uint8

In [136]:
x.head()

Unnamed: 0,cap-shape_c,cap-shape_f,cap-shape_k,cap-shape_s,cap-shape_x,cap-surface_g,cap-surface_s,cap-surface_y,cap-color_c,cap-color_e,...,population_n,population_s,population_v,population_y,habitat_g,habitat_l,habitat_m,habitat_p,habitat_u,habitat_w
0,0,0,0,0,1,0,1,0,0,0,...,0,1,0,0,0,0,0,0,1,0
1,0,0,0,0,1,0,1,0,0,0,...,1,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,1,0,0,0,...,1,0,0,0,0,0,1,0,0,0
3,0,0,0,0,1,0,0,1,0,0,...,0,1,0,0,0,0,0,0,1,0
4,0,0,0,0,1,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [137]:
x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=0.2, random_state=0)

In [138]:
x_train, x_valid, y_train, y_valid = train_test_split( x_train, y_train, test_size=0.2, random_state=0)

In [139]:
for i in range(1,12):
    pca = PCA(n_components=i*5)
    pca.fit(x_train)
    print(i*5, pca.explained_variance_ratio_.sum())

5 0.4782000986863801
10 0.6369867545914093
15 0.735968429783551
20 0.8103316779792226
25 0.864725823780191
30 0.9082423478068136
35 0.9398745355686449
40 0.9623614346406615
45 0.9776022067467506
50 0.9858256973850699
55 0.9911400440709981


In [140]:
pca = PCA(n_components=40)
pca.fit(x_train)

PCA(copy=True, iterated_power='auto', n_components=40, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)

In [141]:
x_train_pca=pca.transform(x_train)
x_valid_pca=pca.transform(x_valid)
x_test_pca=pca.transform(x_test)

# Model training and validation

### Decision Tree

In [142]:
for n in range(3,20):
    for c in ["gini","entropy"]:
        for mf in ["auto",None]:
            clf = DecisionTreeClassifier(max_depth=n, criterion=c, max_features=mf)
            clf.fit(x_train_pca, y_train)
            y_pred=clf.predict(x_valid_pca)
            score=accuracy_score(y_valid,y_pred)
            score_r=recall_score(y_valid,y_pred)
            score_p=precision_score(y_valid,y_pred)
            if score_r>0.995:
                print(n, c, mf, score, score_r, score_p)

10 entropy None 0.9976923076923077 0.9952 1.0
17 gini auto 0.9938461538461538 0.9968 0.9904610492845787
17 gini None 0.9969230769230769 0.9952 0.9983948635634029
17 entropy auto 0.9969230769230769 1.0 0.9936406995230525
19 gini None 0.9969230769230769 0.9968 0.9968
19 entropy None 0.9969230769230769 0.9952 0.9983948635634029


In [143]:
Best_model_d = DecisionTreeClassifier(max_depth=19, criterion="entropy", max_features=None)
Best_model_d.fit(x_train_pca, y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=19,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [177]:
y_pred=Best_model_d.predict(x_valid_pca)

In [178]:
accuracy_score(y_valid,y_pred)

0.9961538461538462

In [179]:
y_pred=Best_model_d.predict(x_test_pca)

In [180]:
accuracy_score(y_test,y_pred)

0.9981538461538462

In [181]:
recall_score(y_test,y_pred)

0.9974126778783958

In [182]:
precision_score(y_test,y_pred)

0.9987046632124352

### Random Forest

In [148]:
Best_models_f = []
for n in range(3,20):
    for c in ["gini","entropy"]:
        for mf in ["auto",None]:
            clf = RandomForestClassifier(max_depth=n, criterion=c, max_features=mf)
            clf.fit(x_train_pca, y_train)
            y_pred=clf.predict(x_valid_pca)
            score=accuracy_score(y_valid,y_pred)
            score_r=recall_score(y_valid,y_pred)
            score_p=precision_score(y_valid,y_pred)
            if score_r>0.995:
                print(n, c, mf, score, score_r, score_p)
                Best_models_f.append(clf)

6 gini auto 0.9946153846153846 0.9984 0.9904761904761905
6 entropy None 0.9984615384615385 0.9984 0.9984
7 gini auto 0.9976923076923077 0.9952 1.0
7 entropy auto 0.9976923076923077 0.9968 0.9983974358974359
7 entropy None 0.9984615384615385 0.9968 1.0
8 entropy auto 0.9984615384615385 0.9968 1.0
8 entropy None 0.9976923076923077 0.9968 0.9983974358974359
9 gini auto 0.9992307692307693 0.9984 1.0
9 entropy auto 0.9992307692307693 0.9984 1.0
9 entropy None 0.9976923076923077 0.9952 1.0
10 gini auto 1.0 1.0 1.0
10 entropy auto 0.9984615384615385 0.9968 1.0
10 entropy None 0.9969230769230769 0.9968 0.9968
11 gini auto 0.9992307692307693 0.9984 1.0
11 gini None 0.9976923076923077 0.9952 1.0
11 entropy auto 1.0 1.0 1.0
11 entropy None 0.9976923076923077 0.9952 1.0
12 gini auto 1.0 1.0 1.0
12 entropy auto 1.0 1.0 1.0
12 entropy None 0.9976923076923077 0.9952 1.0
13 gini auto 1.0 1.0 1.0
13 entropy auto 1.0 1.0 1.0
13 entropy None 0.9984615384615385 0.9968 1.0
14 gini auto 0.9992307692307693 0

In [172]:
Best_model_f=Best_models_f[5]

In [173]:
y_pred=Best_model_f.predict(x_test_pca)

In [174]:
accuracy_score(y_test,y_pred)

0.9993846153846154

In [175]:
recall_score(y_test,y_pred)

0.9987063389391979

In [176]:
precision_score(y_test,y_pred)

1.0

### Logistic Regression

In [152]:
for c in [0.001, 0.01, 0.1, 1, 10, 100, 1000]:
    clf = LogisticRegression(C=c)
    clf.fit(x_train_pca, y_train)
    y_pred=clf.predict(x_valid_pca)
    score=accuracy_score(y_valid,y_pred)
    score_r=recall_score(y_valid,y_pred)
    score_p=precision_score(y_valid,y_pred)
    print(c, score, score_r, score_p)

0.001 0.9338461538461539 0.8752 0.9855855855855856
0.01 0.9823076923076923 0.9808 0.9823717948717948
0.1 0.9861538461538462 0.984 0.9871589085072231
1 0.9923076923076923 0.9952 0.9888712241653418
10 0.9946153846153846 0.9936 0.9951923076923077
100 0.9969230769230769 0.9968 0.9968
1000 0.9992307692307693 0.9984 1.0


In [153]:
Best_model_l = LogisticRegression(C=1000)
Best_model_l.fit(x_train_pca, y_train)

LogisticRegression(C=1000, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [154]:
y_pred=Best_model_l.predict(x_test_pca)

In [155]:
accuracy_score(y_test,y_pred)

1.0

### SVM

In [156]:
for c in [1, 10, 100]:
    for k in ["rbf","linear"]:
        if k=="rbf":
            for g in [1e-3, 1e-4, 1e-5]:
                clf=SVC(kernel=k, gamma=g, C=c)   
                clf.fit(x_train_pca, y_train)
                y_pred=clf.predict(x_valid_pca)
                score=accuracy_score(y_valid,y_pred)
                score_r=recall_score(y_valid,y_pred)
                score_p=precision_score(y_valid,y_pred)
                print(c, k, g, score, score_r, score_p)
        else:
            clf=SVC(kernel=k, C=c)   
            clf.fit(x_train_pca, y_train)
            y_pred=clf.predict(x_valid_pca)
            score=accuracy_score(y_valid,y_pred)
            score_r=recall_score(y_valid,y_pred)
            score_p=precision_score(y_valid,y_pred)
            print(c, k, score, score_r, score_p)

1 rbf 0.001 0.9723076923076923 0.976 0.9667194928684627
1 rbf 0.0001 0.8976923076923077 0.7952 0.9900398406374502
1 rbf 1e-05 0.5192307692307693 0.0 0.0
1 linear 0.9946153846153846 0.9952 0.9936102236421726
10 rbf 0.001 0.9861538461538462 0.9808 0.9903069466882067
10 rbf 0.0001 0.9723076923076923 0.976 0.9667194928684627
10 rbf 1e-05 0.8976923076923077 0.7952 0.9900398406374502
10 linear 0.9976923076923077 0.9968 0.9983974358974359
100 rbf 0.001 0.9923076923076923 0.9952 0.9888712241653418
100 rbf 0.0001 0.9861538461538462 0.9808 0.9903069466882067
100 rbf 1e-05 0.9723076923076923 0.976 0.9667194928684627
100 linear 0.9984615384615385 0.9984 0.9984


In [157]:
Best_model_s = SVC(C=10, kernel="linear", probability=True)
Best_model_s.fit(x_train_pca, y_train)

SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=True, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [158]:
y_pred=Best_model_s.predict(x_test_pca)

In [167]:
accuracy_score(y_test,y_pred)

0.9987692307692307

In [168]:
recall_score(y_test,y_pred)

0.9987063389391979

In [169]:
precision_score(y_test,y_pred)

0.9987063389391979

### XGBoost

In [170]:
clf = XGBClassifier()
clf.fit(x_train_pca, y_train)
y_pred=clf.predict(x_valid_pca)
score=accuracy_score(y_valid,y_pred)
score_r=recall_score(y_valid,y_pred)
score_p=precision_score(y_valid,y_pred)
print(score, score_r, score_p)

1.0 1.0 1.0


In [171]:
y_pred=clf.predict(x_test_pca)
accuracy_score(y_test,y_pred)

1.0