In [1]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd


In [2]:
# Importing the dataset
dataset = pd.read_csv('mushrooms.csv')
dataset.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [3]:
dataset.shape

(8124, 23)

In [4]:
# Encoding categorical data
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
for col in dataset.columns:
    dataset[col] = label_encoder.fit_transform(dataset[col])
dataset.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1


In [5]:
# Support Vector Machine
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

X = dataset.iloc[:, 1:23].values
y = dataset.iloc[:, 0].values

In [6]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

In [7]:

classifier = SVC(kernel = 'rbf', random_state = 0, verbose=True)
classifier.fit(X_train, y_train)

[LibSVM]

SVC(random_state=0, verbose=True)

In [8]:
y_pred = classifier.predict(X_test)

In [9]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[1051    3]
 [  15  962]]


In [10]:
accuracy_score(y_test, y_pred)

0.9911373707533235

In [11]:
# Cross Validation (K-Fold)
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X, y = y, cv = 10)
print("\n\nAccuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM]

Accuracy: 91.72 %
Standard Deviation: 12.89 %


In [12]:
# Grid Search (Hyperparameter Tuning)
from sklearn.model_selection import GridSearchCV
parameters = {'C': [1, 10, 100], 
               'kernel': ['linear' , 'rbf', 'poly', 'sigmoid']}
grid_search = GridSearchCV(estimator = SVC(),
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 10)
grid_search.fit(X_train, y_train)

Fitting 10 folds for each of 12 candidates, totalling 120 fits
[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][

GridSearchCV(cv=10, estimator=SVC(random_state=0, verbose=True),
             param_grid={'C': [1, 10, 100],
                         'kernel': ['linear', 'rbf', 'poly', 'sigmoid']},
             scoring='accuracy', verbose=True)

In [15]:
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
print(grid_search.cv_results_)
print("Best Accuracy: {:.2f} %".format(best_accuracy*100))
print("Best Parameters:", best_parameters)

{'mean_fit_time': array([ 0.64110007,  0.15600109,  0.11149895,  1.29469986,  2.23280089,
        0.07999973,  0.11000021,  0.43409963, 23.15972657,  0.07599938,
        0.11580153,  0.43140054]), 'std_fit_time': array([1.14347935e-01, 2.28084461e-03, 2.24800771e-03, 1.84913566e-01,
       3.88253993e-01, 1.26470670e-03, 3.76845223e-03, 9.84707889e-02,
       1.61424508e+01, 2.00041574e-03, 4.99552356e-03, 9.87640998e-02]), 'mean_score_time': array([0.00919981, 0.02929883, 0.00580101, 0.07289999, 0.00819917,
       0.01009996, 0.0019999 , 0.04329982, 0.00669937, 0.00649946,
       0.00189865, 0.04299948]), 'std_score_time': array([8.71689821e-04, 6.39858043e-04, 4.00630266e-04, 1.86759903e-03,
       1.07576059e-03, 7.00055170e-04, 6.64157308e-07, 8.93422388e-03,
       1.00544211e-03, 5.00656775e-04, 2.99899778e-04, 8.70575972e-03]), 'param_C': masked_array(data=[1, 1, 1, 1, 10, 10, 10, 10, 100, 100, 100, 100],
             mask=[False, False, False, False, False, False, False, False,

In [14]:
# Random Search (Hyperparameter Tuning)
from sklearn.model_selection import RandomizedSearchCV
parameters = {'C': [1, 10, 100], 'kernel': ['linear', 'rbf', 'poly', 'sigmoid']}
random_search = RandomizedSearchCV(estimator = SVC(),
                                   param_distributions = parameters,
                                   scoring = 'accuracy',
                                   cv = 10)
random_search.fit(X_train, y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits
[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM]

RandomizedSearchCV(cv=10, estimator=SVC(random_state=0, verbose=True),
                   param_distributions={'C': [1, 10, 100],
                                        'kernel': ['linear', 'rbf', 'poly',
                                                   'sigmoid']},
                   scoring='accuracy', verbose=True)

In [18]:
best_accuracy = random_search.best_score_
best_parameters = random_search.best_params_
print(random_search.cv_results_)
print("Best Accuracy: {:.2f} %".format(best_accuracy*100))
print("Best Parameters:", best_parameters)

{'mean_fit_time': array([0.11720004, 0.07990305, 0.10910051, 0.43900118, 1.29410033,
       2.23990021, 0.1094003 , 0.64880037, 0.15390036, 0.42850022]), 'std_fit_time': array([0.00509412, 0.00144401, 0.00242701, 0.09572039, 0.18811892,
       0.38372549, 0.00352754, 0.10839702, 0.00341952, 0.0993127 ]), 'mean_score_time': array([0.00169971, 0.00999706, 0.00549951, 0.04359875, 0.07629969,
       0.00779974, 0.00209985, 0.01009963, 0.02950015, 0.04299982]), 'std_score_time': array([0.00045801, 0.00044681, 0.00050044, 0.00884483, 0.0135791 ,
       0.0008716 , 0.00030022, 0.00280848, 0.00066999, 0.00905575]), 'param_kernel': masked_array(data=['poly', 'rbf', 'poly', 'sigmoid', 'sigmoid', 'linear',
                   'poly', 'linear', 'rbf', 'sigmoid'],
             mask=[False, False, False, False, False, False, False, False,
                   False, False],
       fill_value='?',
            dtype=object), 'param_C': masked_array(data=[100, 10, 1, 10, 1, 10, 10, 1, 1, 100],
           

### Model Selection

In [22]:
#model selection
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

models =[('LR', LogisticRegression()), ('DT', DecisionTreeClassifier()), ('RF', RandomForestClassifier()), ('SVM', SVC())]

hyparams = {
            'LR': {
                    'C': [1, 10, 100], 'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
            }, 
            'DT': {
                'criterion': ['gini', 'entropy'], 'splitter': ['best', 'random'], 'max_depth': [1, 10, 100]
            },
            'RF': {
                'n_estimators': [10, 100, 200], 'criterion': ['gini', 'entropy'], 'max_depth': [1, 10, 100]
            },
            'SVM': {
                'C': [1, 10, 100], 'kernel': ['linear', 'rbf', 'poly', 'sigmoid']
            }
}
for name, model in models:
    print(name)
    grid_search = GridSearchCV(estimator = model,
                               param_grid = hyparams[name],
                               scoring = 'accuracy',
                               cv = 10, verbose=True)
    grid_search.fit(X_train, y_train)
    best_accuracy = grid_search.best_score_
    best_parameters = grid_search.best_params_
    print("Best Accuracy: {:.2f} %".format(best_accuracy*100))
    print("Best Parameters:", best_parameters)
    print("------------------------------------------")


LR
Fitting 10 folds for each of 15 candidates, totalling 150 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best Accuracy: 96.37 %
Best Parameters: {'C': 100, 'solver': 'newton-cg'}
------------------------------------------
DT
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Best Accuracy: 100.00 %
Best Parameters: {'criterion': 'gini', 'max_depth': 10, 'splitter': 'best'}
------------------------------------------
RF
Fitting 10 folds for each of 18 candidates, totalling 180 fits
Best Accuracy: 100.00 %
Best Parameters: {'criterion': 'gini', 'max_depth': 10, 'n_estimators': 10}
------------------------------------------
SVM
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Best Accuracy: 100.00 %
Best Parameters: {'C': 10, 'kernel': 'rbf'}
------------------------------------------


In [None]:
# Linear, Logistic Regression
# KNN
# SVM
# Decision Tree
# Random Forest
# Self study: Naive Bayes, XGBoost