In [None]:
# -*- coding: utf-8 -*-
"""
Created on Tue Feb 26 16:00:07 2021

In [None]:
@author: Ganesh Prasad
"""
# Load Libraries

In [None]:
import os
import numpy as np
import pandas as pd
from matplotlib import pyplot
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier, ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
#import graphviz

Import the dataset

In [None]:
df = pd.read_csv("D:/datatrained_project/practice projects/mashroom/mushrooms.csv")

Have a look into data

In [None]:
print("\n Have a look into records", df.head(10))

column and data type

In [None]:
print("\n Column and Data Type", df.info())

shape of data type

In [None]:
print("\n Shape of Dataset", df.shape)

stats of dataset

In [None]:
print("\n Describe the Data ", df.describe())

validate class is having only two attribute

In [None]:
print("\n Unique value in Class ", df["class"].unique())

Class distribution for e and p

In [None]:
print("\n Count of the class for e and p", df['class'].value_counts())

In [None]:
nan_in_df = df.isnull().values.sum()

Print the dataframe for NAN

In [None]:
print("Null value in input column",nan_in_df)

In [None]:
print("Null in feature column",df['class'].isnull().values.sum())

In [None]:
for column in df.columns:
    df[column] = pd.get_dummies(df[column])

In [None]:
print("\n data after one hot encoding",df.head(10))

class distribution

In [None]:
print(df.groupby('class').size())

Unimodal Data Visualizations

histograms

In [None]:
df.hist(sharex=False, sharey=False, xlabelsize=1, ylabelsize=1)
pyplot.show()

Multimodal Data Visualizations

correlation matrix

In [None]:
plt.figure(figsize=(14,12))
sns.heatmap(df.corr(),linewidths=.1,cmap="Purples", annot=True, annot_kws={"size": 7})
plt.yticks(rotation=0)
plt.savefig("corr.png", format='png', dpi=900, bbox_inches='tight')

# Validation Dataset

Split-out validation dataset

In [None]:
array = df.values
X = array[:,1:22]
Y = array[:,0]
validation_size = 0.20
seed = 7
X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y,test_size=validation_size, random_state=seed)

Test options and evaluation metric

In [None]:
num_folds = 10
seed = 7
scoring = 'accuracy'

Spot-Check Algorithms

In [None]:
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))

In [None]:
results = []
names = []
for name, model in models:
    kfold = KFold(n_splits=num_folds, random_state=seed,shuffle=True)
    cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)
# Output
# LR: 0.949684 (0.005679)
# LDA: 0.939374 (0.007613)
# KNN: 0.973072 (0.006961)
# CART: 0.979228 (0.003844)
# NB: 0.893216 (0.016749)
# SVM: 0.978920 (0.004460)

Compare Algorithms

In [None]:
fig = pyplot.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
pyplot.boxplot(results)
ax.set_xticklabels(names)
pyplot.show()

Evaluate Algorithms: Standardize/Normalize data

to avoid data leakage when we transform the data. A good way to avoid leakage is to use pipelines<br>
that standardize the data and build the # model for each fold in the cross-validation test harness.<br>
That way we can get a fair estimation of how each model with standardized data might perform on unseen data.

Standardize the dataset

In [None]:
pipelines = []
pipelines.append(('ScaledLR', Pipeline([('Scaler', StandardScaler()),('LR', LogisticRegression())])))
pipelines.append(('ScaledLDA', Pipeline([('Scaler', StandardScaler()),('LDA', LinearDiscriminantAnalysis())])))
pipelines.append(('ScaledKNN', Pipeline([('Scaler', StandardScaler()),('KNN', KNeighborsClassifier())])))
pipelines.append(('ScaledCART', Pipeline([('Scaler', StandardScaler()),('CART', DecisionTreeClassifier())])))
pipelines.append(('ScaledNB', Pipeline([('Scaler', StandardScaler()),('NB', GaussianNB())])))
pipelines.append(('ScaledSVM', Pipeline([('Scaler', StandardScaler()),('SVM', SVC())])))
results = []
names = []

In [None]:
for name, model in pipelines:
    kfold = KFold(n_splits=num_folds, random_state=seed,shuffle=True)
    cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)
##Output
# ScaledLR: 0.953223 (0.006505)
# ScaledLDA: 0.939374 (0.007613)
# ScaledKNN: 0.976611 (0.005237)
# ScaledCART: 0.979228 (0.003844)
# ScaledNB: 0.910600 (0.006561)
# ScaledSVM: 0.978766 (0.004228)

Compare Algorithms

In [None]:
fig = pyplot.figure()
fig.suptitle('Scaled Algorithm Comparison')
ax = fig.add_subplot(111)
pyplot.boxplot(results)
ax.set_xticklabels(names)
pyplot.show()

After investigate tuning the parameters for two algorithms that show promise from<br>
In the spot-checking highest score is for : KNN and SVM.<br>
Output is<br>
ScaledLR: 0.953223 (0.006505)<br>
ScaledLDA: 0.939374 (0.007613)<br>
ScaledKNN: 0.976611 (0.005237)<br>
ScaledCART: 0.979228 (0.003844)<br>
ScaledNB: 0.910600 (0.006561)<br>
ScaledSVM: 0.978766 (0.004228)

Tuning KNN

Tune scaled KNN

In [None]:
scaler = StandardScaler().fit(X_train)
rescaledX = scaler.transform(X_train)
neighbors = [1,3,5,7,9,11,13,15,17,19,21]
param_grid = dict(n_neighbors=neighbors)
model = KNeighborsClassifier()
kfold = KFold(n_splits=num_folds, random_state=seed,shuffle=True)
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=kfold)
grid_result = grid.fit(rescaledX, Y_train)
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

utput after tunning of KNN<br>
Best: 0.979074 using {'n_neighbors': 1}<br>
0.979074 (0.004468) with: {'n_neighbors': 1}<br>
0.973533 (0.006632) with: {'n_neighbors': 3}<br>
0.977688 (0.005348) with: {'n_neighbors': 5}<br>
0.978613 (0.004319) with: {'n_neighbors': 7}<br>
0.977843 (0.004361) with: {'n_neighbors': 9}<br>
0.976458 (0.004718) with: {'n_neighbors': 11}<br>
0.976920 (0.005102) with: {'n_neighbors': 13}<br>
0.976920 (0.005102) with: {'n_qneighbors': 15}<br>
0.976920 (0.005102) with: {'n_neighbors': 17}<br>
0.976920 (0.005102) with: {'n_neighbors': 19}<br>
0.976920 (0.005102) with: {'n_neighbors': 21}<br>
Best: 0.979689 using {'C': 1.7, 'kernel': 'rbf'}

Tuning SVM<br>
tune two key parameters of the SVM algorithm, the value of C (how much to relax the<br>
margin) and the type of kernel. The default for SVM (the SVC class) is to use the Radial<br>
Basis Function (RBF) kernel with a C value set to 1.0. Like with KNN, we will perform a grid<br>
search using 10-fold cross-validation with a standardized copy of the training dataset. We will<br>
try a number of simpler kernel types and C values with less bias and more bias (less than and<br>
more than 1.0 respectively).

une scaled SVM

In [None]:
scaler = StandardScaler().fit(X_train)
rescaledX = scaler.transform(X_train)
c_values = [0.1, 0.3, 0.5, 0.7, 0.9, 1.0, 1.3, 1.5, 1.7, 2.0]
kernel_values = ['linear', 'poly', 'rbf', 'sigmoid']
param_grid = dict(C=c_values, kernel=kernel_values)
model = SVC()
kfold = KFold(n_splits=num_folds, random_state=seed,shuffle=True)
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=kfold)
grid_result = grid.fit(rescaledX, Y_train)
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

0.958147 (0.005995) with: {'C': 0.1, 'kernel': 'linear'}<br>
0.938297 (0.010319) with: {'C': 0.1, 'kernel': 'poly'}<br>
0.972610 (0.007311) with: {'C': 0.1, 'kernel': 'rbf'}<br>
0.913678 (0.005089) with: {'C': 0.1, 'kernel': 'sigmoid'}<br>
0.959224 (0.006860) with: {'C': 0.3, 'kernel': 'linear'}<br>
0.973072 (0.007195) with: {'C': 0.3, 'kernel': 'poly'}<br>
0.972918 (0.007226) with: {'C': 0.3, 'kernel': 'rbf'}<br>
0.897522 (0.009191) with: {'C': 0.3, 'kernel': 'sigmoid'}<br>
0.959224 (0.006860) with: {'C': 0.5, 'kernel': 'linear'}<br>
0.973995 (0.005739) with: {'C': 0.5, 'kernel': 'poly'}<br>
0.974303 (0.005723) with: {'C': 0.5, 'kernel': 'rbf'}<br>
0.897059 (0.007856) with: {'C': 0.5, 'kernel': 'sigmoid'}<br>
0.959224 (0.006860) with: {'C': 0.7, 'kernel': 'linear'}<br>
0.978766 (0.004228) with: {'C': 0.7, 'kernel': 'poly'}<br>
0.978920 (0.004406) with: {'C': 0.7, 'kernel': 'rbf'}<br>
0.897521 (0.008708) with: {'C': 0.7, 'kernel': 'sigmoid'}<br>
0.959224 (0.006860) with: {'C': 0.9, 'kernel': 'linear'}<br>
0.978613 (0.004264) with: {'C': 0.9, 'kernel': 'poly'}<br>
0.978766 (0.004228) with: {'C': 0.9, 'kernel': 'rbf'}<br>
0.897828 (0.008950) with: {'C': 0.9, 'kernel': 'sigmoid'}<br>
0.959224 (0.006860) with: {'C': 1.0, 'kernel': 'linear'}<br>
0.978613 (0.004264) with: {'C': 1.0, 'kernel': 'poly'}<br>
0.978766 (0.004228) with: {'C': 1.0, 'kernel': 'rbf'}<br>
0.897675 (0.008790) with: {'C': 1.0, 'kernel': 'sigmoid'}<br>
0.959224 (0.006860) with: {'C': 1.3, 'kernel': 'linear'}<br>
0.979228 (0.004198) with: {'C': 1.3, 'kernel': 'poly'}<br>
0.978920 (0.004242) with: {'C': 1.3, 'kernel': 'rbf'}<br>
0.897675 (0.008461) with: {'C': 1.3, 'kernel': 'sigmoid'}<br>
0.959224 (0.006860) with: {'C': 1.5, 'kernel': 'linear'}<br>
0.979228 (0.004198) with: {'C': 1.5, 'kernel': 'poly'}<br>
0.978920 (0.004242) with: {'C': 1.5, 'kernel': 'rbf'}<br>
0.897368 (0.006993) with: {'C': 1.5, 'kernel': 'sigmoid'}<br>
0.959224 (0.006860) with: {'C': 1.7, 'kernel': 'linear'}<br>
0.979228 (0.004198) with: {'C': 1.7, 'kernel': 'poly'}<br>
0.979689 (0.003626) with: {'C': 1.7, 'kernel': 'rbf'}<br>
0.897368 (0.006993) with: {'C': 1.7, 'kernel': 'sigmoid'}<br>
0.959224 (0.006860) with: {'C': 2.0, 'kernel': 'linear'}<br>
0.979536 (0.003770) with: {'C': 2.0, 'kernel': 'poly'}<br>
0.979689 (0.003626) with: {'C': 2.0, 'kernel': 'rbf'}<br>
0.897059 (0.008438) with: {'C': 2.0, 'kernel': 'sigmoid'}

#########Ensemble Methods###########

evaluate four different ensemble machine learning<br>
algorithms, two boosting and two bagging methods:<br>
1. Boosting Methods: AdaBoost (AB) and Gradient Boosting (GBM).<br>
2. Bagging Methods: Random Forests (RF) and Extra Trees (ET).

ensembles

In [None]:
ensembles = []
ensembles.append(('AB', AdaBoostClassifier()))
ensembles.append(('GBM', GradientBoostingClassifier()))
ensembles.append(('RF', RandomForestClassifier()))
ensembles.append(('ET', ExtraTreesClassifier()))
results = []
names = []
for name, model in ensembles:
    kfold = KFold(n_splits=num_folds, random_state=seed,shuffle=True)
    cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)
# Output is
#AB: 0.940913 (0.005893)
#GBM: 0.967533 (0.007102)
#RF: 0.979536 (0.003510)
#ET: 0.979382 (0.003716)

Finalize model<br>
SVM showed the most promise as a low complexity and stable model for this problem. In<br>
so Finalize the model by training it on the entire training dataset and make<br>
predictions for the hold-out validation dataset to confirm

prepare the model

In [None]:
scaler = StandardScaler().fit(X_train)
rescaledX = scaler.transform(X_train)
model = SVC(C=1.5)
model.fit(rescaledX, Y_train)
# estimate accuracy on validation dataset
rescaledValidationX = scaler.transform(X_validation)
predictions = model.predict(rescaledValidationX)
print(accuracy_score(Y_validation, predictions))
print(confusion_matrix(Y_validation, predictions))
print(classification_report(Y_validation, predictions))

#Output is

0.9796923076923076<br>
[[754  31]<br>
 [  2 838]]<br>
              precision    recall  f1-score   support<br>
<br>
           0       1.00      0.96      0.98       785<br>
           1       0.96      1.00      0.98       840<br>
<br>
    accuracy                           0.98      1625<br>
   macro avg       0.98      0.98      0.98      1625<br>
weighted avg       0.98      0.98      0.98      1625