## Mushroom data analysis

## Helper Libraries

In [132]:
# Import libraries
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
from plotnine import ggplot, aes, geom_density, geom_line, geom_point, ggtitle

## Dataframe, Variables, and Splitting Setup

In [209]:
data=pd.read_csv("mushrooms.csv")

In [None]:
# This dataset contains 8124 observations and 23 features
data.shape

In [None]:
# The whole features are catogrical including the target variable. We have binary claasification, either the mushroom is poisonous or edible.
data.head()

In [None]:
# The data set has no missing values
data.isnull().sum()

In [None]:
# check the uniques values of each feature
for i in data.columns:
  print(i, data[i].unique())

In [210]:
# Here we have deleted one of the columns that has no connection with the rest of the data.
data.drop(['veil-type'], axis=1, inplace=True)

In [333]:
def plot_col(col, hue=None, color=['red', 'lightgreen'], labels=None):
    fig, ax = plt.subplots(figsize=(15, 7))
    sns.countplot(col, hue=hue, palette=color, saturation=0.6, data=data, dodge=True, ax=ax)
    ax.set(title = f"Mushroom {col.title()} Quantity", xlabel=f"{col.title()}", ylabel="Quantity")
    if labels!=None:
        ax.set_xticklabels(labels)
    if hue!=None:
        ax.legend(('Poisonous', 'Edible'), loc=0)

In [None]:
# We have approximate balance values of the target variable
class_dict = ('Poisonous', 'Edible')
plot_col(col='class', labels=class_dict)

In [None]:
print(data.groupby('class').size())

In [None]:
# Draw diffrent cap variables along with target variable
shape_dict = {"bell":"b","conical":"c","convex":"x","flat":"f", "knobbed":"k","sunken":"s"}
labels = ('convex', 'bell', 'sunken', 'flat', 'knobbed', 'conical')
plot_col(col='cap-shape', hue='class', labels=labels)

In [None]:
color_dict = {"brown":"n","yellow":"y", "blue":"w", "gray":"g", "red":"e","pink":"p",
              "orange":"b", "purple":"u", "black":"c", "green":"r"}
plot_col(col='cap-color', hue='class', labels=color_dict)

In [None]:
surface_dict = {"smooth":"s", "scaly":"y", "fibrous":"f","grooves":"g"}
plot_col(col='cap-surface', hue='class', labels=surface_dict)

In [341]:
def get_labels(order, a_dict):    
    labels = []
    for values in order:
        for key, value in a_dict.items():
            if values == value:
                labels.append(key)
    return labels

In [342]:
# Mushroom Population & Habitat Percentage
pop_dict = {"abundant":"a","clustered":"c","numerous":"n","scattered":"s","several":"v","solitary":"y"}
hab_dict = {"grasses":"g","leaves":"l","meadows":"m","paths":"p","urban":"u","waste":"w","woods":"d"}

In [None]:
f, ax = plt.subplots(figsize=(15, 10))
order = list(data['population'].value_counts().index)
pop_labels = get_labels(order, pop_dict)
explode = (0.0,0.01,0.02,0.03,0.04,0.05)
data['population'].value_counts().plot.pie(explode=explode , autopct='%1.1f%%', labels=pop_labels, shadow=True, ax=ax)
ax.set_title('Mushroom Population Type Percentange');

In [None]:
f, ax = plt.subplots(figsize=(15, 10))
order = list(data['habitat'].value_counts().index)
hab_labels = get_labels(order, hab_dict)
explode = (0.0,0.01,0.02,0.03,0.04,0.05, 0.06)
data['habitat'].value_counts().plot.pie(explode=explode, autopct='%1.1f%%', labels=hab_labels, shadow=True, ax=ax)
ax.set_title('Mushroom Habitat Type Percentange');

## Bulding Machine Learning Model

## Logistic Regression using the One Hot Encoding technique and an 80/20 split strategy

In [134]:
# separate target and features columns
y= data["class"].values
X= data.drop(["class"],axis=1)

# split the dataset into train and test sets (80-20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=11)
#print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [None]:
# Evaluate logistic regression on the Mushroom dataset with an one-hot encoding

# one-hot encode input variables
onehot_encoder = OneHotEncoder()
onehot_encoder.fit(X_train)
X_train = onehot_encoder.transform(X_train)
X_test = onehot_encoder.transform(X_test)

# encode target variable
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
y_train = label_encoder.transform(y_train)
y_test = label_encoder.transform(y_test)

# define the model
model = LogisticRegression()
# fit on the training set
model.fit(X_train, y_train)

# evaluate the model
print("Test Accuracy: {}%".format(round(accuracy_score(y_test, model.predict(X_test))*100,2)))

## K-nearest Neighbors Algorithm using the One Hot Encoding technique and an 80/20 split strategy

In [136]:
# Evaluate KNN on the Mushroom dataset with an one-hot encoding

# create KNN model object
knn = KNeighborsRegressor()

# define loss function
loss = 'neg_root_mean_squared_error'

# create 10 fold CV object
kfold = KFold(n_splits=10, random_state=11, shuffle=True)

# Create grid of hyperparameter values
hyper_grid = {'n_neighbors': range(2, 17)}

# Tune a knn model using grid search
grid_search = GridSearchCV(knn, hyper_grid, cv=kfold, scoring=loss)
results = grid_search.fit(X_train, y_train)


In [None]:
# Best model's k value
results.best_estimator_.get_params().get('n_neighbors')

# Cross validated grid search

# Plot all RMSE results
all_rmse = pd.DataFrame({'k': range(2, 17), 
                         'RMSE': np.abs(results.cv_results_['mean_test_score'])})

(ggplot(all_rmse, aes(x='k', y='RMSE'))
 + geom_line()
 + geom_point()
 + ggtitle("Cross validated grid search results"))

In [None]:
## Build knn on the best k value and make the prdeiction & accuaracy
knn = KNeighborsClassifier(n_neighbors=2)
knn.fit(X_train, y_train)

print("Test Accuracy: {}%".format(round(accuracy_score(y_test, knn.predict(X_test))*100,2)))


## Logistic Regression using the One Hot Encoding technique and an 70/30 split strategy

In [None]:
# separate target and features columns
y= data["class"].values
X= data.drop(["class"],axis=1)

# split the dataset into train and test sets (80-20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=11)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [None]:
# Evaluate logistic regression on the Mushroom dataset with an one-hot encoding

# one-hot encode input variables
onehot_encoder = OneHotEncoder()
onehot_encoder.fit(X_train)
X_train = onehot_encoder.transform(X_train)
X_test = onehot_encoder.transform(X_test)

# encode target variable
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
y_train = label_encoder.transform(y_train)
y_test = label_encoder.transform(y_test)

# define the model
model = LogisticRegression()
# fit on the training set
model.fit(X_train, y_train)

# evaluate the model
print("Test Accuracy: {}%".format(round(accuracy_score(y_test, model.predict(X_test))*100,2)))

## K-nearest Neighbors Algorithm using the One Hot Encoding technique and an 70/30 split strategy

In [141]:
# Evaluate KNN on the Mushroom dataset with an one-hot encoding

# create KNN model object
knn = KNeighborsRegressor()

# define loss function
loss = 'neg_root_mean_squared_error'

# create 10 fold CV object
kfold = KFold(n_splits=10, random_state=123, shuffle=True)

# Create grid of hyperparameter values
hyper_grid = {'n_neighbors': range(2, 17)}

# Tune a knn model using grid search
grid_search = GridSearchCV(knn, hyper_grid, cv=kfold, scoring=loss)
results = grid_search.fit(X_train, y_train)

In [None]:
# Best model's k value
results.best_estimator_.get_params().get('n_neighbors')

# Cross validated grid search

# Plot all RMSE results
all_rmse = pd.DataFrame({'k': range(2, 17), 
                         'RMSE': np.abs(results.cv_results_['mean_test_score'])})

(ggplot(all_rmse, aes(x='k', y='RMSE'))
 + geom_line()
 + geom_point()
 + ggtitle("Cross validated grid search results"))

In [None]:
## Build knn on the best k value and make the prdeiction & accuaracy
knn = KNeighborsClassifier(n_neighbors=2)
knn.fit(X_train, y_train)
print("Test Accuracy: {}%".format(round(accuracy_score(y_test, knn.predict(X_test))*100,2)))

## K-nearest Neighbors Algorithm using the LabelEncoder technique and an 80/20 split strategy

In [144]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for col in data.columns:
    data = data.apply(le.fit_transform)
    
# separate target and features columns
dfX = data.iloc[:,1:]
dfY = data['class']

# split the dataset into train and test sets (80-20)
X_train, X_test, y_train, y_test = train_test_split(dfX, dfY, test_size=0.2, random_state=11)

In [145]:
# Evaluate KNN on the Mushroom dataset with LabelEncoder

# create KNN model object
knn = KNeighborsRegressor()

# define loss function
loss = 'neg_root_mean_squared_error'

# create 10 fold CV object
kfold = KFold(n_splits=10, random_state=123, shuffle=True)

# Create grid of hyperparameter values
hyper_grid = {'n_neighbors': range(2, 17)}

# Tune a knn model using grid search
grid_search = GridSearchCV(knn, hyper_grid, cv=kfold, scoring=loss)
results = grid_search.fit(X_train, y_train)

In [None]:
# Best model's k value
results.best_estimator_.get_params().get('n_neighbors')

In [None]:
## Build knn on the best k value and make the prdeiction & accuaracy
knn = KNeighborsClassifier(n_neighbors=2)
knn.fit(X_train, y_train)

print("Test Accuracy: {}%".format(round(accuracy_score(y_test, knn.predict(X_test))*100,2)))

## Logistic regression using the LabelEncoder technique and an 80/20 split strategy

In [None]:
# Evaluate logistic regression on the Mushroom dataset with LabelEncoder

# define the model
model = LogisticRegression(solver='newton-cg')
# fit on the training set
model.fit(X_train, y_train)

# evaluate the model
print("Test Accuracy: {}%".format(round(accuracy_score(y_test, model.predict(X_test))*100,2)))

## Logistic regression using the LabelEncoder technique and an 70/30 split strategy

In [149]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for col in data.columns:
    data = data.apply(le.fit_transform)
    
# separate target and features columns
dfX = data.iloc[:,1:]
dfY = data['class']

# split the dataset into train and test sets (80-20)
X_train, X_test, y_train, y_test = train_test_split(dfX, dfY, test_size=0.3, random_state=11)

In [None]:
# Evaluate logistic regression on the Mushroom dataset with LabelEncoder

# define the model
model = LogisticRegression(solver='newton-cg')
# fit on the training set
model.fit(X_train, y_train)

# evaluate the model
print("Test Accuracy: {}%".format(round(accuracy_score(y_test, model.predict(X_test))*100,2)))

## K-nearest Neighbors Algorithm using the LabelEncoder technique and an 70/30 split strategy

In [151]:
# Evaluate KNN on the Mushroom dataset with LabelEncoder

# create KNN model object
knn = KNeighborsRegressor()

# define loss function
loss = 'neg_root_mean_squared_error'

# create 10 fold CV object
kfold = KFold(n_splits=10, random_state=123, shuffle=True)

# Create grid of hyperparameter values
hyper_grid = {'n_neighbors': range(2, 17)}

# Tune a knn model using grid search
grid_search = GridSearchCV(knn, hyper_grid, cv=kfold, scoring=loss)
results1 = grid_search.fit(X_train, y_train)

In [None]:
# Best model's k value
results1.best_estimator_.get_params().get('n_neighbors')

In [None]:
## Build knn on the best k value and make the prdeiction & accuaracy
knn = KNeighborsClassifier(n_neighbors=2)
knn.fit(X_train, y_train)
print("Test Accuracy: {}%".format(round(accuracy_score(y_test, knn.predict(X_test))*100,2)))

## K-nearest Neighbors Algorithm and Logistic Regression using the LabelEncoder technique and 5/10-kfold split strategy

In [203]:
## Data processing 
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for col in data.columns:
    data = data.apply(le.fit_transform)

In [None]:
dfX = data.iloc[:,1:]
dfY = data['class']
dfX.head(5)

In [211]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.model_selection import KFold
skf = KFold(n_splits=5, shuffle=True)
#skf = KFold(n_splits=10, shuffle=True)

In [206]:
accurecy_report = dict()
def trainwith(modelname):
    print(f'using model: {modelname} '.center(100,'='),'\n')
    idx = 0
    model_accurecy =[]
    for _train,_test in skf.split(dfX, dfY):
        idx += 1
        print(f'Fold Number {idx} '.center(100,'='), '\n')
        modelname.fit(dfX.iloc[_train],dfY.iloc[_train])
        print('Confusion Matrix'.center(70,'-'), '\n')
        ypred = modelname.predict(dfX.iloc[_test])
        print(confusion_matrix(dfY.iloc[_test],ypred), '\n')
        print('Classification Report'.center(70,'-'), '\n')
        print(classification_report(dfY.iloc[_test],ypred))
        model_accurecy.append(accuracy_score(dfY.iloc[_test],ypred))
    print('='*100)
    print('The average accurecy of this model is {:.02f}%'.format(np.array(model_accurecy).mean()*100))
    accurecy_report[modelname] = round(np.array(model_accurecy).mean()*100,2)

In [None]:
KNN_model = KNeighborsClassifier(n_neighbors=2)
trainwith(KNN_model)

In [None]:
LR_model = LogisticRegression(solver='newton-cg')
trainwith(LR_model)

## K-nearest Neighbors Algorithm and Logistic Regression using the One hot Encoding technique and 5/10-kfold split strategy

In [None]:
#one hot label encoding
features = data.iloc[:,1:]
features = pd.get_dummies(features)
target = data.iloc[:,0].replace({'p': 0, 'e': 1})
print('First 5 rows of new encoded feature columns:\n',features.head())
print('First 5 rows of new encoded target class of mushroom poisonous = 0 edible = 1:\n',target.head())
dfX = features.values
dfY = target.values


In [213]:
accurecy_report = dict()
def trainwith(modelname):
    print(f'using model: {modelname} '.center(100,'='),'\n')
    idx = 0
    model_accurecy =[]
    for _train,_test in skf.split(dfX, dfY):
        idx += 1
        print(f'Fold Number {idx} '.center(100,'='), '\n')
        modelname.fit(dfX[_train],dfY[_train])
        print('Confusion Matrix'.center(70,'-'), '\n')
        ypred = modelname.predict(dfX[_test])
        print(confusion_matrix(dfY[_test],ypred), '\n')
        print('Classification Report'.center(70,'-'), '\n')
        print(classification_report(dfY[_test],ypred))
        model_accurecy.append(accuracy_score(dfY[_test],ypred))
    print('='*100)
    print('The average accurecy of this model is {:.02f}%'.format(np.array(model_accurecy).mean()*100))
    accurecy_report[modelname] = round(np.array(model_accurecy).mean()*100,2)

In [None]:
KNN_model = KNeighborsClassifier(n_neighbors=2)
trainwith(KNN_model)

In [None]:
LR_model = LogisticRegression(solver='newton-cg')
trainwith(LR_model)