# **Case**:
There is a mushroom dataset.  Based on this dataset, compare the performance between the Decision Tree and Adaboost algorithms.  

Use hyperparameter tuning to get the best parameters and accuracy. 

# **Import Libraries and Load Data**

In [28]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load the data
df = pd.read_csv('../Data/mushrooms.csv')
# show the first 15 rows of the data
df.head(15)

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
5,e,x,y,y,t,a,f,c,b,n,...,s,w,w,p,w,o,p,k,n,g
6,e,b,s,w,t,a,f,c,b,g,...,s,w,w,p,w,o,p,k,n,m
7,e,b,y,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,s,m
8,p,x,y,w,t,p,f,c,n,p,...,s,w,w,p,w,o,p,k,v,g
9,e,b,s,y,t,a,f,c,b,g,...,s,w,w,p,w,o,p,k,s,m


# **Check Data Information**

In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   class                     8124 non-null   object
 1   cap-shape                 8124 non-null   object
 2   cap-surface               8124 non-null   object
 3   cap-color                 8124 non-null   object
 4   bruises                   8124 non-null   object
 5   odor                      8124 non-null   object
 6   gill-attachment           8124 non-null   object
 7   gill-spacing              8124 non-null   object
 8   gill-size                 8124 non-null   object
 9   gill-color                8124 non-null   object
 10  stalk-shape               8124 non-null   object
 11  stalk-root                8124 non-null   object
 12  stalk-surface-above-ring  8124 non-null   object
 13  stalk-surface-below-ring  8124 non-null   object
 14  stalk-color-above-ring  

# **Check Data Description**

In [30]:
df.describe()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
count,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,...,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124
unique,2,6,4,10,2,9,2,2,2,12,...,4,9,9,1,4,3,5,9,6,7
top,e,x,y,n,f,n,f,c,b,b,...,s,w,w,p,w,o,p,w,v,d
freq,4208,3656,3244,2284,4748,3528,7914,6812,5612,1728,...,4936,4464,4384,8124,7924,7488,3968,2388,4040,3148


# **Check Null Column**

In [31]:
df.isnull().sum()

class                       0
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

# **Feature Extraction and Label Encoding**

In [32]:
# Feature Extraction
# slice dataframe from 'cap-shape' to 'habitat' columns
X = df.drop('class', axis=1)
y = df['class']

# label encoding for feature data
from sklearn.preprocessing import LabelEncoder
# In code using X.apply(LabelEncoder().fit_transform), a new instance of 
# LabelEncoder is created for each feature column when the apply method is 
# called. This means that each column's encoding is handled by a separate 
# LabelEncoder object.
# Explanation
    # Separate Instances: Each time LabelEncoder().fit_transform is 
    # executed, a new LabelEncoder object is instantiated. This ensures that 
    # the encoding is independent for each feature, which is especially 
    # important when the categories in different features do not overlap.

    # Implication: This approach is beneficial because it allows each 
    # feature to have its own mapping of categories to integers, preserving 
    # the integrity of the data.
X = X.apply(LabelEncoder().fit_transform)

# label encoding for target data
y = y.map({'e': 1, 'p': 0})

## **Check Feature Data**

In [33]:
X.head(10)

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,5,2,4,1,6,1,0,1,4,0,...,2,7,7,0,2,1,4,2,3,5
1,5,2,9,1,0,1,0,0,4,0,...,2,7,7,0,2,1,4,3,2,1
2,0,2,8,1,3,1,0,0,5,0,...,2,7,7,0,2,1,4,3,2,3
3,5,3,8,1,6,1,0,1,5,0,...,2,7,7,0,2,1,4,2,3,5
4,5,2,3,0,5,1,1,0,4,1,...,2,7,7,0,2,1,0,3,0,1
5,5,3,9,1,0,1,0,0,5,0,...,2,7,7,0,2,1,4,2,2,1
6,0,2,8,1,0,1,0,0,2,0,...,2,7,7,0,2,1,4,2,2,3
7,0,3,8,1,3,1,0,0,5,0,...,2,7,7,0,2,1,4,3,3,3
8,5,3,8,1,6,1,0,1,7,0,...,2,7,7,0,2,1,4,2,4,1
9,0,2,9,1,0,1,0,0,2,0,...,2,7,7,0,2,1,4,2,3,3


## **Check Target Data**

In [34]:
y.head(10)

0    0
1    1
2    1
3    0
4    1
5    1
6    1
7    1
8    0
9    1
Name: class, dtype: int64

# **Split Data**

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# **Train Model**

## **Decision Tree**

## **Function to Check Mean Accuracy and STD**

In [36]:
from sklearn.model_selection import cross_val_score
def check_model(model, X_train, y_train):
    scores = cross_val_score(model, X_train, y_train, cv=5)

    # Calculate mean and standard deviation
    mean_score = np.mean(scores)
    std_dev = np.std(scores)
    
    print(f'Mean Cross-Validation Score: {mean_score:.4f}')
    print(f'Standard Deviation: {std_dev:.4f}')
    
    # Interpretation
    print('Interpretation:')
    if std_dev > 0.1:
        print("The model's performance is inconsistent across folds.")
    elif mean_score < 0.7:
        print("The model is performing poorly.")
    else:
        print("The model appears to generalize well.")


## **Hyperparameter Tuning**

In [37]:
# In scikit-learn, hyperparameters are set during the instantiation of the model.
# In this case, we will use the DecisionTreeClassifier model
# The hyperparameters are set during the instantiation of the model,
# and we will use GridSearch for search best hyperparameters
from sklearn.model_selection import GridSearchCV

# define default DecisionTreeClassifier model
dt = DecisionTreeClassifier()

# define the grid of hyperparameters
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [5, 10, 15, 20, 25, None],
    'min_samples_split': [2, 5, 10, 15, 20],
    'min_samples_leaf': [1, 2, 5, 10, 15]
}

# perform GridSearch
dt_grid = GridSearchCV(estimator=dt, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)

# fit the model
dt_grid.fit(X_train, y_train)

# print best hyperparameters
print(f'Best hyperparameters: {dt_grid.best_params_}')

# print best model accuracy
print(f'\nBest model accuracy: {dt_grid.best_score_}')

Fitting 5 folds for each of 300 candidates, totalling 1500 fits
Best hyperparameters: {'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 2}

Best model accuracy: 1.0


## **Model Evaluation**

In [38]:
dt = dt_grid.best_estimator_

# fit data 
dt.fit(X_train, y_train)

# predict train set
y_pred_train = dt.predict(X_train)

# predict test set
y_pred_dt = dt.predict(X_test)

# calculate train and test data accuracy score
acc_train = accuracy_score(y_train, y_pred_train)
acc_test = accuracy_score(y_test, y_pred_dt)

check_model(dt, X_train, y_train)

# print train and test data accuracy score
print(f"\nAccuracy on train set: {acc_train * 100:.2f}%")
print(f"Accuracy on test set: {acc_test * 100:.2f}%")

Mean Cross-Validation Score: 1.0000
Standard Deviation: 0.0000
Interpretation:
The model appears to generalize well.

Accuracy on train set: 100.00%
Accuracy on test set: 100.00%


## **Adaboost**

## **Hyperparameter Tuning**

In [39]:
# define default RandomForestClassifier model
ada = AdaBoostClassifier(algorithm='SAMME')

# define the grid of hyperparameters
param_grid = {
    'estimator': [DecisionTreeClassifier(max_depth=1), DecisionTreeClassifier(max_depth=2), DecisionTreeClassifier(max_depth=3)],
    'n_estimators': [5, 10, 25, 50, 100, 200, 400],
    'learning_rate': [0.001, 0.01, 0.1, 1.0, 10]
}

# perform GridSearch
ada_grid = GridSearchCV(estimator=ada, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)

# fit the model
ada_grid.fit(X_train, y_train)

# print best hyperparameters
print(f'Best hyperparameters: {ada_grid.best_params_}')

# print best model accuracy
print(f'\nBest model accuracy: {ada_grid.best_score_}')

Fitting 5 folds for each of 105 candidates, totalling 525 fits
Best hyperparameters: {'estimator': DecisionTreeClassifier(max_depth=1), 'learning_rate': 1.0, 'n_estimators': 200}

Best model accuracy: 1.0


## **Model Evaluation**

In [40]:
ada = ada_grid.best_estimator_

# fit data
ada.fit(X_train, y_train)

# predict train set
y_pred_train = ada.predict(X_train)

# predict test set
y_pred_ada = ada.predict(X_test)

# calculate train and test data accuracy score
acc_train = accuracy_score(y_train, y_pred_train)
acc_test = accuracy_score(y_test, y_pred_ada)

check_model(ada, X_train, y_train)

# print train and test data accuracy score
print(f"\nAccuracy on train set: {acc_train * 100:.2f}%")
print(f"Accuracy on test set: {acc_test * 100:.2f}%")

Mean Cross-Validation Score: 1.0000
Standard Deviation: 0.0000
Interpretation:
The model appears to generalize well.

Accuracy on train set: 100.00%
Accuracy on test set: 100.00%
