In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
import warnings

data = pd.read_csv("mushroom.csv")

In [2]:
data

Unnamed: 0.1,Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat,poisonous
0,0,convex,smooth,brown,bruises,pungent,free,close,narrow,black,...,white,white,partial,white,one,pendant,black,scattered,urban,p
1,1,convex,smooth,yellow,bruises,almond,free,close,broad,black,...,white,white,partial,white,one,pendant,brown,numerous,grasses,e
2,2,bell,smooth,white,bruises,anise,free,close,broad,brown,...,white,white,partial,white,one,pendant,brown,numerous,meadows,e
3,3,convex,scaly,white,bruises,pungent,free,close,narrow,brown,...,white,white,partial,white,one,pendant,black,scattered,urban,p
4,4,convex,smooth,gray,no,none,free,crowded,broad,black,...,white,white,partial,white,one,evanescent,brown,abundant,grasses,e
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,8119,knobbed,smooth,brown,no,none,attached,close,broad,yellow,...,orange,orange,partial,orange,one,pendant,buff,clustered,leaves,e
8120,8120,convex,smooth,brown,no,none,attached,close,broad,yellow,...,orange,orange,partial,brown,one,pendant,buff,several,leaves,e
8121,8121,flat,smooth,brown,no,none,attached,close,broad,brown,...,orange,orange,partial,orange,one,pendant,buff,clustered,leaves,e
8122,8122,knobbed,scaly,brown,no,fishy,free,close,narrow,buff,...,white,white,partial,white,one,evanescent,white,several,leaves,p


In [3]:
data.drop(['Unnamed: 0'],axis=1,inplace=True)

In [4]:
data['stalk-root'].fillna('unknown', inplace=True)
data.isnull().sum()

cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
poisonous                   0
dtype: int64

In [5]:
data['cap-shape_surface'] = data['cap-shape'] + '_' + data['cap-surface']  # Cap shape and surface texture are key morphological features for species identification
data['cap-color_odor'] = data['cap-color'] + '_' + data['odor']  # Certain cap colors and odors are strong indicators of toxicity
data['odor_bruises'] = data['odor'] + '_' + data['bruises'].astype(str)  # Bruising combined with odor often signals poisonous mushrooms
data['gill-color_spore-print'] = data['gill-color'] + '_' + data['spore-print-color']  # Gill and spore print colors are critical for differentiating species
data['ring-number_type'] = data['ring-number'] + '_' + data['ring-type']  # Ring number and type help distinguish between mushroom species
data['population_habitat'] = data['population'] + '_' + data['habitat']  # Population density in specific habitats helps identify species
data['odor_habitat'] = data['odor'] + '_' + data['habitat']  # Odors common to certain habitats can indicate mushroom edibility
data['cap-shape_surface_color'] = data['cap-shape'] + '_' + data['cap-surface'] + '_' + data['cap-color']  # A combination of cap features improves species classification
data['gill-size_spacing'] = data['gill-size'] + '_' + data['gill-spacing']  # Gill size and spacing are important biological traits for classification

**These interactions may capture relationships between categorical attributes that aren't obvious when looked at individually**

**Encode ordinal features using LabelEncoder to convert categorical values into ordinal integers, preserving the inherent order of 'gill-size', 'gill-spacing', and 'ring-number'**

In [6]:
ordinal_features = ['gill-size', 'gill-spacing', 'ring-number']
label_encoder = LabelEncoder()
for feature in ordinal_features:
    data[feature] = label_encoder.fit_transform(data[feature])

**Apply one-hot encoding to nominal features to convert categorical variables into binary indicators, facilitating model training by removing the first category to avoid multicollinearity**


In [7]:
nominal_features = ['cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor', 
                    'gill-attachment', 'gill-color', 'stalk-shape', 'stalk-root', 
                    'stalk-surface-above-ring', 'stalk-surface-below-ring', 
                    'stalk-color-above-ring', 'stalk-color-below-ring', 
                    'veil-type', 'veil-color', 'ring-type', 'spore-print-color', 
                    'population', 'habitat']
data = pd.get_dummies(data, columns=nominal_features, drop_first=True,dtype=int)

**Apply one-hot encoding to interaction features to convert combined categorical variables into binary indicators,enhancing the model's ability to capture relationships between feature interactions while avoiding multicollinearity by dropping the first category**

In [8]:
interaction_features = ['cap-shape_surface', 'cap-color_odor', 'odor_bruises', 
                        'gill-color_spore-print', 'ring-number_type', 
                        'population_habitat', 'odor_habitat', 
                        'cap-shape_surface_color', 'gill-size_spacing']
data = pd.get_dummies(data, columns=interaction_features, drop_first=True,dtype=int)

**Map the 'poisonous' target column from categorical labels ('e' for edible, 'p' for poisonous) to binary values (0 for edible, 1 for poisonous) for numerical representation in the model**


In [9]:
data['poisonous'] = data['poisonous'].map({'e': 0, 'p': 1})

In [10]:
data

Unnamed: 0,gill-spacing,gill-size,ring-number,poisonous,cap-shape_conical,cap-shape_convex,cap-shape_flat,cap-shape_knobbed,cap-shape_sunken,cap-surface_grooves,...,cap-shape_surface_color_knobbed_smooth_buff,cap-shape_surface_color_knobbed_smooth_gray,cap-shape_surface_color_knobbed_smooth_pink,cap-shape_surface_color_knobbed_smooth_red,cap-shape_surface_color_knobbed_smooth_white,cap-shape_surface_color_sunken_fibrous_brown,cap-shape_surface_color_sunken_fibrous_gray,gill-size_spacing_broad_crowded,gill-size_spacing_narrow_close,gill-size_spacing_narrow_crowded
0,0,1,1,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,1,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,1,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,0,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
8120,0,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8121,0,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8122,0,1,1,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0


**Separate features (X) and target variable (y) for model training**

In [11]:
X = data.drop(columns=['poisonous'])  # Features
y = data['poisonous']  # Target variable

**Split the dataset into training and testing sets (80% train, 20% test) for model evaluation**

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

**Display the shapes of training and testing datasets to confirm the split sizes**

In [13]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(6499, 342)
(6499,)
(1625, 342)
(1625,)


In [14]:
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

y_pred_log_reg = log_reg.predict(X_test)
print("Logistic Regression Report:")
print(classification_report(y_test, y_pred_log_reg))

Logistic Regression Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       843
           1       1.00      1.00      1.00       782

    accuracy                           1.00      1625
   macro avg       1.00      1.00      1.00      1625
weighted avg       1.00      1.00      1.00      1625



**Logistic Regression Implementation for Mushroom Classification: This code initializes a Logistic Regression model, fits it to the training dataset, and evaluates its performance on the test dataset by predicting whether mushrooms are 'poisonous' or 'edible'. The classification report shows perfect precision, recall, and f1-score of 1.00 for both classes, indicating excellent model performance with an overall accuracy of 100%.**


In [15]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

y_pred_knn = knn.predict(X_test)
print("K Nearest Neighbors Report:")
print(classification_report(y_test, y_pred_knn))

K Nearest Neighbors Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       843
           1       1.00      1.00      1.00       782

    accuracy                           1.00      1625
   macro avg       1.00      1.00      1.00      1625
weighted avg       1.00      1.00      1.00      1625



**K Nearest Neighbors Implementation for Mushroom Classification: This code initializes a KNN model, fits it to the training dataset, and evaluates its performance on the test dataset by predicting whether mushrooms are 'poisonous' or 'edible'. The classification report shows perfect precision, recall, and f1-score of 1.00 for both classes, indicating excellent model performance with an overall accuracy of 100%.**


In [16]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)
print("Random Forest Report:")
print(classification_report(y_test, y_pred_rf))

Random Forest Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       843
           1       1.00      1.00      1.00       782

    accuracy                           1.00      1625
   macro avg       1.00      1.00      1.00      1625
weighted avg       1.00      1.00      1.00      1625



**Random Forest Implementation for Mushroom Classification: This code initializes a Random Forest model, fits it to the training dataset, and evaluates its performance on the test dataset by predicting whether mushrooms are 'poisonous' or 'edible'. The classification report indicates perfect precision, recall, and f1-score of 1.00 for both classes, demonstrating excellent model performance with an overall accuracy of 100%.**


In [17]:
svc = SVC()
svc.fit(X_train, y_train)

y_pred_svc = svc.predict(X_test)
print("Support Vector Classifier Report:")
print(classification_report(y_test, y_pred_svc))

Support Vector Classifier Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       843
           1       1.00      1.00      1.00       782

    accuracy                           1.00      1625
   macro avg       1.00      1.00      1.00      1625
weighted avg       1.00      1.00      1.00      1625



**Support Vector Classifier Implementation for Mushroom Classification: This code initializes a Support Vector Classifier model, fits it to the training dataset, and evaluates its performance on the test dataset by predicting whether mushrooms are 'poisonous' or 'edible'. The classification report shows perfect precision, recall, and f1-score of 1.00 for both classes, indicating outstanding model performance with an overall accuracy of 100%.**


In [18]:
log_reg_param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100], 'solver': ['liblinear', 'saga'],'max_iter': [500]}
knn_param_grid = {'n_neighbors': range(1, 31), 'weights': ['uniform', 'distance']}
rf_param_grid = {'n_estimators': [10, 50, 100], 'max_depth': [None, 10, 20, 30]}
svc_param_grid = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}

**Hyperparameter Grids for Model Optimization: The `log_reg_param_grid` specifies the hyperparameters for Logistic Regression, including the inverse regularization strength `C`, solver options, and maximum iterations for convergence. The `knn_param_grid` defines the range of neighbors and weight functions for K-Nearest Neighbors. The `rf_param_grid` outlines the number of trees (`n_estimators`) and their maximum depth for Random Forest. Lastly, the `svc_param_grid` delineates the regularization parameter `C` and kernel types for Support Vector Classifier. These grids will be utilized for hyperparameter tuning using GridSearchCV to identify the optimal settings for each classification model.**


In [19]:
def perform_grid_search(model, param_grid, X_train, y_train):
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    return grid_search.best_estimator_

**Grid Search Function for Model Optimization: The `perform_grid_search` function takes a machine learning model, a parameter grid, and training data as input. It utilizes `GridSearchCV` to perform cross-validated grid search over the specified parameter grid (`param_grid`) with 5-fold cross-validation to evaluate the accuracy of the model. After fitting the model to the training data (`X_train` and `y_train`), it returns the best estimator found during the search, which corresponds to the optimal hyperparameter configuration. This function is crucial for enhancing model performance by systematically exploring the hyperparameter space.**


In [20]:
warnings.filterwarnings("ignore", category=UserWarning, module='sklearn.linear_model')
best_log_reg = perform_grid_search(LogisticRegression(), log_reg_param_grid, X_train, y_train)
best_knn = perform_grid_search(KNeighborsClassifier(), knn_param_grid, X_train, y_train)
best_rf = perform_grid_search(RandomForestClassifier(), rf_param_grid, X_train, y_train)
best_svc = perform_grid_search(SVC(), svc_param_grid, X_train, y_train)

**Hyperparameter Optimization with Grid Search: The following code first suppresses user warnings related to the logistic regression model to maintain a clean output. Then, it calls the `perform_grid_search` function for each of the specified classification models (Logistic Regression, K Nearest Neighbors, Random Forest, and Support Vector Classifier) using their respective parameter grids (`log_reg_param_grid`, `knn_param_grid`, `rf_param_grid`, and `svc_param_grid`). Each model is fitted to the training data (`X_train`, `y_train`) to identify the best hyperparameter configuration, which is stored in `best_log_reg`, `best_knn`, `best_rf`, and `best_svc` variables. This step is essential for optimizing the performance of the models before final evaluation.**


In [21]:
print("Best Logistic Regression:")
print(best_log_reg)

print("Best K Nearest Neighbors:")
print(best_knn)

print("Best Random Forest:")
print(best_rf)

print("Best Support Vector Classifier:")
print(best_svc)

Best Logistic Regression:
LogisticRegression(C=1, max_iter=500, solver='liblinear')
Best K Nearest Neighbors:
KNeighborsClassifier(n_neighbors=1)
Best Random Forest:
RandomForestClassifier(n_estimators=10)
Best Support Vector Classifier:
SVC(C=0.1, kernel='linear')


**Display of Best Hyperparameters: The following code prints the best hyperparameter configurations obtained from the grid search for each classification model (Logistic Regression, K Nearest Neighbors, Random Forest, and Support Vector Classifier). This output reveals the optimal settings used for each model, which were determined based on cross-validation accuracy during the grid search process. The displayed configurations include the values of hyperparameters that yield the best performance on the training data.**


# **The implemented classification models successfully distinguished between 'poisonous' and 'edible' mushrooms, achieving perfect accuracy across all evaluated models. Through hyperparameter tuning via GridSearchCV, the optimal configurations for each model were identified, enhancing their predictive performance on the dataset.**