In [50]:
# =====================================================
# IMPORT LIBRARIES
# =====================================================

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    matthews_corrcoef
)

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

import xgboost as xgb


In [51]:
from google.colab import files
uploaded = files.upload()

Saving mushrooms.csv to mushrooms (2).csv


In [52]:
# =====================================================
# LOAD DATA
# =====================================================

df = pd.read_csv("mushrooms.csv")

print("Dataset shape:", df.shape)
df.head()


Dataset shape: (8124, 23)


Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [53]:
# =====================================================
# ENCODE CATEGORICAL FEATURES
# =====================================================

X = df.drop("class", axis=1)
y = df["class"]

# Encode all categorical columns
encoders = {}

for column in X.columns:
    encoder = LabelEncoder()
    X[column] = encoder.fit_transform(X[column])
    encoders[column] = encoder

# Encode target
target_encoder = LabelEncoder()
y = target_encoder.fit_transform(y)

print("Feature size:", X.shape[1])
print("Total instances:", X.shape[0])


Feature size: 22
Total instances: 8124


In [54]:
# =====================================================
# SPLIT DATA
# =====================================================

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3,
    random_state=7,
    stratify=y
)

print("Train size:", X_train.shape)
print("Test size:", X_test.shape)


Train size: (5686, 22)
Test size: (2438, 22)


In [55]:
# =====================================================
# MODEL EVALUATION FUNCTION
# =====================================================

def compute_metrics(model, X_tr, X_te, y_tr, y_te):

    model.fit(X_tr, y_tr)
    preds = model.predict(X_te)

    # AUC handling
    if hasattr(model, "predict_proba"):
        probas = model.predict_proba(X_te)

        if len(np.unique(y_te)) == 2:
            auc = roc_auc_score(y_te, probas[:, 1])
        else:
            auc = roc_auc_score(y_te, probas, multi_class="ovr")
    else:
        auc = np.nan

    scores = {
        "Accuracy": accuracy_score(y_te, preds),
        "Precision": precision_score(y_te, preds, average="weighted"),
        "Recall": recall_score(y_te, preds, average="weighted"),
        "F1 Score": f1_score(y_te, preds, average="weighted"),
        "AUC Score": auc,
        "MCC Score": matthews_corrcoef(y_te, preds)
    }

    return scores


In [56]:
# =====================================================
# INITIALIZE MODELS
# =====================================================

model_collection = {
    "Logistic Regression": LogisticRegression(max_iter=1200),
    "Decision Tree": DecisionTreeClassifier(max_depth=None),
    "KNN": KNeighborsClassifier(n_neighbors=7),
    "Gaussian Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(n_estimators=250),
    "XGBoost": xgb.XGBClassifier(
        use_label_encoder=False,
        eval_metric="logloss"
    )
}


In [57]:
# =====================================================
# TRAIN & EVALUATE
# =====================================================

performance_summary = {}

for model_name, model_object in model_collection.items():
    metrics_result = compute_metrics(
        model_object,
        X_train,
        X_test,
        y_train,
        y_test
    )
    performance_summary[model_name] = metrics_result

results_table = pd.DataFrame(performance_summary).T
results_table.round(4)



Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Unnamed: 0,Accuracy,Precision,Recall,F1 Score,AUC Score,MCC Score
Logistic Regression,0.9491,0.9493,0.9491,0.9491,0.9816,0.8982
Decision Tree,0.9984,0.9984,0.9984,0.9984,0.9983,0.9967
KNN,0.9951,0.9951,0.9951,0.9951,1.0,0.9902
Gaussian Naive Bayes,0.9282,0.9282,0.9282,0.9282,0.9543,0.8562
Random Forest,1.0,1.0,1.0,1.0,1.0,1.0
XGBoost,0.9984,0.9984,0.9984,0.9984,1.0,0.9967


In [58]:
# =====================================================
# SORT RESULTS
# =====================================================

results_table.sort_values(by="Accuracy", ascending=False).round(4)



Unnamed: 0,Accuracy,Precision,Recall,F1 Score,AUC Score,MCC Score
Random Forest,1.0,1.0,1.0,1.0,1.0,1.0
Decision Tree,0.9984,0.9984,0.9984,0.9984,0.9983,0.9967
XGBoost,0.9984,0.9984,0.9984,0.9984,1.0,0.9967
KNN,0.9951,0.9951,0.9951,0.9951,1.0,0.9902
Logistic Regression,0.9491,0.9493,0.9491,0.9491,0.9816,0.8982
Gaussian Naive Bayes,0.9282,0.9282,0.9282,0.9282,0.9543,0.8562
