In [None]:
!pip install ucimlrepo



## 1. Imports

In [None]:
import pandas as pd
import numpy as np

from ucimlrepo import fetch_ucirepo

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from sklearn.dummy import DummyClassifier
from sklearn.svm import SVC

## 2. Load Dataset

In [None]:
# Fetch the Mushroom dataset
mushroom = fetch_ucirepo(id=73)

# Features and target
X = mushroom.data.features
y = mushroom.data.targets

print("Dataset Loaded")
print("Shape:", X.shape)
print("\nClass Distribution (Proportion):")
print(y.value_counts(normalize=True))

print("\nMetadata:")
print(mushroom.metadata)

print("\nVariables:")
print(mushroom.variables)



Dataset Loaded
Shape: (8124, 22)

Class Distribution (Proportion):
poisonous
e            0.517971
p            0.482029
Name: proportion, dtype: float64

Metadata:
{'uci_id': 73, 'name': 'Mushroom', 'repository_url': 'https://archive.ics.uci.edu/dataset/73/mushroom', 'data_url': 'https://archive.ics.uci.edu/static/public/73/data.csv', 'abstract': 'From Audobon Society Field Guide; mushrooms described in terms of physical characteristics; classification: poisonous or edible', 'area': 'Biology', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 8124, 'num_features': 22, 'feature_types': ['Categorical'], 'demographics': [], 'target_col': ['poisonous'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 1981, 'last_updated': 'Thu Aug 10 2023', 'dataset_doi': '10.24432/C5959T', 'creators': [], 'intro_paper': None, 'additional_info': {'summary': "This data set includes descriptions of hypothetical samp

## 3. Explore Data

In [None]:
print("\nFirst 5 Rows:")
display(X.head())

print("\nFeature Names:")
print(list(X.columns))

print("\nMissing Values Per Feature:")
print(X.isna().sum())



First 5 Rows:


Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,x,s,n,t,p,f,c,n,k,e,...,s,w,w,p,w,o,p,k,s,u
1,x,s,y,t,a,f,c,b,k,e,...,s,w,w,p,w,o,p,n,n,g
2,b,s,w,t,l,f,c,b,n,e,...,s,w,w,p,w,o,p,n,n,m
3,x,y,w,t,p,f,c,n,n,e,...,s,w,w,p,w,o,p,k,s,u
4,x,s,g,f,n,f,w,b,k,t,...,s,w,w,p,w,o,e,n,a,g



Feature Names:
['cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor', 'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color', 'stalk-shape', 'stalk-root', 'stalk-surface-above-ring', 'stalk-surface-below-ring', 'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number', 'ring-type', 'spore-print-color', 'population', 'habitat']

Missing Values Per Feature:
cap-shape                      0
cap-surface                    0
cap-color                      0
bruises                        0
odor                           0
gill-attachment                0
gill-spacing                   0
gill-size                      0
gill-color                     0
stalk-shape                    0
stalk-root                  2480
stalk-surface-above-ring       0
stalk-surface-below-ring       0
stalk-color-above-ring         0
stalk-color-below-ring         0
veil-type                      0
veil-color                     0
ring-number                    0


## 4. Handle Missing Values
- Treat 'stalk-root' missing values as a separate category

In [None]:
# Only 'stalk-root' has missing values
X['stalk-root'] = X['stalk-root'].fillna('missing')

print("\nAfter Handling Missing Values:")
print(X['stalk-root'].value_counts())



After Handling Missing Values:
stalk-root
b          3776
missing    2480
e          1120
c           556
r           192
Name: count, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['stalk-root'] = X['stalk-root'].fillna('missing')


## 5. Train/Validation/Test Split
- 70% train, 15% validation, 15% test  
- Stratified splitting  
- Fixed random seed = 42

In [None]:
# First split: Train (70%) + Temp (30%)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, stratify=y, random_state=42
)

# Second split: Validation (15%) + Test (15%)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, stratify=y_temp, random_state=42
)

print("\nSplit Complete")
print("Train Size:", X_train.shape)
print("Validation Size:", X_val.shape)
print("Test Size:", X_test.shape)



Split Complete
Train Size: (5686, 22)
Validation Size: (1219, 22)
Test Size: (1219, 22)


## 6. Preprocessing Pipeline
- One-Hot Encoding for all categorical features  
- StandardScaler (with_mean=False) for consistency  
- Fit on training set and transform validation/test sets  
- Convert sparse matrices to dense for SVM

In [None]:
categorical_features = X.columns.tolist()

# ColumnTransformer
preprocess = ColumnTransformer(
    transformers=[('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)]
)

# Full pipeline (OHE + StandardScaler)
full_pipeline = Pipeline([
    ('preprocess', preprocess),
    ('scaler', StandardScaler(with_mean=False))  # works with sparse
])

# Fit on training data
X_train_prep = full_pipeline.fit_transform(X_train)
X_val_prep = full_pipeline.transform(X_val)
X_test_prep = full_pipeline.transform(X_test)

# For SVM:
X_train_dense = X_train_prep.toarray()
X_val_dense = X_val_prep.toarray()
X_test_dense = X_test_prep.toarray()


## 7. Baseline Models
- Majority-class baseline  
- Random baseline  
- Shuffle-label baseline (using Decision Tree)

In [None]:
def evaluate_baseline(model, X_train, y_train, X_test, y_test, name):
    model.fit(X_train, y_train.values.ravel())
    y_pred = model.predict(X_test)
    print(f"\n{name} Results")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred, zero_division=1))

# Majority baseline
majority = DummyClassifier(strategy="most_frequent")
evaluate_baseline(majority, X_train_prep, y_train, X_test_prep, y_test, "Majority Baseline")

# Random baseline
random = DummyClassifier(strategy="stratified", random_state=42)
evaluate_baseline(random, X_train_prep, y_train, X_test_prep, y_test, "Random Baseline")

# Shuffle-label baseline
y_train_shuffled = y_train.sample(frac=1, random_state=42).reset_index(drop=True)
shuffle_tree = DecisionTreeClassifier(class_weight='balanced', random_state=42)
evaluate_baseline(shuffle_tree, X_train_prep, y_train_shuffled, X_test_prep, y_test, "Shuffle-Label Baseline")



Majority Baseline Results
Accuracy: 0.5184577522559475
[[632   0]
 [587   0]]
              precision    recall  f1-score   support

           e       0.52      1.00      0.68       632
           p       1.00      0.00      0.00       587

    accuracy                           0.52      1219
   macro avg       0.76      0.50      0.34      1219
weighted avg       0.75      0.52      0.35      1219


Random Baseline Results
Accuracy: 0.5020508613617719
[[322 310]
 [297 290]]
              precision    recall  f1-score   support

           e       0.52      0.51      0.51       632
           p       0.48      0.49      0.49       587

    accuracy                           0.50      1219
   macro avg       0.50      0.50      0.50      1219
weighted avg       0.50      0.50      0.50      1219


Shuffle-Label Baseline Results
Accuracy: 0.4856439704675964
[[317 315]
 [312 275]]
              precision    recall  f1-score   support

           e       0.50      0.50      0.50       6

## 8. SVM Model (Final Model)
- Initialize SVM with class_weight='balanced'  
- Fit on training data  
- Predict and evaluate on test set  
- Compute accuracy, confusion matrix, classification report

In [None]:
# SVM with class_weight='balanced'
svm_model = SVC(class_weight='balanced', random_state=42)

# Fit model
svm_model.fit(X_train_dense, y_train.values.ravel())

# Predict
y_pred_svm = svm_model.predict(X_test_dense)

# Evaluate
print("\nSVM Results")
print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print(confusion_matrix(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm, zero_division=1))



SVM Results
Accuracy: 0.9991796554552912
[[632   0]
 [  1 586]]
              precision    recall  f1-score   support

           e       1.00      1.00      1.00       632
           p       1.00      1.00      1.00       587

    accuracy                           1.00      1219
   macro avg       1.00      1.00      1.00      1219
weighted avg       1.00      1.00      1.00      1219



## 9. Cross-Validation
- 5-fold Stratified CV on training set  
- Use poisonous-class F1 as scoring metric  
- Report individual fold scores and mean F1

In [None]:
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import make_scorer, f1_score

# Use F1-score for poisonous class as main metric
poison_f1 = make_scorer(f1_score, pos_label='p')

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(svm_model, X_train_dense, y_train.values.ravel(), cv=cv, scoring=poison_f1)

print("5-Fold CV Poisonous F1 scores:", cv_scores)
print("Mean Poisonous F1:", cv_scores.mean())


5-Fold CV Poisonous F1 scores: [1.         1.         0.99908676 1.         0.99908676]
Mean Poisonous F1: 0.9996347031963471


## Summary
We applied a consistent preprocessing pipeline to ensure fair and reproducible evaluation. The mushroom dataset was split using a stratified 70/15/15 train/validation/test split with a fixed random seed (42). Missing values occurred only in the “stalk-root” feature, which were treated as a valid “missing” category rather than imputed. All 22 input features are categorical, so we applied One-Hot Encoding (handle_unknown="ignore") fitted exclusively on the training set, followed by standardization of the resulting one-hot vectors using StandardScaler to maintain consistency across models. Because recall for poisonous mushrooms is safety-critical, we used class_weight="balanced" in the SVM classifier. Model performance was evaluated using 5-fold cross-validation on the training set, with poisonous-class F1 as the primary metric. Baseline comparisons included the majority-class classifier (≈52% accuracy), the analytical random baseline (≈50%), and a shuffle-label baseline, where labels were randomly permuted and the full pipeline retrained to confirm no data leakage. The final SVM model achieved near-perfect poisonous recall and F1, confirming its suitability as the chosen classifier.