In [1]:
# some useful mysklearn package import statements and reloads
import importlib

import mysklearn.myutils
importlib.reload(mysklearn.myutils)
import mysklearn.myutils as myutils

import mysklearn.mypytable
importlib.reload(mysklearn.mypytable)
from mysklearn.mypytable import MyPyTable 

import mysklearn.myclassifiers
importlib.reload(mysklearn.myclassifiers)
from mysklearn.myclassifiers import MyDecisionTreeClassifier

import mysklearn.myevaluation
importlib.reload(mysklearn.myevaluation)
import mysklearn.myevaluation as myevaluation

# Mushroom Classification

## Dataset
- Size: 500 instances
- Features: 9 categorical features (simplified to at most 3 categories each)
- Target: label (edible or poisonous)

### Features:
1. cap-color: gray, brown, other
2. odor: foul, none, other
3. stalk-surface-above-ring: smooth, silky, other
4. stalk-surface-below-ring: smooth, silky, other
5. stalk-color-above-ring: pink, white, other
6. stalk-color-below-ring: pink, white, other
7. ring-type: pendant, evanescent, other
8. population: several, other
9. habitat: wood, grass, other

In [2]:
# Load the mushroom dataset
mushroom_table = MyPyTable()
mushroom_table.load_from_file('input_data/mushroom_reduced.csv')

print(f"Dataset loaded: {len(mushroom_table.data)} instances")
print(f"Columns: {mushroom_table.column_names}")

Dataset loaded: 500 instances
Columns: ['cap-color', 'odor', 'stalk-surface-above-ring', 'stalk-surface-below-ring', 'stalk-color-above-ring', 'stalk-color-below-ring', 'ring-type', 'population', 'habitat', 'label']


In [3]:
# Helper function to run 10-fold cross-validation and display results
def evaluate_feature_subset(X, y, subset_name, feature_names):
    """Run 10-fold cross-validation and compute metrics for feature subset."""
    print("\n" + "="*80)
    print(f"{subset_name}")
    print(f"Features: {', '.join(feature_names)}")
    print("="*80)
    
    # Perform 10-fold cross-validation
    folds = myevaluation.kfold_split(X, n_splits=10)
    
    all_y_true = []
    all_y_pred = []
    
    for train_indices, test_indices in folds:
        # Split data
        X_train = [X[i] for i in train_indices]
        y_train = [y[i] for i in train_indices]
        X_test = [X[i] for i in test_indices]
        y_test = [y[i] for i in test_indices]
        
        # Train and predict
        tree = MyDecisionTreeClassifier()
        tree.fit(X_train, y_train)
        y_pred = tree.predict(X_test)
        
        all_y_true.extend(y_test)
        all_y_pred.extend(y_pred)
    
    # Compute metrics
    accuracy = myevaluation.accuracy_score(all_y_true, all_y_pred)
    error_rate = 1 - accuracy
    
   
    labels = sorted(list(set(all_y_true)))
    print(f"Accuracy:    {accuracy:.4f} ({accuracy*100:.2f}%)")
    print(f"Error Rate:  {error_rate:.4f} ({error_rate*100:.2f}%)")
    
    # Per class metrics
    print(f"\nPer-Class Performance Metrics:")
    print(f"{'Class':<15} {'Precision':>12} {'Recall':>12} {'F1-Score':>12}")
    print("-" * 53)
    
    for label in labels:
        precision = myevaluation.binary_precision_score(all_y_true, all_y_pred, label)
        recall = myevaluation.binary_recall_score(all_y_true, all_y_pred, label)
        f1 = myevaluation.binary_f1_score(all_y_true, all_y_pred, label)
        print(f"{label:<15} {precision:>12.4f} {recall:>12.4f} {f1:>12.4f}")
    
    # Confusion
    conf_matrix = myevaluation.confusion_matrix(all_y_true, all_y_pred, labels)
    myutils.print_confusion_matrix(conf_matrix, labels, "\nConfusion Matrix")
    
    return accuracy, tree

## Step 1: Using Only the Odor Feature

This is a baseline set of results showing how accurately the model can predict whether a mushroom is edible or poisonous using only the odor feature.

In [5]:
odor_idx = mushroom_table.column_names.index('odor')
label_idx = mushroom_table.column_names.index('label')

X_odor = [[row[odor_idx]] for row in mushroom_table.data]
y = [row[label_idx] for row in mushroom_table.data]

acc_odor, tree_odor = evaluate_feature_subset(X_odor, y, "Odor Only", ['odor'])


Odor Only
Features: odor
Accuracy:    0.8460 (84.60%)
Error Rate:  0.1540 (15.40%)

Per-Class Performance Metrics:
Class              Precision       Recall     F1-Score
-----------------------------------------------------
edible                0.0000       0.0000       0.0000
poisonous             0.0000       0.0000       0.0000


Confusion Matrix:
----------------------------------------
                   edible poisonous | Total
----------------------------------------
         edible       191        63 | 254
      poisonous        14       232 | 246
----------------------------------------
          Total       205       295 | 500


## Step 2: Multiple Feature Subsets

Testing 4 different feature subsets to find the best combination for predicting mushroom edibility.

In [6]:
# feature subsets
feature_subsets = [
    {
        'name': 'Subset 1: Odor and Cap Color',
        'features': ['odor', 'cap-color']
    },
    {
        'name': 'Subset 2: Odor and Stalk Features',
        'features': ['odor', 'stalk-surface-above-ring', 'stalk-color-above-ring']
    },
    {
        'name': 'Subset 3: Odor and Ring and Habitat',
        'features': ['odor', 'habitat']
    },
    {
        'name': 'Subset 4: Comprehensive (5 features)',
        'features': ['odor', 'cap-color', 'stalk-color-below-ring', 'population', 'habitat']
    }
]

# Store results to compare
results = [{'name': 'Odor Only', 'accuracy': acc_odor, 'features': ['odor']}]

for subset in feature_subsets:
    # Get feature indices
    feature_indices = [mushroom_table.column_names.index(f) for f in subset['features']]
    
    # get features
    X_subset = [[row[idx] for idx in feature_indices] for row in mushroom_table.data]
    
    # Evaluate
    acc, tree = evaluate_feature_subset(X_subset, y, subset['name'], subset['features'])
    results.append({'name': subset['name'], 'accuracy': acc, 'features': subset['features'], 'tree': tree})


Subset 1: Odor and Cap Color
Features: odor, cap-color
Accuracy:    0.8460 (84.60%)
Error Rate:  0.1540 (15.40%)

Per-Class Performance Metrics:
Class              Precision       Recall     F1-Score
-----------------------------------------------------
edible                0.0000       0.0000       0.0000
poisonous             0.0000       0.0000       0.0000


Confusion Matrix:
----------------------------------------
                   edible poisonous | Total
----------------------------------------
         edible       191        63 | 254
      poisonous        14       232 | 246
----------------------------------------
          Total       205       295 | 500

Subset 2: Odor and Stalk Features
Features: odor, stalk-surface-above-ring, stalk-color-above-ring
Accuracy:    0.8340 (83.40%)
Error Rate:  0.1660 (16.60%)

Per-Class Performance Metrics:
Class              Precision       Recall     F1-Score
-----------------------------------------------------
edible                0

In [7]:
# Summary comparison
print("\n" + "="*80)
print("SUMMARY")
print("="*80)
print()
print(f"{'Subset':<45} {'# Features':>10} {'Accuracy':>12} {'Error Rate':>12}")
print("-"*80)

for result in results:
    error_rate = 1 - result['accuracy']
    print(f"{result['name']:<45} {len(result['features']):>10} "
          f"{result['accuracy']:>12.4f} {error_rate:>12.4f}")

# Find best subset
best_result = max(results, key=lambda x: x['accuracy'])
print("\n" + "="*80)
print("BEST SUBSET")
print("="*80)
print(f"Name:      {best_result['name']}")
print(f"Features:  {', '.join(best_result['features'])}")
print(f"Accuracy:  {best_result['accuracy']:.4f} ({best_result['accuracy']*100:.2f}%)")
print(f"Error:     {(1-best_result['accuracy']):.4f} ({(1-best_result['accuracy'])*100:.2f}%)")
print("="*80)

# Show improvement over baseline
baseline_acc = results[0]['accuracy']
improvement = best_result['accuracy'] - baseline_acc
if improvement > 0:
    print(f"\nImprovement over baseline (Odor Only): +{improvement:.4f} "
          f"({improvement*100:.2f} percentage points)")
elif improvement < 0:
    print(f"\nChange from baseline (Odor Only): {improvement:.4f} "
          f"({improvement*100:.2f} percentage points)")
else:
    print(f"\nPerformance equals baseline (Odor Only)")



SUMMARY

Subset                                        # Features     Accuracy   Error Rate
--------------------------------------------------------------------------------
Odor Only                                              1       0.8460       0.1540
Subset 1: Odor and Cap Color                           2       0.8460       0.1540
Subset 2: Odor and Stalk Features                      3       0.8340       0.1660
Subset 3: Odor and Ring and Habitat                    2       0.8780       0.1220
Subset 4: Comprehensive (5 features)                   5       0.9080       0.0920

BEST SUBSET
Name:      Subset 4: Comprehensive (5 features)
Features:  odor, cap-color, stalk-color-below-ring, population, habitat
Accuracy:  0.9080 (90.80%)
Error:     0.0920 (9.20%)

Improvement over baseline (Odor Only): +0.0620 (6.20 percentage points)


## Decision Rules from Best Model

Training the best feature subset on the entire dataset to extract decision rules.

In [8]:
# Train best model on entire dataset
print("\n" + "="*80)
print("DECISION RULES FROM BEST MODEL")
print("="*80)
print(f"\nFeatures used: {best_result['features']}\n")

# Get feature indices for best subset
best_feature_indices = [mushroom_table.column_names.index(f) for f in best_result['features']]
X_best = [[row[idx] for idx in best_feature_indices] for row in mushroom_table.data]

# Train on entire dataset
final_tree = MyDecisionTreeClassifier()
final_tree.fit(X_best, y)

# Print decision rules
final_tree.print_decision_rules(attribute_names=best_result['features'], class_name='label')


DECISION RULES FROM BEST MODEL

Features used: ['odor', 'cap-color', 'stalk-color-below-ring', 'population', 'habitat']

IF odor == foul AND stalk-color-below-ring == other AND habitat == grass THEN label = poisonous
IF odor == foul AND stalk-color-below-ring == other AND habitat == other AND population == other THEN label = poisonous
IF odor == foul AND stalk-color-below-ring == other AND habitat == other AND population == several AND cap-color == brown THEN label = poisonous
IF odor == foul AND stalk-color-below-ring == other AND habitat == other AND population == several AND cap-color == gray THEN label = poisonous
IF odor == foul AND stalk-color-below-ring == other AND habitat == other AND population == several AND cap-color == other THEN label = poisonous
IF odor == foul AND stalk-color-below-ring == other AND habitat == wood THEN label = poisonous
IF odor == foul AND stalk-color-below-ring == pink AND habitat == grass THEN label = poisonous
IF odor == foul AND stalk-color-below-

## Analysis and Pruning Discussion

In [9]:
print("\n" + "="*80)
print("ANALYSIS")
print("="*80)

print("""
- Rules with very few instances are not very applicable and can be pruned and replaced with majority class
- Multiple rules leading to the same class with similar conditions can be merged them
- For each internal node, if removing that subtree and replacing with majority class doesn't affect accuracy on validation set it can be pruned
- Leaf nodes with low confidence can be replace with parent's majority or mark as uncertain
- The most important features can be kept and rules that split on less important features in the tree can be pruned
- Pruning decisions should be based on: validation set results, instance counts at each node, classification confidence for leaf nodes
""")


ANALYSIS

- Rules with very few instances are not very applicable and can be pruned and replaced with majority class
- Multiple rules leading to the same class with similar conditions can be merged them
- For each internal node, if removing that subtree and replacing with majority class doesn't affect accuracy on validation set it can be pruned
- Leaf nodes with low confidence can be replace with parent's majority or mark as uncertain
- The most important features can be kept and rules that split on less important features in the tree can be pruned
- Pruning decisions should be based on: validation set results, instance counts at each node, classification confidence for leaf nodes



## Conclusions
- The odor feature alone provides strong and reliable baseline performance.
- Adding more features beyond odor may provide marginal improvements, but risks overfitting on the training data.
- The subset with the highest accuracy balances predictive power with model simplicity.

- Pruning the decision tree would:
   - Reduce overfitting
   - Improve generalization 
   - Create more interpretable rules
   - Keep high accuracy with reduced complexity