In [6]:
import sys
import importlib
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

# Configuration
subname = "EC_F_PES2UG23CS379_Lab3"  # Replace with your actual module name
datasets = [
    {"name": "Mushrooms", "path": "mushrooms.csv"},
    {"name": "Tic-Tac-Toe", "path": "tictactoe.csv"},
    {"name": "Nursery", "path": "Nursery.csv"}
]

framework = 'numpy'  # Using NumPy framework
print_tree_flag = False
print_construction_flag = False

print(f" Framework: NumPy/sklearn")
print(f" Using NumPy arrays for all data processing")

# Import your module
try:
    mymodule = importlib.import_module(subname)
    print(f" Successfully imported module: {subname}")
except Exception as e:
    print(f" Error importing module '{subname}': {e}")
    print("Please ensure your module is named correctly and is available.")
    sys.exit()

# Import required functions from your module
try:
    get_selected_attribute = mymodule.get_selected_attribute
    get_information_gain = mymodule.get_information_gain
    get_avg_info_of_attribute = mymodule.get_avg_info_of_attribute
    get_entropy_of_dataset = mymodule.get_entropy_of_dataset
    print(" All required functions imported successfully")
except AttributeError as e:
    print(f" Error: Missing required function in module '{subname}': {e}")
    print("Required functions: get_selected_attribute, get_information_gain, get_avg_info_of_attribute, get_entropy_of_dataset")
    sys.exit()

def calculate_accuracy(y_true, y_pred):
    """Calculate accuracy with NumPy arrays"""
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)

    # Handle None predictions
    valid_mask = np.array([pred is not None for pred in y_pred])
    if not np.any(valid_mask):
        return 0.0

    y_true_valid = y_true[valid_mask]
    y_pred_valid = y_pred[valid_mask]

    correct = np.sum(y_true_valid == y_pred_valid)
    total = len(y_true_valid)

    return correct / total if total > 0 else 0.0

def calculate_precision_recall_f1(y_true, y_pred, average='weighted'):
    """Calculate precision, recall, and F1-score with NumPy"""
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)

    # Handle None predictions
    valid_mask = np.array([pred is not None for pred in y_pred])
    if not np.any(valid_mask):
        return 0.0, 0.0, 0.0

    y_true_valid = y_true[valid_mask]
    y_pred_valid = y_pred[valid_mask]

    classes = np.unique(np.concatenate([y_true_valid, y_pred_valid]))

    if average == 'weighted':
        precisions, recalls, f1s, supports = [], [], [], []

        for cls in classes:
            tp = np.sum((y_true_valid == cls) & (y_pred_valid == cls))
            fp = np.sum((y_true_valid != cls) & (y_pred_valid == cls))
            fn = np.sum((y_true_valid == cls) & (y_pred_valid != cls))

            precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
            recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
            f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
            support = np.sum(y_true_valid == cls)

            precisions.append(precision)
            recalls.append(recall)
            f1s.append(f1)
            supports.append(support)

        total_support = sum(supports)
        if total_support == 0:
            return 0.0, 0.0, 0.0

        weighted_precision = sum(p * s for p, s in zip(precisions, supports)) / total_support
        weighted_recall = sum(r * s for r, s in zip(recalls, supports)) / total_support
        weighted_f1 = sum(f * s for f, s in zip(f1s, supports)) / total_support

        return weighted_precision, weighted_recall, weighted_f1

    elif average == 'macro':
        precisions, recalls, f1s = [], [], []

        for cls in classes:
            tp = np.sum((y_true_valid == cls) & (y_pred_valid == cls))
            fp = np.sum((y_true_valid != cls) & (y_pred_valid == cls))
            fn = np.sum((y_true_valid == cls) & (y_pred_valid != cls))

            precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
            recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
            f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0

            precisions.append(precision)
            recalls.append(recall)
            f1s.append(f1)

        return np.mean(precisions), np.mean(recalls), np.mean(f1s)

def calculate_per_class_metrics(y_true, y_pred):
    """Calculate per-class metrics with NumPy"""
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)

    valid_mask = np.array([pred is not None for pred in y_pred])
    if not np.any(valid_mask):
        return {}

    y_true_valid = y_true[valid_mask]
    y_pred_valid = y_pred[valid_mask]

    classes = np.unique(np.concatenate([y_true_valid, y_pred_valid]))
    metrics = {}

    for cls in classes:
        tp = np.sum((y_true_valid == cls) & (y_pred_valid == cls))
        fp = np.sum((y_true_valid != cls) & (y_pred_valid == cls))
        fn = np.sum((y_true_valid == cls) & (y_pred_valid != cls))
        tn = np.sum((y_true_valid != cls) & (y_pred_valid != cls))

        precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
        specificity = tn / (tn + fp) if (tn + fp) > 0 else 0.0
        support = np.sum(y_true_valid == cls)

        metrics[cls] = {
            'precision': precision,
            'recall': recall,
            'f1_score': f1,
            'specificity': specificity,
            'support': support,
            'true_positives': tp,
            'false_positives': fp,
            'false_negatives': fn,
            'true_negatives': tn
        }

    return metrics

def calculate_tree_complexity_metrics(tree):
    """Calculate tree complexity metrics"""
    if isinstance(tree, (int, np.integer)) or tree is None:
        return {
            'max_depth': 0,
            'num_nodes': 1,
            'num_leaves': 1,
            'num_internal_nodes': 0
        }

    if not isinstance(tree, dict):
        return {
            'max_depth': 0,
            'num_nodes': 1,
            'num_leaves': 1,
            'num_internal_nodes': 0
        }

    def get_tree_stats(node, depth=0):
        if isinstance(node, (int, np.integer)) or node is None:
            return {
                'max_depth': depth,
                'num_nodes': 1,
                'num_leaves': 1,
                'num_internal_nodes': 0
            }

        if not isinstance(node, dict) or 'branches' not in node:
            return {
                'max_depth': depth,
                'num_nodes': 1,
                'num_leaves': 1,
                'num_internal_nodes': 0
            }

        max_depth = depth
        total_nodes = 1
        total_leaves = 0
        total_internal = 1

        for branch_value, subtree in node['branches'].items():
            subtree_stats = get_tree_stats(subtree, depth + 1)
            max_depth = max(max_depth, subtree_stats['max_depth'])
            total_nodes += subtree_stats['num_nodes']
            total_leaves += subtree_stats['num_leaves']
            total_internal += subtree_stats['num_internal_nodes']

        return {
            'max_depth': max_depth,
            'num_nodes': total_nodes,
            'num_leaves': total_leaves,
            'num_internal_nodes': total_internal
        }

    return get_tree_stats(tree)

def evaluate_decision_tree(tree, X_test, y_test, cols, class_names=None):
    """Evaluate decision tree performance"""
    predictions = []

    # Make predictions for all test samples
    for sample in X_test:
        pred = predict_single_sample(tree, sample, cols)
        predictions.append(pred)

    # Calculate metrics
    accuracy = calculate_accuracy(y_test, predictions)
    precision, recall, f1 = calculate_precision_recall_f1(y_test, predictions, average='weighted')
    precision_macro, recall_macro, f1_macro = calculate_precision_recall_f1(y_test, predictions, average='macro')

    per_class_metrics = calculate_per_class_metrics(y_test, predictions)
    tree_stats = calculate_tree_complexity_metrics(tree)

    # Print results
    print(f"\n OVERALL PERFORMANCE METRICS")
    print(f"{'='*40}")
    print(f"Accuracy:             {accuracy:.4f} ({accuracy*100:.2f}%)")
    print(f"Precision (weighted): {precision:.4f}")
    print(f"Recall (weighted):    {recall:.4f}")
    print(f"F1-Score (weighted):  {f1:.4f}")
    print(f"Precision (macro):    {precision_macro:.4f}")
    print(f"Recall (macro):       {recall_macro:.4f}")
    print(f"F1-Score (macro):     {f1_macro:.4f}")

    print(f"\n TREE COMPLEXITY METRICS")
    print(f"{'='*40}")
    print(f"Maximum Depth:        {tree_stats['max_depth']}")
    print(f"Total Nodes:          {tree_stats['num_nodes']}")
    print(f"Leaf Nodes:           {tree_stats['num_leaves']}")
    print(f"Internal Nodes:       {tree_stats['num_internal_nodes']}")

    total_predictions = len(predictions)
    valid_predictions = sum(1 for p in predictions if p is not None)
    prediction_rate = valid_predictions / total_predictions if total_predictions > 0 else 0

    return {
        'accuracy': accuracy,
        'precision_weighted': precision,
        'recall_weighted': recall,
        'f1_weighted': f1,
        'precision_macro': precision_macro,
        'recall_macro': recall_macro,
        'f1_macro': f1_macro,
        'per_class_metrics': per_class_metrics,
        'tree_complexity': tree_stats,
        'predictions': predictions,
        'prediction_rate': prediction_rate
    }

def preprocess_data(df):
    """Preprocess dataset by encoding categorical variables"""
    df_processed = df.copy()

    print(" Original dataset info:")
    print(f"Shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")
    print("\nFirst few encoded samples:")

    label_encoders = {}
    for column in df_processed.columns:
        le = LabelEncoder()
        df_processed[column] = le.fit_transform(df_processed[column])
        label_encoders[column] = le

        # Show encoding for first few columns and target
        if column in list(df.columns)[:3] or column == df.columns[-1]:
            unique_orig = df[column].unique()[:5]
            unique_encoded = df_processed[column].unique()[:5]
            print(f"  {column}: {unique_orig} → {unique_encoded}")

    return df_processed, label_encoders

def construct_tree(data, cols, used_attributes=None, level=0, max_depth=6, print_construction=False):
    """
    Construct decision tree using ID3 algorithm with NumPy

    Args:
        data: NumPy array with features and target (last column)
        cols: List of column names
        used_attributes: Set of already used attribute indices
        level: Current tree depth level
        max_depth: Maximum allowed depth
        print_construction: Whether to print construction details

    Returns:
        Tree structure (dict for internal nodes, int for leaf nodes)
    """
    if used_attributes is None:
        used_attributes = set()

    # Base case: empty data
    if len(data) == 0:
        return None

    # Ensure data is NumPy array
    data = np.array(data, dtype=np.float32)

    # Calculate entropy of current dataset
    entropy = get_entropy_of_dataset(data)
    if print_construction:
        print_node_info(f"Entropy = {entropy:.4f}", level)

    # Base case: pure node (entropy ≈ 0)
    if entropy < 1e-10:  # Using small threshold for floating point comparison
        target_values = data[:, -1].astype(int)
        majority_class = int(target_values[0])  # All same class
        if print_construction:
            print_node_info(f"Leaf: Class {majority_class} (pure node)", level)
        return majority_class

    # Base case: max depth reached
    if level >= max_depth:
        unique_vals, counts = np.unique(data[:, -1], return_counts=True)
        majority_idx = np.argmax(counts)
        majority_class = int(unique_vals[majority_idx])
        if print_construction:
            print_node_info(f"Leaf: Class {majority_class} (max depth)", level)
        return majority_class

    # Base case: no more attributes available
    num_features = len(cols) - 1
    if len(used_attributes) >= num_features:
        unique_vals, counts = np.unique(data[:, -1], return_counts=True)
        majority_idx = np.argmax(counts)
        majority_class = int(unique_vals[majority_idx])
        if print_construction:
            print_node_info(f"Leaf: Class {majority_class} (no attributes)", level)
        return majority_class

    try:
        # Get the best attribute to split on
        gain_dict, selected_attribute = get_selected_attribute(data)

        # Filter out already used attributes
        available_gains = {attr: gain for attr, gain in gain_dict.items()
                          if attr not in used_attributes}

        # Base case: no available attributes or no information gain
        if not available_gains or max(available_gains.values()) <= 0:
            unique_vals, counts = np.unique(data[:, -1], return_counts=True)
            majority_idx = np.argmax(counts)
            majority_class = int(unique_vals[majority_idx])
            if print_construction:
                print_node_info(f"Leaf: Class {majority_class} (no gain)", level)
            return majority_class

        # Select the attribute with highest information gain
        selected_attribute = max(available_gains, key=available_gains.get)

        if print_construction:
            print_node_info(f"Split on: {cols[selected_attribute]} (gain: {available_gains[selected_attribute]:.4f})", level)

        # Create tree node
        tree_node = {
            'attribute': selected_attribute,
            'attribute_name': cols[selected_attribute],
            'gain': available_gains[selected_attribute],
            'level': level,
            'branches': {}
        }

        # Get unique values for the selected attribute
        unique_values = np.unique(data[:, selected_attribute])
        new_used_attributes = used_attributes.copy()
        new_used_attributes.add(selected_attribute)

        # Create branches for each unique value
        for value in unique_values:
            mask = data[:, selected_attribute] == value
            subset_data = data[mask]

            value_int = int(value)

            # Handle empty subset
            if len(subset_data) == 0:
                unique_vals, counts = np.unique(data[:, -1], return_counts=True)
                majority_idx = np.argmax(counts)
                majority_class = int(unique_vals[majority_idx])
                if print_construction:
                    print_node_info(f"Branch {cols[selected_attribute]} = {value_int} → Class {majority_class} (empty)", level)
                tree_node['branches'][value_int] = majority_class
                continue

            if print_construction:
                print_node_info(f"Branch {cols[selected_attribute]} = {value_int} ({len(subset_data)} samples)", level)

            # Recursive call
            subtree = construct_tree(
                subset_data,
                cols,
                new_used_attributes,
                level + 1,
                max_depth,
                print_construction
            )

            tree_node['branches'][value_int] = subtree

        return tree_node

    except Exception as e:
        if print_construction:
            print_node_info(f"Error: {e}", level)

        # Fallback to majority class
        try:
            unique_vals, counts = np.unique(data[:, -1], return_counts=True)
            majority_idx = np.argmax(counts)
            majority_class = int(unique_vals[majority_idx])
            return majority_class
        except:
            return None

def predict_single_sample(tree, sample, cols):
    """
    Predict class for a single sample using the decision tree

    Args:
        tree: Decision tree (dict or int)
        sample: Single data sample (array-like)
        cols: Column names

    Returns:
        Predicted class (int) or None if prediction fails
    """
    # Base case: leaf node
    if isinstance(tree, (int, np.integer)):
        return int(tree)

    # Invalid tree structure
    if not isinstance(tree, dict) or 'attribute' not in tree:
        return None

    attribute_idx = tree['attribute']

    # Check if attribute index is valid
    if attribute_idx >= len(sample):
        return None

    attribute_value = int(sample[attribute_idx])

    # Check if this attribute value exists in the tree
    if attribute_value not in tree['branches']:
        return None

    # Recursively traverse the tree
    subtree = tree['branches'][attribute_value]
    return predict_single_sample(subtree, sample, cols)

def print_tree_structure(tree, cols, level=0, prefix=""):
    """Pretty print the tree structure"""
    if isinstance(tree, (int, np.integer)):
        print(f"{prefix}├── Class {tree}")
        return

    if not isinstance(tree, dict) or 'attribute' not in tree:
        print(f"{prefix}├── None")
        return

    attr_name = tree['attribute_name']
    gain = tree.get('gain', 0)

    if level == 0:
        print(f"Root: {attr_name} (gain: {gain:.4f})")

    branches = tree['branches']
    branch_items = list(branches.items())

    for i, (value, subtree) in enumerate(branch_items):
        is_last = (i == len(branch_items) - 1)

        if level == 0:
            print(f"├── = {value}:")
            new_prefix = "│   "
        else:
            branch_symbol = "└──" if is_last else "├──"
            print(f"{prefix}{branch_symbol} = {value}:")
            new_prefix = prefix + ("    " if is_last else "│   ")

        if isinstance(subtree, (int, np.integer)):
            print(f"{new_prefix}├── Class {subtree}")
        elif isinstance(subtree, dict):
            attr_name = subtree['attribute_name']
            gain = subtree.get('gain', 0)
            print(f"{new_prefix}├── {attr_name} (gain: {gain:.4f})")
            print_tree_structure(subtree, cols, level + 1, new_prefix)
        else:
            print(f"{new_prefix}├── None")

def print_node_info(message, level):
    """Print formatted node information"""
    indent = "  " * level
    print(f"Level {level}: {indent}{message}")

def test_single_dataset(dataset_info):
    """Test a single dataset"""
    dataset_name = dataset_info["name"]
    data_path = dataset_info["path"]

    print(f"\n{'='*80}")
    print(f" TESTING DATASET: {dataset_name.upper()}")
    print(f" File: {data_path}")
    print(f" Framework: NumPy")
    print(f"{'='*80}")

    try:
        # Load dataset
        df = pd.read_csv(data_path)
        print(f" Successfully loaded {data_path}")
    except Exception as e:
        print(f" Error loading dataset {data_path}: {e}")
        return None

    print(f" Target column: '{df.columns[-1]}' (last column)")

    # Preprocess data
    df_processed, label_encoders = preprocess_data(df)

    # Convert to NumPy array
    dataset = df_processed.values.astype(np.float32)
    cols = list(df_processed.columns)

    print(f"\n DATASET SUMMARY")
    print(f"{'='*40}")
    print(f"Shape: {dataset.shape}")
    print(f"Features: {len(cols) - 1}")
    print(f"Data type: {type(dataset)}")

    # Get class distribution
    unique_classes, class_counts = np.unique(dataset[:, -1], return_counts=True)
    unique_classes = unique_classes.astype(int)
    class_counts = class_counts.astype(int)

    print(f"Classes: {len(unique_classes)} → {unique_classes.tolist()}")
    print(f"Distribution: {dict(zip(unique_classes, class_counts))}")

    try:
        print(f"\n DECISION TREE CONSTRUCTION")
        print(f"{'='*40}")

        # Train-test split
        total_samples = len(dataset)
        train_split = 0.8
        train_size = int(total_samples * train_split)

        print(f"Total samples: {total_samples:,}")
        print(f"Training: {train_size:,} samples")
        print(f"Testing: {total_samples - train_size:,} samples")

        # Shuffle and split data
        np.random.seed(42)
        indices = np.random.permutation(total_samples)
        dataset_shuffled = dataset[indices]

        train_data = dataset_shuffled[:train_size]
        test_data = dataset_shuffled[train_size:]

        # Adjust max depth based on dataset complexity
        if len(cols) > 20:  # Very complex datasets
            max_depth = 8
        elif len(cols) > 15:  # Complex datasets
            max_depth = 7
        elif len(cols) > 10:  # Medium datasets
            max_depth = 6
        else:  # Simple datasets
            max_depth = 5

        print(f" Building tree (max depth: {max_depth})...")

        # Construct decision tree
        tree = construct_tree(
            train_data,
            cols=cols,
            max_depth=max_depth,
            print_construction=print_construction_flag
        )

        if tree is not None:
            print(f" Decision tree construction completed!")

            if print_tree_flag:
                print(f"\n DECISION TREE STRUCTURE")
                print("="*60)
                print_tree_structure(tree, cols)
                print()

            # Prepare test data
            X_test = test_data[:, :-1]
            y_test = test_data[:, -1]

            # Get class names for reporting
            target_col = cols[-1]
            class_names = None
            if target_col in label_encoders:
                le = label_encoders[target_col]
                class_names = {i: le.inverse_transform([i])[0] for i in range(len(le.classes_))}

            # Evaluate the tree
            evaluation_results = evaluate_decision_tree(tree, X_test, y_test, cols, class_names)

            return {
                'dataset_name': dataset_name,
                'success': True,
                'results': evaluation_results,
                'dataset_info': {
                    'shape': df.shape,
                    'num_features': len(cols) - 1,
                    'num_classes': len(unique_classes),
                    'class_distribution': dict(zip(unique_classes, class_counts))
                }
            }

        else:
            print(" Tree construction failed!")
            return {
                'dataset_name': dataset_name,
                'success': False,
                'error': 'Tree construction returned None'
            }

    except Exception as e:
        print(f" Error in tree construction: {e}")
        import traceback
        traceback.print_exc()
        return {
            'dataset_name': dataset_name,
            'success': False,
            'error': str(e)
        }

def print_summary_report(all_results):
    """Print comprehensive summary report"""
    print(f"\n{'='*100}")
    print(f" COMPREHENSIVE SUMMARY REPORT - NUMPY FRAMEWORK")
    print(f"{'='*100}")

    successful_results = [r for r in all_results if r['success']]
    failed_results = [r for r in all_results if not r['success']]

    if successful_results:
        print(f"\n SUCCESSFUL EVALUATIONS: {len(successful_results)}/{len(all_results)}")
        print(f"{'='*80}")

        # Results table
        print(f"{'Dataset':<15} {'Accuracy':<10} {'Precision':<10} {'Recall':<10} {'F1-Score':<10} {'Depth':<7} {'Nodes':<7}")
        print(f"{'-'*80}")

        for result in successful_results:
            name = result['dataset_name']
            metrics = result['results']
            accuracy = metrics['accuracy']
            precision = metrics['precision_weighted']
            recall = metrics['recall_weighted']
            f1 = metrics['f1_weighted']
            depth = metrics['tree_complexity']['max_depth']
            nodes = metrics['tree_complexity']['num_nodes']

            print(f"{name:<15} {accuracy:<10.4f} {precision:<10.4f} {recall:<10.4f} {f1:<10.4f} {depth:<7} {nodes:<7}")

        # Best performer
        best_result = max(successful_results, key=lambda x: x['results']['accuracy'])
        print(f"\n BEST ACCURACY: {best_result['dataset_name']} ({best_result['results']['accuracy']:.1%})")

        # Dataset analysis
        print(f"\n DATASET COMPLEXITY ANALYSIS")
        print(f"{'='*50}")

        for result in successful_results:
            info = result['dataset_info']
            tree_stats = result['results']['tree_complexity']

            print(f"\n{result['dataset_name']}:")
            print(f"  • Size: {info['shape'][0]:,} samples × {info['num_features']} features")
            print(f"  • Classes: {info['num_classes']} classes")
            print(f"  • Tree: {tree_stats['max_depth']} levels, {tree_stats['num_nodes']} nodes")
            print(f"  • Performance: {result['results']['accuracy']:.1%} accuracy")

    if failed_results:
        print(f"\n FAILED EVALUATIONS: {len(failed_results)}")
        for result in failed_results:
            print(f"  • {result['dataset_name']}: {result.get('error', 'Unknown error')}")

    print(f"\n{'='*100}")

def main():
    """Main function to test all datasets"""
    print(f" DECISION TREE EVALUATION - NUMPY FRAMEWORK")
    print(f"{'='*60}")
    print(f" Datasets to process: {len(datasets)}")
    print(f" Framework: NumPy/sklearn")

    all_results = []

    for i, dataset_info in enumerate(datasets, 1):
        print(f"\n Processing {i}/{len(datasets)}: {dataset_info['name']}")
        result = test_single_dataset(dataset_info)
        if result:
            all_results.append(result)

    # Print comprehensive summary
    print_summary_report(all_results)

    print(f"\n EVALUATION COMPLETE!")
    print(f"Framework: NumPy")
    print(f"Datasets processed: {len(all_results)}")
    print(f"Successful: {len([r for r in all_results if r['success']])}")

    return all_results

if __name__ == "__main__":
    results = main()

 Framework: NumPy/sklearn
 Using NumPy arrays for all data processing
 Successfully imported module: EC_F_PES2UG23CS379_Lab3
 All required functions imported successfully
 DECISION TREE EVALUATION - NUMPY FRAMEWORK
 Datasets to process: 3
 Framework: NumPy/sklearn

 Processing 1/3: Mushrooms

 TESTING DATASET: MUSHROOMS
 File: mushrooms.csv
 Framework: NumPy
 Successfully loaded mushrooms.csv
 Target column: 'class' (last column)
 Original dataset info:
Shape: (8124, 23)
Columns: ['cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor', 'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color', 'stalk-shape', 'stalk-root', 'stalk-surface-above-ring', 'stalk-surface-below-ring', 'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number', 'ring-type', 'spore-print-color', 'population', 'habitat', 'class']

First few encoded samples:
  cap-shape: ['x' 'b' 's' 'f' 'k'] → [5 0 4 2 3]
  cap-surface: ['s' 'y' 'f' 'g'] → [2 3 0 1]
  cap-color: ['n' 'y' '