Importing library functions

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

Configuration

In [None]:
DATA_URL = 'https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data'
COLUMN_NAMES = [
'class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color', 'stalk-shape',
'stalk-root', 'stalk-surface-above-ring', 'stalk-surface-below-ring',
'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-type', 'veil-color',
'ring-number', 'ring-type', 'spore-print-color', 'population', 'habitat'
]
RANDOM_STATE = 42
NF_LIST = [1, 50, 100, 150, 200, 250]
TEST_SIZE = 0.2

Loading Data

In [None]:
print('Downloading dataset...')
df = pd.read_csv(DATA_URL, header=None, names=COLUMN_NAMES)
print('Loaded:', df.shape)

Downloading dataset...
Loaded: (8124, 23)


Basic EDA

In [None]:
print('\n--- Basic info ---')
print(df.head())
print('\nValue counts for target:')
print(df['class'].value_counts())


--- Basic info ---
  class cap-shape cap-surface cap-color bruises odor gill-attachment  \
0     p         x           s         n       t    p               f   
1     e         x           s         y       t    a               f   
2     e         b           s         w       t    l               f   
3     p         x           y         w       t    p               f   
4     e         x           s         g       f    n               f   

  gill-spacing gill-size gill-color  ... stalk-surface-below-ring  \
0            c         n          k  ...                        s   
1            c         b          k  ...                        s   
2            c         b          n  ...                        s   
3            c         n          n  ...                        s   
4            w         b          k  ...                        s   

  stalk-color-above-ring stalk-color-below-ring veil-type veil-color  \
0                      w                      w         p   

Counting missing values

In [None]:
print('\nMissing value counts (raw "?" entries):')
missing_counts = (df == '?').sum()
print(missing_counts[missing_counts > 0])


Missing value counts (raw "?" entries):
stalk-root    2480
dtype: int64


Replace '?' with NaN to use pandas isna

In [None]:
df = df.replace('?', np.nan)
print('\nMissing counts after replacing with NaN:')
print(df.isna().sum()[df.isna().sum() > 0])


Missing counts after replacing with NaN:
stalk-root    2480
dtype: int64


EDA plots

In [None]:
os.makedirs('outputs', exist_ok=True)
plt.figure(figsize=(6,4))
sns.countplot(x='class', data=df)
plt.title('Class distribution')
plt.savefig('outputs/class_distribution.png', bbox_inches='tight')
plt.close()

Preprocessing

Drop rows with missing values

In [None]:
print('\nDropping rows with missing values (simple approach). Count before:', len(df))
df_clean = df.dropna().reset_index(drop=True)
print('Count after:', len(df_clean))


Dropping rows with missing values (simple approach). Count before: 8124
Count after: 5644


Separate X, y

In [None]:
X = df_clean.drop(columns=['class'])
y = df_clean['class'].map({'e':0, 'p':1})

One-hot encode X

In [None]:
X_enc = pd.get_dummies(X, prefix_sep='=')
print('One-hot encoded shape:', X_enc.shape)

One-hot encoded shape: (5644, 98)


Train-test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_enc, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y)
print('\nTrain shape:', X_train.shape, 'Test shape:', X_test.shape)


Train shape: (4515, 98) Test shape: (1129, 98)


Decision Tree baseline

In [None]:
dt = DecisionTreeClassifier(random_state=RANDOM_STATE)
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)
acc_dt = accuracy_score(y_test, y_pred_dt)
print('\nDecision Tree test accuracy:', acc_dt)
print('\nDecision Tree classification report:\n', classification_report(y_test, y_pred_dt, target_names=['edible','poisonous']))


Decision Tree test accuracy: 1.0

Decision Tree classification report:
               precision    recall  f1-score   support

      edible       1.00      1.00      1.00       698
   poisonous       1.00      1.00      1.00       431

    accuracy                           1.00      1129
   macro avg       1.00      1.00      1.00      1129
weighted avg       1.00      1.00      1.00      1129



Save confusion matrix

In [None]:
cm_dt = confusion_matrix(y_test, y_pred_dt)
plt.figure(figsize=(5,4))
sns.heatmap(cm_dt, annot=True, fmt='d', xticklabels=['edible','poisonous'], yticklabels=['edible','poisonous'])
plt.ylabel('True')
plt.xlabel('Pred')
plt.title('Confusion Matrix - Decision Tree')
plt.savefig('outputs/confusion_matrix_decision_tree.png', bbox_inches='tight')
plt.close()

Random Forest

In [None]:
results = []
for n in NF_LIST:
    print(f'\nTraining RandomForest (n_estimators={n})...')
    rf = RandomForestClassifier(n_estimators=n, random_state=RANDOM_STATE, n_jobs=-1)
    rf.fit(X_train, y_train)
    y_train_pred = rf.predict(X_train)
    y_test_pred = rf.predict(X_test)
    acc_train = accuracy_score(y_train, y_train_pred)
    acc_test = accuracy_score(y_test, y_test_pred)
    print(f' Train acc: {acc_train:.4f} Test acc: {acc_test:.4f}')
    results.append({'n_estimators': n, 'train_accuracy': acc_train, 'test_accuracy': acc_test})


Training RandomForest (n_estimators=1)...
 Train acc: 1.0000 Test acc: 1.0000

Training RandomForest (n_estimators=50)...
 Train acc: 1.0000 Test acc: 1.0000

Training RandomForest (n_estimators=100)...
 Train acc: 1.0000 Test acc: 1.0000

Training RandomForest (n_estimators=150)...
 Train acc: 1.0000 Test acc: 1.0000

Training RandomForest (n_estimators=200)...
 Train acc: 1.0000 Test acc: 1.0000

Training RandomForest (n_estimators=250)...
 Train acc: 1.0000 Test acc: 1.0000


Results table

In [None]:
res_df = pd.DataFrame(results)
res_df.to_csv('outputs/rf_n_estimators_results.csv', index=False)
print('\nSaved results to outputs/rf_n_estimators_results.csv')


Saved results to outputs/rf_n_estimators_results.csv


Plot accuracy vs n_estimators

In [None]:
plt.figure(figsize=(8,5))
plt.plot(res_df['n_estimators'], res_df['test_accuracy'], marker='o', label='test_accuracy')
plt.plot(res_df['n_estimators'], res_df['train_accuracy'], marker='o', linestyle='--', label='train_accuracy')
plt.xlabel('n_estimators')
plt.ylabel('Accuracy')
plt.title('Random Forest accuracy vs n_estimators')
plt.legend()
plt.grid(True)
plt.savefig('outputs/accuracy_vs_n_estimators.png', bbox_inches='tight')
plt.close()

Choose best RF

In [None]:
best_row = res_df.loc[res_df['test_accuracy'].idxmax()]
best_n = int(best_row['n_estimators'])
print('\nBest RF n_estimators by test accuracy:', best_n)


Best RF n_estimators by test accuracy: 1


Retrain best RF and compare with Decision Tree

In [None]:
best_rf = RandomForestClassifier(n_estimators=best_n, random_state=RANDOM_STATE, n_jobs=-1)
best_rf.fit(X_train, y_train)
y_pred_rf = best_rf.predict(X_test)
acc_rf = accuracy_score(y_test, y_pred_rf)
print('Best Random Forest test accuracy:', acc_rf)
print('\nRandom Forest classification report:\n', classification_report(y_test, y_pred_rf, target_names=['edible','poisonous']))

Best Random Forest test accuracy: 1.0

Random Forest classification report:
               precision    recall  f1-score   support

      edible       1.00      1.00      1.00       698
   poisonous       1.00      1.00      1.00       431

    accuracy                           1.00      1129
   macro avg       1.00      1.00      1.00      1129
weighted avg       1.00      1.00      1.00      1129



Save confusion matrix (RF)

In [None]:
cm_rf = confusion_matrix(y_test, y_pred_rf)
plt.figure(figsize=(5,4))
sns.heatmap(cm_rf, annot=True, fmt='d', xticklabels=['edible','poisonous'], yticklabels=['edible','poisonous'])
plt.ylabel('True')
plt.xlabel('Pred')
plt.title(f'Confusion Matrix - Random Forest (n={best_n})')
plt.savefig('outputs/confusion_matrix_random_forest.png', bbox_inches='tight')
plt.close()

Feature importances

In [None]:
importances = best_rf.feature_importances_
feat_names = X_enc.columns
imp_df = pd.DataFrame({'feature': feat_names, 'importance': importances}).sort_values('importance', ascending=False).head(20)
plt.figure(figsize=(8,6))
sns.barplot(x='importance', y='feature', data=imp_df)
plt.title('Top 20 feature importances (Random Forest)')
plt.tight_layout()
plt.savefig('outputs/feature_importances_top20.png', bbox_inches='tight')
plt.close()

Save summary

In [None]:
summary = {
'decision_tree_test_accuracy': [acc_dt],
'random_forest_test_accuracy': [acc_rf],
'best_n_estimators': [best_n]
}
summary_df = pd.DataFrame(summary)
summary_df.to_csv('outputs/model_summary.csv', index=False)
print('\nSaved model summary to outputs/model_summary.csv')


print('\nDone. Check the outputs/ directory for plots and CSVs.')


Saved model summary to outputs/model_summary.csv

Done. Check the outputs/ directory for plots and CSVs.
