# Fungal Fun
ML approach

In [11]:
#Imports/seeds
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import (
    confusion_matrix, 
    classification_report, 
    roc_curve, 
    auc
)
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, BatchNormalization, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

## Dataset loading

In [3]:
def load_and_preprocess_data(filepath):
    # Load the dataset
    df = pd.read_csv(filepath)
    

    feature_columns = [
        'cap-diameter', 'cap-shape', 'cap-surface', 'cap-color', 
        'does-bruise-or-bleed', 
        'gill-attachment', 'gill-spacing', 'gill-color',
        'stem-height', 'stem-width', 'stem-root', 'stem-surface', 'stem-color', 
        'veil-type', 'veil-color', 'has-ring', 'ring-type', 
        'spore-print-color',
        'habitat', 'season'
    ]
    

    # Remove columns with high percentage of null values (>40%)
    null_percentages = df.isnull().mean()
    columns_to_drop = null_percentages[null_percentages > 0.4].index.tolist()
    df = df.drop(columns=columns_to_drop)
    
    # Drop any remaining rows with NA values
    df = df.dropna()
    
    # Separate features and target
    X = df[feature_columns]
    y = df['class']  # Assuming 'class' is the column indicating edible/poisonous
    
    # Encode categorical features
    for column in X.columns:
        le = LabelEncoder()
        X[column] = le.fit_transform(X[column].astype(str))
    
    # Encode target variable
    le_y = LabelEncoder()
    y = le_y.fit_transform(y)
    
    return X, y, le_y

In [7]:
#Improved
def load_and_preprocess_data(filepath):
    # Load the dataset
    df = pd.read_csv(filepath)
    
    # Print out all column names for diagnostic purposes
    print("Available columns in the dataset:")
    print(df.columns.tolist())
    
    # Identify columns to drop based on null percentage
    null_percentages = df.isnull().mean()
    columns_to_drop = null_percentages[null_percentages > 0.4].index.tolist()
    print("\nColumns to be dropped due to high null percentage:")
    print(columns_to_drop)
    
    # Drop columns with high null percentage
    df = df.drop(columns=columns_to_drop)
    
    # Drop any remaining rows with NA values
    df = df.dropna()
    
    # Identify the actual feature columns (excluding the target)
    feature_columns = [col for col in df.columns if col != 'class']
    
    # Separate features and target
    X = df[feature_columns]
    y = df['class']
    
    # Encode categorical features
    for column in X.columns:
        le = LabelEncoder()
        X[column] = le.fit_transform(X[column].astype(str))
    
    # Encode target variable
    le_y = LabelEncoder()
    y = le_y.fit_transform(y)
    
    # Print some additional diagnostic information
    print(f"\nFeature columns used: {feature_columns}")
    print(f"Number of features: {len(feature_columns)}")
    print(f"Number of samples: {len(X)}")
    
    return X, y, le_y

## Visualization

In [13]:
# Visualization Functions
def plot_training_history(history):
    plt.figure(figsize=(12,4))
    
    # Accuracy plot
    plt.subplot(1,2,1)
    plt.plot(history.history['accuracy'], label='Training Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.title('Model Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    
    # Loss plot
    plt.subplot(1,2,2)
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title('Model Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    
    plt.tight_layout()
    plt.savefig('training_history.png')
    plt.close()

def plot_confusion_matrix(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(8,6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title('Confusion Matrix')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.savefig('confusion_matrix.png')
    plt.close()

def plot_roc_curve(y_true, y_scores):
    fpr, tpr, _ = roc_curve(y_true, y_scores)
    roc_auc = auc(fpr, tpr)
    
    plt.figure()
    plt.plot(fpr, tpr, color='darkorange', lw=2, 
             label=f'ROC curve (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend(loc="lower right")
    plt.savefig('roc_curve.png')
    plt.close()


## Neural Net

In [6]:
def create_neural_network(input_shape):
    model = Sequential([
        # Input layer with BatchNormalization
        Dense(64, activation='relu', input_shape=(input_shape,)),
        BatchNormalization(),
        Dropout(0.3),
        
        # Hidden layers with different configurations
        Dense(32, activation='relu'),
        BatchNormalization(),
        Dropout(0.3),
        
        Dense(16, activation='relu'),
        BatchNormalization(),
        Dropout(0.2),
        
        # Output layer
        Dense(1, activation='sigmoid')
    ])
    
    # Compile the model
    model.compile(
        optimizer=Adam(learning_rate=0.001),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    
    return model


ERROR! Session/line number was not unique in database. History logging moved to new session 56


In [17]:
# Main execution
def main():
    # Load and preprocess data
    X, y, label_encoder = load_and_preprocess_data('/Users/bcmain/Desktop/poisonous_mushrooms.csv')
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    
    # Scale the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Create and train the model
    model = create_neural_network(X_train.shape[1])
    
    # Early stopping to prevent overfitting
    early_stopping = EarlyStopping(
        monitor='val_loss', 
        patience=10, 
        restore_best_weights=True
    )
    
    # Train the model
    history = model.fit(
        X_train_scaled, y_train,
        epochs=5,
        batch_size=32,
        validation_split=0.2,
        callbacks=[early_stopping],
        verbose=1
    )
    
    # Evaluate the model
    test_loss, test_accuracy = model.evaluate(X_test_scaled, y_test, verbose=0)
    print(f"\nTest Accuracy: {test_accuracy:.4f}")
    
    # Predictions
    y_pred = (model.predict(X_test_scaled) > 0.5).astype(int)
    y_scores = model.predict(X_test_scaled)
    
    # Detailed Classification Report
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    # Visualizations
    plot_training_history(history)
    plot_confusion_matrix(y_test, y_pred)
    plot_roc_curve(y_test, y_scores)
    
    # Save the model
    model.save('mushroom_classifier.h5')
    
    return model, history

if __name__ == "__main__":
    model, history = main()




Available columns in the dataset:
['id', 'class', 'cap-diameter', 'cap-shape', 'cap-surface', 'cap-color', 'does-bruise-or-bleed', 'gill-attachment', 'gill-spacing', 'gill-color', 'stem-height', 'stem-width', 'stem-root', 'stem-surface', 'stem-color', 'veil-type', 'veil-color', 'has-ring', 'ring-type', 'spore-print-color', 'habitat', 'season']

Columns to be dropped due to high null percentage:
['gill-spacing', 'stem-root', 'stem-surface', 'veil-type', 'veil-color', 'spore-print-color']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[column] = le.fit_transform(X[column].astype(str))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[column] = le.fit_transform(X[column].astype(str))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[column] = le.fit_transform(X[column].astype(str))
A value is trying to be set on a copy of a slice 


Feature columns used: ['id', 'cap-diameter', 'cap-shape', 'cap-surface', 'cap-color', 'does-bruise-or-bleed', 'gill-attachment', 'gill-color', 'stem-height', 'stem-width', 'stem-color', 'has-ring', 'ring-type', 'habitat', 'season']
Number of features: 15
Number of samples: 1930108




Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5

Test Accuracy: 0.9731

Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.98      0.97    179132
           1       0.98      0.97      0.97    206890

    accuracy                           0.97    386022
   macro avg       0.97      0.97      0.97    386022
weighted avg       0.97      0.97      0.97    386022

Visualizations have been saved:
1. training_history.png
2. confusion_matrix.png
3. roc_curve.png


  saving_api.save_model(
