In [1]:
### Machine Learning Models with Scikit-learn: Template

In [2]:
# Step 1: Load the Data
import pandas as pd

# Load the dataset
url = 'https://raw.githubusercontent.com/fenago/datasets/refs/heads/main/mushrooms.csv'
data = pd.read_csv(url)

# Display the first few rows of the data
data.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [3]:
# Step 2: Identify the Target Variable and Encode It
# Map the target variable (class)
data['class'] = data['class'].map({'e': 0, 'p': 1})

In [4]:
# Step 3: Handle Missing Data
# Remove missing values (if any)
data.dropna(inplace=True)

In [6]:
# Step 4: Examine Unique Values and Clean Features
# Check the number of unique values in each column
unique_values = data.nunique()
print(unique_values)

class                        2
cap-shape                    6
cap-surface                  4
cap-color                   10
bruises                      2
odor                         9
gill-attachment              2
gill-spacing                 2
gill-size                    2
gill-color                  12
stalk-shape                  2
stalk-root                   5
stalk-surface-above-ring     4
stalk-surface-below-ring     4
stalk-color-above-ring       9
stalk-color-below-ring       9
veil-type                    1
veil-color                   4
ring-number                  3
ring-type                    5
spore-print-color            9
population                   6
habitat                      7
dtype: int64


In [7]:
# Optionally drop columns with too many unique values
# data.drop(['column_name'], axis=1, inplace=True)

In [8]:
# Step 5: Convert Categorical Variables
# Convert categorical variables into dummy/indicator variables
data_encoded = pd.get_dummies(data)

In [9]:
# Step 6: Split the Data for Training and Testing
from sklearn.model_selection import train_test_split

# Separate the features and target variable
X = data_encoded.drop('class', axis=1)
y = data_encoded['class']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# Step 7: Choose a Model
from sklearn.ensemble import RandomForestClassifier

# Initialize the model (you can replace this with any other classifier)
model = RandomForestClassifier(random_state=42)

# Other models to try
# from sklearn.linear_model import LogisticRegression
# model = LogisticRegression()

# from sklearn.svm import SVC
# model = SVC()

In [11]:
# Step 8: Explore Model Parameters
# Explore parameters of the RandomForest model
print(model.get_params())

{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False}


In [12]:
# Step 9: Train the Model
# Train the model
model.fit(X_train, y_train)

In [13]:
# Step 10: Evaluate Model Performance
from sklearn.metrics import classification_report, confusion_matrix

# Make predictions on the test data
y_pred = model.predict(X_test)

# Print performance metrics
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       843
           1       1.00      1.00      1.00       782

    accuracy                           1.00      1625
   macro avg       1.00      1.00      1.00      1625
weighted avg       1.00      1.00      1.00      1625

Confusion Matrix:
 [[843   0]
 [  0 782]]


In [14]:
# Step 11: Tune Hyperparameters with Grid Search

from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30]
}

# Initialize Grid Search
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

# Fit the grid search model
grid_search.fit(X_train, y_train)

# Display the best parameters and the best score
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Score: {grid_search.best_score_}")

Fitting 5 folds for each of 9 candidates, totalling 45 fits
Best Parameters: {'max_depth': 10, 'n_estimators': 100}
Best Score: 1.0


In [15]:
# Step 12: Use Randomized Search for Tuning

from sklearn.model_selection import RandomizedSearchCV

# Define the parameter distribution
param_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30]
}

# Initialize Random Search
random_search = RandomizedSearchCV(estimator=model, param_distributions=param_dist, n_iter=10, cv=5, random_state=42, n_jobs=-1)

# Fit the random search model
random_search.fit(X_train, y_train)

# Display the best parameters and the best score
print(f"Best Parameters (Randomized Search): {random_search.best_params_}")
print(f"Best Score (Randomized Search): {random_search.best_score_}")



Best Parameters (Randomized Search): {'n_estimators': 100, 'max_depth': 10}
Best Score (Randomized Search): 1.0


In [16]:
# Step 13: Build and Test the Final Model

# Train the final model with best parameters
final_model = RandomForestClassifier(n_estimators=grid_search.best_params_['n_estimators'], max_depth=grid_search.best_params_['max_depth'])
final_model.fit(X_train, y_train)

# Test the final model
final_predictions = final_model.predict(X_test)

# Evaluate the final model
print("Final Model Classification Report:\n", classification_report(y_test, final_predictions))
print("Final Model Confusion Matrix:\n", confusion_matrix(y_test, final_predictions))

Final Model Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       843
           1       1.00      1.00      1.00       782

    accuracy                           1.00      1625
   macro avg       1.00      1.00      1.00      1625
weighted avg       1.00      1.00      1.00      1625

Final Model Confusion Matrix:
 [[843   0]
 [  0 782]]
