In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE

In [2]:
# Load the dataset
data = pd.read_csv('mushroom_dataset.csv')

In [3]:
data.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [4]:
# Check for question marks in the entire dataset
question_marks = data.applymap(lambda x: '?' in str(x))

# Check if there are any columns with question marks
columns_with_question_marks = question_marks.any()

# Print the columns with question marks
print("Columns with Question Marks:")
print(columns_with_question_marks[columns_with_question_marks])

Columns with Question Marks:
stalk-root    True
dtype: bool


In [5]:
# Replace '?' with NaN in the entire dataset
data.replace('?', np.nan, inplace=True)

In [6]:
# Check for missing values in the entire dataset
missing_values = data.isna().sum()
# Print the columns with missing values and their respective counts
print("Columns with Missing Values:")
print(missing_values[missing_values > 0])

Columns with Missing Values:
stalk-root    2480
dtype: int64


In [7]:
#replacing missing values with the mode (most frequent value) of the "stalk-root" column.
mode_value = data['stalk-root'].mode()[0]
data['stalk-root'].fillna(mode_value, inplace=True)


In [8]:
# Encode the label column ('class') using LabelEncoder
label_encoder = LabelEncoder()
data['class'] = label_encoder.fit_transform(data['class'])

In [9]:
# One-hot encode categorical features
categorical_columns = data.select_dtypes(include=['object']).columns
data = pd.get_dummies(data, columns=categorical_columns, drop_first=True)

In [10]:
# Split the data into features (X) and labels (y)
X = data.drop('class', axis=1)  # Exclude the 'class' column as it's the label
y = data['class']

In [11]:

# Handle class imbalance using SMOTE
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

In [12]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

In [13]:
# Create a K-Nearest Neighbors classifier
knn = KNeighborsClassifier()

# Define hyperparameters to tune with GridSearchCV
param_grid = {
    'n_neighbors': [3, 5, 7],  # Number of neighbors to consider
    'weights': ['uniform', 'distance'],  # Weighting scheme
    'p': [1, 2]  # Minkowski distance parameter (1: Manhattan distance, 2: Euclidean distance)
}

# Perform grid search with 5-fold cross-validation
grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the optimal hyperparameters
best_params = grid_search.best_params_
print(f"Optimal Hyperparameters: {best_params}")

Optimal Hyperparameters: {'n_neighbors': 3, 'p': 1, 'weights': 'uniform'}


In [14]:
# Train the K-Nearest Neighbors classifier with the optimal hyperparameters
best_knn = KNeighborsClassifier(**best_params)
best_knn.fit(X_train, y_train)

# Make predictions on the test set
y_pred = best_knn.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Get a classification report
report = classification_report(y_test, y_pred)
print(report)

Accuracy: 1.00
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       836
           1       1.00      1.00      1.00       848

    accuracy                           1.00      1684
   macro avg       1.00      1.00      1.00      1684
weighted avg       1.00      1.00      1.00      1684

