In [4]:
#check for the respective required libraries
!pip install numpy pandas scikit-learn matplotlib seaborn




In [5]:
import pandas as pd

# Loading dataset
data = pd.read_csv('mushroom_mixed_50000.csv')

# Display sample data (first 5 rows)
data.head()


Unnamed: 0,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,...,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,e,9.39,f,?,n,t,?,,w,9.04,...,b,?,w,u,w,t,g,?,d,u
1,p,15.42,f,k,n,f,d,c,y,6.15,...,?,k,n,?,,f,f,?,d,u
2,p,6.07,s,?,e,t,d,c,b,6.8,...,?,,n,?,,f,f,?,d,a
3,e,4.64,x,t,n,f,a,d,y,8.37,...,?,k,n,?,,f,f,?,d,s
4,p,17.87,f,h,e,f,e,?,w,19.03,...,s,y,w,u,w,t,g,?,d,u


In [6]:
#Data has both numerical and categorial values for respective columns and each missing data has 
#to be estimated respective to the values in the column so we deal with both numerical and categorial columns seperately to get the missing data predicted 

#numeric columns 
#Filling missing numerical values with median
numeric_cols = data.select_dtypes(include=['number']).columns
data.loc[:, numeric_cols] = data[numeric_cols].fillna(data[numeric_cols].mode())

#categorial columns (columns which have values other than numerical values)
# Fill missing values for categorical columns with mode
categorical_cols = data.select_dtypes(include=['object', 'category']).columns
for col in categorical_cols:
    data.loc[:, col] = data[col].fillna(data[col].mode()[0])



In [7]:
#Thers's no missing data now but entire data is not numeric
#next step is to bring the entire data into numeric
from sklearn.preprocessing import LabelEncoder

#we are label encoding the entire categorial values
encoder = LabelEncoder()
for col in data.columns:
    # convert each categorial values to integers
    data[col] = encoder.fit_transform(data[col])

data.head()


Unnamed: 0,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,...,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,0,894,2,0,5,1,0,1,10,792,...,1,0,11,1,4,1,3,0,0,2
1,1,1496,2,6,5,0,2,1,11,503,...,0,5,6,0,4,0,2,0,0,2
2,1,562,5,0,1,1,2,1,0,568,...,0,6,6,0,4,0,2,0,0,0
3,0,419,6,9,5,0,1,2,11,725,...,0,5,6,0,4,0,2,0,0,1
4,1,1737,2,4,1,0,3,0,10,1753,...,5,8,11,1,4,1,3,0,0,2


In [8]:
#Now that there is no missing data in the data set and the entire data is encoded or in numeric the data is ready 
#Splitting data for Implementing Cross Validation 
from sklearn.model_selection import train_test_split

#target column class
X = data.drop('class', axis=1)
y = data['class']

#splitting and training ata into 80-20
#random state is set to 54

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=54)


In [9]:
#Algotithm 1 Decision Tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

param_grid = {
    'max_depth': [10, 20, 30],  # Limits tree depth to avoid overfitting
    'min_samples_split': [2, 5, 10],  # Minimum samples to split a node
    'min_samples_leaf': [1, 2, 4]  # Minimum samples required at a leaf node
}

#Decision tree Classifier
grid = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5) #cv=5 Cross Fold Validation

#Fitting the Grid search to training data
grid.fit(X_train, y_train)

# Best model and parameters
dt_model = grid.best_estimator_

#Printing the best hyperparameters
print("Best DecisionTree parameters:", grid.best_params_)


Best DecisionTree parameters: {'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 2}


In [10]:
from sklearn.model_selection import cross_val_score
import numpy as np

#Performing cross validation 
scores = cross_val_score(dt_model, X_train, y_train, cv=5)

#Mean Accuracy and SD
print(f"DecisionTree Mean Accuracy: {np.mean(scores):.4f}")
print(f"DecisionTree Std Deviation: {np.std(scores):.4f}")


DecisionTree Mean Accuracy: 0.9970
DecisionTree Std Deviation: 0.0010


In [11]:
#Algorithm 2 Random Forest
from sklearn.ensemble import RandomForestClassifier

param_grid = {
    'n_estimators': [50, 100],  # Number of trees to balance accuracy and execution time
    'max_depth': [10, 15],  # Reduces complexity and overfitting
    'min_samples_split': [5, 10],  # Ensures nodes have enough samples before splitting
    'min_samples_leaf': [2, 4],  # Avoids overfitting on small splits
    'max_features': ['sqrt', 'log2']  # Reduces dimensionality impact
}

#Random Forest Classifier
grid = GridSearchCV(RandomForestClassifier(), param_grid, cv=5)
#Fitting the Grid search to training data
grid.fit(X_train, y_train)

# Best model and parameters
rf_model = grid.best_estimator_

#Printing the best hyperparameters
print("Best RandomForest parameters:", grid.best_params_)


Best RandomForest parameters: {'max_depth': 15, 'max_features': 'log2', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 100}


In [12]:
#Performing cross validation 
scores = cross_val_score(rf_model, X_train, y_train, cv=5)

#Mean Accuracy and SD
print(f"RandomForest Mean Accuracy: {np.mean(scores):.4f}")
print(f"RandomForest Std Deviation: {np.std(scores):.4f}")


RandomForest Mean Accuracy: 0.9996
RandomForest Std Deviation: 0.0003


In [13]:
#Algorithm 3 KNN 
from sklearn.neighbors import KNeighborsClassifier

param_grid = {
    'n_neighbors': [3, 5, 7],  # Optimal neighborhood size for complex patterns
    'weights': ['uniform', 'distance'],  # Distance-based weighting improves accuracy
}

#KNN Classifier
grid = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5)
#Fitting the Grid search to training data
grid.fit(X_train, y_train)

# Best model and parameters
knn_model = grid.best_estimator_
#Printing the best hyperparameters
print("Best KNN parameters:", grid.best_params_)


Best KNN parameters: {'n_neighbors': 7, 'weights': 'distance'}


In [14]:
#Performing cross validation 
scores = cross_val_score(knn_model, X_train, y_train, cv=5)

#Mean Accuracy and SD
print(f"KNN Mean Accuracy: {np.mean(scores):.4f}")
print(f"KNN Std Deviation: {np.std(scores):.4f}")


KNN Mean Accuracy: 0.8204
KNN Std Deviation: 0.0030


In [15]:
import pickle

# Decision Tree Pickle file
with open('decision_tree_model.pkl', 'wb') as f: # 'wb': write binary mode
    pickle.dump(dt_model, f)

# Random Forest Pickle file
with open('random_forest_model.pkl', 'wb') as f: # 'wb': write binary mode
    pickle.dump(rf_model, f)

# KNN Pickle file
with open('knn_model.pkl', 'wb') as f: # 'wb': write binary mode
    pickle.dump(knn_model, f)


In [16]:
best_model = rf_model  # best performed model
with open('proj1_chosen_model.pkl', 'wb') as f: # 'wb': write binary mode
    pickle.dump(best_model, f)
