In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [6]:
## Model 1: Random Forest

In [7]:
import tensorflow as tf
import pydot

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix

#-----------------------------------------import and encoding of data so it can be used in randomForest Model-------------------------------------
#dataset import
colnames=['e/p','cap-shape', 'cap-surface', 'cap-color', 'bruises?', 'odor', 'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color', 'stalk-shape', 'stalk-root', 'stalk-surface-above-ring', 'stalk-surface-below-ring', 'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-type', 'veil-color','ring-number', 'ring-type', 'spore-print-color', 'population', 'habitat'] 
dataset = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data", names=colnames, header=0)

#oneHOtEncoding
labels = colnames
categorical_data = dataset[labels]
ohe = OneHotEncoder(categories='auto')
feature_arr = ohe.fit_transform(categorical_data).toarray()
ohe_labels = ohe.get_feature_names_out(labels)
features = pd.DataFrame(feature_arr,columns=ohe_labels)
#print(features)

#-------------------------------------------------implementation of random forest AI model------------------------------------------------
#preparing datasets
x= features.iloc[:, -117:].values
Y=features.iloc[:, :-117].values

x_train,x_test, Y_train,Y_test = train_test_split(x, Y, test_size=1/3)


#creating random forest classifier object
param_grid= {
    'n_estimators': np.array(range(70,120)),
    'max_features': ['auto', 'sqrt', 'log2']
}
classifier = RandomForestClassifier(random_state = 42)# scores are better with 100 estimators
# we use grid_search to find the best parameters for classifier
grid_search = GridSearchCV(classifier, param_grid=param_grid, cv=10, scoring='f1_macro', n_jobs=4)
#print(classifier.get_params())#-->best n_estimators is 100

classifier.fit(x_train, Y_train)
y_pred = classifier.predict(x_test)


#---------------------------------test and values of errors of model---------------------------------------------------------------------------------
#we do not use mape because it is problematic for datasets whose scales do not have a meaningful 0 or for intermittent demand datasets, where y_t=0 occurs frequently.
#instead we use MASE: mean  absolut error
#accuracy is usually not measured in regression so we only compute error values

df=pd.DataFrame({'Actual':Y_test.flatten(), 'Predicted':y_pred.flatten()})
#print(df)
# View the classification report for test data and predictions
bsf = classifier.score(x_train, Y_train)
print("Test Accuracy: {}%".format(round(bsf*100, 2)))
print(classification_report(Y_test, y_pred))
testScore = classifier.score(x_test, Y_test)
print('test Score: %.15f MSE (%.15f RMSE)' % (testScore, np.sqrt(testScore)))
trainScore = classifier.score(x_train, Y_train)
print('Train Score: %.15f MSE (%.15f RMSE)' % (trainScore, np.sqrt(trainScore)))


#confusion matrix:
y_pred = (y_pred > 0.5) 
cm = confusion_matrix(Y_test.argmax(axis=1), y_pred.argmax(axis=1))
x_axis = ["Edible", "Poisonous"]
y_axis = ["Edible", "Poisonous"]
f, ax = plt.subplots(figsize =(7,7))
sns.heatmap(cm, annot = True, linewidths=0.2, linecolor="black", fmt = ".0f", ax=ax, xticklabels=x_axis, yticklabels=y_axis)
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title('Confusion Matrix for Random Forest Classifier');
plt.show()
plt.close()

ModuleNotFoundError: No module named 'tensorflow'

In [8]:
## Model 2: Artificial Neural Network

In [9]:
import warnings

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from keras.models import Sequential 
from keras.layers import Dense 
from keras.optimizers import *
from keras.utils import *
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix

#-----------------------------------------import and encoding of data so it can be used in randomForest Model-------------------------------------

#dataset import
colnames=['e/p','cap-shape', 'cap-surface', 'cap-color', 'bruises?', 'odor', 'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color', 'stalk-shape', 'stalk-root', 'stalk-surface-above-ring', 'stalk-surface-below-ring', 'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-type', 'veil-color','ring-number', 'ring-type', 'spore-print-color', 'population', 'habitat'] 
dataset = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data", names=colnames, header=0)

#oneHOtEncoding
labels = colnames
categorical_data = dataset[labels]
ohe = OneHotEncoder(categories='auto')
feature_arr = ohe.fit_transform(categorical_data).toarray()
ohe_labels = ohe.get_feature_names_out(labels)
features = pd.DataFrame(feature_arr,columns=ohe_labels)

#print(features)

x= features.iloc[:, -117:].values
Y=features.iloc[:, :-117].values

#-------------------------------------------------implementation of ANN model------------------------------------------------

x_train,x_test, Y_train,Y_test = train_test_split(x, Y, test_size=1/3)

num_classes = Y_train.shape[1]
#print(num_classes)


def neural_network():
    model = Sequential()
    model.add(Dense(117, input_dim=117, kernel_initializer='normal', activation='relu'))
    model.add(Dense(num_classes, kernel_initializer='normal', activation='softmax'))    
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

param_grid= {'n_estimators':[100,200,300,400,500,600,700,800,900,1000],
    'criterion':['gini','entropy'],
    'max_depth': [2,3,4,5,6,7,8,20,100]}

model = neural_network()
grid_search = GridSearchCV(model, param_grid=param_grid, cv=10, scoring='f1_macro', n_jobs=4)
model.summary()

warnings.filterwarnings("ignore")

model.fit(x_train, Y_train, validation_data=(x_test, Y_test), epochs=1, batch_size=100)

#---------------------------------test and values of errors of model---------------------------------------------------------------------------------

scores = model.evaluate(x_test, Y_test)
print("Neural network accuracy: %.2f%%" % (scores[1]*100))

y_pred= model.predict(x_test)


#confusion matrix:
y_pred = (y_pred > 0.5) 
cm = confusion_matrix(Y_test[:,0], y_pred[:,0])
x_axis_labels = ["Edible", "Poisonous"]
y_axis_labels = ["Edible", "Poisonous"]
f, ax = plt.subplots(figsize =(7,7))
sns.heatmap(cm, annot = True, linewidths=0.2, linecolor="black", fmt = ".0f", ax=ax, cmap="Purples", xticklabels=x_axis_labels, yticklabels=y_axis_labels)
plt.xlabel("PREDICTED LABEL")
plt.ylabel("TRUE LABEL")
plt.title('Confusion Matrix for neural network');

plt.show()
plt.close()



ModuleNotFoundError: No module named 'sklearn'

In [10]:
## Model 3: K-Nearest Neighbors 

In [11]:
from sklearn.datasets import make_blobs
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix

#dataset import
colnames =['e/p','cap-shape', 'cap-surface', 'cap-color', 'bruises?', 'odor', 'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color', 'stalk-shape', 'stalk-root', 'stalk-surface-above-ring', 'stalk-surface-below-ring', 'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-type', 'veil-color','ring-number', 'ring-type', 'spore-print-color', 'population', 'habitat'] 
df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data", names=colnames, header=0)

#dataset processing to convert categorical data into numerical data
df = df.astype('category')
labelencoder = LabelEncoder()
for column in df.columns:
  df[column] = labelencoder.fit_transform(df[column])
# print(df.head())

#to predict the edibility of the mushroom, we drop the premier column
X = df.drop(['e/p'], axis=1)
y = df["e/p"]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=1/3)

#implementation of KNN classifier
best_KValue = 0
bsf = 0

for i in range(1, 10):
  knn = KNeighborsClassifier(n_neighbors=i)
  knn.fit(X_train, y_train)
  if knn.score(X_test, y_test) > bsf:
    bsf = knn.score(X_train, y_train)
    best_KValue = i

print("Best KNN Value: {}".format(best_KValue))
print("Test Accuracy: {}%".format(round(bsf*100, 2)))

y_pred = knn.predict(X_test)
print("KNN Classifier report: \n\n", classification_report(y_test, y_pred))

testScore = knn.score(X_test, y_test)
print('test Score: %.15f MSE (%.15f RMSE)' % (testScore, np.sqrt(testScore)))
trainScore = knn.score(X_train, y_train)
print('Train Score: %.15f MSE (%.15f RMSE)' % (trainScore, np.sqrt(trainScore)))

#confusion matrix
cm = confusion_matrix(y_test, y_pred)

x_axis = ["Edible", "Poisonious"]
y_axis = ["Edible", "Poisonious"]

f, ax = plt.subplots(figsize = (7,7))
sns.heatmap(cm, annot = True, linewidths=0.2, linecolor="black", fmt = ".0f", ax=ax, cmap="Purples", xticklabels=x_axis, yticklabels=y_axis)
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title('Confusion Matrix for KNN Classifier')
plt.show()

ModuleNotFoundError: No module named 'matplotlib'