Data Treatment

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [2]:
df = pd.read_csv('/src/dataset/mushrooms.csv')

In [3]:
print(df.columns)

Index(['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat'],
      dtype='object')


In [4]:
#veil-type has only one value
df = df.drop(columns=['veil-type'])

In [5]:
print(df.duplicated().sum())

0


In [6]:
print(df.isna().sum())

class                       0
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64


In [7]:
X = df.drop(columns=['class'])

In [8]:
Y = df['class']

In [9]:
le = LabelEncoder()
Y = le.fit_transform(Y)
print(Y)

[1 0 0 ... 0 1 0]


In [10]:
X = pd.get_dummies(X)

In [11]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state = 42)

XGBoost Training

In [12]:
from xgboost import XGBClassifier

In [13]:
XGBoostModel = XGBClassifier(n_estimators=500, objective='binary:logistic', random_state=42)

In [14]:
XGBoostModel.fit(X_train, Y_train)

In [15]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [16]:
XGBoostPredictions = XGBoostModel.predict(X_test)

In [17]:
accuracy = accuracy_score(Y_test, XGBoostPredictions)
report = classification_report(Y_test, XGBoostPredictions)
confusion = confusion_matrix(Y_test, XGBoostPredictions)

print("Accuracy:\n", accuracy)
print("Confusion Matrix:\n", confusion)
print("Relatory:\n", report)

Accuracy:
 1.0
Confusion Matrix:
 [[843   0]
 [  0 782]]
Relatory:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       843
           1       1.00      1.00      1.00       782

    accuracy                           1.00      1625
   macro avg       1.00      1.00      1.00      1625
weighted avg       1.00      1.00      1.00      1625



RandomForest Training

In [18]:
from sklearn.ensemble import RandomForestClassifier

In [19]:
RandomForestModel = RandomForestClassifier(n_estimators=500, random_state=42)

In [20]:
RandomForestModel.fit(X_train, Y_train)

In [21]:
RandomForestPredictions = RandomForestModel.predict(X_test)

In [22]:
accuracy = accuracy_score(Y_test, RandomForestPredictions)
report = classification_report(Y_test, RandomForestPredictions)
confusion = confusion_matrix(Y_test, RandomForestPredictions)

print("Accuracy:\n", accuracy)
print("Confusion Matrix:\n", confusion)
print("Relatory:\n", report)

Accuracy:
 1.0
Confusion Matrix:
 [[843   0]
 [  0 782]]
Relatory:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       843
           1       1.00      1.00      1.00       782

    accuracy                           1.00      1625
   macro avg       1.00      1.00      1.00      1625
weighted avg       1.00      1.00      1.00      1625



Decision Tree Training

In [23]:
from sklearn.tree import DecisionTreeClassifier

In [24]:
DecisionTreeModel = DecisionTreeClassifier(random_state=42)

In [25]:
DecisionTreeModel.fit(X_train, Y_train)

In [26]:
DecisionTreePredictions = DecisionTreeModel.predict(X_test)

In [27]:
accuracy = accuracy_score(Y_test, DecisionTreePredictions)
report = classification_report(Y_test, DecisionTreePredictions)
confusion = confusion_matrix(Y_test, DecisionTreePredictions)

print("Accuracy:\n", accuracy)
print("Confusion Matrix:\n", confusion)
print("Relatory:\n", report)

Accuracy:
 1.0
Confusion Matrix:
 [[843   0]
 [  0 782]]
Relatory:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       843
           1       1.00      1.00      1.00       782

    accuracy                           1.00      1625
   macro avg       1.00      1.00      1.00      1625
weighted avg       1.00      1.00      1.00      1625



Saving The Models

In [28]:
import joblib

In [29]:
XGBoostModel.save_model("/src/models/XGBoostModel.json")

In [None]:
joblib.dump(RandomForestModel, "/src/models/RandomForestModel.joblib", compress=3)

In [None]:
joblib.dump(DecisionTreeModel, "/src/models/DecisionTreeModel.joblib", compress=3)