Importing the Libraries.
Fetching the mushroom dataset from the UCI Machine Learning Repository.

In [36]:
# Importing libraries
from ucimlrepo import fetch_ucirepo 
import pandas as pd
import numpy as np
from sklearn import preprocessing, model_selection, svm

# fetch dataset from UCI Machine Learning Repository
mushroom = fetch_ucirepo(id=73) 

# data as pandas dataframes
mushroom_df = pd.DataFrame.from_dict(mushroom.data.original)

not sure if this helps but i dropped the rows with null elements 

In [37]:

# number of rows before 
elements = mushroom_df.shape[0]
print(f"Number of elements before: {elements}")

# checking for NULL elements
null_elements = mushroom_df.isnull().sum().sum()
print(f"Number of null elements before: {null_elements}")

# removing rows with null elements
mushroom_df = mushroom_df.dropna()

# number of rows after
elements = mushroom_df.shape[0]
print(f"Number of elements remaining: {elements}")

# check for null elements after removing
null_elements = mushroom_df.isnull().sum().sum()
print(f"Number of null elements after: {null_elements}")

print(mushroom_df.head())


Number of elements before: 8124
Number of null elements before: 2480
Number of elements remaining: 5644
Number of null elements after: 0
  cap-shape cap-surface cap-color bruises odor gill-attachment gill-spacing  \
0         x           s         n       t    p               f            c   
1         x           s         y       t    a               f            c   
2         b           s         w       t    l               f            c   
3         x           y         w       t    p               f            c   
4         x           s         g       f    n               f            w   

  gill-size gill-color stalk-shape  ... stalk-color-above-ring  \
0         n          k           e  ...                      w   
1         b          k           e  ...                      w   
2         b          n           e  ...                      w   
3         n          n           e  ...                      w   
4         b          k           t  ...                   

Got to look at the dataset and figuring out the features which are significant to determine if a mushroom is poisonous

In [28]:
# get unique values for each column of data in the X dataframe

for col in mushroom_df.columns:
    print(col, mushroom_df[col].unique())


cap-shape ['x' 'b' 's' 'f' 'k' 'c']
cap-surface ['s' 'y' 'f' 'g']
cap-color ['n' 'y' 'w' 'g' 'e' 'p' 'b' 'c']
bruises ['t' 'f']
odor ['p' 'a' 'l' 'n' 'f' 'c' 'm']
gill-attachment ['f' 'a']
gill-spacing ['c' 'w']
gill-size ['n' 'b']
gill-color ['k' 'n' 'g' 'p' 'w' 'h' 'u' 'r' 'y']
stalk-shape ['e' 't']
stalk-root ['e' 'c' 'b' 'r']
stalk-surface-above-ring ['s' 'f' 'k' 'y']
stalk-surface-below-ring ['s' 'f' 'y' 'k']
stalk-color-above-ring ['w' 'g' 'p' 'n' 'b' 'c' 'y']
stalk-color-below-ring ['w' 'p' 'g' 'b' 'n' 'c' 'y']
veil-type ['p']
veil-color ['w' 'y']
ring-number ['o' 't' 'n']
ring-type ['p' 'e' 'l' 'n']
spore-print-color ['k' 'n' 'u' 'h' 'r' 'w']
population ['s' 'n' 'a' 'v' 'y' 'c']
habitat ['u' 'g' 'm' 'd' 'p' 'l']
poisonous ['p' 'e']


With a bit of searching i found that "Amanita" Mushrooms are mostly poisonous and has some features that are significant to determine it. 

1. color
2. cap shape
3. Bruises
4. cap texture
5. cap surface
6. Stalk Root
7. Stalk ring
8. Gill
9. Gill color
10. Spore color
11. Odor ( most edible mushrooms have musky earthy smell while poisonous ones have bad odor)

so the rest can be safely discarded 


In [38]:
mushroom_df = mushroom_df.drop(
    [
        "gill-attachment",
        "gill-spacing",
        "stalk-surface-above-ring",
        "stalk-surface-below-ring",
        "stalk-color-above-ring",
        "stalk-color-below-ring",
        "veil-type",
        "population"
    ], axis=1
)

print(mushroom_df.head())

  cap-shape cap-surface cap-color bruises odor gill-size gill-color  \
0         x           s         n       t    p         n          k   
1         x           s         y       t    a         b          k   
2         b           s         w       t    l         b          n   
3         x           y         w       t    p         n          n   
4         x           s         g       f    n         b          k   

  stalk-shape stalk-root veil-color ring-number ring-type spore-print-color  \
0           e          e          w           o         p                 k   
1           e          c          w           o         p                 n   
2           e          c          w           o         p                 n   
3           e          e          w           o         p                 k   
4           t          e          w           o         e                 n   

  habitat poisonous  
0       u         p  
1       g         e  
2       m         e  
3       u 

In [39]:
categories = [
    "cap-shape",
    "cap-surface",
    "cap-color",
    "odor",
    "gill-size",
    "gill-color",
    "stalk-shape",
    "stalk-root",
    "veil-color",
    "ring-number",
    "ring-type",
    "spore-print-color",
    "habitat"    
    ]

mushroom_df[categories] = mushroom_df[categories].astype("category")


# map column bruises t as True and f as False boolean value
mushroom_df["bruises"] = mushroom_df["bruises"].map({"t": True, "f": False})

# map column poisonous  p as True and e as False boolean value
mushroom_df["poisonous"] = mushroom_df["poisonous"].map({"p": True, "e": False})


# print(mushroom_df.dtypes)
print(mushroom_df.head())

  cap-shape cap-surface cap-color  bruises odor gill-size gill-color  \
0         x           s         n     True    p         n          k   
1         x           s         y     True    a         b          k   
2         b           s         w     True    l         b          n   
3         x           y         w     True    p         n          n   
4         x           s         g    False    n         b          k   

  stalk-shape stalk-root veil-color ring-number ring-type spore-print-color  \
0           e          e          w           o         p                 k   
1           e          c          w           o         p                 n   
2           e          c          w           o         p                 n   
3           e          e          w           o         p                 k   
4           t          e          w           o         e                 n   

  habitat  poisonous  
0       u       True  
1       g      False  
2       m      False  


In [40]:
# get dummies for categorical data
mushroom_df = pd.get_dummies(mushroom_df)

# print the first few lines of the dataframe
print(mushroom_df.head())

   bruises  poisonous  cap-shape_b  cap-shape_c  cap-shape_f  cap-shape_k  \
0     True       True            0            0            0            0   
1     True      False            0            0            0            0   
2     True      False            1            0            0            0   
3     True       True            0            0            0            0   
4    False      False            0            0            0            0   

   cap-shape_s  cap-shape_x  cap-surface_f  cap-surface_g  ...  \
0            0            1              0              0  ...   
1            0            1              0              0  ...   
2            0            0              0              0  ...   
3            0            1              0              0  ...   
4            0            1              0              0  ...   

   spore-print-color_n  spore-print-color_r  spore-print-color_u  \
0                    0                    0                    0   
1   

Seperated the Features form the Targets from the dataset

In [32]:
# creating X and y dataframes
X =  mushroom_df.drop("poisonous", axis=1)
y =  mushroom_df["poisonous"]


In [33]:
# split the dataset into training and testing sets
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2)


In [34]:
# create the model
clf = svm.SVC()
# train the model
clf.fit(X_train, y_train)
# get the accuracy of the model
accuracy = clf.score(X_test, y_test)
# print the accuracy of the model
print(f"Accuracy: {accuracy}")

Accuracy: 1.0


In [35]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, log_loss

# create a linear regression model
model = LogisticRegression()
# train the model
model.fit(X_train, y_train)

# make predictions on both train and test sets
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

# Calculate accuracy of test and train
accuracy_train = accuracy_score(y_train, y_pred_train)
accuracy_test = accuracy_score(y_test, y_pred_test)

# Calculate log loss of test and train
log_loss_train = log_loss(y_train, y_pred_train)
log_loss_test = log_loss(y_test, y_pred_test)

# print the train and test accuracy and log loss of the model
print(f"Train Accuracy: {accuracy_train}")
print(f"Test Accuracy: {accuracy_test}")
print(f"Train Log Loss: {log_loss_train}")
print(f"Test Log Loss: {log_loss_test}")





Train Accuracy: 1.0
Test Accuracy: 1.0
Train Log Loss: 2.220446049250313e-16
Test Log Loss: 2.220446049250313e-16
