In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

plt.rcParams["font.size"] = 16

from sklearn.model_selection import cross_val_score, cross_validate, train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

In [3]:
mushroom_traindf = pd.read_csv("train.csv")
mushroom_testdf = pd.read_csv("test.csv")

In [5]:
mushroom_traindf.count()

id                      3116945
class                   3116945
cap-diameter            3116941
cap-shape               3116905
cap-surface             2445922
cap-color               3116933
does-bruise-or-bleed    3116937
gill-attachment         2593009
gill-spacing            1858510
gill-color              3116888
stem-height             3116945
stem-width              3116945
stem-root                359922
stem-surface            1136084
stem-color              3116907
veil-type                159452
veil-color               375998
has-ring                3116921
ring-type               2988065
spore-print-color        267263
habitat                 3116900
season                  3116945
dtype: int64

In [7]:
mushroom_traindf.dtypes

id                        int64
class                    object
cap-diameter            float64
cap-shape                object
cap-surface              object
cap-color                object
does-bruise-or-bleed     object
gill-attachment          object
gill-spacing             object
gill-color               object
stem-height             float64
stem-width              float64
stem-root                object
stem-surface             object
stem-color               object
veil-type                object
veil-color               object
has-ring                 object
ring-type                object
spore-print-color        object
habitat                  object
season                   object
dtype: object

class: binary\
cap-diameter: numeric\
cap-shape: categorical\
cap-surface: categorical\
cap-color: categorical\
does-bruise-or-bleed: binary\
gill-attachment: categorical\
gill-spacing: categorical\
gill-color: categorical\
stem-height: numeric\
stem-width: numeric\
stem-root: categorical\
stem-surface: categorical\
stem-color: categorical\
veil-type: categorical\
veil-color: categorical\
has-ring: binary\
ring-type: categorical\
spore-print-color: categorical\
habitat: categorical\
season: categorical

In [10]:
# Dispaly percentage of data available
missing_percentage_train = mushroom_traindf.isnull().mean() * 100

missing_percentage_train

id                       0.000000
class                    0.000000
cap-diameter             0.000128
cap-shape                0.001283
cap-surface             21.528227
cap-color                0.000385
does-bruise-or-bleed     0.000257
gill-attachment         16.809280
gill-spacing            40.373988
gill-color               0.001829
stem-height              0.000000
stem-width               0.000000
stem-root               88.452732
stem-surface            63.551362
stem-color               0.001219
veil-type               94.884350
veil-color              87.936970
has-ring                 0.000770
ring-type                4.134818
spore-print-color       91.425482
habitat                  0.001444
season                   0.000000
dtype: float64

In [12]:
missing_percentage_test = mushroom_testdf.isnull().mean() * 100

missing_percentage_test

id                       0.000000
cap-diameter             0.000337
cap-shape                0.001492
cap-surface             21.506821
cap-color                0.000626
does-bruise-or-bleed     0.000481
gill-attachment         16.834796
gill-spacing            40.404694
gill-color               0.002358
stem-height              0.000048
stem-width               0.000000
stem-root               88.452543
stem-surface            63.595327
stem-color               0.001011
veil-type               94.878689
veil-color              87.880445
has-ring                 0.000914
ring-type                4.148051
spore-print-color       91.417224
habitat                  0.001203
season                   0.000000
dtype: float64

In [13]:
# Remove any data where over 50% was missing
train_cleaned = mushroom_traindf.loc[:, missing_percentage_train < 50]
test_cleaned = mushroom_testdf.loc[:, missing_percentage_test < 50]

train_cleaned.head(), test_cleaned.head()

(   id class  cap-diameter cap-shape cap-surface cap-color  \
 0   0     e          8.80         f           s         u   
 1   1     p          4.51         x           h         o   
 2   2     e          6.94         f           s         b   
 3   3     e          3.88         f           y         g   
 4   4     e          5.85         x           l         w   
 
   does-bruise-or-bleed gill-attachment gill-spacing gill-color  stem-height  \
 0                    f               a            c          w         4.51   
 1                    f               a            c          n         4.79   
 2                    f               x            c          w         6.85   
 3                    f               s          NaN          g         4.16   
 4                    f               d          NaN          w         3.37   
 
    stem-width stem-color has-ring ring-type habitat season  
 0       15.39          w        f         f       d      a  
 1        6.48      

In [None]:
# Imputation and Encoding with remaining columns
columns_to_use = ['cap-diameter', 'cap-shape', 'cap-surface', 'cap-color',
                  'does-bruise-or-bleed', 'gill-attachment', 'gill-spacing', 'gill-color',
                  'stem-height', 'stem-width', 'stem-color', 'has-ring', 'ring-type', 'habitat', 'season']

train_operations = mushroom_traindf[columns_to_use].copy()
test_operations = mushroom_testdf[columns_to_use].copy()

binary_cols = ['does-bruise-or-bleed', 'has-ring']
numeric_cols = ['cap-diameter', 'stem-height', 'stem-width']
categorical_cols = ['cap-shape', 'cap-surface', 'cap-color', 'gill-attachment', 'gill-spacing', 
                    'gill-color', 'stem-color', 'ring-type', 'habitat', 'season']

# Imputing values
numeric_imputer = SimpleImputer(strategy='median')
train_operations.loc[:, numeric_cols] = numeric_imputer.fit_transform(train_operations[numeric_cols])
test_operations.loc[:, numeric_cols] = numeric_imputer.transform(test_operations[numeric_cols])

# Imputing categorical columns with 'most_frequent'
categorical_imputer = SimpleImputer(strategy='most_frequent')
train_operations.loc[:, categorical_cols] = categorical_imputer.fit_transform(train_operations[categorical_cols])
test_operations.loc[:, categorical_cols] = categorical_imputer.transform(test_operations[categorical_cols])

train_operations['does-bruise-or-bleed'] = train_operations['does-bruise-or-bleed'].map({'f': 0, 't': 1}).astype('float64')
train_operations['has-ring'] = train_operations['has-ring'].map({'f': 0, 't': 1}).astype('float64')

test_operations['does-bruise-or-bleed'] = test_operations['does-bruise-or-bleed'].map({'f': 0, 't': 1}).astype('float64')
test_operations['has-ring'] = test_operations['has-ring'].map({'f': 0, 't': 1}).astype('float64')

In [None]:
# Using a dummy classifier
X_train = train_cleaned.drop('class', axis=1)
y_train = train_cleaned['class']

dummy = DummyClassifier(strategy="most_frequent")

dummy.fit(X_train, y_train)

X_test = test_cleaned.drop('id', axis=1)
test_predictions = dummy.predict(X_test)

In [None]:
submission = pd.DataFrame(
    {
        'id' : mushroom_testdf['id'],
        'class' : test_predictions
    }
)

submission.to_csv('submission.csv', index=False)

submission.head()