In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder

Data is from https://archive.ics.uci.edu/dataset/848/secondary+mushroom+dataset

In [3]:
df = pd.read_csv("./MushroomDataset/secondary_data.csv", sep=";")
df.head()

Unnamed: 0,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,...,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,p,15.26,x,g,o,f,e,,w,16.95,...,s,y,w,u,w,t,g,,d,w
1,p,16.6,x,g,o,f,e,,w,17.99,...,s,y,w,u,w,t,g,,d,u
2,p,14.07,x,g,o,f,e,,w,17.8,...,s,y,w,u,w,t,g,,d,w
3,p,14.17,f,h,e,f,e,,w,15.77,...,s,y,w,u,w,t,p,,d,w
4,p,14.64,x,h,o,f,e,,w,16.53,...,s,y,w,u,w,t,p,,d,w


In [4]:
for i in df.columns:
    if df[i].count() != len(df):
        print(f"{i}: {(df[i]).count()}")

cap-surface: 46949
gill-attachment: 51185
gill-spacing: 36006
stem-root: 9531
stem-surface: 22945
veil-type: 3177
veil-color: 7413
ring-type: 58598
spore-print-color: 6354


Many of the features are missing values for large parts of data, so i will be removing features that have NaN's for more than 20% of the data because filling in that much of the data might make the model inaccurate or mess with true proportions of categorical data. columns with missing values that make up less than 20% will have NaN's replaced with mode if categorical or mean if numerical

In [6]:
for i in df.columns:
    if (df[i].count() != len(df)) & (df[i].count()/len(df) > .80) :
        print(f"{i}: {(df[i]).count()}")

gill-attachment: 51185
ring-type: 58598


In [7]:
df["ring-type"] = df["ring-type"].fillna(df["ring-type"].mode()[0])
df["gill-attachment"] = df["gill-attachment"].fillna(df["gill-attachment"].mode()[0])

In [8]:
keep_cols = []
for i in df.columns:
    if df[i].count()/len(df) > .80:
        keep_cols.append(i)
df = df[keep_cols].copy()
df['class'] = df['class'].replace('e', "1").replace('p', "0").astype(int)
df.head()

Unnamed: 0,class,cap-diameter,cap-shape,cap-color,does-bruise-or-bleed,gill-attachment,gill-color,stem-height,stem-width,stem-color,has-ring,ring-type,habitat,season
0,0,15.26,x,o,f,e,w,16.95,17.09,w,t,g,d,w
1,0,16.6,x,o,f,e,w,17.99,18.19,w,t,g,d,u
2,0,14.07,x,o,f,e,w,17.8,17.74,w,t,g,d,w
3,0,14.17,f,e,f,e,w,15.77,15.98,w,t,p,d,w
4,0,14.64,x,o,f,e,w,16.53,17.2,w,t,p,d,w


In [9]:
train_df, temp = train_test_split(df, test_size=0.3, random_state=12)
val_df, test_df = train_test_split(temp, test_size=0.5, random_state=12)

In [10]:
X_train = train_df.drop(['class'], axis=1, inplace=False)
y_train = train_df['class']

X_val = val_df.drop(['class'], axis=1, inplace=False)
y_val = val_df['class']

X_test = test_df.drop(['class'], axis=1, inplace=False)
y_test = test_df['class']

In [11]:
y_train

25093    1
15626    0
12074    0
11098    0
13362    0
        ..
40177    0
19709    0
58758    0
38555    0
14155    1
Name: class, Length: 42748, dtype: int32

In [12]:
categorical_features = X_train.select_dtypes(include=['object']).columns
encoder = OneHotEncoder(sparse_output=False)
X_train_c = encoder.fit_transform(X_train[categorical_features])
X_test_c = encoder.transform(X_test[categorical_features])
X_val_c = encoder.transform(X_val[categorical_features])
    
numerical_features = X_train.select_dtypes(include=['float64']).columns
scaler = StandardScaler()
X_train_n = scaler.fit_transform(X_train[numerical_features])
X_test_n = scaler.transform(X_test[numerical_features])
X_val_n = scaler.transform(X_val[numerical_features])
    
X_train_processed = np.hstack([X_train_n, X_train_c])
X_test_processed = np.hstack([X_test_n, X_test_c])
X_val_processed = np.hstack([X_val_n, X_val_c])

In [13]:
train_df = pd.DataFrame(np.hstack([y_train.values.reshape(-1,1),X_train_processed]))
val_df = pd.DataFrame(np.hstack([y_val.values.reshape(-1,1),X_val_processed]))
test_df = pd.DataFrame(np.hstack([y_test.values.reshape(-1,1),X_test_processed]))

In [14]:
train_df.to_csv('proj3_cleaned_train_data.csv', index=False)
val_df.to_csv('proj3_cleaned_validation_data.csv', index=False)
test_df.to_csv('proj3_cleaned_test_data.csv', index=False)