In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelBinarizer
import numpy as np

import warnings
warnings.filterwarnings("ignore")

df = pd.read_csv("mushroom_unpro.csv")

target_col = 'target'

feature_names = [col for col in df.columns if col != target_col]

possible_outcomes = list(df[target_col].unique())
print("Possible outcomes:", possible_outcomes)
print(df[target_col].value_counts())

pro_data = df.dropna()

non_num_cols = [column for column in pro_data.drop([target_col], axis = 1).columns if pro_data[column].dtypes == "O"]
num_cols = [column for column in pro_data.drop([target_col], axis = 1).columns if column not in non_num_cols]

num_data = pro_data[num_cols]
cat_data = pro_data[non_num_cols]

Possible outcomes: ['p', 'e']
e    4208
p    3916
Name: target, dtype: int64


In [2]:
cat_data

Unnamed: 0,cap_shape,cap_surface,cap_colour,bruises,odour,gill_attachment,gill_spacing,gill_size,gill_colour,stalk_shape,...,stake_surface_below_ring,stalk_colour_above_ring,stalk_colour_below_ring,veil_type,veil_colour,ring_number,ring_type,spore_print_colour,population,habitat
0,x,s,n,t,p,f,c,n,k,e,...,s,w,w,p,w,o,p,k,s,u
1,x,s,y,t,a,f,c,b,k,e,...,s,w,w,p,w,o,p,n,n,g
2,b,s,w,t,l,f,c,b,n,e,...,s,w,w,p,w,o,p,n,n,m
3,x,y,w,t,p,f,c,n,n,e,...,s,w,w,p,w,o,p,k,s,u
4,x,s,g,f,n,f,w,b,k,t,...,s,w,w,p,w,o,e,n,a,g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,k,s,n,f,n,a,c,b,y,e,...,s,o,o,p,o,o,p,b,c,l
8120,x,s,n,f,n,a,c,b,y,e,...,s,o,o,p,n,o,p,b,v,l
8121,f,s,n,f,n,a,c,b,n,e,...,s,o,o,p,o,o,p,b,c,l
8122,k,y,n,f,y,f,c,n,b,t,...,k,w,w,p,w,o,e,w,v,l


In [3]:
encoder = OneHotEncoder(sparse = True)
enc_data = encoder.fit_transform(cat_data)
enc_df = pd.DataFrame.sparse.from_spmatrix(enc_data, columns = encoder.get_feature_names(cat_data.columns))
enc_df

Unnamed: 0,cap_shape_b,cap_shape_c,cap_shape_f,cap_shape_k,cap_shape_s,cap_shape_x,cap_surface_f,cap_surface_g,cap_surface_s,cap_surface_y,...,population_s,population_v,population_y,habitat_d,habitat_g,habitat_l,habitat_m,habitat_p,habitat_u,habitat_w
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
8120,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
8121,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
8122,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [4]:
binarizer = LabelBinarizer()
target_data = binarizer.fit_transform(pro_data[target_col])
target_data = target_data.reshape(1, -1)[0]
enc_df['target'] = target_data

In [5]:
enc_df

Unnamed: 0,cap_shape_b,cap_shape_c,cap_shape_f,cap_shape_k,cap_shape_s,cap_shape_x,cap_surface_f,cap_surface_g,cap_surface_s,cap_surface_y,...,population_v,population_y,habitat_d,habitat_g,habitat_l,habitat_m,habitat_p,habitat_u,habitat_w,target
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0
3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0
8120,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0
8121,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0
8122,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1


In [6]:
enc_df.to_csv("mushroom.csv", index = False)

In [7]:
num_cols = None
one_hot_cols = list(enc_df.columns.drop('target'))

col_dict = {"continuous": num_cols, "discrete": one_hot_cols}

import json

form = json.dumps(col_dict)
f = open("col_dict.json", "w")
f.write(form)
f.close()

with open('col_dict.json') as file:
    col_dict = json.load(file)
file.close()
col_dict

{'continuous': None,
 'discrete': ['cap_shape_b',
  'cap_shape_c',
  'cap_shape_f',
  'cap_shape_k',
  'cap_shape_s',
  'cap_shape_x',
  'cap_surface_f',
  'cap_surface_g',
  'cap_surface_s',
  'cap_surface_y',
  'cap_colour_b',
  'cap_colour_c',
  'cap_colour_e',
  'cap_colour_g',
  'cap_colour_n',
  'cap_colour_p',
  'cap_colour_r',
  'cap_colour_u',
  'cap_colour_w',
  'cap_colour_y',
  'bruises_f',
  'bruises_t',
  'odour_a',
  'odour_c',
  'odour_f',
  'odour_l',
  'odour_m',
  'odour_n',
  'odour_p',
  'odour_s',
  'odour_y',
  'gill_attachment_a',
  'gill_attachment_f',
  'gill_spacing_c',
  'gill_spacing_w',
  'gill_size_b',
  'gill_size_n',
  'gill_colour_b',
  'gill_colour_e',
  'gill_colour_g',
  'gill_colour_h',
  'gill_colour_k',
  'gill_colour_n',
  'gill_colour_o',
  'gill_colour_p',
  'gill_colour_r',
  'gill_colour_u',
  'gill_colour_w',
  'gill_colour_y',
  'stalk_shape_e',
  'stalk_shape_t',
  'stalk_root_?',
  'stalk_root_b',
  'stalk_root_c',
  'stalk_root_e',
  's