In [44]:
#Installing and importing essential packages
!pip install ucimlrepo
!pip install pandas
!pip install numpy
!pip install scikit-learn
!pip install torch

import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch as torch



In [24]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
mushroom = fetch_ucirepo(id=73) 
  
# data (as pandas dataframes) 
X = mushroom.data.features 
y = mushroom.data.targets 
  
# metadata 
print(mushroom.metadata) 
  
# variable information 
print(mushroom.variables) 

{'uci_id': 73, 'name': 'Mushroom', 'repository_url': 'https://archive.ics.uci.edu/dataset/73/mushroom', 'data_url': 'https://archive.ics.uci.edu/static/public/73/data.csv', 'abstract': 'From Audobon Society Field Guide; mushrooms described in terms of physical characteristics; classification: poisonous or edible', 'area': 'Biology', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 8124, 'num_features': 22, 'feature_types': ['Categorical'], 'demographics': [], 'target_col': ['poisonous'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 1981, 'last_updated': 'Thu Aug 10 2023', 'dataset_doi': '10.24432/C5959T', 'creators': [], 'intro_paper': None, 'additional_info': {'summary': "This data set includes descriptions of hypothetical samples corresponding to 23 species of gilled mushrooms in the Agaricus and Lepiota Family (pp. 500-525).  Each species is identified as definitely edible, definitely po

In [25]:
#These datasets are already in pandas df form

mushroom['data']

{'ids': None,
 'features':      cap-shape cap-surface cap-color bruises odor gill-attachment  \
 0            x           s         n       t    p               f   
 1            x           s         y       t    a               f   
 2            b           s         w       t    l               f   
 3            x           y         w       t    p               f   
 4            x           s         g       f    n               f   
 ...        ...         ...       ...     ...  ...             ...   
 8119         k           s         n       f    n               a   
 8120         x           s         n       f    n               a   
 8121         f           s         n       f    n               a   
 8122         k           y         n       f    y               f   
 8123         x           s         n       f    n               a   
 
      gill-spacing gill-size gill-color stalk-shape  ...  \
 0               c         n          k           e  ...   
 1          

In [26]:
mushroom_df = mushroom['data']['original']

In [31]:
mushroom_df

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat,poisonous
0,x,s,n,t,p,f,c,n,k,e,...,w,w,p,w,o,p,k,s,u,p
1,x,s,y,t,a,f,c,b,k,e,...,w,w,p,w,o,p,n,n,g,e
2,b,s,w,t,l,f,c,b,n,e,...,w,w,p,w,o,p,n,n,m,e
3,x,y,w,t,p,f,c,n,n,e,...,w,w,p,w,o,p,k,s,u,p
4,x,s,g,f,n,f,w,b,k,t,...,w,w,p,w,o,e,n,a,g,e
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,k,s,n,f,n,a,c,b,y,e,...,o,o,p,o,o,p,b,c,l,e
8120,x,s,n,f,n,a,c,b,y,e,...,o,o,p,n,o,p,b,v,l,e
8121,f,s,n,f,n,a,c,b,n,e,...,o,o,p,o,o,p,b,c,l,e
8122,k,y,n,f,y,f,c,n,b,t,...,w,w,p,w,o,e,w,v,l,p


In [35]:
Y = mushroom['data']['targets']
Y

Unnamed: 0,poisonous
0,p
1,e
2,e
3,p
4,e
...,...
8119,e
8120,e
8121,e
8122,p


In [36]:
variables = mushroom['variables']
variables

Unnamed: 0,name,role,type,demographic,description,units,missing_values
0,poisonous,Target,Categorical,,,,no
1,cap-shape,Feature,Categorical,,"bell=b,conical=c,convex=x,flat=f, knobbed=k,su...",,no
2,cap-surface,Feature,Categorical,,"fibrous=f,grooves=g,scaly=y,smooth=s",,no
3,cap-color,Feature,Binary,,"brown=n,buff=b,cinnamon=c,gray=g,green=r, pink...",,no
4,bruises,Feature,Categorical,,"bruises=t,no=f",,no
5,odor,Feature,Categorical,,"almond=a,anise=l,creosote=c,fishy=y,foul=f, mu...",,no
6,gill-attachment,Feature,Categorical,,"attached=a,descending=d,free=f,notched=n",,no
7,gill-spacing,Feature,Categorical,,"close=c,crowded=w,distant=d",,no
8,gill-size,Feature,Categorical,,"broad=b,narrow=n",,no
9,gill-color,Feature,Categorical,,"black=k,brown=n,buff=b,chocolate=h,gray=g, gre...",,no


In [37]:
#investigating our descriptions of variables

variables.loc[variables["name"] == "poisonous"]

Unnamed: 0,name,role,type,demographic,description,units,missing_values
0,poisonous,Target,Categorical,,,,no


In [38]:
X = mushroom_df.drop(columns=["poisonous"],  axis = 1)
X

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,x,s,n,t,p,f,c,n,k,e,...,s,w,w,p,w,o,p,k,s,u
1,x,s,y,t,a,f,c,b,k,e,...,s,w,w,p,w,o,p,n,n,g
2,b,s,w,t,l,f,c,b,n,e,...,s,w,w,p,w,o,p,n,n,m
3,x,y,w,t,p,f,c,n,n,e,...,s,w,w,p,w,o,p,k,s,u
4,x,s,g,f,n,f,w,b,k,t,...,s,w,w,p,w,o,e,n,a,g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,k,s,n,f,n,a,c,b,y,e,...,s,o,o,p,o,o,p,b,c,l
8120,x,s,n,f,n,a,c,b,y,e,...,s,o,o,p,n,o,p,b,v,l
8121,f,s,n,f,n,a,c,b,n,e,...,s,o,o,p,o,o,p,b,c,l
8122,k,y,n,f,y,f,c,n,b,t,...,k,w,w,p,w,o,e,w,v,l


In [39]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size= 0.2, random_state = 99)

In [40]:
x_train

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
5720,k,s,e,t,n,f,c,b,w,e,...,s,w,w,p,w,t,e,w,c,w
645,x,s,y,t,l,f,w,n,n,t,...,s,w,w,p,w,o,p,u,v,d
5752,b,s,p,t,n,f,c,b,r,e,...,s,w,w,p,w,t,p,r,v,g
4062,f,y,e,t,n,f,c,b,p,t,...,s,w,w,p,w,o,p,k,y,d
405,x,y,y,t,l,f,c,b,n,e,...,s,w,w,p,w,o,p,k,s,m
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1737,x,f,n,t,n,f,c,b,u,t,...,s,g,p,p,w,o,p,k,v,d
3240,f,y,g,t,n,f,c,b,p,t,...,s,p,w,p,w,o,p,k,y,d
5305,x,y,n,f,s,f,c,n,b,t,...,s,w,w,p,w,o,e,w,v,l
7203,k,s,e,f,y,f,c,n,b,t,...,k,p,w,p,w,o,e,w,v,l


In [46]:
#Here we want to create a label encoder to encode our categorical variables with numbers

le = LabelEncoder()

In [50]:
#Testing this out on one of our columns from our x dataframe


le.fit_transform(x_train["cap-color"])

array([2, 9, 5, ..., 4, 2, 3], shape=(6499,))

In [51]:
x_train.columns

Index(['cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat'],
      dtype='object')

In [58]:
#Now we want to do this for all the columns and create a separate dataframe. We are using x training dataframe here

transformed_df = pd.DataFrame()

for col in x_train.columns:
    transformed_df[f'{col}'] = le.fit_transform(x_train[f'{col}'])


transformed_df

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,3,2,2,1,5,1,0,0,10,0,...,2,7,7,0,2,2,0,7,1,6
1,5,2,9,1,3,1,1,1,5,1,...,2,7,7,0,2,1,4,6,4,0
2,0,2,5,1,5,1,0,0,8,0,...,2,7,7,0,2,2,4,5,4,1
3,2,3,2,1,5,1,0,0,7,1,...,2,7,7,0,2,1,4,2,5,0
4,5,3,9,1,3,1,0,0,5,0,...,2,7,7,0,2,1,4,2,3,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6494,5,0,4,1,5,1,0,0,9,1,...,2,3,6,0,2,1,4,2,4,0
6495,2,3,3,1,5,1,0,0,7,1,...,2,6,7,0,2,1,4,2,5,0
6496,5,3,4,0,7,1,0,1,0,1,...,2,7,7,0,2,1,0,7,4,2
6497,3,2,2,0,8,1,0,1,0,1,...,1,6,7,0,2,1,0,7,4,2


In [56]:
rand_df = pd.DataFrame()

rand_df['Col1'] = [1,2,3]

In [57]:
rand_df

Unnamed: 0,Col1
0,1
1,2
2,3
