## Outline

* Importing the required libraries.
* Data Loading.
* Log transformation of the `cap-diameter` feature.
* Numerical features scaling.
* Categorical features encoding.

---------------

### Importing the required libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
import category_encoders as ce
import joblib

### Loading the dataset

In [2]:
mushroom_train_df = pd.read_csv("train_clean.csv")
mushroom_train_df.head()

Unnamed: 0,id,class,cap-diameter,cap-shape,cap-color,does-bruise-or-bleed,gill-color,stem-height,stem-width,stem-color,has-ring,ring-type,habitat,season
0,0,e,8.8,f,u,f,w,4.51,15.39,w,f,f,d,a
1,1,p,4.51,x,o,f,n,4.79,6.48,o,t,z,d,w
2,2,e,6.94,f,b,f,w,6.85,9.93,n,f,f,l,w
3,3,e,3.88,f,g,f,g,4.16,6.53,w,f,f,d,u
4,4,e,5.85,x,w,f,w,3.37,8.36,w,f,f,g,a


In [3]:
mushroom_test_df = pd.read_csv("test_clean.csv")
mushroom_test_df.head()

Unnamed: 0,id,cap-diameter,cap-shape,cap-color,does-bruise-or-bleed,gill-color,stem-height,stem-width,stem-color,has-ring,ring-type,habitat,season
0,3116945,8.64,x,n,t,w,11.13,17.12,w,t,g,d,a
1,3116946,6.9,o,o,f,y,1.27,10.75,n,f,f,d,a
2,3116947,2.0,b,n,f,n,6.18,3.14,n,f,f,d,s
3,3116948,3.47,x,n,f,n,4.98,8.51,w,t,z,d,u
4,3116949,6.17,x,y,f,y,6.73,13.7,y,t,f,d,u


In [4]:
train_ids = mushroom_train_df["id"]
test_ids = mushroom_test_df["id"]

In [5]:
mushroom_train_df.drop('id', axis = 1, inplace = True)
mushroom_test_df.drop('id', axis = 1, inplace = True)

In [6]:
numCols = mushroom_train_df.describe(include = np.number).columns.to_list()
catCols = mushroom_train_df.describe(include = np.object_).drop("class", axis = 1).columns.to_list()
targetCol = "class"

---------------

### Log transformation of the `cap-diameter` feature

In [7]:
mushroom_train_df["cap-diameter"] = np.log1p(mushroom_train_df["cap-diameter"])
mushroom_test_df["cap-diameter"] = np.log1p(mushroom_test_df["cap-diameter"])

In [8]:
mushroom_train_df.head()

Unnamed: 0,class,cap-diameter,cap-shape,cap-color,does-bruise-or-bleed,gill-color,stem-height,stem-width,stem-color,has-ring,ring-type,habitat,season
0,e,2.282382,f,u,f,w,4.51,15.39,w,f,f,d,a
1,p,1.706565,x,o,f,n,4.79,6.48,o,t,z,d,w
2,e,2.071913,f,b,f,w,6.85,9.93,n,f,f,l,w
3,e,1.585145,f,g,f,g,4.16,6.53,w,f,f,d,u
4,e,1.924249,x,w,f,w,3.37,8.36,w,f,f,g,a


In [9]:
mushroom_test_df.head()

Unnamed: 0,cap-diameter,cap-shape,cap-color,does-bruise-or-bleed,gill-color,stem-height,stem-width,stem-color,has-ring,ring-type,habitat,season
0,2.265921,x,n,t,w,11.13,17.12,w,t,g,d,a
1,2.066863,o,o,f,y,1.27,10.75,n,f,f,d,a
2,1.098612,b,n,f,n,6.18,3.14,n,f,f,d,s
3,1.497388,x,n,f,n,4.98,8.51,w,t,z,d,u
4,1.969906,x,y,f,y,6.73,13.7,y,t,f,d,u


In [10]:
print(mushroom_train_df["cap-diameter"].skew())
print(mushroom_test_df["cap-diameter"].skew())

-0.117664453720681
-0.11364799575602647


---------------

### Feature scaling

In [11]:
scaler = StandardScaler()
mushroom_train_df[numCols] = scaler.fit_transform(mushroom_train_df[numCols])
mushroom_test_df[numCols] = scaler.fit_transform(mushroom_test_df[numCols])

In [12]:
mushroom_train_df.head()

Unnamed: 0,class,cap-diameter,cap-shape,cap-color,does-bruise-or-bleed,gill-color,stem-height,stem-width,stem-color,has-ring,ring-type,habitat,season
0,e,0.800897,f,u,f,w,-0.68092,0.523308,w,f,f,d,a
1,p,-0.227703,x,o,f,n,-0.5772,-0.577318,o,t,z,d,w
2,e,0.42493,f,b,f,w,0.185883,-0.15115,n,f,f,l,w
3,e,-0.444599,f,g,f,g,-0.81057,-0.571142,w,f,f,d,u
4,e,0.161152,x,w,f,w,-1.103208,-0.345087,w,f,f,g,a


In [13]:
mushroom_test_df.head()

Unnamed: 0,cap-diameter,cap-shape,cap-color,does-bruise-or-bleed,gill-color,stem-height,stem-width,stem-color,has-ring,ring-type,habitat,season
0,0.772567,x,n,t,w,1.77264,0.737267,w,t,g,d,a
1,0.417168,o,o,f,y,-1.881089,-0.049156,n,f,f,d,a
2,-1.311549,b,n,f,n,-0.061636,-0.988667,n,f,f,d,s
3,-0.599573,x,n,f,n,-0.506309,-0.325701,w,t,z,d,u
4,0.24406,x,y,f,y,0.142173,0.315043,y,t,f,d,u


------------

### Categorical features encoding

In [14]:
mushroom_train_df[targetCol] = mushroom_train_df[targetCol].map({'p':0, 'e':1})

In [15]:
mushroom_train_df_encoded = mushroom_train_df.copy()
mushroom_test_df_encoded = mushroom_test_df.copy()

In [16]:
te = ce.TargetEncoder(cols = catCols, return_df = True).fit(mushroom_train_df_encoded[catCols], mushroom_train_df_encoded[targetCol])

In [17]:
mushroom_train_df_encoded[catCols] = te.transform(mushroom_train_df_encoded[catCols])
mushroom_test_df_encoded[catCols] = te.transform(mushroom_test_df_encoded[catCols])

In [18]:
mushroom_train_df.head()

Unnamed: 0,class,cap-diameter,cap-shape,cap-color,does-bruise-or-bleed,gill-color,stem-height,stem-width,stem-color,has-ring,ring-type,habitat,season
0,1,0.800897,f,u,f,w,-0.68092,0.523308,w,f,f,d,a
1,0,-0.227703,x,o,f,n,-0.5772,-0.577318,o,t,z,d,w
2,1,0.42493,f,b,f,w,0.185883,-0.15115,n,f,f,l,w
3,1,-0.444599,f,g,f,g,-0.81057,-0.571142,w,f,f,d,u
4,1,0.161152,x,w,f,w,-1.103208,-0.345087,w,f,f,g,a


In [19]:
mushroom_train_df_encoded.head()

Unnamed: 0,class,cap-diameter,cap-shape,cap-color,does-bruise-or-bleed,gill-color,stem-height,stem-width,stem-color,has-ring,ring-type,habitat,season
0,1,0.800897,0.488037,0.460691,0.444231,0.571559,-0.68092,0.523308,0.582287,0.466735,0.460745,0.471157,0.428394
1,0,-0.227703,0.489837,0.282747,0.444231,0.285108,-0.5772,-0.577318,0.403348,0.408902,0.003842,0.471157,0.653996
2,1,0.42493,0.488037,0.873944,0.444231,0.571559,0.185883,-0.15115,0.389062,0.466735,0.460745,0.605072,0.653996
3,1,-0.444599,0.488037,0.588087,0.444231,0.51899,-0.81057,-0.571142,0.582287,0.466735,0.460745,0.471157,0.417541
4,1,0.161152,0.489837,0.502929,0.444231,0.571559,-1.103208,-0.345087,0.582287,0.466735,0.460745,0.325389,0.428394


In [20]:
mushroom_test_df

Unnamed: 0,cap-diameter,cap-shape,cap-color,does-bruise-or-bleed,gill-color,stem-height,stem-width,stem-color,has-ring,ring-type,habitat,season
0,0.772567,x,n,t,w,1.772640,0.737267,w,t,g,d,a
1,0.417168,o,o,f,y,-1.881089,-0.049156,n,f,f,d,a
2,-1.311549,b,n,f,n,-0.061636,-0.988667,n,f,f,d,s
3,-0.599573,x,n,f,n,-0.506309,-0.325701,w,t,z,d,u
4,0.244060,x,y,f,y,0.142173,0.315043,y,t,f,d,u
...,...,...,...,...,...,...,...,...,...,...,...,...
2077593,-2.145940,x,w,f,w,-1.362304,-1.209656,e,f,f,d,u
2077594,-0.745146,x,w,f,w,-1.354893,-0.465208,w,f,f,g,a
2077595,0.130990,x,e,f,w,-0.069047,-0.173848,y,t,z,d,a
2077596,-0.065098,b,n,f,g,-0.128337,-0.949161,g,f,f,d,a


In [21]:
mushroom_test_df_encoded

Unnamed: 0,cap-diameter,cap-shape,cap-color,does-bruise-or-bleed,gill-color,stem-height,stem-width,stem-color,has-ring,ring-type,habitat,season
0,0.772567,0.489837,0.509077,0.493370,0.571559,1.772640,0.737267,0.582287,0.408902,0.578547,0.471157,0.428394
1,0.417168,0.325177,0.282747,0.444231,0.420942,-1.881089,-0.049156,0.389062,0.466735,0.460745,0.471157,0.428394
2,-1.311549,0.226965,0.509077,0.444231,0.285108,-0.061636,-0.988667,0.389062,0.466735,0.460745,0.471157,0.611712
3,-0.599573,0.489837,0.509077,0.444231,0.285108,-0.506309,-0.325701,0.582287,0.408902,0.003842,0.471157,0.417541
4,0.244060,0.489837,0.377041,0.444231,0.420942,0.142173,0.315043,0.306232,0.408902,0.460745,0.471157,0.417541
...,...,...,...,...,...,...,...,...,...,...,...,...
2077593,-2.145940,0.489837,0.502929,0.444231,0.571559,-1.362304,-1.209656,0.298708,0.466735,0.460745,0.471157,0.417541
2077594,-0.745146,0.489837,0.502929,0.444231,0.571559,-1.354893,-0.465208,0.582287,0.466735,0.460745,0.325389,0.428394
2077595,0.130990,0.489837,0.206738,0.444231,0.571559,-0.069047,-0.173848,0.306232,0.408902,0.003842,0.471157,0.428394
2077596,-0.065098,0.226965,0.509077,0.444231,0.518990,-0.128337,-0.949161,0.608568,0.466735,0.460745,0.471157,0.428394


In [22]:
all_mappings = {}
for col in catCols:
    col_mappings = dict(zip(mushroom_train_df[col].unique(), te.mapping[col][:-2].tolist()))
    all_mappings[col] = col_mappings

-----------------

### Saving datasets and mappings

In [23]:
mushroom_train_df_encoded.to_csv("train_processed.csv", index = False)
mushroom_test_df_encoded.to_csv("test_processed.csv", index = False)

In [24]:
#joblib.dump(all_mappings, "mappings.pkl")