# Feature Engineering

## Import necessary library

In [28]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import preprocessing
from sklearn.preprocessing import OrdinalEncoder

## Get data from csv files

In [43]:
train_dataset_df = pd.read_csv('Mushroom_datasets/mushroom_train.csv')
test_dataset_df = pd.read_csv('Mushroom_datasets/mushroom_test.csv')
print(train_dataset_df)

       cap-diameter cap-shape cap-surface cap-color does-bruise-or-bleed  \
0              4.98         c           i         y                    f   
1              2.84         x           y         y                    f   
2             11.44         x           y         y                    f   
3              8.77         s           t         r                    t   
4              7.55         x           d         n                    t   
...             ...       ...         ...       ...                  ...   
42743          3.28         f           y         p                    f   
42744          8.91         x           w         p                    f   
42745         45.84         o           y         y                    f   
42746         10.91         f           y         n                    f   
42747          2.41         f           t         w                    f   

      gill-attachment gill-spacing gill-color  stem-height  stem-width  \
0            

In [46]:
train_dataset_df.describe()

Unnamed: 0,cap-diameter,stem-height,stem-width
count,42748.0,42748.0,42748.0
mean,6.714149,6.583224,12.117692
std,5.220008,3.368333,10.004874
min,0.38,0.0,0.0
25%,3.48,4.64,5.18
50%,5.865,5.96,10.2
75%,8.53,7.75,16.54
max,62.34,33.92,103.91


In [47]:
train_dataset_df['class'].value_counts()

p    23595
e    19153
Name: class, dtype: int64

In [48]:
test_dataset_df['class'].value_counts()

p    10293
e     8028
Name: class, dtype: int64

## Feature Engineering
Most of the features of the dataset are categorical (such as cap-shape: bell, conical, convex or flat) and cannot be directly used as inputs to machine learning models (since they are not numerical). We can use such features to create extra features on both training and test datasets. The new features can reflect statistics of the original numerical features and can potentially detect patterns of poisonous or edible mushrooms and simplify the classification task.

We can use the categorical features to group all the data points with the same categorical feature value (i.e., all the mushrooms with orange cap color) and calculate statistics of the numerical data corresponding to each group (i.e., average cap-diameter of all the mushrooms with orange cap color). Then in the new feature, all data points of this group (i.e., mushrooms with orange cap-color) are assigned that calculated statistic. This could be used as an alternative to one-hot encoding of the feature.

In [37]:
def featureEngineering():
    
    # Encode the "T/F" traning data
    # Copy in case o f overwrite
    encode_data_train = train_dataset_df.copy()
    enc = OrdinalEncoder() # 4, 11 from original data "f" -> 0.0, "t" -> 1.0
    encode_data_train[["does-bruise-or-bleed","has-ring"]] = enc.fit_transform(encode_data_train[["does-bruise-or-bleed","has-ring"]])
    
    # Apply the same OrdinalEncoder
    encode_data_test = test_dataset_df.copy()
    encode_data_test[["does-bruise-or-bleed","has-ring"]] = enc.fit_transform(encode_data_test[["does-bruise-or-bleed","has-ring"]])

    # Calculated the different value of the numerical data only by grouping the mushrooms's non-num feature
    group_list = ["cap-shape", "cap-surface", "cap-color", "gill-attachment", "gill-spacing", "gill-color", "stem-color", "ring-type", "habitat", "season"]
    
    
    # Put new features in a dictionary using the original numeric data only
    diction = {}
    for feature in group_list:
        average_feature = train_dataset_df.groupby([feature], as_index=True).mean(numeric_only=True)
        min_feature = train_dataset_df.groupby([feature], as_index=True).min(numeric_only=True)
        max_feature = train_dataset_df.groupby([feature], as_index=True).max(numeric_only=True)
        median_feature = train_dataset_df.groupby([feature], as_index=True).median(numeric_only=True)
        diction[feature] = {"average": average_feature, "min": min_feature, "max": max_feature, "median": median_feature}
#     print(diction)

    # List of new features for training 
    new_features = []
    for feature in diction:
#         print(feature)
        
        for statistic in diction[feature]:
#             print(statistic)
            
            for num_feature in diction[feature][statistic]:
#                 print(num_feature)
                feature_name = feature + '-' + num_feature + '-' + statistic
                new_feature = train_dataset_df[feature].map(diction[feature][statistic][num_feature]).rename(feature_name)
                new_features.append(new_feature)
                
    # List of new features for testing using the statistics from training data
    new_features_test = []
    for feature in diction:
#         print(feature)
        
        for statistic in diction[feature]:
#             print(statistic)

            for num_feature in diction[feature][statistic]:
#                 print(num_feature)
                feature_name = feature + '-' + num_feature + '-' + statistic
                new_feature = test_dataset_df[feature].map(diction[feature][statistic][num_feature]).rename(feature_name)
                new_features_test.append(new_feature)
                
    # Copy the original numerical features      
    xdata_train = encode_data_train.iloc[:, [0,4,8,9,11]].copy()
    # Concat with the new features
    xdata_train = pd.concat([xdata_train] + new_features, axis = 1)
    print("New training dataset is :")
    print(xdata_train)
    print(f"The shape of training data is {xdata_train.shape}")
    
    # Deal with the test data
    xdata_test = encode_data_test.iloc[:, [0,4,8,9,11]].copy()
    xdata_test = pd.concat([xdata_test] + new_features_test, axis = 1)
    print("New testing dataset is :")
    print(xdata_test)
    print(f"The shape of testing data is {xdata_test.shape}")
    
    # Convert the training label to number
    ydata_train = train_dataset_df.iloc[:,-1:].values
    ydata_test = test_dataset_df.iloc[:,-1:].values
    ydata_train = ydata_train.reshape(-1)
    ydata_test = ydata_test.reshape(-1)
    labelencoder = preprocessing.LabelEncoder()
    labelencoder.fit(ydata_train)
    
    # 0 stands for edible, 1 stands for poison
    ydata_train = labelencoder.transform(ydata_train)
    ydata_test = labelencoder.transform(ydata_test)
#     print(ydata_train)
#     print(ydata_test)
    
    return xdata_train, ydata_train, xdata_test, ydata_test

In [38]:
xdata_train, ydata_train, xdata_test, ydata_test = featureEngineering()

New training dataset is :
       cap-diameter  does-bruise-or-bleed  stem-height  stem-width  has-ring  \
0              4.98                   0.0         6.04        6.21       0.0   
1              2.84                   0.0         5.66        3.55       1.0   
2             11.44                   0.0         7.03       25.29       1.0   
3              8.77                   1.0         4.44       13.61       0.0   
4              7.55                   1.0         8.41       18.44       0.0   
...             ...                   ...          ...         ...       ...   
42743          3.28                   0.0         4.96        3.51       1.0   
42744          8.91                   0.0         4.61       11.12       0.0   
42745         45.84                   0.0         5.75       26.36       0.0   
42746         10.91                   0.0         7.55       24.38       1.0   
42747          2.41                   0.0         3.52        3.71       0.0   

       cap-sh

## Save the new data to csv file
Save the original expanded data and the data after feature selection

In [39]:
# Concatenate the training xdata and training ydata
data_train = np.zeros((xdata_train.shape[0],xdata_train.shape[1]+1))
data_train[:, :xdata_train.shape[1]] = np.copy(xdata_train)
data_train[:,-1] = np.copy(ydata_train)

# Concatenate the training xdata and training ydata
data_test = np.zeros((xdata_test.shape[0],xdata_test.shape[1]+1))
data_test[:, :xdata_test.shape[1]] = np.copy(xdata_test)
data_test[:,-1] = np.copy(ydata_test)

print(data_train.shape)
print(data_test.shape)
# Save the data to csv file
df_train = pd.DataFrame(data_train)
df_train.to_csv('mushroom_train_encode.csv', index=False)
df_test = pd.DataFrame(data_test)
df_test.to_csv('mushroom_test_encode.csv', index=False)

(42748, 126)
(18321, 126)
