In [17]:
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [18]:
#Load All Data
pathFile = "./Data/mushroom_data_all.csv"
full_data = pd.read_csv(pathFile)

In [19]:
#Make a copy of data not to change the original set
dataCopy = full_data.copy()

#Distinguish between target and features
y = dataCopy.class_edible
X = dataCopy.drop(['class_edible'], axis=1, inplace=False)
X.drop(['veil-type'], axis = 1, inplace = True)

In [20]:
#Split in train and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size = 0.7, test_size= 0.3, random_state=42)

In [21]:
#Preprocessing with Imputation of the categorical columns 
#Distinguish between data labelled in the training from 
#other possible values in validation but not in the training set

categorical_columns = [col for col in X_train.columns if X_train[col].dtype == 'object'] 
good_columns = [col for col in categorical_columns if set(X_train[col]) == set(X_valid[col])]
problematic_columns = list(set(X_valid.columns)-set(good_columns))

In [22]:
#Search for numbers of unique values for each cat columns
val_unique = list(map(lambda col: X_train[col].nunique(), categorical_columns))
d = dict(zip(categorical_columns, val_unique))
sorted(d.items(), key=lambda x: x[1])
#Veil-type column can potentially be erased from the features

[('bruises', 2),
 ('gill-attachment', 2),
 ('gill-spacing', 2),
 ('gill-size', 2),
 ('stalk-shape', 2),
 ('ring-number', 3),
 ('cap-surface', 4),
 ('stalk-surface-above-ring', 4),
 ('stalk-surface-below-ring', 4),
 ('veil-color', 4),
 ('stalk-root', 5),
 ('ring-type', 5),
 ('cap-shape', 6),
 ('population', 6),
 ('habitat', 7),
 ('odor', 9),
 ('stalk-color-above-ring', 9),
 ('stalk-color-below-ring', 9),
 ('spore-print-color', 9),
 ('cap-color', 10),
 ('gill-color', 12)]

In [23]:
low_cardinality_columns = [col for col in good_columns if X_train[col].nunique() <= 9]
high_cardinality_columns = list(set(good_columns)-set(low_cardinality_columns))

In [24]:
##One Hot Encoding for columns with cardinality <= 9## 
OHEncoder = OneHotEncoder(handle_unknown = 'ignore', sparse = False)
OH_columns_train = pd.DataFrame(OHEncoder.fit_transform(X_train[low_cardinality_columns]))
OH_columns_valid = pd.DataFrame(OHEncoder.transform(X_valid[low_cardinality_columns]))

#Restoring indexes deleted by OH encoding
OH_columns_train.index = X_train.index
OH_columns_valid.index = X_valid.index

OHEncoded_X_train = pd.concat([X_train, OH_columns_train], axis = 1)
OHEncoded_X_valid = pd.concat([X_valid, OH_columns_valid], axis = 1)

#Drop columns that will not be used
OHEncoded_X_train = OHEncoded_X_train.drop(low_cardinality_columns, axis = 1)
OHEncoded_X_valid = OHEncoded_X_valid.drop(low_cardinality_columns, axis = 1)



In [26]:
##Label Encoding for the other categorical columns (cardinality > 9)##
label_encoder = LabelEncoder()

labelled_X_train = OHEncoded_X_train
labelled_X_valid = OHEncoded_X_valid

for col in set(high_cardinality_columns):
    labelled_X_train[col] = label_encoder.fit_transform(labelled_X_train[col])
    labelled_X_valid[col] = label_encoder.transform(labelled_X_valid[col])

In [27]:
#Build the model
my_model = RandomForestClassifier(random_state = 1)
my_model.fit(labelled_X_train, y_train)
predictions = my_model.predict(labelled_X_valid)

In [28]:
#Output of training and validation accuracy
training_accuracy = []
validation_accuracy = []
training_accuracy.append(my_model.score(labelled_X_train, y_train))    
validation_accuracy.append(my_model.score(labelled_X_valid, y_valid))
print('Training accuracy: ', training_accuracy)
print('Validation accuracy: ', validation_accuracy)

Training accuracy:  [1.0]
Validation accuracy:  [1.0]
