# Poisonous Mushroom 
    - using fake random forest 

In [1]:
# data: https://www.kaggle.com/datasets/uciml/mushroom-classification/data
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.base import BaseEstimator


### Read data

In [2]:
data = pd.read_csv('./data/mushrooms.csv')
print(data.head(2))

  class cap-shape cap-surface cap-color bruises odor gill-attachment  \
0     p         x           s         n       t    p               f   
1     e         x           s         y       t    a               f   

  gill-spacing gill-size gill-color  ... stalk-surface-below-ring  \
0            c         n          k  ...                        s   
1            c         b          k  ...                        s   

  stalk-color-above-ring stalk-color-below-ring veil-type veil-color  \
0                      w                      w         p          w   
1                      w                      w         p          w   

  ring-number ring-type spore-print-color population habitat  
0           o         p                 k          s       u  
1           o         p                 n          n       g  

[2 rows x 23 columns]


### Transform data

In [3]:
def generate_dummy_columns(data: pd.DataFrame, columns : list) -> pd.DataFrame :
    # data = data.dropna()
    for col in columns:        
        new_column_suffix = data[col].apply(lambda x: str(x)).unique()
        new_cols = [ col + '_' + suffix for suffix in new_column_suffix ]
        new_data =pd.get_dummies(data[col], prefix=col)[new_cols]
        data = pd.concat([data, new_data], axis=1) 
        del data[col]
    return data

In [4]:
# fill missing and prepare dummy valiables
# print(len(data))
data.fillna('missing', inplace=True)
data_x = data.iloc[:,1:]
X = generate_dummy_columns(data_x, data_x.columns)
Y = data['class'].apply(lambda x: +1 if x == 'p' else -1)

print(X.columns)
print(X.head(2))





Index(['cap-shape_x', 'cap-shape_b', 'cap-shape_s', 'cap-shape_f',
       'cap-shape_k', 'cap-shape_c', 'cap-surface_s', 'cap-surface_y',
       'cap-surface_f', 'cap-surface_g',
       ...
       'population_v', 'population_y', 'population_c', 'habitat_u',
       'habitat_g', 'habitat_m', 'habitat_d', 'habitat_p', 'habitat_w',
       'habitat_l'],
      dtype='object', length=117)
   cap-shape_x  cap-shape_b  cap-shape_s  cap-shape_f  cap-shape_k  \
0         True        False        False        False        False   
1         True        False        False        False        False   

   cap-shape_c  cap-surface_s  cap-surface_y  cap-surface_f  cap-surface_g  \
0        False           True          False          False          False   
1        False           True          False          False          False   

   ...  population_v  population_y  population_c  habitat_u  habitat_g  \
0  ...         False         False         False       True      False   
1  ...         False 

### Compare sk learn models

In [5]:
models = [LogisticRegression(), DecisionTreeClassifier(), RandomForestClassifier(n_estimators=10)]

for model in models:
    score = cross_val_score(model,X,Y, cv=8)
    print(model.__class__.__name__, score.mean())

LogisticRegression 0.9260029188161825
DecisionTreeClassifier 0.9308194503704279
RandomForestClassifier 0.9542182227221597


### Build fake random forest


In [26]:
class FakeRanndomForest(BaseEstimator):
    def __init__(self, M = 10):
        self.M = M
        self.models = []
        self.features = []

    def fit(self, X, Y, n_features=None):
        if n_features is None:
            n_features = int(np.sqrt(X.shape[1]))
        N = len(X)

        for i in range(self.M):
            model = DecisionTreeClassifier(max_depth=10)

            # select features randomly
            selected_features_indexes = np.random.choice(X.shape[1], n_features, replace=False)            
            selected_sample_indexes = np.random.choice(N, N, replace=False)
            train_x = X.iloc[selected_sample_indexes, selected_features_indexes]
            train_y = Y.iloc[selected_sample_indexes]

            model.fit(train_x, train_y)
            self.models.append(model)
            self.features.append(selected_features_indexes)
        
    def predict(self, X):
        predictions = np.zeros(len(X))
        for i, model in enumerate(self.models):
            x_p = X.iloc[:, self.features[i]]
            predictions += model.predict(x_p)
        return np.sign(predictions/self.M)
    
    def score(self, X, y):
        return accuracy_score(y, self.predict(X))

In [27]:
class BaggedTree(BaseEstimator):
    def __init__(self, M): 
        self.M = M  
        self.models = []

    def fit(self, X, Y):
        N = len(X)
        
        for i in range(self.M):
            indexes = np.random.choice(N, size=N, replace=True)  
            X_train = X.iloc[indexes,:]
            Y_train = Y.iloc[indexes]  
            model = DecisionTreeClassifier(max_depth=2)
            model.fit(X_train, Y_train)
            self.models.append(model)

    def predict(self, X):
        predictions = np.zeros(len(X))
        for i, model in enumerate(self.models):
            predictions += model.predict(X)
        return np.round(predictions/self.M)
    
    def score(self, X, Y):
        y_pred = self.predict(X)
        return accuracy_score(Y, y_pred)      
    
model = BaggedTree(10)
model.fit(X, Y)
y_pred = model.predict(X)

### Evaluate model

In [28]:
models = [BaggedTree(M=100), FakeRanndomForest(M=100) ]

for model in models:
    # model.fit(X, Y)
    # r = model.predict(X)
    score = cross_val_score(model,X,Y, cv=4)
    print(model.__class__.__name__, score.mean())    

BaggedTree 0.8892171344165436
FakeRanndomForest 0.9236829148202855


### Predit on test dataset


### Output

In [7]:

# df = pd.DataFrame()
# df.to_csv('submission.csv',index = False, header=True)