# Dataset: Mushroom Classification
Link to a dataset: https://www.kaggle.com/datasets/uciml/mushroom-classification?resource=download

In [1]:
import numpy as np
import pandas as pd

from sklearn.metrics import make_scorer, accuracy_score, f1_score

from sklearn import model_selection
from sklearn.model_selection import (
    StratifiedKFold,
    cross_validate,
    GridSearchCV,
)

Standard models used:

In [2]:
from sklearn.linear_model import LogisticRegression

from sklearn.neighbors import KNeighborsClassifier

from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import ComplementNB

from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import RandomForestClassifier

**fcalc** library files are located in **FCALC** folder

**FCALC** folder is located in the same directory as this notebook.

In [3]:
import FCALC.fcalc as fcalc

# Loading data

In [4]:
data_mush = pd.read_csv('mushrooms.csv', sep=',')

data_mush['class'] = data_mush['class'].replace({'p': True, 'e': False})
data_mush.rename(columns={'class': 'poisonous'}, inplace=True)

data_mush.dropna(inplace=True)
data_mush.drop_duplicates(inplace=True)

data_mush.head()

  data_mush['class'] = data_mush['class'].replace({'p': True, 'e': False})


Unnamed: 0,poisonous,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,True,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,False,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,False,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,True,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,False,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [5]:
print(data_mush['poisonous'].value_counts())


poisonous
False    4208
True     3916
Name: count, dtype: int64


In [6]:
data_mush_encoded = pd.get_dummies(data_mush, prefix_sep='_', dtype=float)
print(data_mush_encoded.corr()['poisonous'].sort_values(ascending=False))
print(data_mush_encoded.head())


poisonous                     1.000000
odor_f                        0.623842
stalk-surface-above-ring_k    0.587658
stalk-surface-below-ring_k    0.573524
gill-size_n                   0.540024
                                ...   
bruises_t                    -0.501530
gill-size_b                  -0.540024
ring-type_p                  -0.540469
odor_n                       -0.785557
veil-type_p                        NaN
Name: poisonous, Length: 118, dtype: float64
   poisonous  cap-shape_b  cap-shape_c  cap-shape_f  cap-shape_k  cap-shape_s  \
0       True          0.0          0.0          0.0          0.0          0.0   
1      False          0.0          0.0          0.0          0.0          0.0   
2      False          1.0          0.0          0.0          0.0          0.0   
3       True          0.0          0.0          0.0          0.0          0.0   
4      False          0.0          0.0          0.0          0.0          0.0   

   cap-shape_x  cap-surface_f  cap-surf

In [7]:
# Calculate correlations
correlations = data_mush_encoded.corr()['poisonous'].abs().sort_values(ascending=False)

# Set threshold for dropping features
threshold = 0.15
correlated_features = correlations[correlations > threshold].index.tolist()

# Remove the target column itself from the list
correlated_features.remove('poisonous')

# Drop these features
data_mush_encoded = data_mush_encoded.drop(columns=correlated_features)
data_mush_encoded.head()

Unnamed: 0,poisonous,cap-shape_c,cap-shape_f,cap-shape_s,cap-shape_x,cap-surface_g,cap-surface_s,cap-surface_y,cap-color_b,cap-color_c,...,spore-print-color_b,spore-print-color_o,spore-print-color_r,spore-print-color_u,spore-print-color_y,population_c,population_y,habitat_d,habitat_m,habitat_u
0,True,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,False,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,False,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,True,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,False,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
print(data_mush_encoded['veil-type_p'].value_counts(dropna=False))
data_mush_encoded = data_mush_encoded.drop(columns=['veil-type_p'])

veil-type_p
1.0    8124
Name: count, dtype: int64


In [9]:
from sklearn.utils import resample

majority = data_mush_encoded[data_mush_encoded['poisonous'] == 0]
minority = data_mush_encoded[data_mush_encoded['poisonous'] == 1]

majority_downsampled = resample(majority, 
                                replace=False,     # Sample without replacement
                                n_samples=len(minority), # Sample the same number as minority
                                random_state=42)

# Combine minority class with downsampled majority class
balanced_df = pd.concat([minority, majority_downsampled])

# Trim the dataset to 400 instances
if len(balanced_df) > 300:
    balanced_df = balanced_df.sample(n=300, random_state=42)


In [10]:
df = balanced_df
df.shape

(300, 58)

In [11]:
features = df.columns[1:-1]
target = df.columns[0]

In [12]:
assert target not in features, "Target column is included in features!"


In [13]:
X = df[features]
y = df[target]

# Testing models

In [14]:
best_parameters = { 
    'LogisticRegression': [],
    'KNeighborsClassifier': [],
    'MultinomialNB': [],
    'GaussianNB': [],
    'ComplementNB': [],
    'DecisionTreeClassifier': [],
    'RandomForestClassifier': [],
    'BinarizedBinaryClassifier': [],
    'PatternBinaryClassifier': [],
}

In [15]:
best_metrics = {
    'LogisticRegression': [],
    'KNeighborsClassifier': [],
    'MultinomialNB': [],
    'GaussianNB': [],
    'ComplementNB': [],
    'DecisionTreeClassifier': [],
    'RandomForestClassifier': [],
    'BinarizedBinaryClassifier': [],
    'PatternBinaryClassifier': [],  
}

In [16]:
scoring = {'accuracy' : make_scorer(accuracy_score),
           'f1_macro' : make_scorer(f1_score, average='macro'),
           'f1_binary' : make_scorer(f1_score),
          }

kfold = StratifiedKFold(n_splits=10, random_state=49, shuffle=True)

In [17]:
def count_metrics(results):
    acc = np.round(np.mean(results['test_accuracy']), 4) 
    f1_m = np.round(np.mean(results['test_f1_macro']), 4)
    f1_b = np.round(np.mean(results['test_f1_binary']), 4)
    return(acc, f1_m, f1_b)  

In [18]:
def print_results(results):
    acc, f1_m, f1_b = count_metrics(results)
    print(f'Accuracy = {acc:0.4f}, F1_binary = {f1_b:0.4f}, F1_macro = {f1_m:0.4f}')  

In [19]:
def fill_best_metrics(results, method):
    acc, f1_m, f1_b = count_metrics(results)
    best_metrics[method] = {'Accuracy': acc, 'F1_binary': f1_b, 'F1_macro': f1_m}

# Standard models

### Logistic regression

In [20]:
parameters = {'C' : np.linspace(1,5,101)}

model = LogisticRegression()

best = GridSearchCV(estimator=model, 
                    param_grid=parameters, 
                    cv=kfold,
                    verbose=1, 
                    scoring=scoring,
                    refit='f1_binary'
                   )
best.fit(X, y)
best_parameters['LogisticRegression'] = best.best_params_
best.best_params_

Fitting 10 folds for each of 101 candidates, totalling 1010 fits


{'C': np.float64(4.32)}

In [21]:
model = LogisticRegression(C=best.best_params_['C'])
results = cross_validate(estimator=model,
                         X=X, y=y,
                         cv=kfold,
                         scoring=scoring
                         )
fill_best_metrics(results, 'LogisticRegression')
print_results(results)

Accuracy = 0.8167, F1_binary = 0.8293, F1_macro = 0.8149


# K-Nearest Neighbours

In [22]:
parameters = {'n_neighbors' : range(5,78,4)}

model = KNeighborsClassifier()

best = GridSearchCV(estimator=model, 
                    param_grid=parameters, 
                    cv=kfold,
                    verbose=1, 
                    scoring=scoring,
                    refit='f1_binary'
                   )
best.fit(X, y)
best_parameters['KNeighborsClassifier'] = best.best_params_
best.best_params_

Fitting 10 folds for each of 19 candidates, totalling 190 fits


{'n_neighbors': 5}

In [23]:
model = KNeighborsClassifier(n_neighbors=best.best_params_['n_neighbors'])
results = cross_validate(estimator=model,
                         X=X,
                         y=y,
                         cv=kfold,
                         scoring=scoring
                         )
fill_best_metrics(results, 'KNeighborsClassifier')
print_results(results)

Accuracy = 0.9000, F1_binary = 0.9105, F1_macro = 0.8983


# Naive Bayes

##### MULTINOMIAL NB

In [24]:
parameters = {'alpha' : np.linspace(0.001,100.001,1001)}

model = MultinomialNB()

best = GridSearchCV(estimator=model, 
                    param_grid=parameters, 
                    cv=kfold,
                    verbose=1, 
                    scoring=scoring,
                    refit='f1_binary'
                   )
best.fit(X, y)
best_parameters['MultinomialNB'] = best.best_params_
best.best_params_

Fitting 10 folds for each of 1001 candidates, totalling 10010 fits


{'alpha': np.float64(0.30100000000000005)}

In [25]:
model = MultinomialNB(alpha=best.best_params_['alpha'])

results = cross_validate(estimator=model,
                         X=X,
                         y=y,
                         cv=kfold,
                         scoring=scoring
                         )
fill_best_metrics(results, 'MultinomialNB')
print_results(results)

Accuracy = 0.7600, F1_binary = 0.7828, F1_macro = 0.7562


##### GAUSSIAN NB

In [26]:
parameters = {'var_smoothing': np.logspace(0,-9, num=100)}

model = GaussianNB()

best = GridSearchCV(estimator=model, 
                    param_grid=parameters, 
                    cv=kfold,
                    verbose=1, 
                    scoring=scoring,
                    refit='f1_binary'
                   )
best.fit(X, y)
best_parameters['GaussianNB'] = best.best_params_
best.best_params_

Fitting 10 folds for each of 100 candidates, totalling 1000 fits


{'var_smoothing': np.float64(0.533669923120631)}

In [27]:
model = GaussianNB(var_smoothing=best.best_params_['var_smoothing'])

results = cross_validate(estimator=model,
                         X=X,
                         y=y,
                         cv=kfold,
                         scoring=scoring
                         )
fill_best_metrics(results, 'GaussianNB')
print_results(results)

Accuracy = 0.6633, F1_binary = 0.7579, F1_macro = 0.5977


##### COMPLEMENT NB

In [28]:
parameters = {'alpha' : np.linspace(0.001,100.001,1001)}

model =  ComplementNB()

best = GridSearchCV(estimator=model, 
                    param_grid=parameters, 
                    cv=kfold,
                    verbose=1, 
                    scoring=scoring,
                    refit='f1_binary'
                   )
best.fit(X, y)
best_parameters['ComplementNB'] = best.best_params_
best.best_params_

Fitting 10 folds for each of 1001 candidates, totalling 10010 fits


{'alpha': np.float64(0.201)}

In [29]:
model = ComplementNB(alpha=best.best_params_['alpha'])
results = cross_validate(estimator=model,
                         X=X,
                         y=y,
                         cv=kfold,
                         scoring=scoring
                         )
fill_best_metrics(results, 'ComplementNB')
print_results(results) 

Accuracy = 0.7567, F1_binary = 0.7736, F1_macro = 0.7548


### Decision tree

In [30]:
parameters = {'min_samples_split' : range(2,23,2),
              'max_depth' : range(2,21,2),
              'criterion' : ['gini', 'entropy']
             }

model = DecisionTreeClassifier(random_state=49)

best = GridSearchCV(estimator=model, 
                    param_grid=parameters, 
                    cv=kfold,
                    verbose=1, 
                    scoring=scoring,
                    refit='f1_binary'
                   )
best.fit(X, y)
best_parameters['DecisionTreeClassifier'] = best.best_params_
best.best_params_

Fitting 10 folds for each of 220 candidates, totalling 2200 fits


  _data = np.array(data, dtype=dtype, copy=copy,


{'criterion': 'entropy', 'max_depth': 8, 'min_samples_split': 8}

In [31]:
model = DecisionTreeClassifier(
    max_depth=best.best_params_['max_depth'], 
    min_samples_split=best.best_params_['min_samples_split'],
    criterion=best.best_params_['criterion'],
    random_state=49
)
results = cross_validate(estimator=model,
                         X=X,
                         y=y,
                         cv=kfold,
                         scoring=scoring
                         )
fill_best_metrics(results, 'DecisionTreeClassifier')
print_results(results)

Accuracy = 0.9667, F1_binary = 0.9691, F1_macro = 0.9664


### RandomForest

In [32]:
parameters = {'n_estimators' : range(40,121,10),
              'min_samples_split' : range(2,13,2),
              'max_depth' : range(2,21,2),
              'criterion' : ['gini', 'entropy']
             }
model = RandomForestClassifier(random_state=49)

best = GridSearchCV(estimator=model, 
                    param_grid=parameters, 
                    cv=kfold,
                    verbose=1, 
                    scoring=scoring,
                    refit='f1_binary'
                   )
best.fit(X, y)
best_parameters['RandomForestClassifier'] = best.best_params_
best.best_params_

Fitting 10 folds for each of 1080 candidates, totalling 10800 fits


{'criterion': 'gini',
 'max_depth': 16,
 'min_samples_split': 6,
 'n_estimators': 100}

In [33]:
model = RandomForestClassifier(
    n_estimators=best.best_params_['n_estimators'],
    min_samples_split=best.best_params_['min_samples_split'],
    max_depth=best.best_params_['max_depth'],
    criterion=best.best_params_['criterion'],
    random_state=49
)
results = cross_validate(estimator=model,
                         X=X,
                         y=y,
                         cv=kfold,
                         scoring=scoring
                         )
fill_best_metrics(results, 'RandomForestClassifier')
print_results(results)

Accuracy = 0.9600, F1_binary = 0.9636, F1_macro = 0.9596


# Lazy FCA

Due to the multilabel output of **BinarizedBinaryClassifier** (**1** for **True**, **0** for **false** and **-1** for **undefined**) we cannot utilize **f1_score** with **average='binary'**. Therefore, a function that interprets **undefined** as misclassification was implemented.

In [34]:
def compare_with_binary_f1_old(y_true, y_pred):
    y_tmp = np.concatenate(
        (np.array(y_true)[:,None],np.array(y_pred)[:,None]),
        axis=1
    )
    df_tmp = pd.DataFrame(y_tmp, columns=['y_true','y_pred'])
    df_tmp.y_true = df_tmp.y_true.astype(bool)
    df_tmp['y_new']= ~df_tmp.loc[df_tmp.y_pred==-1]['y_true']
    df_tmp.loc[df_tmp.y_pred!=-1, 'y_new'] = df_tmp.loc[df_tmp.y_pred!=-1,'y_pred'].astype(bool) 
    df_tmp.y_new = df_tmp.y_new.astype(bool)
    return(f1_score(df_tmp.y_true, df_tmp.y_new))

In [35]:
def results_fca(y_true, y_pred):
    f1 = f1_score(y_true, y_pred, average='macro')
    accuracy = accuracy_score(y_true, y_pred)
    f1_binary = compare_with_binary_f1(y_true, y_pred)
    return (accuracy, f1, f1_binary)  

In [36]:
def compare_with_binary_f1(y_true, y_pred):
    y_tmp = np.copy(y_pred)
    undef = y_pred == -1
    y_tmp[undef] = (y_true[undef] - np.ones(shape=y_tmp[undef].shape) * 2) // -2
    return (f1_score(y_true, y_tmp))

(0 - 2) // (-2) = 1

(1 - 2) // (-2) = 0

## BinarizedBinaryClassifier

**BinarizedBinaryClassifier** is not a **scikit** model, so **GridSearchCV** and **cross_validate** from **scikit** do not support it. Thus, we check desired parameters in **for**  loops.

In [37]:
# Ensure all data is numeric or binary
X = X.astype(float)

# If still problematic, consider a data preprocessing step to convert types
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
X = X.apply(le.fit_transform)

In [38]:
parameters = {'alpha' : np.linspace(0, 1, 21),
              'method': ['standard','standard-support','ratio-support'],
             }

n = kfold.get_n_splits(X)

f1_best = 0
alpha_best = 0.
method_best = 'standard'

for method in parameters['method']:
    for alpha in parameters['alpha']:
        
        f1 = [0 for i in range(n)]
        for i, (train_index, test_index) in enumerate(kfold.split(X, y)):
            bin_cls = fcalc.classifier.BinarizedBinaryClassifier(
                X.iloc[train_index].values, 
                y.iloc[train_index].to_numpy(),
                method=method,
                alpha=alpha
            )
            bin_cls.predict(X.iloc[test_index].values)
            f1[i] = f1_score(y.iloc[test_index], bin_cls.predictions, average='macro')
            print(f"[CV {i+1}/{n}] method={method}, alpha={alpha:0.2f}, f1_macro={f1[i]:0.4f}") 
                    
        f1_mean = np.mean(f1)
        print(f'f1_mean = {f1_mean:0.4f}\n')
        if(f1_mean > f1_best):
            f1_best = f1_mean
            alpha_best = alpha
            method_best = method

best_parameters['BinarizedBinaryClassifier'] = {'method': method_best, 'alpha': alpha_best}              
print(f"f1_best={f1_best:0.4f}, method={method_best}, alpha={alpha_best:0.2f}")

[CV 1/10] method=standard, alpha=0.00, f1_macro=0.0000
[CV 2/10] method=standard, alpha=0.00, f1_macro=0.0769
[CV 3/10] method=standard, alpha=0.00, f1_macro=0.0513
[CV 4/10] method=standard, alpha=0.00, f1_macro=0.0870
[CV 5/10] method=standard, alpha=0.00, f1_macro=0.0606
[CV 6/10] method=standard, alpha=0.00, f1_macro=0.1111
[CV 7/10] method=standard, alpha=0.00, f1_macro=0.0351
[CV 8/10] method=standard, alpha=0.00, f1_macro=0.0000
[CV 9/10] method=standard, alpha=0.00, f1_macro=0.0303
[CV 10/10] method=standard, alpha=0.00, f1_macro=0.0000
f1_mean = 0.0452

[CV 1/10] method=standard, alpha=0.05, f1_macro=0.0000
[CV 2/10] method=standard, alpha=0.05, f1_macro=0.0769
[CV 3/10] method=standard, alpha=0.05, f1_macro=0.0513
[CV 4/10] method=standard, alpha=0.05, f1_macro=0.0870
[CV 5/10] method=standard, alpha=0.05, f1_macro=0.0606
[CV 6/10] method=standard, alpha=0.05, f1_macro=0.1111
[CV 7/10] method=standard, alpha=0.05, f1_macro=0.0351
[CV 8/10] method=standard, alpha=0.05, f1_macr

Extra for ratio-support

In [39]:
parameters = {'alpha' : np.linspace(1, 10, 19),
              'method': ['ratio-support'],
             }

n = kfold.get_n_splits(X)

for method in parameters['method']:
    for alpha in parameters['alpha']:
        
        f1 = [0 for i in range(n)]
        for i, (train_index, test_index) in enumerate(kfold.split(X, y)):
            bin_cls = fcalc.classifier.BinarizedBinaryClassifier(
                X.iloc[train_index].values, 
                y.iloc[train_index].to_numpy(),
                method=method,
                alpha=alpha
            )
            bin_cls.predict(X.iloc[test_index].values)
            f1[i] = f1_score(y.iloc[test_index], bin_cls.predictions, average='macro')
            print(f"[CV {i+1}/{n}] method={method}, alpha={alpha:0.2f}, f1_macro={f1[i]:0.4f}") 
                    
        f1_mean = np.mean(f1)
        print(f'f1_mean = {f1_mean:0.4f}\n')
        if(f1_mean > f1_best):
            f1_best = f1_mean
            alpha_best = alpha
            method_best = method

best_parameters['BinarizedBinaryClassifier'] = {'method': method_best, 'alpha': alpha_best}            
print(f"f1_best={f1_best:0.4f}, method={method_best}, alpha={alpha_best:0.2f}")

[CV 1/10] method=ratio-support, alpha=1.00, f1_macro=0.1754
[CV 2/10] method=ratio-support, alpha=1.00, f1_macro=0.1944
[CV 3/10] method=ratio-support, alpha=1.00, f1_macro=0.2222
[CV 4/10] method=ratio-support, alpha=1.00, f1_macro=0.1270
[CV 5/10] method=ratio-support, alpha=1.00, f1_macro=0.1333
[CV 6/10] method=ratio-support, alpha=1.00, f1_macro=0.1212
[CV 7/10] method=ratio-support, alpha=1.00, f1_macro=0.0784
[CV 8/10] method=ratio-support, alpha=1.00, f1_macro=0.2609
[CV 9/10] method=ratio-support, alpha=1.00, f1_macro=0.1667
[CV 10/10] method=ratio-support, alpha=1.00, f1_macro=0.1667
f1_mean = 0.1646

[CV 1/10] method=ratio-support, alpha=1.50, f1_macro=0.0000
[CV 2/10] method=ratio-support, alpha=1.50, f1_macro=0.0000
[CV 3/10] method=ratio-support, alpha=1.50, f1_macro=0.0000
[CV 4/10] method=ratio-support, alpha=1.50, f1_macro=0.0000
[CV 5/10] method=ratio-support, alpha=1.50, f1_macro=0.0000
[CV 6/10] method=ratio-support, alpha=1.50, f1_macro=0.0000
[CV 7/10] method=rati

In [40]:
f1 = [0 for i in range(n)]
accuracy = [0 for i in range(n)]
f1_binary = [0 for i in range(n)]

for i, (train_index, test_index) in enumerate(kfold.split(X, y)):
    bin_cls = fcalc.classifier.BinarizedBinaryClassifier(
        X.iloc[train_index].values, 
        y.iloc[train_index].to_numpy(),
        method=method_best,
        alpha=alpha_best
    )
    bin_cls.predict(X.iloc[test_index].values)
    
    accuracy[i], f1[i], f1_binary[i] = results_fca(y.iloc[test_index], bin_cls.predictions)
    
    print(f"[CV {i+1}/{n}] method={method_best}, alpha={alpha_best:0.2f}", 
          f"accuracy={accuracy[i]:0.4f}, f1_binary={f1_binary[i]:0.4f}, f1_macro={f1[i]:0.4f}")

f1_mean = np.mean(f1)
accuracy_mean = np.mean(accuracy)
f1_binary_mean = np.mean(f1_binary)

best_metrics['BinarizedBinaryClassifier'] = {'Accuracy': accuracy_mean, 'F1_binary': f1_binary_mean, 'F1_macro': f1_mean}
print(f"\n accuracy={accuracy_mean:0.4f}, f1_binary={f1_binary_mean:0.4f}, f1_macro={f1_mean:0.4f}")

[CV 1/10] method=standard-support, alpha=1.00 accuracy=0.1667, f1_binary=0.0000, f1_macro=0.1754
[CV 2/10] method=standard-support, alpha=1.00 accuracy=0.2333, f1_binary=0.0000, f1_macro=0.1944
[CV 3/10] method=standard-support, alpha=1.00 accuracy=0.2667, f1_binary=0.0000, f1_macro=0.2222
[CV 4/10] method=standard-support, alpha=1.00 accuracy=0.1333, f1_binary=0.0000, f1_macro=0.1270
[CV 5/10] method=standard-support, alpha=1.00 accuracy=0.1333, f1_binary=0.0000, f1_macro=0.1333
[CV 6/10] method=standard-support, alpha=1.00 accuracy=0.1333, f1_binary=0.0000, f1_macro=0.1212
[CV 7/10] method=standard-support, alpha=1.00 accuracy=0.0667, f1_binary=0.0000, f1_macro=0.0784
[CV 8/10] method=standard-support, alpha=1.00 accuracy=0.3000, f1_binary=0.0000, f1_macro=0.2609
[CV 9/10] method=standard-support, alpha=1.00 accuracy=0.1667, f1_binary=0.0000, f1_macro=0.1667
[CV 10/10] method=standard-support, alpha=1.00 accuracy=0.1667, f1_binary=0.0000, f1_macro=0.1667

 accuracy=0.1767, f1_binary=

## PatternBinaryClassifier

In [41]:
parameters = {'alpha' : np.linspace(0, 1, 21),
              'method': ['standard','standard-support','ratio-support'],
             }

n = kfold.get_n_splits(X)

f1_best_p = 0
alpha_best_p = 0.
method_best_p = 'standard'

for method in parameters['method']:
    for alpha in parameters['alpha']:
        
        f1 = [0 for i in range(n)]
        for i, (train_index, test_index) in enumerate(kfold.split(X, y)):
            bin_cls = fcalc.classifier.PatternBinaryClassifier(
                X.iloc[train_index].values, 
                y.iloc[train_index].to_numpy(),
                method=method,
                alpha=alpha
            )
            bin_cls.predict(X.iloc[test_index].values)
            f1[i] = f1_score(y.iloc[test_index], bin_cls.predictions, average='macro')
            print(f"[CV {i+1}/{n}] method={method}, alpha={alpha:0.2f}, f1_macro={f1[i]:0.4f}") 
                    
        f1_mean = np.mean(f1)
        print(f'f1_mean = {f1_mean:0.4f}\n')
        if(f1_mean > f1_best_p):
            f1_best_p = f1_mean
            alpha_best_p = alpha
            method_best_p = method

best_parameters['PatternBinaryClassifier'] = {'method': method_best_p, 'alpha': alpha_best_p}              
print(f"f1_best={f1_best_p:0.4f}, method={method_best_p}, alpha={alpha_best_p:0.2f}")

[CV 1/10] method=standard, alpha=0.00, f1_macro=0.9663
[CV 2/10] method=standard, alpha=0.00, f1_macro=0.9663
[CV 3/10] method=standard, alpha=0.00, f1_macro=0.9321
[CV 4/10] method=standard, alpha=0.00, f1_macro=0.8971
[CV 5/10] method=standard, alpha=0.00, f1_macro=0.9321
[CV 6/10] method=standard, alpha=0.00, f1_macro=0.8643
[CV 7/10] method=standard, alpha=0.00, f1_macro=0.8611
[CV 8/10] method=standard, alpha=0.00, f1_macro=0.8643
[CV 9/10] method=standard, alpha=0.00, f1_macro=0.8971
[CV 10/10] method=standard, alpha=0.00, f1_macro=0.8286
f1_mean = 0.9009

[CV 1/10] method=standard, alpha=0.05, f1_macro=0.7222
[CV 2/10] method=standard, alpha=0.05, f1_macro=0.7600
[CV 3/10] method=standard, alpha=0.05, f1_macro=0.6411
[CV 4/10] method=standard, alpha=0.05, f1_macro=0.6032
[CV 5/10] method=standard, alpha=0.05, f1_macro=0.7129
[CV 6/10] method=standard, alpha=0.05, f1_macro=0.6250
[CV 7/10] method=standard, alpha=0.05, f1_macro=0.6534
[CV 8/10] method=standard, alpha=0.05, f1_macr

Extra for ratio-support

In [42]:
parameters = {'alpha' : np.linspace(0, 10, 21),
              'method': ['ratio-support'],
             }

n = kfold.get_n_splits(X)

for method in parameters['method']:
    for alpha in parameters['alpha']:
        
        f1 = [0 for i in range(n)]
        for i, (train_index, test_index) in enumerate(kfold.split(X, y)):
            bin_cls = fcalc.classifier.PatternBinaryClassifier(
                X.iloc[train_index].values, 
                y.iloc[train_index].to_numpy(),
                method=method,
                alpha=alpha
            )
            bin_cls.predict(X.iloc[test_index].values)
            f1[i] = f1_score(y.iloc[test_index], bin_cls.predictions, average='macro')
            print(f"[CV {i+1}/{n}] method={method}, alpha={alpha:0.2f}, f1_macro={f1[i]:0.4f}") 
                    
        f1_mean = np.mean(f1)
        print(f'f1_mean = {f1_mean:0.4f}\n')
        if(f1_mean > f1_best_p):
            f1_best_p = f1_mean
            alpha_best_p = alpha
            method_best_p = method

best_parameters['PatternBinaryClassifier'] = {'method': method_best_p, 'alpha': alpha_best_p}            
print(f"f1_best={f1_best_p:0.4f}, method={method_best_p}, alpha={alpha_best_p:0.2f}")

[CV 1/10] method=ratio-support, alpha=0.00, f1_macro=0.7847
[CV 2/10] method=ratio-support, alpha=0.00, f1_macro=0.7847
[CV 3/10] method=ratio-support, alpha=0.00, f1_macro=0.6250
[CV 4/10] method=ratio-support, alpha=0.00, f1_macro=0.7000
[CV 5/10] method=ratio-support, alpha=0.00, f1_macro=0.7000
[CV 6/10] method=ratio-support, alpha=0.00, f1_macro=0.7436
[CV 7/10] method=ratio-support, alpha=0.00, f1_macro=0.7000
[CV 8/10] method=ratio-support, alpha=0.00, f1_macro=0.8237
[CV 9/10] method=ratio-support, alpha=0.00, f1_macro=0.7000
[CV 10/10] method=ratio-support, alpha=0.00, f1_macro=0.5764
f1_mean = 0.7138

[CV 1/10] method=ratio-support, alpha=0.50, f1_macro=0.8237
[CV 2/10] method=ratio-support, alpha=0.50, f1_macro=0.8237
[CV 3/10] method=ratio-support, alpha=0.50, f1_macro=0.6411
[CV 4/10] method=ratio-support, alpha=0.50, f1_macro=0.7129
[CV 5/10] method=ratio-support, alpha=0.50, f1_macro=0.7436
[CV 6/10] method=ratio-support, alpha=0.50, f1_macro=0.6703
[CV 7/10] method=rati

In [43]:
f1 = [0 for i in range(n)]
accuracy = [0 for i in range(n)]
f1_binary = [0 for i in range(n)]

for i, (train_index, test_index) in enumerate(kfold.split(X, y)):
    bin_cls = fcalc.classifier.PatternBinaryClassifier(
        X.iloc[train_index].values, 
        y.iloc[train_index].to_numpy(),
        method=method_best_p,
        alpha=alpha_best_p
    )
    bin_cls.predict(X.iloc[test_index].values)
    
    accuracy[i], f1[i], f1_binary[i] = results_fca(y.iloc[test_index], bin_cls.predictions)
    
    print(f"[CV {i+1}/{n}] method={method_best_p}, alpha={alpha_best_p:0.2f}", 
          f"accuracy={accuracy[i]:0.4f}, f1_binary={f1_binary[i]:0.4f}, f1_macro={f1[i]:0.4f}")

f1_mean = np.mean(f1)
accuracy_mean = np.mean(accuracy)
f1_binary_mean = np.mean(f1_binary)

best_metrics['PatternBinaryClassifier'] = {'Accuracy': accuracy_mean, 'F1_binary': f1_binary_mean, 'F1_macro': f1_mean}
print(f"\n accuracy={accuracy_mean:0.4f}, f1_binary={f1_binary_mean:0.4f}, f1_macro={f1_mean:0.4f}")

[CV 1/10] method=standard, alpha=0.00 accuracy=0.9667, f1_binary=0.9697, f1_macro=0.9663
[CV 2/10] method=standard, alpha=0.00 accuracy=0.9667, f1_binary=0.9697, f1_macro=0.9663
[CV 3/10] method=standard, alpha=0.00 accuracy=0.9333, f1_binary=0.9412, f1_macro=0.9321
[CV 4/10] method=standard, alpha=0.00 accuracy=0.9000, f1_binary=0.9143, f1_macro=0.8971
[CV 5/10] method=standard, alpha=0.00 accuracy=0.9333, f1_binary=0.9412, f1_macro=0.9321
[CV 6/10] method=standard, alpha=0.00 accuracy=0.8667, f1_binary=0.8824, f1_macro=0.8643
[CV 7/10] method=standard, alpha=0.00 accuracy=0.8667, f1_binary=0.8889, f1_macro=0.8611
[CV 8/10] method=standard, alpha=0.00 accuracy=0.8667, f1_binary=0.8824, f1_macro=0.8643
[CV 9/10] method=standard, alpha=0.00 accuracy=0.9000, f1_binary=0.9143, f1_macro=0.8971
[CV 10/10] method=standard, alpha=0.00 accuracy=0.8333, f1_binary=0.8571, f1_macro=0.8286

 accuracy=0.9033, f1_binary=0.9161, f1_macro=0.9009


# Overall

In [44]:
for i in best_parameters:
    print(i)
    print(best_parameters[i])
    print()

LogisticRegression
{'C': np.float64(4.32)}

KNeighborsClassifier
{'n_neighbors': 5}

MultinomialNB
{'alpha': np.float64(0.30100000000000005)}

GaussianNB
{'var_smoothing': np.float64(0.533669923120631)}

ComplementNB
{'alpha': np.float64(0.201)}

DecisionTreeClassifier
{'criterion': 'entropy', 'max_depth': 8, 'min_samples_split': 8}

RandomForestClassifier
{'criterion': 'gini', 'max_depth': 16, 'min_samples_split': 6, 'n_estimators': 100}

BinarizedBinaryClassifier
{'method': 'standard-support', 'alpha': np.float64(1.0)}

PatternBinaryClassifier
{'method': 'standard', 'alpha': np.float64(0.0)}



In [45]:
for i in best_metrics:
    print(i)
    print(best_metrics[i])
    print()

LogisticRegression
{'Accuracy': np.float64(0.8167), 'F1_binary': np.float64(0.8293), 'F1_macro': np.float64(0.8149)}

KNeighborsClassifier
{'Accuracy': np.float64(0.9), 'F1_binary': np.float64(0.9105), 'F1_macro': np.float64(0.8983)}

MultinomialNB
{'Accuracy': np.float64(0.76), 'F1_binary': np.float64(0.7828), 'F1_macro': np.float64(0.7562)}

GaussianNB
{'Accuracy': np.float64(0.6633), 'F1_binary': np.float64(0.7579), 'F1_macro': np.float64(0.5977)}

ComplementNB
{'Accuracy': np.float64(0.7567), 'F1_binary': np.float64(0.7736), 'F1_macro': np.float64(0.7548)}

DecisionTreeClassifier
{'Accuracy': np.float64(0.9667), 'F1_binary': np.float64(0.9691), 'F1_macro': np.float64(0.9664)}

RandomForestClassifier
{'Accuracy': np.float64(0.96), 'F1_binary': np.float64(0.9636), 'F1_macro': np.float64(0.9596)}

BinarizedBinaryClassifier
{'Accuracy': np.float64(0.1766666666666667), 'F1_binary': np.float64(0.0), 'F1_macro': np.float64(0.16462691157872206)}

PatternBinaryClassifier
{'Accuracy': np.flo

# Extra: Expandad table for BinarizedBinaryClassifier

In [46]:
X_expanded = ~X
anticolumns = X_expanded.columns.to_list()
for i in range(len(anticolumns)):
    anticolumns[i] = 'NOT_' + anticolumns[i]
X_expanded.columns = anticolumns
X_expanded = pd.concat([X, X_expanded],axis=1)
X_expanded.head()

Unnamed: 0,cap-shape_c,cap-shape_f,cap-shape_s,cap-shape_x,cap-surface_g,cap-surface_s,cap-surface_y,cap-color_b,cap-color_c,cap-color_e,...,NOT_ring-type_n,NOT_spore-print-color_b,NOT_spore-print-color_o,NOT_spore-print-color_r,NOT_spore-print-color_u,NOT_spore-print-color_y,NOT_population_c,NOT_population_y,NOT_habitat_d,NOT_habitat_m
4531,0,1,0,0,0,0,0,0,0,0,...,-1,-1,-1,-1,-1,-1,-1,-2,-1,-1
2173,0,0,0,1,0,0,1,0,0,0,...,-1,-1,-1,-1,-1,-1,-1,-1,-2,-1
6284,0,1,0,0,0,1,0,0,0,1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
5138,0,1,0,0,0,0,1,0,0,0,...,-1,-1,-1,-1,-1,-1,-1,-2,-1,-1
2109,0,1,0,0,0,0,0,0,0,0,...,-1,-1,-1,-1,-1,-1,-1,-1,-2,-1


In [47]:
parameters = {'alpha' : np.linspace(0, 1, 21),
              'method': ['standard','standard-support','ratio-support'],
             }

n = kfold.get_n_splits(X_expanded)

f1_best_e = 0
alpha_best_e = 0.
method_best_e = 'standard'

for method in parameters['method']:
    for alpha in parameters['alpha']:
        
        f1 = [0 for i in range(n)]
        for i, (train_index, test_index) in enumerate(kfold.split(X_expanded, y)):
            bin_cls = fcalc.classifier.BinarizedBinaryClassifier(
                X_expanded.iloc[train_index].values, 
                y.iloc[train_index].to_numpy(),
                method=method,
                alpha=alpha
            )
            bin_cls.predict(X_expanded.iloc[test_index].values)
            f1[i] = f1_score(y.iloc[test_index], bin_cls.predictions, average='macro')
            print(f"[CV {i+1}/{n}] method={method}, alpha={alpha:0.2f}, f1_macro={f1[i]:0.4f}") 
                    
        f1_mean = np.mean(f1)
        print(f'f1_mean = {f1_mean:0.4f}\n')
        if(f1_mean > f1_best_e):
            f1_best_e = f1_mean
            alpha_best_e = alpha
            method_best_e = method

best_parameters_expanded = {'method': method_best_e, 'alpha': alpha_best_e}             
print(f"f1_best={f1_best_e:0.4f}, method={method_best_e}, alpha={alpha_best_e:0.2f}")

[CV 1/10] method=standard, alpha=0.00, f1_macro=0.0000
[CV 2/10] method=standard, alpha=0.00, f1_macro=0.0000
[CV 3/10] method=standard, alpha=0.00, f1_macro=0.0000
[CV 4/10] method=standard, alpha=0.00, f1_macro=0.0000
[CV 5/10] method=standard, alpha=0.00, f1_macro=0.0000
[CV 6/10] method=standard, alpha=0.00, f1_macro=0.0000
[CV 7/10] method=standard, alpha=0.00, f1_macro=0.0000
[CV 8/10] method=standard, alpha=0.00, f1_macro=0.0000
[CV 9/10] method=standard, alpha=0.00, f1_macro=0.0000
[CV 10/10] method=standard, alpha=0.00, f1_macro=0.0000
f1_mean = 0.0000

[CV 1/10] method=standard, alpha=0.05, f1_macro=0.0000
[CV 2/10] method=standard, alpha=0.05, f1_macro=0.0000
[CV 3/10] method=standard, alpha=0.05, f1_macro=0.0000
[CV 4/10] method=standard, alpha=0.05, f1_macro=0.0000
[CV 5/10] method=standard, alpha=0.05, f1_macro=0.0000
[CV 6/10] method=standard, alpha=0.05, f1_macro=0.0000
[CV 7/10] method=standard, alpha=0.05, f1_macro=0.0000
[CV 8/10] method=standard, alpha=0.05, f1_macr

In [48]:
parameters = {'alpha' : np.linspace(1, 10, 19),
              'method': ['ratio-support'],
             }

n = kfold.get_n_splits(X_expanded)

for method in parameters['method']:
    for alpha in parameters['alpha']:
        
        f1 = [0 for i in range(n)]
        for i, (train_index, test_index) in enumerate(kfold.split(X_expanded, y)):
            bin_cls = fcalc.classifier.BinarizedBinaryClassifier(
                X_expanded.iloc[train_index].values, 
                y.iloc[train_index].to_numpy(),
                method=method,
                alpha=alpha
            )
            bin_cls.predict(X_expanded.iloc[test_index].values)
            f1[i] = f1_score(y.iloc[test_index], bin_cls.predictions, average='macro')
            print(f"[CV {i+1}/{n}] method={method}, alpha={alpha:0.2f}, f1_macro={f1[i]:0.4f}") 
                    
        f1_mean = np.mean(f1)
        print(f'f1_mean = {f1_mean:0.4f}\n')
        if(f1_mean > f1_best_e):
            f1_best_e = f1_mean
            alpha_best_e = alpha
            method_best_e = method

best_parameters_expanded = {'method': method_best_e, 'alpha': alpha_best_e}            
print(f"f1_best={f1_best_e:0.4f}, method={method_best_e}, alpha={alpha_best_e:0.2f}")

[CV 1/10] method=ratio-support, alpha=1.00, f1_macro=0.0000
[CV 2/10] method=ratio-support, alpha=1.00, f1_macro=0.0000
[CV 3/10] method=ratio-support, alpha=1.00, f1_macro=0.0000
[CV 4/10] method=ratio-support, alpha=1.00, f1_macro=0.0000
[CV 5/10] method=ratio-support, alpha=1.00, f1_macro=0.0000
[CV 6/10] method=ratio-support, alpha=1.00, f1_macro=0.0000
[CV 7/10] method=ratio-support, alpha=1.00, f1_macro=0.0000
[CV 8/10] method=ratio-support, alpha=1.00, f1_macro=0.0000
[CV 9/10] method=ratio-support, alpha=1.00, f1_macro=0.0000
[CV 10/10] method=ratio-support, alpha=1.00, f1_macro=0.0000
f1_mean = 0.0000

[CV 1/10] method=ratio-support, alpha=1.50, f1_macro=0.0000
[CV 2/10] method=ratio-support, alpha=1.50, f1_macro=0.0000
[CV 3/10] method=ratio-support, alpha=1.50, f1_macro=0.0000
[CV 4/10] method=ratio-support, alpha=1.50, f1_macro=0.0000
[CV 5/10] method=ratio-support, alpha=1.50, f1_macro=0.0000
[CV 6/10] method=ratio-support, alpha=1.50, f1_macro=0.0000
[CV 7/10] method=rati

In [49]:
f1 = [0 for i in range(n)]
accuracy = [0 for i in range(n)]
f1_binary = [0 for i in range(n)]

for i, (train_index, test_index) in enumerate(kfold.split(X_expanded, y)):
    bin_cls = fcalc.classifier.BinarizedBinaryClassifier(
        X_expanded.iloc[train_index].values, 
        y.iloc[train_index].to_numpy(),
        method=method_best_e,
        alpha=alpha_best_e
    )
    bin_cls.predict(X_expanded.iloc[test_index].values)
    
    accuracy[i], f1[i], f1_binary[i] = results_fca(y.iloc[test_index], bin_cls.predictions)
    
    print(f"[CV {i+1}/{n}] method={method_best_e}, alpha={alpha_best_e:0.2f}", 
          f"accuracy={accuracy[i]:0.4f}, f1_macro={f1[i]:0.4f}, f1_binary={f1_binary[i]:0.4f}")

f1_mean = np.mean(f1)
accuracy_mean = np.mean(accuracy)
f1_binary_mean = np.mean(f1_binary)

best_metrics_expanded = {'Accuracy': accuracy_mean, 'F1_binary': f1_binary_mean, 'F1_macro': f1_mean}
print(f"\n accuracy={accuracy_mean:0.4f}, f1_binary={f1_binary_mean:0.4f}, f1_macro={f1_mean:0.4f}")

[CV 1/10] method=standard, alpha=0.00 accuracy=0.0000, f1_macro=0.0000, f1_binary=0.0000
[CV 2/10] method=standard, alpha=0.00 accuracy=0.0000, f1_macro=0.0000, f1_binary=0.0000
[CV 3/10] method=standard, alpha=0.00 accuracy=0.0000, f1_macro=0.0000, f1_binary=0.0000
[CV 4/10] method=standard, alpha=0.00 accuracy=0.0000, f1_macro=0.0000, f1_binary=0.0000
[CV 5/10] method=standard, alpha=0.00 accuracy=0.0000, f1_macro=0.0000, f1_binary=0.0000
[CV 6/10] method=standard, alpha=0.00 accuracy=0.0000, f1_macro=0.0000, f1_binary=0.0000
[CV 7/10] method=standard, alpha=0.00 accuracy=0.0000, f1_macro=0.0000, f1_binary=0.0000
[CV 8/10] method=standard, alpha=0.00 accuracy=0.0000, f1_macro=0.0000, f1_binary=0.0000
[CV 9/10] method=standard, alpha=0.00 accuracy=0.0000, f1_macro=0.0000, f1_binary=0.0000
[CV 10/10] method=standard, alpha=0.00 accuracy=0.0000, f1_macro=0.0000, f1_binary=0.0000

 accuracy=0.0000, f1_binary=0.0000, f1_macro=0.0000


In [50]:
best_parameters_expanded

{'method': 'standard', 'alpha': 0.0}

In [51]:
best_metrics_expanded

{'Accuracy': np.float64(0.0),
 'F1_binary': np.float64(0.0),
 'F1_macro': np.float64(0.0)}