# Dataset: COVID, FLU, COLD Symptoms
Link to a dataset: https://www.kaggle.com/datasets/walterconway/covid-flu-cold-symptoms

In [1]:
import numpy as np
import pandas as pd

from sklearn.metrics import make_scorer, accuracy_score, f1_score

from sklearn import model_selection
from sklearn.model_selection import (
    StratifiedKFold,
    cross_validate,
    GridSearchCV,
)

Standard models used:

In [2]:
from sklearn.linear_model import LogisticRegression

from sklearn.neighbors import KNeighborsClassifier

from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import ComplementNB

from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import RandomForestClassifier

**fcalc** library files are located in **FCALC** folder

**FCALC** folder is located in the same directory as this notebook.

In [3]:
import FCALC.fcalc as fcalc

# Loading data

In [4]:
data_cfc = pd.read_csv('Datasets/covid_flu_cold.csv', sep=',')
data_cfc

Unnamed: 0,COUGH,MUSCLE_ACHES,TIREDNESS,SORE_THROAT,RUNNY_NOSE,STUFFY_NOSE,FEVER,NAUSEA,VOMITING,DIARRHEA,...,DIFFICULTY_BREATHING,LOSS_OF_TASTE,LOSS_OF_SMELL,ITCHY_NOSE,ITCHY_EYES,ITCHY_MOUTH,ITCHY_INNER_EAR,SNEEZING,PINK_EYE,TYPE
0,0,0,1,0,1,0,0,0,0,0,...,0,1,0,1,0,0,1,0,1,ALLERGY
1,0,0,1,0,0,0,0,0,0,0,...,0,1,0,1,0,1,1,1,1,ALLERGY
2,0,1,1,1,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0,1,ALLERGY
3,0,0,0,1,1,0,0,0,0,0,...,0,1,1,0,0,1,0,1,1,ALLERGY
4,0,0,1,0,1,0,0,0,0,0,...,0,1,1,0,1,0,1,1,1,ALLERGY
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44448,1,0,0,1,0,0,1,0,1,1,...,0,0,0,0,0,0,0,1,0,FLU
44449,1,1,0,1,1,0,1,1,0,1,...,0,1,1,0,0,0,0,1,0,FLU
44450,0,0,1,0,1,0,0,0,1,1,...,1,1,0,0,0,0,0,1,0,FLU
44451,0,0,0,1,1,1,0,0,1,0,...,0,0,1,0,0,0,0,0,0,FLU


In [5]:
one_hot = pd.get_dummies(data_cfc['TYPE']).astype(int)
one_hot.head()

Unnamed: 0,ALLERGY,COLD,COVID,FLU
0,1,0,0,0
1,1,0,0,0
2,1,0,0,0
3,1,0,0,0
4,1,0,0,0


In [6]:
one_hot.value_counts()

ALLERGY  COLD  COVID  FLU
0        0     0      1      25000
1        0     0      0      16381
0        0     1      0       2048
         1     0      0       1024
Name: count, dtype: int64

In [7]:
data_cfc['COVID'] = one_hot['COVID']
data_cfc = data_cfc.drop(labels=['TYPE'], axis=1)
data_cfc = data_cfc[(one_hot.COLD == 1) | (one_hot.COVID == 1)]
data_cfc.drop_duplicates(inplace=True)
data_cfc

Unnamed: 0,COUGH,MUSCLE_ACHES,TIREDNESS,SORE_THROAT,RUNNY_NOSE,STUFFY_NOSE,FEVER,NAUSEA,VOMITING,DIARRHEA,...,DIFFICULTY_BREATHING,LOSS_OF_TASTE,LOSS_OF_SMELL,ITCHY_NOSE,ITCHY_EYES,ITCHY_MOUTH,ITCHY_INNER_EAR,SNEEZING,PINK_EYE,COVID
16381,0,1,1,0,1,0,1,0,0,0,...,0,1,0,0,0,0,0,1,0,0
16382,1,1,0,1,1,0,1,0,0,0,...,0,1,0,0,0,0,0,1,0,0
16383,0,1,0,0,0,0,0,0,0,0,...,0,1,1,0,0,0,0,1,0,0
16384,1,1,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
16385,1,0,0,1,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19448,0,1,0,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,1
19449,0,0,1,0,0,0,0,0,1,0,...,1,0,0,0,0,0,0,1,0,1
19450,0,0,0,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,1
19451,0,1,0,1,0,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,1


In [8]:
data_cfc.shape

(3072, 21)

In [9]:
data_cfc.COVID.value_counts(), data_cfc.COVID.value_counts(normalize=True)

(COVID
 1    2048
 0    1024
 Name: count, dtype: int64,
 COVID
 1    0.666667
 0    0.333333
 Name: proportion, dtype: float64)

In [10]:
data_cfc = data_cfc.astype(bool)
data_cfc

Unnamed: 0,COUGH,MUSCLE_ACHES,TIREDNESS,SORE_THROAT,RUNNY_NOSE,STUFFY_NOSE,FEVER,NAUSEA,VOMITING,DIARRHEA,...,DIFFICULTY_BREATHING,LOSS_OF_TASTE,LOSS_OF_SMELL,ITCHY_NOSE,ITCHY_EYES,ITCHY_MOUTH,ITCHY_INNER_EAR,SNEEZING,PINK_EYE,COVID
16381,False,True,True,False,True,False,True,False,False,False,...,False,True,False,False,False,False,False,True,False,False
16382,True,True,False,True,True,False,True,False,False,False,...,False,True,False,False,False,False,False,True,False,False
16383,False,True,False,False,False,False,False,False,False,False,...,False,True,True,False,False,False,False,True,False,False
16384,True,True,True,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
16385,True,False,False,True,False,False,True,False,False,False,...,False,False,True,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19448,False,True,False,False,False,False,True,True,False,False,...,False,False,False,False,False,False,False,False,False,True
19449,False,False,True,False,False,False,False,False,True,False,...,True,False,False,False,False,False,False,True,False,True
19450,False,False,False,False,False,False,False,True,True,False,...,False,False,False,False,False,False,False,False,False,True
19451,False,True,False,True,False,False,False,False,False,False,...,True,False,False,False,False,False,False,True,False,True


In [11]:
df = data_cfc.groupby('COVID').sample(frac=500/3072, random_state=49)
df = df.sample(frac=1, random_state=49) #shuffling
df.head()

Unnamed: 0,COUGH,MUSCLE_ACHES,TIREDNESS,SORE_THROAT,RUNNY_NOSE,STUFFY_NOSE,FEVER,NAUSEA,VOMITING,DIARRHEA,...,DIFFICULTY_BREATHING,LOSS_OF_TASTE,LOSS_OF_SMELL,ITCHY_NOSE,ITCHY_EYES,ITCHY_MOUTH,ITCHY_INNER_EAR,SNEEZING,PINK_EYE,COVID
18280,True,False,True,True,False,False,False,True,True,False,...,False,False,False,False,False,False,False,True,False,True
18704,False,True,True,False,False,False,True,False,False,False,...,True,False,False,False,False,False,False,False,False,True
18441,False,True,True,True,False,False,True,True,True,True,...,False,False,False,False,False,False,False,False,False,True
18651,True,False,True,True,False,False,True,False,False,True,...,False,False,False,False,False,False,False,False,False,True
17038,False,False,False,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False


In [12]:
df.shape

(500, 21)

In [13]:
df.COVID.value_counts(normalize=True)

COVID
True     0.666
False    0.334
Name: proportion, dtype: float64

In [14]:
features = df.columns[0:-1]
target = df.columns[-1]

In [15]:
X = df[features]
y = df[target]

# Testing models

In [16]:
best_parameters = { 
    'LogisticRegression': [],
    'KNeighborsClassifier': [],
    'MultinomialNB': [],
    'GaussianNB': [],
    'ComplementNB': [],
    'DecisionTreeClassifier': [],
    'RandomForestClassifier': [],
    'BinarizedBinaryClassifier': [],
    'PatternBinaryClassifier': [],
}

In [17]:
best_metrics = {
    'LogisticRegression': [],
    'KNeighborsClassifier': [],
    'MultinomialNB': [],
    'GaussianNB': [],
    'ComplementNB': [],
    'DecisionTreeClassifier': [],
    'RandomForestClassifier': [],
    'BinarizedBinaryClassifier': [],
    'PatternBinaryClassifier': [],  
}

In [18]:
scoring = {'accuracy' : make_scorer(accuracy_score),
           'f1_macro' : make_scorer(f1_score, average='macro'),
           'f1_binary' : make_scorer(f1_score),
          }

kfold = StratifiedKFold(n_splits=10, random_state=49, shuffle=True)

In [19]:
def count_metrics(results):
    acc = np.round(np.mean(results['test_accuracy']), 4) 
    f1_m = np.round(np.mean(results['test_f1_macro']), 4)
    f1_b = np.round(np.mean(results['test_f1_binary']), 4)
    return(acc, f1_m, f1_b)  

In [20]:
def print_results(results):
    acc, f1_m, f1_b = count_metrics(results)
    print(f'Accuracy = {acc:0.4f}, F1_binary = {f1_b:0.4f}, F1_macro = {f1_m:0.4f}')  

In [21]:
def fill_best_metrics(results, method):
    acc, f1_m, f1_b = count_metrics(results)
    best_metrics[method] = {'Accuracy': acc, 'F1_binary': f1_b, 'F1_macro': f1_m}

# Standard models

### Logistic regression

In [22]:
parameters = {'C' : np.linspace(1,5,101)}

model = LogisticRegression()

best = GridSearchCV(estimator=model, 
                    param_grid=parameters, 
                    cv=kfold,
                    verbose=1, 
                    scoring=scoring,
                    refit='f1_binary'
                   )
best.fit(X, y)
best_parameters['LogisticRegression'] = best.best_params_
best.best_params_

Fitting 10 folds for each of 101 candidates, totalling 1010 fits


{'C': 1.0}

In [23]:
model = LogisticRegression(C=best.best_params_['C'])
results = cross_validate(estimator=model,
                         X=X, y=y,
                         cv=kfold,
                         scoring=scoring
                         )
fill_best_metrics(results, 'LogisticRegression')
print_results(results)

Accuracy = 0.9800, F1_binary = 0.9852, F1_macro = 0.9771


In [24]:
results = cross_validate(
    estimator=model,
    X=data_cfc[~data_cfc.index.isin(df.index)].iloc[:, :-1],
    y=data_cfc[~data_cfc.index.isin(df.index)].iloc[:, -1],
    cv=kfold,
    scoring=scoring
)
print_results(results)

Accuracy = 0.9743, F1_binary = 0.9809, F1_macro = 0.9709


### K-Nearest Neighbours

In [25]:
parameters = {'n_neighbors' : range(4,64,3)}

model = KNeighborsClassifier()

best = GridSearchCV(estimator=model, 
                    param_grid=parameters, 
                    cv=kfold,
                    verbose=1, 
                    scoring=scoring,
                    refit='f1_binary'
                   )

best.fit(X, y)
best_parameters['KNeighborsClassifier'] = best.best_params_
best.best_params_

Fitting 10 folds for each of 20 candidates, totalling 200 fits


{'n_neighbors': 61}

In [26]:
model = KNeighborsClassifier(n_neighbors=best.best_params_['n_neighbors'])
results = cross_validate(estimator=model,
                         X=X,
                         y=y,
                         cv=kfold,
                         scoring=scoring
                         )
fill_best_metrics(results, 'KNeighborsClassifier')
print_results(results)

Accuracy = 0.9860, F1_binary = 0.9898, F1_macro = 0.9838


In [27]:
results = cross_validate(
    estimator=model,
    X=data_cfc[~data_cfc.index.isin(df.index)].iloc[:, :-1],
    y=data_cfc[~data_cfc.index.isin(df.index)].iloc[:, -1],
    cv=kfold,
    scoring=scoring
)
print_results(results)

Accuracy = 0.9763, F1_binary = 0.9824, F1_macro = 0.9730


### Naive Bayes

##### MULTINOMIAL NB

In [28]:
parameters = {'alpha' : np.linspace(0.001,100.001,1001)}

model = MultinomialNB()

best = GridSearchCV(estimator=model, 
                    param_grid=parameters, 
                    cv=kfold,
                    verbose=1, 
                    scoring=scoring,
                    refit='f1_binary'
                   )
best.fit(X, y)
best_parameters['MultinomialNB'] = best.best_params_
best.best_params_

Fitting 10 folds for each of 1001 candidates, totalling 10010 fits


{'alpha': 0.001}

In [29]:
model = MultinomialNB(alpha=best.best_params_['alpha'])

results = cross_validate(estimator=model,
                         X=X,
                         y=y,
                         cv=kfold,
                         scoring=scoring
                         )
fill_best_metrics(results, 'MultinomialNB')
print_results(results)

Accuracy = 0.9800, F1_binary = 0.9854, F1_macro = 0.9769


In [30]:
results = cross_validate(
    estimator=model,
    X=data_cfc[~data_cfc.index.isin(df.index)].iloc[:, :-1],
    y=data_cfc[~data_cfc.index.isin(df.index)].iloc[:, -1],
    cv=kfold,
    scoring=scoring
)
print_results(results)

Accuracy = 0.9790, F1_binary = 0.9845, F1_macro = 0.9759


##### GAUSSIAN NB

In [31]:
parameters = {'var_smoothing': np.logspace(0,-9, num=100)}

model = GaussianNB()

best = GridSearchCV(estimator=model, 
                    param_grid=parameters, 
                    cv=kfold,
                    verbose=1, 
                    scoring=scoring,
                    refit='f1_binary'
                   )
best.fit(X, y)
best_parameters['GaussianNB'] = best.best_params_
best.best_params_

Fitting 10 folds for each of 100 candidates, totalling 1000 fits


{'var_smoothing': 1.0}

In [32]:
model = GaussianNB(var_smoothing=best.best_params_['var_smoothing'])

results = cross_validate(estimator=model,
                         X=X,
                         y=y,
                         cv=kfold,
                         scoring=scoring
                         )
fill_best_metrics(results, 'GaussianNB')
print_results(results)

Accuracy = 0.9840, F1_binary = 0.9882, F1_macro = 0.9816


In [33]:
results = cross_validate(
    estimator=model,
    X=data_cfc[~data_cfc.index.isin(df.index)].iloc[:, :-1],
    y=data_cfc[~data_cfc.index.isin(df.index)].iloc[:, -1],
    cv=kfold,
    scoring=scoring
)
print_results(results)

Accuracy = 0.9790, F1_binary = 0.9845, F1_macro = 0.9759


##### COMPLEMENT NB

In [34]:
parameters = {'alpha' : np.linspace(0.001,100.001,1001)}

model =  ComplementNB()

best = GridSearchCV(estimator=model, 
                    param_grid=parameters, 
                    cv=kfold,
                    verbose=1, 
                    scoring=scoring,
                    refit='f1_binary'
                   )
best.fit(X, y)
best_parameters['ComplementNB'] = best.best_params_
best.best_params_

Fitting 10 folds for each of 1001 candidates, totalling 10010 fits


{'alpha': 25.301000000000002}

In [35]:
model = ComplementNB(alpha=best.best_params_['alpha'])
results = cross_validate(estimator=model,
                         X=X,
                         y=y,
                         cv=kfold,
                         scoring=scoring
                         )
fill_best_metrics(results, 'ComplementNB')
print_results(results) 

Accuracy = 0.9880, F1_binary = 0.9913, F1_macro = 0.9861


In [36]:
results = cross_validate(
    estimator=model,
    X=data_cfc[~data_cfc.index.isin(df.index)].iloc[:, :-1],
    y=data_cfc[~data_cfc.index.isin(df.index)].iloc[:, -1],
    cv=kfold,
    scoring=scoring
)
print_results(results)

Accuracy = 0.9794, F1_binary = 0.9843, F1_macro = 0.9772


### Decision tree

In [37]:
parameters = {'min_samples_split' : range(2,21,2),
              'max_depth' : range(2,21,2),
              'criterion' : ['gini', 'entropy']
             }

model = DecisionTreeClassifier(random_state=49)

best = GridSearchCV(estimator=model, 
                    param_grid=parameters, 
                    cv=kfold,
                    verbose=1, 
                    scoring=scoring,
                    refit='f1_binary'
                   )
best.fit(X, y)
best_parameters['DecisionTreeClassifier'] = best.best_params_
best.best_params_

Fitting 10 folds for each of 200 candidates, totalling 2000 fits


{'criterion': 'gini', 'max_depth': 10, 'min_samples_split': 10}

In [38]:
model = DecisionTreeClassifier(
    max_depth=best.best_params_['max_depth'], 
    min_samples_split=best.best_params_['min_samples_split'],
    criterion=best.best_params_['criterion'],
    random_state=49
)
results = cross_validate(estimator=model,
                         X=X,
                         y=y,
                         cv=kfold,
                         scoring=scoring
                         )
fill_best_metrics(results, 'DecisionTreeClassifier')
print_results(results)

Accuracy = 0.9860, F1_binary = 0.9894, F1_macro = 0.9843


In [39]:
results = cross_validate(
    estimator=model,
    X=data_cfc[~data_cfc.index.isin(df.index)].iloc[:, :-1],
    y=data_cfc[~data_cfc.index.isin(df.index)].iloc[:, -1],
    cv=kfold,
    scoring=scoring
)
print_results(results)

Accuracy = 0.9751, F1_binary = 0.9814, F1_macro = 0.9720


### RandomForest

In [40]:
parameters = {'n_estimators' : range(60,121,10),
              'min_samples_split' : range(2,13,2),
              'max_depth' : range(8,21,2),
              'criterion' : ['gini', 'entropy']
             }
model = RandomForestClassifier(random_state=49)

best = GridSearchCV(estimator=model, 
                    param_grid=parameters, 
                    cv=kfold,
                    verbose=1, 
                    scoring=scoring,
                    refit='f1_binary'
                   )
best.fit(X, y)
best_parameters['RandomForestClassifier'] = best.best_params_
best.best_params_

Fitting 10 folds for each of 588 candidates, totalling 5880 fits


{'criterion': 'gini',
 'max_depth': 10,
 'min_samples_split': 12,
 'n_estimators': 90}

In [41]:
model = RandomForestClassifier(
    n_estimators=best.best_params_['n_estimators'],
    min_samples_split=best.best_params_['min_samples_split'],
    max_depth=best.best_params_['max_depth'],
    criterion=best.best_params_['criterion'],
    random_state=49
)
results = cross_validate(estimator=model,
                         X=X,
                         y=y,
                         cv=kfold,
                         scoring=scoring
                         )
fill_best_metrics(results, 'RandomForestClassifier')
print_results(results)

Accuracy = 0.9820, F1_binary = 0.9869, F1_macro = 0.9791


In [42]:
results = cross_validate(
    estimator=model,
    X=data_cfc[~data_cfc.index.isin(df.index)].iloc[:, :-1],
    y=data_cfc[~data_cfc.index.isin(df.index)].iloc[:, -1],
    cv=kfold,
    scoring=scoring
)
print_results(results)

Accuracy = 0.9778, F1_binary = 0.9837, F1_macro = 0.9746


# Lazy FCA

Due to the multilabel output of **BinarizedBinaryClassifier** (**1** for **True**, **0** for **false** and **-1** for **undefined**) we cannot utilize **f1_score** with **average='binary'**. Therefore, a function that interprets **undefined** as misclassification was implemented.

In [43]:
def compare_with_binary_f1_old(y_true, y_pred):
    y_tmp = np.concatenate(
        (np.array(y_true)[:,None],np.array(y_pred)[:,None]),
        axis=1
    )
    df_tmp = pd.DataFrame(y_tmp, columns=['y_true','y_pred'])
    df_tmp.y_true = df_tmp.y_true.astype(bool)
    df_tmp['y_new']= ~df_tmp.loc[df_tmp.y_pred==-1]['y_true']
    df_tmp.loc[df_tmp.y_pred!=-1, 'y_new'] = df_tmp.loc[df_tmp.y_pred!=-1,'y_pred'].astype(bool) 
    df_tmp.y_new = df_tmp.y_new.astype(bool)
    return(f1_score(df_tmp.y_true, df_tmp.y_new))

In [44]:
def results_fca(y_true, y_pred):
    f1 = f1_score(y_true, y_pred, average='macro')
    accuracy = accuracy_score(y_true, y_pred)
    f1_binary = compare_with_binary_f1(y_true, y_pred)
    return (accuracy, f1, f1_binary)  

In [45]:
def compare_with_binary_f1(y_true, y_pred):
    y_tmp = np.copy(y_pred)
    undef = y_pred == -1
    y_tmp[undef] = (y_true[undef] - np.ones(shape=y_tmp[undef].shape) * 2) // -2
    return (f1_score(y_true, y_tmp))

(0 - 2) // (-2) = 1

(1 - 2) // (-2) = 0

## BinarizedBinaryClassifier

**BinarizedBinaryClassifier** is not a **scikit** model, so **GridSearchCV** and **cross_validate** from **scikit** do not support it. Thus, we check desired parameters in **for**  loops.

In [46]:
parameters = {'alpha' : np.linspace(0, 1, 21),
              'method': ['standard','standard-support','ratio-support'],
             }

n = kfold.get_n_splits(X)

f1_best = 0
alpha_best = 0.
method_best = 'standard'

for method in parameters['method']:
    for alpha in parameters['alpha']:
        
        f1 = [0 for i in range(n)]
        for i, (train_index, test_index) in enumerate(kfold.split(X, y)):
            bin_cls = fcalc.classifier.BinarizedBinaryClassifier(
                X.iloc[train_index].values, 
                y.iloc[train_index].to_numpy(),
                method=method,
                alpha=alpha
            )
            bin_cls.predict(X.iloc[test_index].values)
            f1[i] = f1_score(y.iloc[test_index], bin_cls.predictions, average='macro')
            print(f"[CV {i+1}/{n}] method={method}, alpha={alpha:0.2f}, f1_macro={f1[i]:0.4f}") 
                    
        f1_mean = np.mean(f1)
        print(f'f1_mean = {f1_mean:0.4f}\n')
        if(f1_mean > f1_best):
            f1_best = f1_mean
            alpha_best = alpha
            method_best = method

best_parameters['BinarizedBinaryClassifier'] = {'method': method_best, 'alpha': alpha_best}            
print(f"f1_best={f1_best:0.4f}, method={method_best}, alpha={alpha_best:0.2f}")

[CV 1/10] method=standard, alpha=0.00, f1_macro=0.6509
[CV 2/10] method=standard, alpha=0.00, f1_macro=0.6405
[CV 3/10] method=standard, alpha=0.00, f1_macro=0.6509
[CV 4/10] method=standard, alpha=0.00, f1_macro=0.6615
[CV 5/10] method=standard, alpha=0.00, f1_macro=0.6293
[CV 6/10] method=standard, alpha=0.00, f1_macro=0.6615
[CV 7/10] method=standard, alpha=0.00, f1_macro=0.6566
[CV 8/10] method=standard, alpha=0.00, f1_macro=0.6566
[CV 9/10] method=standard, alpha=0.00, f1_macro=0.6407
[CV 10/10] method=standard, alpha=0.00, f1_macro=0.6562
f1_mean = 0.6505

[CV 1/10] method=standard, alpha=0.05, f1_macro=0.6509
[CV 2/10] method=standard, alpha=0.05, f1_macro=0.6405
[CV 3/10] method=standard, alpha=0.05, f1_macro=0.6509
[CV 4/10] method=standard, alpha=0.05, f1_macro=0.6615
[CV 5/10] method=standard, alpha=0.05, f1_macro=0.6407
[CV 6/10] method=standard, alpha=0.05, f1_macro=0.6615
[CV 7/10] method=standard, alpha=0.05, f1_macro=0.6566
[CV 8/10] method=standard, alpha=0.05, f1_macr

[CV 6/10] method=standard, alpha=0.70, f1_macro=0.9554
[CV 7/10] method=standard, alpha=0.70, f1_macro=1.0000
[CV 8/10] method=standard, alpha=0.70, f1_macro=0.9774
[CV 9/10] method=standard, alpha=0.70, f1_macro=1.0000
[CV 10/10] method=standard, alpha=0.70, f1_macro=0.9566
f1_mean = 0.9751

[CV 1/10] method=standard, alpha=0.75, f1_macro=0.9774
[CV 2/10] method=standard, alpha=0.75, f1_macro=0.9540
[CV 3/10] method=standard, alpha=0.75, f1_macro=0.9524
[CV 4/10] method=standard, alpha=0.75, f1_macro=1.0000
[CV 5/10] method=standard, alpha=0.75, f1_macro=0.9774
[CV 6/10] method=standard, alpha=0.75, f1_macro=0.9554
[CV 7/10] method=standard, alpha=0.75, f1_macro=1.0000
[CV 8/10] method=standard, alpha=0.75, f1_macro=0.9774
[CV 9/10] method=standard, alpha=0.75, f1_macro=1.0000
[CV 10/10] method=standard, alpha=0.75, f1_macro=0.9566
f1_mean = 0.9751

[CV 1/10] method=standard, alpha=0.80, f1_macro=0.9774
[CV 2/10] method=standard, alpha=0.80, f1_macro=0.9540
[CV 3/10] method=standard, 

[CV 1/10] method=standard-support, alpha=0.35, f1_macro=0.6608
[CV 2/10] method=standard-support, alpha=0.35, f1_macro=0.8417
[CV 3/10] method=standard-support, alpha=0.35, f1_macro=0.7512
[CV 4/10] method=standard-support, alpha=0.35, f1_macro=0.4743
[CV 5/10] method=standard-support, alpha=0.35, f1_macro=0.4868
[CV 6/10] method=standard-support, alpha=0.35, f1_macro=0.3973
[CV 7/10] method=standard-support, alpha=0.35, f1_macro=0.7029
[CV 8/10] method=standard-support, alpha=0.35, f1_macro=0.7512
[CV 9/10] method=standard-support, alpha=0.35, f1_macro=0.7126
[CV 10/10] method=standard-support, alpha=0.35, f1_macro=0.7313
f1_mean = 0.6510

[CV 1/10] method=standard-support, alpha=0.40, f1_macro=0.6608
[CV 2/10] method=standard-support, alpha=0.40, f1_macro=0.8417
[CV 3/10] method=standard-support, alpha=0.40, f1_macro=0.7512
[CV 4/10] method=standard-support, alpha=0.40, f1_macro=0.4743
[CV 5/10] method=standard-support, alpha=0.40, f1_macro=0.4868
[CV 6/10] method=standard-support, a

[CV 8/10] method=standard-support, alpha=0.95, f1_macro=0.0909
[CV 9/10] method=standard-support, alpha=0.95, f1_macro=0.1397
[CV 10/10] method=standard-support, alpha=0.95, f1_macro=0.1600
f1_mean = 0.1306

[CV 1/10] method=standard-support, alpha=1.00, f1_macro=0.0196
[CV 2/10] method=standard-support, alpha=1.00, f1_macro=0.0400
[CV 3/10] method=standard-support, alpha=1.00, f1_macro=0.0000
[CV 4/10] method=standard-support, alpha=1.00, f1_macro=0.0196
[CV 5/10] method=standard-support, alpha=1.00, f1_macro=0.0385
[CV 6/10] method=standard-support, alpha=1.00, f1_macro=0.0000
[CV 7/10] method=standard-support, alpha=1.00, f1_macro=0.0000
[CV 8/10] method=standard-support, alpha=1.00, f1_macro=0.0196
[CV 9/10] method=standard-support, alpha=1.00, f1_macro=0.0000
[CV 10/10] method=standard-support, alpha=1.00, f1_macro=0.0385
f1_mean = 0.0176

[CV 1/10] method=ratio-support, alpha=0.00, f1_macro=0.9766
[CV 2/10] method=ratio-support, alpha=0.00, f1_macro=1.0000
[CV 3/10] method=ratio-

[CV 10/10] method=ratio-support, alpha=0.55, f1_macro=1.0000
f1_mean = 0.9816

[CV 1/10] method=ratio-support, alpha=0.60, f1_macro=0.9766
[CV 2/10] method=ratio-support, alpha=0.60, f1_macro=1.0000
[CV 3/10] method=ratio-support, alpha=0.60, f1_macro=0.9766
[CV 4/10] method=ratio-support, alpha=0.60, f1_macro=1.0000
[CV 5/10] method=ratio-support, alpha=0.60, f1_macro=0.9299
[CV 6/10] method=ratio-support, alpha=0.60, f1_macro=1.0000
[CV 7/10] method=ratio-support, alpha=0.60, f1_macro=0.9774
[CV 8/10] method=ratio-support, alpha=0.60, f1_macro=1.0000
[CV 9/10] method=ratio-support, alpha=0.60, f1_macro=0.9554
[CV 10/10] method=ratio-support, alpha=0.60, f1_macro=1.0000
f1_mean = 0.9816

[CV 1/10] method=ratio-support, alpha=0.65, f1_macro=0.9766
[CV 2/10] method=ratio-support, alpha=0.65, f1_macro=1.0000
[CV 3/10] method=ratio-support, alpha=0.65, f1_macro=0.9766
[CV 4/10] method=ratio-support, alpha=0.65, f1_macro=1.0000
[CV 5/10] method=ratio-support, alpha=0.65, f1_macro=0.9299
[C

**Extra with more ratio-support**

In [47]:
parameters = {'alpha' : np.linspace(1, 10, 19),
              'method': ['ratio-support'],
             }

n = kfold.get_n_splits(X)

for method in parameters['method']:
    for alpha in parameters['alpha']:
        
        f1 = [0 for i in range(n)]
        for i, (train_index, test_index) in enumerate(kfold.split(X, y)):
            bin_cls = fcalc.classifier.BinarizedBinaryClassifier(
                X.iloc[train_index].values, 
                y.iloc[train_index].to_numpy(),
                method=method,
                alpha=alpha
            )
            bin_cls.predict(X.iloc[test_index].values)
            f1[i] = f1_score(y.iloc[test_index], bin_cls.predictions, average='macro')
            print(f"[CV {i+1}/{n}] method={method}, alpha={alpha:0.2f}, f1_macro={f1[i]:0.4f}") 
                    
        f1_mean = np.mean(f1)
        print(f'f1_mean = {f1_mean:0.4f}\n')
        if(f1_mean > f1_best):
            f1_best = f1_mean
            alpha_best = alpha
            method_best = method

best_parameters['BinarizedBinaryClassifier'] = {'method': method_best, 'alpha': alpha_best}            
print(f"f1_best={f1_best:0.4f}, method={method_best}, alpha={alpha_best:0.2f}")

[CV 1/10] method=ratio-support, alpha=1.00, f1_macro=0.9766
[CV 2/10] method=ratio-support, alpha=1.00, f1_macro=0.9540
[CV 3/10] method=ratio-support, alpha=1.00, f1_macro=0.9766
[CV 4/10] method=ratio-support, alpha=1.00, f1_macro=1.0000
[CV 5/10] method=ratio-support, alpha=1.00, f1_macro=0.9299
[CV 6/10] method=ratio-support, alpha=1.00, f1_macro=1.0000
[CV 7/10] method=ratio-support, alpha=1.00, f1_macro=0.9774
[CV 8/10] method=ratio-support, alpha=1.00, f1_macro=0.9774
[CV 9/10] method=ratio-support, alpha=1.00, f1_macro=0.9554
[CV 10/10] method=ratio-support, alpha=1.00, f1_macro=0.9780
f1_mean = 0.9725

[CV 1/10] method=ratio-support, alpha=1.50, f1_macro=0.6509
[CV 2/10] method=ratio-support, alpha=1.50, f1_macro=0.6405
[CV 3/10] method=ratio-support, alpha=1.50, f1_macro=0.6509
[CV 4/10] method=ratio-support, alpha=1.50, f1_macro=0.6615
[CV 5/10] method=ratio-support, alpha=1.50, f1_macro=0.6407
[CV 6/10] method=ratio-support, alpha=1.50, f1_macro=0.6615
[CV 7/10] method=rati

[CV 4/10] method=ratio-support, alpha=7.50, f1_macro=0.6615
[CV 5/10] method=ratio-support, alpha=7.50, f1_macro=0.6293
[CV 6/10] method=ratio-support, alpha=7.50, f1_macro=0.6615
[CV 7/10] method=ratio-support, alpha=7.50, f1_macro=0.6566
[CV 8/10] method=ratio-support, alpha=7.50, f1_macro=0.6566
[CV 9/10] method=ratio-support, alpha=7.50, f1_macro=0.6407
[CV 10/10] method=ratio-support, alpha=7.50, f1_macro=0.6562
f1_mean = 0.6505

[CV 1/10] method=ratio-support, alpha=8.00, f1_macro=0.6509
[CV 2/10] method=ratio-support, alpha=8.00, f1_macro=0.6405
[CV 3/10] method=ratio-support, alpha=8.00, f1_macro=0.6509
[CV 4/10] method=ratio-support, alpha=8.00, f1_macro=0.6615
[CV 5/10] method=ratio-support, alpha=8.00, f1_macro=0.6293
[CV 6/10] method=ratio-support, alpha=8.00, f1_macro=0.6615
[CV 7/10] method=ratio-support, alpha=8.00, f1_macro=0.6566
[CV 8/10] method=ratio-support, alpha=8.00, f1_macro=0.6566
[CV 9/10] method=ratio-support, alpha=8.00, f1_macro=0.6407
[CV 10/10] method=rat

In [48]:
f1 = [0 for i in range(n)]
accuracy = [0 for i in range(n)]
f1_binary = [0 for i in range(n)]

for i, (train_index, test_index) in enumerate(kfold.split(X, y)):
    bin_cls = fcalc.classifier.BinarizedBinaryClassifier(
        X.iloc[train_index].values, 
        y.iloc[train_index].to_numpy(),
        method=method_best,
        alpha=alpha_best
    )
    bin_cls.predict(X.iloc[test_index].values)
    
    accuracy[i], f1[i], f1_binary[i] = results_fca(y.iloc[test_index], bin_cls.predictions)
    
    print(f"[CV {i+1}/{n}] method={method_best}, alpha={alpha_best:0.2f}", 
          f"accuracy={accuracy[i]:0.4f}, f1_macro={f1[i]:0.4f}, f1_binary={f1_binary[i]:0.4f}")

f1_mean = np.mean(f1)
accuracy_mean = np.mean(accuracy)
f1_binary_mean = np.mean(f1_binary)

best_metrics['BinarizedBinaryClassifier'] = {'Accuracy': accuracy_mean, 'F1_binary': f1_binary_mean, 'F1_macro': f1_mean}
print(f"\n accuracy={accuracy_mean:0.4f}, f1_binary={f1_binary_mean:0.4f}, f1_macro={f1_mean:0.4f}")

[CV 1/10] method=ratio-support, alpha=0.00 accuracy=0.9800, f1_macro=0.9766, f1_binary=0.9855
[CV 2/10] method=ratio-support, alpha=0.00 accuracy=1.0000, f1_macro=1.0000, f1_binary=1.0000
[CV 3/10] method=ratio-support, alpha=0.00 accuracy=0.9800, f1_macro=0.9766, f1_binary=0.9855
[CV 4/10] method=ratio-support, alpha=0.00 accuracy=1.0000, f1_macro=1.0000, f1_binary=1.0000
[CV 5/10] method=ratio-support, alpha=0.00 accuracy=0.9400, f1_macro=0.9299, f1_binary=0.9565
[CV 6/10] method=ratio-support, alpha=0.00 accuracy=1.0000, f1_macro=1.0000, f1_binary=1.0000
[CV 7/10] method=ratio-support, alpha=0.00 accuracy=0.9800, f1_macro=0.9774, f1_binary=0.9851
[CV 8/10] method=ratio-support, alpha=0.00 accuracy=1.0000, f1_macro=1.0000, f1_binary=1.0000
[CV 9/10] method=ratio-support, alpha=0.00 accuracy=0.9600, f1_macro=0.9554, f1_binary=0.9697
[CV 10/10] method=ratio-support, alpha=0.00 accuracy=1.0000, f1_macro=1.0000, f1_binary=1.0000

 accuracy=0.9840, f1_binary=0.9882, f1_macro=0.9816


## PatternBinaryClassifier

In [49]:
parameters = {'alpha' : np.linspace(0, 1, 21),
              'method': ['standard','standard-support','ratio-support'],
             }

n = kfold.get_n_splits(X)

f1_best_p = 0
alpha_best_p = 0.
method_best_p = 'standard'

for method in parameters['method']:
    for alpha in parameters['alpha']:
        
        f1 = [0 for i in range(n)]
        for i, (train_index, test_index) in enumerate(kfold.split(X, y)):
            bin_cls = fcalc.classifier.PatternBinaryClassifier(
                X.iloc[train_index].values, 
                y.iloc[train_index].to_numpy(),
                method=method,
                alpha=alpha
            )
            bin_cls.predict(X.iloc[test_index].values)
            f1[i] = f1_score(y.iloc[test_index], bin_cls.predictions, average='macro')
            print(f"[CV {i+1}/{n}] method={method}, alpha={alpha:0.2f}, f1_macro={f1[i]:0.4f}") 
                    
        f1_mean = np.mean(f1)
        print(f'f1_mean = {f1_mean:0.4f}\n')
        if(f1_mean > f1_best_p):
            f1_best_p = f1_mean
            alpha_best_p = alpha
            method_best_p = method

best_parameters['PatternBinaryClassifier'] = {'method': method_best_p, 'alpha': alpha_best_p}            
print(f"f1_best={f1_best_p:0.4f}, method={method_best_p}, alpha={alpha_best_p:0.2f}")

[CV 1/10] method=standard, alpha=0.00, f1_macro=0.9774
[CV 2/10] method=standard, alpha=0.00, f1_macro=0.9540
[CV 3/10] method=standard, alpha=0.00, f1_macro=0.9540
[CV 4/10] method=standard, alpha=0.00, f1_macro=1.0000
[CV 5/10] method=standard, alpha=0.00, f1_macro=0.9540
[CV 6/10] method=standard, alpha=0.00, f1_macro=1.0000
[CV 7/10] method=standard, alpha=0.00, f1_macro=1.0000
[CV 8/10] method=standard, alpha=0.00, f1_macro=0.9774
[CV 9/10] method=standard, alpha=0.00, f1_macro=0.9322
[CV 10/10] method=standard, alpha=0.00, f1_macro=0.9566
f1_mean = 0.9706

[CV 1/10] method=standard, alpha=0.05, f1_macro=0.9554
[CV 2/10] method=standard, alpha=0.05, f1_macro=0.8529
[CV 3/10] method=standard, alpha=0.05, f1_macro=0.9554
[CV 4/10] method=standard, alpha=0.05, f1_macro=0.6323
[CV 5/10] method=standard, alpha=0.05, f1_macro=0.6293
[CV 6/10] method=standard, alpha=0.05, f1_macro=0.9356
[CV 7/10] method=standard, alpha=0.05, f1_macro=0.9356
[CV 8/10] method=standard, alpha=0.05, f1_macr

[CV 6/10] method=standard, alpha=0.70, f1_macro=0.1381
[CV 7/10] method=standard, alpha=0.70, f1_macro=0.1196
[CV 8/10] method=standard, alpha=0.70, f1_macro=0.0370
[CV 9/10] method=standard, alpha=0.70, f1_macro=0.1091
[CV 10/10] method=standard, alpha=0.70, f1_macro=0.0000
f1_mean = 0.0649

[CV 1/10] method=standard, alpha=0.75, f1_macro=0.0583
[CV 2/10] method=standard, alpha=0.75, f1_macro=0.0392
[CV 3/10] method=standard, alpha=0.75, f1_macro=0.0000
[CV 4/10] method=standard, alpha=0.75, f1_macro=0.0721
[CV 5/10] method=standard, alpha=0.75, f1_macro=0.0751
[CV 6/10] method=standard, alpha=0.75, f1_macro=0.1381
[CV 7/10] method=standard, alpha=0.75, f1_macro=0.1196
[CV 8/10] method=standard, alpha=0.75, f1_macro=0.0370
[CV 9/10] method=standard, alpha=0.75, f1_macro=0.1091
[CV 10/10] method=standard, alpha=0.75, f1_macro=0.0000
f1_mean = 0.0649

[CV 1/10] method=standard, alpha=0.80, f1_macro=0.0583
[CV 2/10] method=standard, alpha=0.80, f1_macro=0.0392
[CV 3/10] method=standard, 

[CV 1/10] method=standard-support, alpha=0.35, f1_macro=0.2784
[CV 2/10] method=standard-support, alpha=0.35, f1_macro=0.2921
[CV 3/10] method=standard-support, alpha=0.35, f1_macro=0.2424
[CV 4/10] method=standard-support, alpha=0.35, f1_macro=0.3779
[CV 5/10] method=standard-support, alpha=0.35, f1_macro=0.3779
[CV 6/10] method=standard-support, alpha=0.35, f1_macro=0.3187
[CV 7/10] method=standard-support, alpha=0.35, f1_macro=0.2747
[CV 8/10] method=standard-support, alpha=0.35, f1_macro=0.2870
[CV 9/10] method=standard-support, alpha=0.35, f1_macro=0.3490
[CV 10/10] method=standard-support, alpha=0.35, f1_macro=0.2308
f1_mean = 0.3029

[CV 1/10] method=standard-support, alpha=0.40, f1_macro=0.2784
[CV 2/10] method=standard-support, alpha=0.40, f1_macro=0.2921
[CV 3/10] method=standard-support, alpha=0.40, f1_macro=0.2424
[CV 4/10] method=standard-support, alpha=0.40, f1_macro=0.3779
[CV 5/10] method=standard-support, alpha=0.40, f1_macro=0.3779
[CV 6/10] method=standard-support, a

[CV 8/10] method=standard-support, alpha=0.95, f1_macro=0.2870
[CV 9/10] method=standard-support, alpha=0.95, f1_macro=0.2537
[CV 10/10] method=standard-support, alpha=0.95, f1_macro=0.2308
f1_mean = 0.2559

[CV 1/10] method=standard-support, alpha=1.00, f1_macro=0.2492
[CV 2/10] method=standard-support, alpha=1.00, f1_macro=0.2921
[CV 3/10] method=standard-support, alpha=1.00, f1_macro=0.2424
[CV 4/10] method=standard-support, alpha=1.00, f1_macro=0.2537
[CV 5/10] method=standard-support, alpha=1.00, f1_macro=0.2537
[CV 6/10] method=standard-support, alpha=1.00, f1_macro=0.2537
[CV 7/10] method=standard-support, alpha=1.00, f1_macro=0.2424
[CV 8/10] method=standard-support, alpha=1.00, f1_macro=0.2870
[CV 9/10] method=standard-support, alpha=1.00, f1_macro=0.2537
[CV 10/10] method=standard-support, alpha=1.00, f1_macro=0.2308
f1_mean = 0.2559

[CV 1/10] method=ratio-support, alpha=0.00, f1_macro=0.9774
[CV 2/10] method=ratio-support, alpha=0.00, f1_macro=0.9132
[CV 3/10] method=ratio-

[CV 10/10] method=ratio-support, alpha=0.55, f1_macro=0.8750
f1_mean = 0.9610

[CV 1/10] method=ratio-support, alpha=0.60, f1_macro=0.9774
[CV 2/10] method=ratio-support, alpha=0.60, f1_macro=0.9132
[CV 3/10] method=ratio-support, alpha=0.60, f1_macro=0.9540
[CV 4/10] method=ratio-support, alpha=0.60, f1_macro=0.9780
[CV 5/10] method=ratio-support, alpha=0.60, f1_macro=1.0000
[CV 6/10] method=ratio-support, alpha=0.60, f1_macro=0.9780
[CV 7/10] method=ratio-support, alpha=0.60, f1_macro=1.0000
[CV 8/10] method=ratio-support, alpha=0.60, f1_macro=0.9566
[CV 9/10] method=ratio-support, alpha=0.60, f1_macro=0.9780
[CV 10/10] method=ratio-support, alpha=0.60, f1_macro=0.8750
f1_mean = 0.9610

[CV 1/10] method=ratio-support, alpha=0.65, f1_macro=0.9774
[CV 2/10] method=ratio-support, alpha=0.65, f1_macro=0.9341
[CV 3/10] method=ratio-support, alpha=0.65, f1_macro=0.9540
[CV 4/10] method=ratio-support, alpha=0.65, f1_macro=0.9780
[CV 5/10] method=ratio-support, alpha=0.65, f1_macro=1.0000
[C

**Extra with more ratio-support**

In [50]:
parameters = {'alpha' : np.linspace(0, 10, 21),
              'method': ['ratio-support'],
             }

n = kfold.get_n_splits(X)

for method in parameters['method']:
    for alpha in parameters['alpha']:
        
        f1 = [0 for i in range(n)]
        for i, (train_index, test_index) in enumerate(kfold.split(X, y)):
            bin_cls = fcalc.classifier.PatternBinaryClassifier(
                X.iloc[train_index].values, 
                y.iloc[train_index].to_numpy(),
                method=method,
                alpha=alpha
            )
            bin_cls.predict(X.iloc[test_index].values)
            f1[i] = f1_score(y.iloc[test_index], bin_cls.predictions, average='macro')
            print(f"[CV {i+1}/{n}] method={method}, alpha={alpha:0.2f}, f1_macro={f1[i]:0.4f}") 
                    
        f1_mean = np.mean(f1)
        print(f'f1_mean = {f1_mean:0.4f}\n')
        if(f1_mean > f1_best_p):
            f1_best_p = f1_mean
            alpha_best_p = alpha
            method_best_p = method

best_parameters['PatternBinaryClassifier'] = {'method': method_best_p, 'alpha': alpha_best_p}            
print(f"f1_best={f1_best_p:0.4f}, method={method_best_p}, alpha={alpha_best_p:0.2f}")

[CV 1/10] method=ratio-support, alpha=0.00, f1_macro=0.9774
[CV 2/10] method=ratio-support, alpha=0.00, f1_macro=0.9132
[CV 3/10] method=ratio-support, alpha=0.00, f1_macro=0.9540
[CV 4/10] method=ratio-support, alpha=0.00, f1_macro=0.9780
[CV 5/10] method=ratio-support, alpha=0.00, f1_macro=1.0000
[CV 6/10] method=ratio-support, alpha=0.00, f1_macro=0.9780
[CV 7/10] method=ratio-support, alpha=0.00, f1_macro=1.0000
[CV 8/10] method=ratio-support, alpha=0.00, f1_macro=0.9566
[CV 9/10] method=ratio-support, alpha=0.00, f1_macro=0.9780
[CV 10/10] method=ratio-support, alpha=0.00, f1_macro=0.8750
f1_mean = 0.9610

[CV 1/10] method=ratio-support, alpha=0.50, f1_macro=0.9774
[CV 2/10] method=ratio-support, alpha=0.50, f1_macro=0.9132
[CV 3/10] method=ratio-support, alpha=0.50, f1_macro=0.9540
[CV 4/10] method=ratio-support, alpha=0.50, f1_macro=0.9780
[CV 5/10] method=ratio-support, alpha=0.50, f1_macro=1.0000
[CV 6/10] method=ratio-support, alpha=0.50, f1_macro=0.9780
[CV 7/10] method=rati

[CV 4/10] method=ratio-support, alpha=6.50, f1_macro=0.9540
[CV 5/10] method=ratio-support, alpha=6.50, f1_macro=0.9299
[CV 6/10] method=ratio-support, alpha=6.50, f1_macro=0.9540
[CV 7/10] method=ratio-support, alpha=6.50, f1_macro=0.9322
[CV 8/10] method=ratio-support, alpha=6.50, f1_macro=0.9780
[CV 9/10] method=ratio-support, alpha=6.50, f1_macro=0.9322
[CV 10/10] method=ratio-support, alpha=6.50, f1_macro=0.9109
f1_mean = 0.9428

[CV 1/10] method=ratio-support, alpha=7.00, f1_macro=0.9774
[CV 2/10] method=ratio-support, alpha=7.00, f1_macro=0.9299
[CV 3/10] method=ratio-support, alpha=7.00, f1_macro=0.9299
[CV 4/10] method=ratio-support, alpha=7.00, f1_macro=0.9540
[CV 5/10] method=ratio-support, alpha=7.00, f1_macro=0.9048
[CV 6/10] method=ratio-support, alpha=7.00, f1_macro=0.9299
[CV 7/10] method=ratio-support, alpha=7.00, f1_macro=0.9081
[CV 8/10] method=ratio-support, alpha=7.00, f1_macro=0.9554
[CV 9/10] method=ratio-support, alpha=7.00, f1_macro=0.8831
[CV 10/10] method=rat

In [51]:
f1 = [0 for i in range(n)]
accuracy = [0 for i in range(n)]
f1_binary = [0 for i in range(n)]

for i, (train_index, test_index) in enumerate(kfold.split(X, y)):
    bin_cls = fcalc.classifier.PatternBinaryClassifier(
        X.iloc[train_index].values, 
        y.iloc[train_index].to_numpy(),
        method=method_best_p,
        alpha=alpha_best_p
    )
    bin_cls.predict(X.iloc[test_index].values)
    
    accuracy[i], f1[i], f1_binary[i] = results_fca(y.iloc[test_index], bin_cls.predictions)
    
    print(f"[CV {i+1}/{n}] method={method_best_p}, alpha={alpha_best_p:0.2f}", 
          f"accuracy={accuracy[i]:0.4f}, f1_binary={f1_binary[i]:0.4f}, f1_macro={f1[i]:0.4f}")

f1_mean = np.mean(f1)
accuracy_mean = np.mean(accuracy)
f1_binary_mean = np.mean(f1_binary)

best_metrics['PatternBinaryClassifier'] = {'Accuracy': accuracy_mean, 'F1_binary': f1_binary_mean, 'F1_macro': f1_mean}
print(f"\n accuracy={accuracy_mean:0.4f}, f1_binary={f1_binary_mean:0.4f}, f1_macro={f1_mean:0.4f}")

[CV 1/10] method=ratio-support, alpha=3.00 accuracy=0.9800, f1_binary=0.9851, f1_macro=0.9774
[CV 2/10] method=ratio-support, alpha=3.00 accuracy=0.9800, f1_binary=0.9851, f1_macro=0.9774
[CV 3/10] method=ratio-support, alpha=3.00 accuracy=0.9600, f1_binary=0.9706, f1_macro=0.9540
[CV 4/10] method=ratio-support, alpha=3.00 accuracy=0.9800, f1_binary=0.9846, f1_macro=0.9780
[CV 5/10] method=ratio-support, alpha=3.00 accuracy=0.9800, f1_binary=0.9851, f1_macro=0.9774
[CV 6/10] method=ratio-support, alpha=3.00 accuracy=1.0000, f1_binary=1.0000, f1_macro=1.0000
[CV 7/10] method=ratio-support, alpha=3.00 accuracy=1.0000, f1_binary=1.0000, f1_macro=1.0000
[CV 8/10] method=ratio-support, alpha=3.00 accuracy=1.0000, f1_binary=1.0000, f1_macro=1.0000
[CV 9/10] method=ratio-support, alpha=3.00 accuracy=0.9800, f1_binary=0.9846, f1_macro=0.9780
[CV 10/10] method=ratio-support, alpha=3.00 accuracy=0.9600, f1_binary=0.9688, f1_macro=0.9566

 accuracy=0.9820, f1_binary=0.9864, f1_macro=0.9799


# Overall

In [52]:
for i in best_parameters:
    print(i)
    print(best_parameters[i])
    print()

LogisticRegression
{'C': 1.0}

KNeighborsClassifier
{'n_neighbors': 61}

MultinomialNB
{'alpha': 0.001}

GaussianNB
{'var_smoothing': 1.0}

ComplementNB
{'alpha': 25.301000000000002}

DecisionTreeClassifier
{'criterion': 'gini', 'max_depth': 10, 'min_samples_split': 10}

RandomForestClassifier
{'criterion': 'gini', 'max_depth': 10, 'min_samples_split': 12, 'n_estimators': 90}

BinarizedBinaryClassifier
{'method': 'ratio-support', 'alpha': 0.0}

PatternBinaryClassifier
{'method': 'ratio-support', 'alpha': 3.0}



In [53]:
for i in best_metrics:
    print(i)
    print(best_metrics[i])
    print()

LogisticRegression
{'Accuracy': 0.98, 'F1_binary': 0.9852, 'F1_macro': 0.9771}

KNeighborsClassifier
{'Accuracy': 0.986, 'F1_binary': 0.9898, 'F1_macro': 0.9838}

MultinomialNB
{'Accuracy': 0.98, 'F1_binary': 0.9854, 'F1_macro': 0.9769}

GaussianNB
{'Accuracy': 0.984, 'F1_binary': 0.9882, 'F1_macro': 0.9816}

ComplementNB
{'Accuracy': 0.988, 'F1_binary': 0.9913, 'F1_macro': 0.9861}

DecisionTreeClassifier
{'Accuracy': 0.986, 'F1_binary': 0.9894, 'F1_macro': 0.9843}

RandomForestClassifier
{'Accuracy': 0.982, 'F1_binary': 0.9869, 'F1_macro': 0.9791}

BinarizedBinaryClassifier
{'Accuracy': 0.984, 'F1_binary': 0.9882307828446699, 'F1_macro': 0.9815945473075629}

PatternBinaryClassifier
{'Accuracy': 0.982, 'F1_binary': 0.9863792885121901, 'F1_macro': 0.9798842690757198}



# Extra: Expandad table for BinarizedBinaryClassifier

In [54]:
X_expanded = ~X
anticolumns = X_expanded.columns.to_list()
for i in range(len(anticolumns)):
    anticolumns[i] = 'NOT_' + anticolumns[i]
X_expanded.columns = anticolumns
X_expanded = pd.concat([X, X_expanded],axis=1)
X_expanded.head()

Unnamed: 0,COUGH,MUSCLE_ACHES,TIREDNESS,SORE_THROAT,RUNNY_NOSE,STUFFY_NOSE,FEVER,NAUSEA,VOMITING,DIARRHEA,...,NOT_SHORTNESS_OF_BREATH,NOT_DIFFICULTY_BREATHING,NOT_LOSS_OF_TASTE,NOT_LOSS_OF_SMELL,NOT_ITCHY_NOSE,NOT_ITCHY_EYES,NOT_ITCHY_MOUTH,NOT_ITCHY_INNER_EAR,NOT_SNEEZING,NOT_PINK_EYE
18280,True,False,True,True,False,False,False,True,True,False,...,True,True,True,True,True,True,True,True,False,True
18704,False,True,True,False,False,False,True,False,False,False,...,True,False,True,True,True,True,True,True,True,True
18441,False,True,True,True,False,False,True,True,True,True,...,False,True,True,True,True,True,True,True,True,True
18651,True,False,True,True,False,False,True,False,False,True,...,False,True,True,True,True,True,True,True,True,True
17038,False,False,False,False,False,True,False,False,False,False,...,True,True,True,True,True,True,True,True,False,True


In [55]:
parameters = {'alpha' : np.linspace(0, 1, 21),
              'method': ['standard','standard-support','ratio-support'],
             }

n = kfold.get_n_splits(X_expanded)

f1_best_e = 0
alpha_best_e = 0.
method_best_e = 'standard'

for method in parameters['method']:
    for alpha in parameters['alpha']:
        
        f1 = [0 for i in range(n)]
        for i, (train_index, test_index) in enumerate(kfold.split(X_expanded, y)):
            bin_cls = fcalc.classifier.BinarizedBinaryClassifier(
                X_expanded.iloc[train_index].values, 
                y.iloc[train_index].to_numpy(),
                method=method,
                alpha=alpha
            )
            bin_cls.predict(X_expanded.iloc[test_index].values)
            f1[i] = f1_score(y.iloc[test_index], bin_cls.predictions, average='macro')
            print(f"[CV {i+1}/{n}] method={method}, alpha={alpha:0.2f}, f1_macro={f1[i]:0.4f}") 
                    
        f1_mean = np.mean(f1)
        print(f'f1_mean = {f1_mean:0.4f}\n')
        if(f1_mean > f1_best_e):
            f1_best_e = f1_mean
            alpha_best_e = alpha
            method_best_e = method

best_parameters_expanded = {'method': method_best_e, 'alpha': alpha_best_e}             
print(f"f1_best={f1_best_e:0.4f}, method={method_best_e}, alpha={alpha_best_e:0.2f}")

[CV 1/10] method=standard, alpha=0.00, f1_macro=0.9774
[CV 2/10] method=standard, alpha=0.00, f1_macro=0.9540
[CV 3/10] method=standard, alpha=0.00, f1_macro=0.9540
[CV 4/10] method=standard, alpha=0.00, f1_macro=1.0000
[CV 5/10] method=standard, alpha=0.00, f1_macro=0.9540
[CV 6/10] method=standard, alpha=0.00, f1_macro=1.0000
[CV 7/10] method=standard, alpha=0.00, f1_macro=1.0000
[CV 8/10] method=standard, alpha=0.00, f1_macro=0.9774
[CV 9/10] method=standard, alpha=0.00, f1_macro=0.9322
[CV 10/10] method=standard, alpha=0.00, f1_macro=0.9566
f1_mean = 0.9706

[CV 1/10] method=standard, alpha=0.05, f1_macro=0.9554
[CV 2/10] method=standard, alpha=0.05, f1_macro=0.8529
[CV 3/10] method=standard, alpha=0.05, f1_macro=0.9554
[CV 4/10] method=standard, alpha=0.05, f1_macro=0.6323
[CV 5/10] method=standard, alpha=0.05, f1_macro=0.6293
[CV 6/10] method=standard, alpha=0.05, f1_macro=0.9356
[CV 7/10] method=standard, alpha=0.05, f1_macro=0.9356
[CV 8/10] method=standard, alpha=0.05, f1_macr

[CV 7/10] method=standard, alpha=0.70, f1_macro=0.1196
[CV 8/10] method=standard, alpha=0.70, f1_macro=0.0370
[CV 9/10] method=standard, alpha=0.70, f1_macro=0.1091
[CV 10/10] method=standard, alpha=0.70, f1_macro=0.0000
f1_mean = 0.0649

[CV 1/10] method=standard, alpha=0.75, f1_macro=0.0583
[CV 2/10] method=standard, alpha=0.75, f1_macro=0.0392
[CV 3/10] method=standard, alpha=0.75, f1_macro=0.0000
[CV 4/10] method=standard, alpha=0.75, f1_macro=0.0721
[CV 5/10] method=standard, alpha=0.75, f1_macro=0.0751
[CV 6/10] method=standard, alpha=0.75, f1_macro=0.1381
[CV 7/10] method=standard, alpha=0.75, f1_macro=0.1196
[CV 8/10] method=standard, alpha=0.75, f1_macro=0.0370
[CV 9/10] method=standard, alpha=0.75, f1_macro=0.1091
[CV 10/10] method=standard, alpha=0.75, f1_macro=0.0000
f1_mean = 0.0649

[CV 1/10] method=standard, alpha=0.80, f1_macro=0.0583
[CV 2/10] method=standard, alpha=0.80, f1_macro=0.0392
[CV 3/10] method=standard, alpha=0.80, f1_macro=0.0000
[CV 4/10] method=standard, 

[CV 1/10] method=standard-support, alpha=0.35, f1_macro=0.2784
[CV 2/10] method=standard-support, alpha=0.35, f1_macro=0.2921
[CV 3/10] method=standard-support, alpha=0.35, f1_macro=0.2424
[CV 4/10] method=standard-support, alpha=0.35, f1_macro=0.3779
[CV 5/10] method=standard-support, alpha=0.35, f1_macro=0.3779
[CV 6/10] method=standard-support, alpha=0.35, f1_macro=0.3187
[CV 7/10] method=standard-support, alpha=0.35, f1_macro=0.2747
[CV 8/10] method=standard-support, alpha=0.35, f1_macro=0.2870
[CV 9/10] method=standard-support, alpha=0.35, f1_macro=0.3490
[CV 10/10] method=standard-support, alpha=0.35, f1_macro=0.2308
f1_mean = 0.3029

[CV 1/10] method=standard-support, alpha=0.40, f1_macro=0.2784
[CV 2/10] method=standard-support, alpha=0.40, f1_macro=0.2921
[CV 3/10] method=standard-support, alpha=0.40, f1_macro=0.2424
[CV 4/10] method=standard-support, alpha=0.40, f1_macro=0.3779
[CV 5/10] method=standard-support, alpha=0.40, f1_macro=0.3779
[CV 6/10] method=standard-support, a

[CV 9/10] method=standard-support, alpha=0.95, f1_macro=0.2537
[CV 10/10] method=standard-support, alpha=0.95, f1_macro=0.2308
f1_mean = 0.2559

[CV 1/10] method=standard-support, alpha=1.00, f1_macro=0.2492
[CV 2/10] method=standard-support, alpha=1.00, f1_macro=0.2921
[CV 3/10] method=standard-support, alpha=1.00, f1_macro=0.2424
[CV 4/10] method=standard-support, alpha=1.00, f1_macro=0.2537
[CV 5/10] method=standard-support, alpha=1.00, f1_macro=0.2537
[CV 6/10] method=standard-support, alpha=1.00, f1_macro=0.2537
[CV 7/10] method=standard-support, alpha=1.00, f1_macro=0.2424
[CV 8/10] method=standard-support, alpha=1.00, f1_macro=0.2870
[CV 9/10] method=standard-support, alpha=1.00, f1_macro=0.2537
[CV 10/10] method=standard-support, alpha=1.00, f1_macro=0.2308
f1_mean = 0.2559

[CV 1/10] method=ratio-support, alpha=0.00, f1_macro=0.9774
[CV 2/10] method=ratio-support, alpha=0.00, f1_macro=0.9132
[CV 3/10] method=ratio-support, alpha=0.00, f1_macro=0.9540
[CV 4/10] method=ratio-sup

[CV 1/10] method=ratio-support, alpha=0.60, f1_macro=0.9774
[CV 2/10] method=ratio-support, alpha=0.60, f1_macro=0.9132
[CV 3/10] method=ratio-support, alpha=0.60, f1_macro=0.9540
[CV 4/10] method=ratio-support, alpha=0.60, f1_macro=0.9780
[CV 5/10] method=ratio-support, alpha=0.60, f1_macro=1.0000
[CV 6/10] method=ratio-support, alpha=0.60, f1_macro=0.9780
[CV 7/10] method=ratio-support, alpha=0.60, f1_macro=1.0000
[CV 8/10] method=ratio-support, alpha=0.60, f1_macro=0.9566
[CV 9/10] method=ratio-support, alpha=0.60, f1_macro=0.9780
[CV 10/10] method=ratio-support, alpha=0.60, f1_macro=0.8750
f1_mean = 0.9610

[CV 1/10] method=ratio-support, alpha=0.65, f1_macro=0.9774
[CV 2/10] method=ratio-support, alpha=0.65, f1_macro=0.9341
[CV 3/10] method=ratio-support, alpha=0.65, f1_macro=0.9540
[CV 4/10] method=ratio-support, alpha=0.65, f1_macro=0.9780
[CV 5/10] method=ratio-support, alpha=0.65, f1_macro=1.0000
[CV 6/10] method=ratio-support, alpha=0.65, f1_macro=0.9780
[CV 7/10] method=rati

In [56]:
parameters = {'alpha' : np.linspace(1, 10, 19),
              'method': ['ratio-support'],
             }

n = kfold.get_n_splits(X_expanded)

for method in parameters['method']:
    for alpha in parameters['alpha']:
        
        f1 = [0 for i in range(n)]
        for i, (train_index, test_index) in enumerate(kfold.split(X_expanded, y)):
            bin_cls = fcalc.classifier.BinarizedBinaryClassifier(
                X_expanded.iloc[train_index].values, 
                y.iloc[train_index].to_numpy(),
                method=method,
                alpha=alpha
            )
            bin_cls.predict(X_expanded.iloc[test_index].values)
            f1[i] = f1_score(y.iloc[test_index], bin_cls.predictions, average='macro')
            print(f"[CV {i+1}/{n}] method={method}, alpha={alpha:0.2f}, f1_macro={f1[i]:0.4f}") 
                    
        f1_mean = np.mean(f1)
        print(f'f1_mean = {f1_mean:0.4f}\n')
        if(f1_mean > f1_best_e):
            f1_best_e = f1_mean
            alpha_best_e = alpha
            method_best_e = method

best_parameters_expanded = {'method': method_best_e, 'alpha': alpha_best_e}            
print(f"f1_best={f1_best_e:0.4f}, method={method_best_e}, alpha={alpha_best_e:0.2f}")

[CV 1/10] method=ratio-support, alpha=1.00, f1_macro=0.9774
[CV 2/10] method=ratio-support, alpha=1.00, f1_macro=0.9554
[CV 3/10] method=ratio-support, alpha=1.00, f1_macro=0.9540
[CV 4/10] method=ratio-support, alpha=1.00, f1_macro=0.9780
[CV 5/10] method=ratio-support, alpha=1.00, f1_macro=1.0000
[CV 6/10] method=ratio-support, alpha=1.00, f1_macro=0.9780
[CV 7/10] method=ratio-support, alpha=1.00, f1_macro=1.0000
[CV 8/10] method=ratio-support, alpha=1.00, f1_macro=0.9566
[CV 9/10] method=ratio-support, alpha=1.00, f1_macro=0.9780
[CV 10/10] method=ratio-support, alpha=1.00, f1_macro=0.8750
f1_mean = 0.9653

[CV 1/10] method=ratio-support, alpha=1.50, f1_macro=0.9774
[CV 2/10] method=ratio-support, alpha=1.50, f1_macro=0.9774
[CV 3/10] method=ratio-support, alpha=1.50, f1_macro=0.9540
[CV 4/10] method=ratio-support, alpha=1.50, f1_macro=0.9780
[CV 5/10] method=ratio-support, alpha=1.50, f1_macro=1.0000
[CV 6/10] method=ratio-support, alpha=1.50, f1_macro=0.9780
[CV 7/10] method=rati

[CV 5/10] method=ratio-support, alpha=7.50, f1_macro=0.8786
[CV 6/10] method=ratio-support, alpha=7.50, f1_macro=0.9048
[CV 7/10] method=ratio-support, alpha=7.50, f1_macro=0.8831
[CV 8/10] method=ratio-support, alpha=7.50, f1_macro=0.9554
[CV 9/10] method=ratio-support, alpha=7.50, f1_macro=0.8831
[CV 10/10] method=ratio-support, alpha=7.50, f1_macro=0.9109
f1_mean = 0.9163

[CV 1/10] method=ratio-support, alpha=8.00, f1_macro=0.9554
[CV 2/10] method=ratio-support, alpha=8.00, f1_macro=0.8831
[CV 3/10] method=ratio-support, alpha=8.00, f1_macro=0.9081
[CV 4/10] method=ratio-support, alpha=8.00, f1_macro=0.9540
[CV 5/10] method=ratio-support, alpha=8.00, f1_macro=0.8512
[CV 6/10] method=ratio-support, alpha=8.00, f1_macro=0.9048
[CV 7/10] method=ratio-support, alpha=8.00, f1_macro=0.8831
[CV 8/10] method=ratio-support, alpha=8.00, f1_macro=0.9554
[CV 9/10] method=ratio-support, alpha=8.00, f1_macro=0.8831
[CV 10/10] method=ratio-support, alpha=8.00, f1_macro=0.9109
f1_mean = 0.9089

[C

In [57]:
f1 = [0 for i in range(n)]
accuracy = [0 for i in range(n)]
f1_binary = [0 for i in range(n)]

for i, (train_index, test_index) in enumerate(kfold.split(X_expanded, y)):
    bin_cls = fcalc.classifier.BinarizedBinaryClassifier(
        X_expanded.iloc[train_index].values, 
        y.iloc[train_index].to_numpy(),
        method=method_best_e,
        alpha=alpha_best_e
    )
    bin_cls.predict(X_expanded.iloc[test_index].values)
    
    accuracy[i], f1[i], f1_binary[i] = results_fca(y.iloc[test_index], bin_cls.predictions)
    
    print(f"[CV {i+1}/{n}] method={method_best_e}, alpha={alpha_best_e:0.2f}", 
          f"accuracy={accuracy[i]:0.4f}, f1_macro={f1[i]:0.4f}, f1_binary={f1_binary[i]:0.4f}")

f1_mean = np.mean(f1)
accuracy_mean = np.mean(accuracy)
f1_binary_mean = np.mean(f1_binary)

best_metrics_expanded = {'Accuracy': accuracy_mean, 'F1_binary': f1_binary_mean, 'F1_macro': f1_mean}
print(f"\n accuracy={accuracy_mean:0.4f}, f1_binary={f1_binary_mean:0.4f}, f1_macro={f1_mean:0.4f}")

[CV 1/10] method=ratio-support, alpha=3.00 accuracy=0.9800, f1_macro=0.9774, f1_binary=0.9851
[CV 2/10] method=ratio-support, alpha=3.00 accuracy=0.9800, f1_macro=0.9774, f1_binary=0.9851
[CV 3/10] method=ratio-support, alpha=3.00 accuracy=0.9600, f1_macro=0.9540, f1_binary=0.9706
[CV 4/10] method=ratio-support, alpha=3.00 accuracy=0.9800, f1_macro=0.9780, f1_binary=0.9846
[CV 5/10] method=ratio-support, alpha=3.00 accuracy=0.9800, f1_macro=0.9774, f1_binary=0.9851
[CV 6/10] method=ratio-support, alpha=3.00 accuracy=1.0000, f1_macro=1.0000, f1_binary=1.0000
[CV 7/10] method=ratio-support, alpha=3.00 accuracy=1.0000, f1_macro=1.0000, f1_binary=1.0000
[CV 8/10] method=ratio-support, alpha=3.00 accuracy=1.0000, f1_macro=1.0000, f1_binary=1.0000
[CV 9/10] method=ratio-support, alpha=3.00 accuracy=0.9800, f1_macro=0.9780, f1_binary=0.9846
[CV 10/10] method=ratio-support, alpha=3.00 accuracy=0.9600, f1_macro=0.9566, f1_binary=0.9688

 accuracy=0.9820, f1_binary=0.9864, f1_macro=0.9799


In [58]:
best_parameters_expanded

{'method': 'ratio-support', 'alpha': 3.0}

In [59]:
best_metrics_expanded

{'Accuracy': 0.982,
 'F1_binary': 0.9863792885121901,
 'F1_macro': 0.9798842690757198}