# Dataset: Symptoms and COVID Presence (May 2020 data)
Link to a dataset: https://www.kaggle.com/datasets/hemanthhari/symptoms-and-covid-presence

In [1]:
import numpy as np
import pandas as pd

from sklearn.metrics import make_scorer, accuracy_score, f1_score

from sklearn import model_selection
from sklearn.model_selection import (
    StratifiedKFold,
    cross_validate,
    GridSearchCV,
)

Standard models used:

In [2]:
from sklearn.linear_model import LogisticRegression

from sklearn.neighbors import KNeighborsClassifier

from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import ComplementNB

from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import RandomForestClassifier

**fcalc** library files are located in **FCALC** folder

**FCALC** folder is located in the same directory as this notebook.

In [3]:
import FCALC.fcalc as fcalc

# Loading data

In [4]:
data_cov = pd.read_csv("Datasets/covid_dataset.csv", 
                       sep=',',
                       true_values=['Yes','yes'],
                       false_values=['No','no']
                      )
data_cov.drop_duplicates(inplace=True)
data_cov.reset_index(inplace=True, drop=True)
data_cov

Unnamed: 0,Breathing Problem,Fever,Dry Cough,Sore throat,Running Nose,Asthma,Chronic Lung Disease,Headache,Heart Disease,Diabetes,...,Fatigue,Gastrointestinal,Abroad travel,Contact with COVID Patient,Attended Large Gathering,Visited Public Exposed Places,Family working in Public Exposed Places,Wearing Masks,Sanitization from Market,COVID-19
0,True,True,True,True,True,False,False,False,False,True,...,True,True,False,True,False,True,True,False,False,True
1,True,True,True,True,False,True,True,True,False,False,...,True,False,False,False,True,True,False,False,False,True
2,True,True,True,True,True,True,True,True,False,True,...,True,True,True,False,False,False,False,False,False,True
3,True,True,True,False,False,True,False,False,True,True,...,False,False,True,False,True,True,False,False,False,True
4,True,True,True,True,True,False,True,True,True,True,...,False,True,False,True,False,True,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
461,True,False,False,False,False,False,False,True,True,False,...,False,False,False,True,True,True,False,False,False,True
462,True,False,False,False,False,False,False,True,True,False,...,True,False,False,True,True,True,False,False,False,True
463,False,False,True,False,False,True,True,False,True,True,...,False,True,False,True,True,True,False,False,False,True
464,True,False,False,False,True,False,True,True,False,True,...,True,True,False,True,True,True,False,False,False,True


In [5]:
data_cov.shape

(466, 21)

In [6]:
df = data_cov
df

Unnamed: 0,Breathing Problem,Fever,Dry Cough,Sore throat,Running Nose,Asthma,Chronic Lung Disease,Headache,Heart Disease,Diabetes,...,Fatigue,Gastrointestinal,Abroad travel,Contact with COVID Patient,Attended Large Gathering,Visited Public Exposed Places,Family working in Public Exposed Places,Wearing Masks,Sanitization from Market,COVID-19
0,True,True,True,True,True,False,False,False,False,True,...,True,True,False,True,False,True,True,False,False,True
1,True,True,True,True,False,True,True,True,False,False,...,True,False,False,False,True,True,False,False,False,True
2,True,True,True,True,True,True,True,True,False,True,...,True,True,True,False,False,False,False,False,False,True
3,True,True,True,False,False,True,False,False,True,True,...,False,False,True,False,True,True,False,False,False,True
4,True,True,True,True,True,False,True,True,True,True,...,False,True,False,True,False,True,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
461,True,False,False,False,False,False,False,True,True,False,...,False,False,False,True,True,True,False,False,False,True
462,True,False,False,False,False,False,False,True,True,False,...,True,False,False,True,True,True,False,False,False,True
463,False,False,True,False,False,True,True,False,True,True,...,False,True,False,True,True,True,False,False,False,True
464,True,False,False,False,True,False,True,True,False,True,...,True,True,False,True,True,True,False,False,False,True


In [7]:
df['COVID-19'].value_counts(normalize=True)

COVID-19
True     0.82618
False    0.17382
Name: proportion, dtype: float64

In [8]:
features = df.columns[0:-1]
target = df.columns[-1]

In [9]:
X = df[features]
y = df[target]

# Testing models

In [10]:
best_parameters = { 
    'LogisticRegression': [],
    'KNeighborsClassifier': [],
    'MultinomialNB': [],
    'GaussianNB': [],
    'ComplementNB': [],
    'DecisionTreeClassifier': [],
    'RandomForestClassifier': [],
    'BinarizedBinaryClassifier': [],
    'PatternBinaryClassifier': [],
}

In [11]:
best_metrics = {
    'LogisticRegression': [],
    'KNeighborsClassifier': [],
    'MultinomialNB': [],
    'GaussianNB': [],
    'ComplementNB': [],
    'DecisionTreeClassifier': [],
    'RandomForestClassifier': [],
    'BinarizedBinaryClassifier': [],
    'PatternBinaryClassifier': [],  
}

In [12]:
scoring = {'accuracy' : make_scorer(accuracy_score),
           'f1_macro' : make_scorer(f1_score, average='macro'),
           'f1_binary' : make_scorer(f1_score),
          }

kfold = StratifiedKFold(n_splits=10, random_state=49, shuffle=True)

In [13]:
def count_metrics(results):
    acc = np.round(np.mean(results['test_accuracy']), 4) 
    f1_m = np.round(np.mean(results['test_f1_macro']), 4)
    f1_b = np.round(np.mean(results['test_f1_binary']), 4)
    return(acc, f1_m, f1_b)  

In [14]:
def print_results(results):
    acc, f1_m, f1_b = count_metrics(results)
    print(f'Accuracy = {acc:0.4f}, F1_binary = {f1_b:0.4f}, F1_macro = {f1_m:0.4f}')  

In [15]:
def fill_best_metrics(results, method):
    acc, f1_m, f1_b = count_metrics(results)
    best_metrics[method] = {'Accuracy': acc, 'F1_binary': f1_b, 'F1_macro': f1_m}

# Standard models

### Logistic regression

In [16]:
parameters = {'C' : np.linspace(1,5,101)}

model = LogisticRegression()

best = GridSearchCV(estimator=model, 
                    param_grid=parameters, 
                    cv=kfold,
                    verbose=1, 
                    scoring=scoring,
                    refit='f1_binary'
                   )
best.fit(X, y)
best_parameters['LogisticRegression'] = best.best_params_
best.best_params_

Fitting 10 folds for each of 101 candidates, totalling 1010 fits


{'C': 1.0}

In [17]:
model = LogisticRegression(C=best.best_params_['C'])
results = cross_validate(estimator=model,
                         X=X, y=y,
                         cv=kfold,
                         scoring=scoring
                         )
fill_best_metrics(results, 'LogisticRegression')
print_results(results)

Accuracy = 0.9633, F1_binary = 0.9784, F1_macro = 0.9281


### K-Nearest Neighbours

In [18]:
parameters = {'n_neighbors' : range(4,65,4)}

model = KNeighborsClassifier()

best = GridSearchCV(estimator=model, 
                    param_grid=parameters, 
                    cv=kfold,
                    verbose=1, 
                    scoring=scoring,
                    refit='f1_binary'
                   )
best.fit(X, y)
best_parameters['KNeighborsClassifier'] = best.best_params_
best.best_params_

Fitting 10 folds for each of 16 candidates, totalling 160 fits


{'n_neighbors': 12}

In [19]:
model = KNeighborsClassifier(n_neighbors=best.best_params_['n_neighbors'])
results = cross_validate(estimator=model,
                         X=X,
                         y=y,
                         cv=kfold,
                         scoring=scoring
                         )
fill_best_metrics(results, 'KNeighborsClassifier')
print_results(results)

Accuracy = 0.9376, F1_binary = 0.9637, F1_macro = 0.8679


### Naive Bayes

##### MULTINOMIAL NB

In [20]:
parameters = {'alpha' : np.linspace(0.001,100.001,1001)}

model = MultinomialNB()

best = GridSearchCV(estimator=model, 
                    param_grid=parameters, 
                    cv=kfold,
                    verbose=1, 
                    scoring=scoring,
                    refit='f1_binary'
                   )
best.fit(X, y)
best_parameters['MultinomialNB'] = best.best_params_
best.best_params_

Fitting 10 folds for each of 1001 candidates, totalling 10010 fits


{'alpha': 0.501}

In [21]:
model = MultinomialNB(alpha=best.best_params_['alpha'])

results = cross_validate(estimator=model,
                         X=X,
                         y=y,
                         cv=kfold,
                         scoring=scoring
                         )
fill_best_metrics(results, 'MultinomialNB')
print_results(results)

Accuracy = 0.8668, F1_binary = 0.9255, F1_macro = 0.6488


##### GAUSSIAN NB

In [22]:
parameters = {'var_smoothing': np.logspace(0,-9, num=100)}

model = GaussianNB()

best = GridSearchCV(estimator=model, 
                    param_grid=parameters, 
                    cv=kfold,
                    verbose=1, 
                    scoring=scoring,
                    refit='f1_binary'
                   )
best.fit(X, y)
best_parameters['GaussianNB'] = best.best_params_
best.best_params_

Fitting 10 folds for each of 100 candidates, totalling 1000 fits


{'var_smoothing': 0.1873817422860384}

In [23]:
model = GaussianNB(var_smoothing=best.best_params_['var_smoothing'])
results = cross_validate(estimator=model,
                         X=X,
                         y=y,
                         cv=kfold,
                         scoring=scoring
                         )
fill_best_metrics(results, 'GaussianNB')
print_results(results)

Accuracy = 0.9634, F1_binary = 0.9782, F1_macro = 0.9313


##### COMPLEMENT NB

In [24]:
parameters = {'alpha' : np.linspace(0.001,100.001,1001)}

model =  ComplementNB()

best = GridSearchCV(estimator=model, 
                    param_grid=parameters, 
                    cv=kfold,
                    verbose=1, 
                    scoring=scoring,
                    refit='f1_binary'
                   )
best.fit(X, y)
best_parameters['ComplementNB'] = best.best_params_
best.best_params_

Fitting 10 folds for each of 1001 candidates, totalling 10010 fits


{'alpha': 60.801}

In [25]:
model = ComplementNB(alpha=best.best_params_['alpha'])
results = cross_validate(estimator=model,
                         X=X,
                         y=y,
                         cv=kfold,
                         scoring=scoring
                         )
fill_best_metrics(results, 'ComplementNB')
print_results(results) 

Accuracy = 0.9269, F1_binary = 0.9576, F1_macro = 0.8430


### Decision tree

In [26]:
parameters = {'min_samples_split' : range(2,21,2),
              'max_depth' : range(2,17,2),
              'criterion' : ['gini', 'entropy']
             }

model = DecisionTreeClassifier(random_state=49)

best = GridSearchCV(estimator=model, 
                    param_grid=parameters, 
                    cv=kfold,
                    verbose=1, 
                    scoring=scoring,
                    refit='f1_binary'
                   )
best.fit(X, y)
best_parameters['DecisionTreeClassifier'] = best.best_params_
best.best_params_

Fitting 10 folds for each of 160 candidates, totalling 1600 fits


{'criterion': 'gini', 'max_depth': 6, 'min_samples_split': 10}

In [27]:
model = DecisionTreeClassifier(
    max_depth=best.best_params_['max_depth'], 
    min_samples_split=best.best_params_['min_samples_split'],
    criterion=best.best_params_['criterion'],
    random_state=49
)
results = cross_validate(estimator=model,
                         X=X,
                         y=y,
                         cv=kfold,
                         scoring=scoring
                         )
fill_best_metrics(results, 'DecisionTreeClassifier')
print_results(results)

Accuracy = 0.9505, F1_binary = 0.9706, F1_macro = 0.9048


### RandomForest

In [28]:
parameters = {'n_estimators' : range(40,151,10),
              'min_samples_split' : range(2,13,2),
              'max_depth' : range(4,21,2),
              'criterion' : ['gini', 'entropy']
             }
model = RandomForestClassifier(random_state=49)

best = GridSearchCV(estimator=model, 
                    param_grid=parameters, 
                    cv=kfold,
                    verbose=1, 
                    scoring=scoring,
                    refit='f1_binary'
                   )
best.fit(X, y)
best_parameters['RandomForestClassifier'] = best.best_params_
best.best_params_

Fitting 10 folds for each of 1296 candidates, totalling 12960 fits


{'criterion': 'entropy',
 'max_depth': 6,
 'min_samples_split': 4,
 'n_estimators': 70}

In [29]:
model = RandomForestClassifier(
    n_estimators=best.best_params_['n_estimators'],
    min_samples_split=best.best_params_['min_samples_split'],
    max_depth=best.best_params_['max_depth'],
    criterion=best.best_params_['criterion'],
    random_state=49
)
results = cross_validate(estimator=model,
                         X=X,
                         y=y,
                         cv=kfold,
                         scoring=scoring
                         )
fill_best_metrics(results, 'RandomForestClassifier')
print_results(results)

Accuracy = 0.9548, F1_binary = 0.9736, F1_macro = 0.9070


# Lazy FCA

Due to the multilabel output of **BinarizedBinaryClassifier** (**1** for **True**, **0** for **false** and **-1** for **undefined**) we cannot utilize **f1_score** with **average='binary'**. Therefore, a function that interprets **undefined** as misclassification was implemented.

In [30]:
def compare_with_binary_f1_old(y_true, y_pred):
    y_tmp = np.concatenate(
        (np.array(y_true)[:,None],np.array(y_pred)[:,None]),
        axis=1
    )
    df_tmp = pd.DataFrame(y_tmp, columns=['y_true','y_pred'])
    df_tmp.y_true = df_tmp.y_true.astype(bool)
    df_tmp['y_new']= ~df_tmp.loc[df_tmp.y_pred==-1]['y_true']
    df_tmp.loc[df_tmp.y_pred!=-1, 'y_new'] = df_tmp.loc[df_tmp.y_pred!=-1,'y_pred'].astype(bool) 
    df_tmp.y_new = df_tmp.y_new.astype(bool)
    return(f1_score(df_tmp.y_true, df_tmp.y_new))

In [31]:
def results_fca(y_true, y_pred):
    f1 = f1_score(y_true, y_pred, average='macro')
    accuracy = accuracy_score(y_true, y_pred)
    f1_binary = compare_with_binary_f1(y_true, y_pred)
    return (accuracy, f1, f1_binary)  

In [32]:
def compare_with_binary_f1(y_true, y_pred):
    y_tmp = np.copy(y_pred)
    undef = y_pred == -1
    y_tmp[undef] = (y_true[undef] - np.ones(shape=y_tmp[undef].shape) * 2) // -2
    return (f1_score(y_true, y_tmp))

(0 - 2) // (-2) = 1

(1 - 2) // (-2) = 0

## BinarizedBinaryClassifier

**BinarizedBinaryClassifier** is not a **scikit** model, so **GridSearchCV** and **cross_validate** from **scikit** do not support it. Thus, we check desired parameters in **for**  loops.

In [33]:
parameters = {'alpha' : np.linspace(0, 1, 21),
              'method': ['standard','standard-support','ratio-support'],
             }

n = kfold.get_n_splits(X)

f1_best = 0
alpha_best = 0.
method_best = 'standard'

for method in parameters['method']:
    for alpha in parameters['alpha']:
        
        f1 = [0 for i in range(n)]
        for i, (train_index, test_index) in enumerate(kfold.split(X, y)):
            bin_cls = fcalc.classifier.BinarizedBinaryClassifier(
                X.iloc[train_index].values, 
                y.iloc[train_index].to_numpy(),
                method=method,
                alpha=alpha
            )
            bin_cls.predict(X.iloc[test_index].values)
            f1[i] = f1_score(y.iloc[test_index], bin_cls.predictions, average='macro')
            print(f"[CV {i+1}/{n}] method={method}, alpha={alpha:0.2f}, f1_macro={f1[i]:0.4f}") 
                    
        f1_mean = np.mean(f1)
        print(f'f1_mean = {f1_mean:0.4f}\n')
        if(f1_mean > f1_best):
            f1_best = f1_mean
            alpha_best = alpha
            method_best = method

best_parameters['BinarizedBinaryClassifier'] = {'method': method_best, 'alpha': alpha_best}             
print(f"f1_best={f1_best:0.4f}, method={method_best}, alpha={alpha_best:0.2f}")

[CV 1/10] method=standard, alpha=0.00, f1_macro=0.2991
[CV 2/10] method=standard, alpha=0.00, f1_macro=0.3038
[CV 3/10] method=standard, alpha=0.00, f1_macro=0.3122
[CV 4/10] method=standard, alpha=0.00, f1_macro=0.3052
[CV 5/10] method=standard, alpha=0.00, f1_macro=0.3008
[CV 6/10] method=standard, alpha=0.00, f1_macro=0.2991
[CV 7/10] method=standard, alpha=0.00, f1_macro=0.3045
[CV 8/10] method=standard, alpha=0.00, f1_macro=0.2906
[CV 9/10] method=standard, alpha=0.00, f1_macro=0.3083
[CV 10/10] method=standard, alpha=0.00, f1_macro=0.3045
f1_mean = 0.3028

[CV 1/10] method=standard, alpha=0.05, f1_macro=0.3059
[CV 2/10] method=standard, alpha=0.05, f1_macro=0.3836
[CV 3/10] method=standard, alpha=0.05, f1_macro=0.4535
[CV 4/10] method=standard, alpha=0.05, f1_macro=0.3059
[CV 5/10] method=standard, alpha=0.05, f1_macro=0.4535
[CV 6/10] method=standard, alpha=0.05, f1_macro=0.4471
[CV 7/10] method=standard, alpha=0.05, f1_macro=0.4524
[CV 8/10] method=standard, alpha=0.05, f1_macr

[CV 6/10] method=standard, alpha=0.70, f1_macro=0.3263
[CV 7/10] method=standard, alpha=0.70, f1_macro=0.4480
[CV 8/10] method=standard, alpha=0.70, f1_macro=0.2415
[CV 9/10] method=standard, alpha=0.70, f1_macro=0.2465
[CV 10/10] method=standard, alpha=0.70, f1_macro=0.3810
f1_mean = 0.3669

[CV 1/10] method=standard, alpha=0.75, f1_macro=0.4411
[CV 2/10] method=standard, alpha=0.75, f1_macro=0.2898
[CV 3/10] method=standard, alpha=0.75, f1_macro=0.4055
[CV 4/10] method=standard, alpha=0.75, f1_macro=0.4128
[CV 5/10] method=standard, alpha=0.75, f1_macro=0.4618
[CV 6/10] method=standard, alpha=0.75, f1_macro=0.3263
[CV 7/10] method=standard, alpha=0.75, f1_macro=0.4408
[CV 8/10] method=standard, alpha=0.75, f1_macro=0.2353
[CV 9/10] method=standard, alpha=0.75, f1_macro=0.2465
[CV 10/10] method=standard, alpha=0.75, f1_macro=0.3749
f1_mean = 0.3635

[CV 1/10] method=standard, alpha=0.80, f1_macro=0.4171
[CV 2/10] method=standard, alpha=0.80, f1_macro=0.3385
[CV 3/10] method=standard, 

[CV 1/10] method=standard-support, alpha=0.35, f1_macro=0.5554
[CV 2/10] method=standard-support, alpha=0.35, f1_macro=0.4532
[CV 3/10] method=standard-support, alpha=0.35, f1_macro=0.3505
[CV 4/10] method=standard-support, alpha=0.35, f1_macro=0.4125
[CV 5/10] method=standard-support, alpha=0.35, f1_macro=0.4964
[CV 6/10] method=standard-support, alpha=0.35, f1_macro=0.4683
[CV 7/10] method=standard-support, alpha=0.35, f1_macro=0.4363
[CV 8/10] method=standard-support, alpha=0.35, f1_macro=0.3799
[CV 9/10] method=standard-support, alpha=0.35, f1_macro=0.3432
[CV 10/10] method=standard-support, alpha=0.35, f1_macro=0.3684
f1_mean = 0.4264

[CV 1/10] method=standard-support, alpha=0.40, f1_macro=0.4683
[CV 2/10] method=standard-support, alpha=0.40, f1_macro=0.3896
[CV 3/10] method=standard-support, alpha=0.40, f1_macro=0.2893
[CV 4/10] method=standard-support, alpha=0.40, f1_macro=0.2627
[CV 5/10] method=standard-support, alpha=0.40, f1_macro=0.4544
[CV 6/10] method=standard-support, a

[CV 9/10] method=standard-support, alpha=0.95, f1_macro=0.1049
[CV 10/10] method=standard-support, alpha=0.95, f1_macro=0.1518
f1_mean = 0.1618

[CV 1/10] method=standard-support, alpha=1.00, f1_macro=0.1060
[CV 2/10] method=standard-support, alpha=1.00, f1_macro=0.1060
[CV 3/10] method=standard-support, alpha=1.00, f1_macro=0.1132
[CV 4/10] method=standard-support, alpha=1.00, f1_macro=0.1261
[CV 5/10] method=standard-support, alpha=1.00, f1_macro=0.1261
[CV 6/10] method=standard-support, alpha=1.00, f1_macro=0.1642
[CV 7/10] method=standard-support, alpha=1.00, f1_macro=0.0980
[CV 8/10] method=standard-support, alpha=1.00, f1_macro=0.1083
[CV 9/10] method=standard-support, alpha=1.00, f1_macro=0.1154
[CV 10/10] method=standard-support, alpha=1.00, f1_macro=0.1288
f1_mean = 0.1192

[CV 1/10] method=ratio-support, alpha=0.00, f1_macro=0.5699
[CV 2/10] method=ratio-support, alpha=0.00, f1_macro=0.5699
[CV 3/10] method=ratio-support, alpha=0.00, f1_macro=0.4471
[CV 4/10] method=ratio-sup

[CV 1/10] method=ratio-support, alpha=0.60, f1_macro=0.5699
[CV 2/10] method=ratio-support, alpha=0.60, f1_macro=0.5699
[CV 3/10] method=ratio-support, alpha=0.60, f1_macro=0.4535
[CV 4/10] method=ratio-support, alpha=0.60, f1_macro=0.6643
[CV 5/10] method=ratio-support, alpha=0.60, f1_macro=0.4535
[CV 6/10] method=ratio-support, alpha=0.60, f1_macro=0.4471
[CV 7/10] method=ratio-support, alpha=0.60, f1_macro=0.4524
[CV 8/10] method=ratio-support, alpha=0.60, f1_macro=0.4524
[CV 9/10] method=ratio-support, alpha=0.60, f1_macro=0.4524
[CV 10/10] method=ratio-support, alpha=0.60, f1_macro=0.4524
f1_mean = 0.4968

[CV 1/10] method=ratio-support, alpha=0.65, f1_macro=0.5699
[CV 2/10] method=ratio-support, alpha=0.65, f1_macro=0.5699
[CV 3/10] method=ratio-support, alpha=0.65, f1_macro=0.4535
[CV 4/10] method=ratio-support, alpha=0.65, f1_macro=0.6643
[CV 5/10] method=ratio-support, alpha=0.65, f1_macro=0.4535
[CV 6/10] method=ratio-support, alpha=0.65, f1_macro=0.4471
[CV 7/10] method=rati

In [34]:
parameters = {'alpha' : np.linspace(1, 10, 19),
              'method': ['ratio-support'],
             }

n = kfold.get_n_splits(X)

for method in parameters['method']:
    for alpha in parameters['alpha']:
        
        f1 = [0 for i in range(n)]
        for i, (train_index, test_index) in enumerate(kfold.split(X, y)):
            bin_cls = fcalc.classifier.BinarizedBinaryClassifier(
                X.iloc[train_index].values, 
                y.iloc[train_index].to_numpy(),
                method=method,
                alpha=alpha
            )
            bin_cls.predict(X.iloc[test_index].values)
            f1[i] = f1_score(y.iloc[test_index], bin_cls.predictions, average='macro')
            print(f"[CV {i+1}/{n}] method={method}, alpha={alpha:0.2f}, f1_macro={f1[i]:0.4f}") 
                    
        f1_mean = np.mean(f1)
        print(f'f1_mean = {f1_mean:0.4f}\n')
        if(f1_mean > f1_best):
            f1_best = f1_mean
            alpha_best = alpha
            method_best = method

best_parameters['BinarizedBinaryClassifier'] = {'method': method_best, 'alpha': alpha_best}            
print(f"f1_best={f1_best:0.4f}, method={method_best}, alpha={alpha_best:0.2f}")

[CV 1/10] method=ratio-support, alpha=1.00, f1_macro=0.5699
[CV 2/10] method=ratio-support, alpha=1.00, f1_macro=0.5699
[CV 3/10] method=ratio-support, alpha=1.00, f1_macro=0.4535
[CV 4/10] method=ratio-support, alpha=1.00, f1_macro=0.5699
[CV 5/10] method=ratio-support, alpha=1.00, f1_macro=0.4535
[CV 6/10] method=ratio-support, alpha=1.00, f1_macro=0.4471
[CV 7/10] method=ratio-support, alpha=1.00, f1_macro=0.4524
[CV 8/10] method=ratio-support, alpha=1.00, f1_macro=0.4524
[CV 9/10] method=ratio-support, alpha=1.00, f1_macro=0.4524
[CV 10/10] method=ratio-support, alpha=1.00, f1_macro=0.4524
f1_mean = 0.4873

[CV 1/10] method=ratio-support, alpha=1.50, f1_macro=0.4535
[CV 2/10] method=ratio-support, alpha=1.50, f1_macro=0.5699
[CV 3/10] method=ratio-support, alpha=1.50, f1_macro=0.4471
[CV 4/10] method=ratio-support, alpha=1.50, f1_macro=0.5699
[CV 5/10] method=ratio-support, alpha=1.50, f1_macro=0.4535
[CV 6/10] method=ratio-support, alpha=1.50, f1_macro=0.5524
[CV 7/10] method=rati

[CV 4/10] method=ratio-support, alpha=7.50, f1_macro=0.2963
[CV 5/10] method=ratio-support, alpha=7.50, f1_macro=0.3793
[CV 6/10] method=ratio-support, alpha=7.50, f1_macro=0.2857
[CV 7/10] method=ratio-support, alpha=7.50, f1_macro=0.4524
[CV 8/10] method=ratio-support, alpha=7.50, f1_macro=0.2906
[CV 9/10] method=ratio-support, alpha=7.50, f1_macro=0.2857
[CV 10/10] method=ratio-support, alpha=7.50, f1_macro=0.2917
f1_mean = 0.3235

[CV 1/10] method=ratio-support, alpha=8.00, f1_macro=0.3370
[CV 2/10] method=ratio-support, alpha=8.00, f1_macro=0.3083
[CV 3/10] method=ratio-support, alpha=8.00, f1_macro=0.3083
[CV 4/10] method=ratio-support, alpha=8.00, f1_macro=0.2963
[CV 5/10] method=ratio-support, alpha=8.00, f1_macro=0.3793
[CV 6/10] method=ratio-support, alpha=8.00, f1_macro=0.2857
[CV 7/10] method=ratio-support, alpha=8.00, f1_macro=0.4524
[CV 8/10] method=ratio-support, alpha=8.00, f1_macro=0.2906
[CV 9/10] method=ratio-support, alpha=8.00, f1_macro=0.2906
[CV 10/10] method=rat

In [35]:
f1 = [0 for i in range(n)]
accuracy = [0 for i in range(n)]
f1_binary = [0 for i in range(n)]

for i, (train_index, test_index) in enumerate(kfold.split(X, y)):
    bin_cls = fcalc.classifier.BinarizedBinaryClassifier(
        X.iloc[train_index].values, 
        y.iloc[train_index].to_numpy(),
        method=method_best,
        alpha=alpha_best
    )
    bin_cls.predict(X.iloc[test_index].values)
    
    accuracy[i], f1[i], f1_binary[i] = results_fca(y.iloc[test_index], bin_cls.predictions)
    
    print(f"[CV {i+1}/{n}] method={method_best}, alpha={alpha_best:0.2f}", 
          f"accuracy={accuracy[i]:0.4f}, f1_macro={f1[i]:0.4f}, f1_binary={f1_binary[i]:0.4f}")

f1_mean = np.mean(f1)
accuracy_mean = np.mean(accuracy)
f1_binary_mean = np.mean(f1_binary)

best_metrics['BinarizedBinaryClassifier'] = {'Accuracy': accuracy_mean, 'F1_binary': f1_binary_mean, 'F1_macro': f1_mean}
print(f"\n accuracy={accuracy_mean:0.4f}, f1_binary={f1_binary_mean:0.4f}, f1_macro={f1_mean:0.4f}")

[CV 1/10] method=standard, alpha=0.15 accuracy=0.8723, f1_macro=0.6643, f1_binary=0.9286
[CV 2/10] method=standard, alpha=0.15 accuracy=0.8511, f1_macro=0.5699, f1_binary=0.9176
[CV 3/10] method=standard, alpha=0.15 accuracy=0.8511, f1_macro=0.5699, f1_binary=0.9176
[CV 4/10] method=standard, alpha=0.15 accuracy=0.8511, f1_macro=0.5699, f1_binary=0.9176
[CV 5/10] method=standard, alpha=0.15 accuracy=0.8511, f1_macro=0.5699, f1_binary=0.9176
[CV 6/10] method=standard, alpha=0.15 accuracy=0.8298, f1_macro=0.5524, f1_binary=0.9048
[CV 7/10] method=standard, alpha=0.15 accuracy=0.8478, f1_macro=0.5689, f1_binary=0.9157
[CV 8/10] method=standard, alpha=0.15 accuracy=0.8261, f1_macro=0.4524, f1_binary=0.9048
[CV 9/10] method=standard, alpha=0.15 accuracy=0.8261, f1_macro=0.4524, f1_binary=0.9048
[CV 10/10] method=standard, alpha=0.15 accuracy=0.8261, f1_macro=0.4524, f1_binary=0.9048

 accuracy=0.8432, f1_macro=0.5422, f1_binary=0.9134


## PatternBinaryClassifier

In [36]:
parameters = {'alpha' : np.linspace(0, 1, 21),
              'method': ['standard','standard-support','ratio-support'],
             }

n = kfold.get_n_splits(X)

f1_best_p = 0
alpha_best_p = 0.
method_best_p = 'standard'

for method in parameters['method']:
    for alpha in parameters['alpha']:
        
        f1 = [0 for i in range(n)]
        for i, (train_index, test_index) in enumerate(kfold.split(X, y)):
            bin_cls = fcalc.classifier.PatternBinaryClassifier(
                X.iloc[train_index].values, 
                y.iloc[train_index].to_numpy(),
                method=method,
                alpha=alpha
            )
            bin_cls.predict(X.iloc[test_index].values)
            f1[i] = f1_score(y.iloc[test_index], bin_cls.predictions, average='macro')
            print(f"[CV {i+1}/{n}] method={method}, alpha={alpha:0.2f}, f1_macro={f1[i]:0.4f}") 
                    
        f1_mean = np.mean(f1)
        print(f'f1_mean = {f1_mean:0.4f}\n')
        if(f1_mean > f1_best_p):
            f1_best_p = f1_mean
            alpha_best_p = alpha
            method_best_p = method

best_parameters['PatternBinaryClassifier'] = {'method': method_best_p, 'alpha': alpha_best_p}              
print(f"f1_best={f1_best_p:0.4f}, method={method_best_p}, alpha={alpha_best_p:0.2f}")

[CV 1/10] method=standard, alpha=0.00, f1_macro=0.9313
[CV 2/10] method=standard, alpha=0.00, f1_macro=0.8626
[CV 3/10] method=standard, alpha=0.00, f1_macro=0.9641
[CV 4/10] method=standard, alpha=0.00, f1_macro=0.9247
[CV 5/10] method=standard, alpha=0.00, f1_macro=0.9161
[CV 6/10] method=standard, alpha=0.00, f1_macro=0.9080
[CV 7/10] method=standard, alpha=0.00, f1_macro=0.8315
[CV 8/10] method=standard, alpha=0.00, f1_macro=0.8315
[CV 9/10] method=standard, alpha=0.00, f1_macro=0.8656
[CV 10/10] method=standard, alpha=0.00, f1_macro=0.7760
f1_mean = 0.8811

[CV 1/10] method=standard, alpha=0.05, f1_macro=0.4071
[CV 2/10] method=standard, alpha=0.05, f1_macro=0.3681
[CV 3/10] method=standard, alpha=0.05, f1_macro=0.4861
[CV 4/10] method=standard, alpha=0.05, f1_macro=0.4703
[CV 5/10] method=standard, alpha=0.05, f1_macro=0.6390
[CV 6/10] method=standard, alpha=0.05, f1_macro=0.4581
[CV 7/10] method=standard, alpha=0.05, f1_macro=0.3677
[CV 8/10] method=standard, alpha=0.05, f1_macr

[CV 6/10] method=standard, alpha=0.70, f1_macro=0.0000
[CV 7/10] method=standard, alpha=0.70, f1_macro=0.0000
[CV 8/10] method=standard, alpha=0.70, f1_macro=0.0000
[CV 9/10] method=standard, alpha=0.70, f1_macro=0.0000
[CV 10/10] method=standard, alpha=0.70, f1_macro=0.0000
f1_mean = 0.0000

[CV 1/10] method=standard, alpha=0.75, f1_macro=0.0000
[CV 2/10] method=standard, alpha=0.75, f1_macro=0.0000
[CV 3/10] method=standard, alpha=0.75, f1_macro=0.0000
[CV 4/10] method=standard, alpha=0.75, f1_macro=0.0000
[CV 5/10] method=standard, alpha=0.75, f1_macro=0.0000
[CV 6/10] method=standard, alpha=0.75, f1_macro=0.0000
[CV 7/10] method=standard, alpha=0.75, f1_macro=0.0000
[CV 8/10] method=standard, alpha=0.75, f1_macro=0.0000
[CV 9/10] method=standard, alpha=0.75, f1_macro=0.0000
[CV 10/10] method=standard, alpha=0.75, f1_macro=0.0000
f1_mean = 0.0000

[CV 1/10] method=standard, alpha=0.80, f1_macro=0.0000
[CV 2/10] method=standard, alpha=0.80, f1_macro=0.0000
[CV 3/10] method=standard, 

[CV 1/10] method=standard-support, alpha=0.35, f1_macro=0.1455
[CV 2/10] method=standard-support, alpha=0.35, f1_macro=0.1455
[CV 3/10] method=standard-support, alpha=0.35, f1_macro=0.1455
[CV 4/10] method=standard-support, alpha=0.35, f1_macro=0.1455
[CV 5/10] method=standard-support, alpha=0.35, f1_macro=0.1455
[CV 6/10] method=standard-support, alpha=0.35, f1_macro=0.1607
[CV 7/10] method=standard-support, alpha=0.35, f1_macro=0.1481
[CV 8/10] method=standard-support, alpha=0.35, f1_macro=0.1481
[CV 9/10] method=standard-support, alpha=0.35, f1_macro=0.1481
[CV 10/10] method=standard-support, alpha=0.35, f1_macro=0.1481
f1_mean = 0.1481

[CV 1/10] method=standard-support, alpha=0.40, f1_macro=0.1455
[CV 2/10] method=standard-support, alpha=0.40, f1_macro=0.1455
[CV 3/10] method=standard-support, alpha=0.40, f1_macro=0.1455
[CV 4/10] method=standard-support, alpha=0.40, f1_macro=0.1455
[CV 5/10] method=standard-support, alpha=0.40, f1_macro=0.1455
[CV 6/10] method=standard-support, a

[CV 8/10] method=standard-support, alpha=0.95, f1_macro=0.1481
[CV 9/10] method=standard-support, alpha=0.95, f1_macro=0.1481
[CV 10/10] method=standard-support, alpha=0.95, f1_macro=0.1481
f1_mean = 0.1481

[CV 1/10] method=standard-support, alpha=1.00, f1_macro=0.1455
[CV 2/10] method=standard-support, alpha=1.00, f1_macro=0.1455
[CV 3/10] method=standard-support, alpha=1.00, f1_macro=0.1455
[CV 4/10] method=standard-support, alpha=1.00, f1_macro=0.1455
[CV 5/10] method=standard-support, alpha=1.00, f1_macro=0.1455
[CV 6/10] method=standard-support, alpha=1.00, f1_macro=0.1607
[CV 7/10] method=standard-support, alpha=1.00, f1_macro=0.1481
[CV 8/10] method=standard-support, alpha=1.00, f1_macro=0.1481
[CV 9/10] method=standard-support, alpha=1.00, f1_macro=0.1481
[CV 10/10] method=standard-support, alpha=1.00, f1_macro=0.1481
f1_mean = 0.1481

[CV 1/10] method=ratio-support, alpha=0.00, f1_macro=0.5320
[CV 2/10] method=ratio-support, alpha=0.00, f1_macro=0.5853
[CV 3/10] method=ratio-

[CV 10/10] method=ratio-support, alpha=0.55, f1_macro=0.6538
f1_mean = 0.6318

[CV 1/10] method=ratio-support, alpha=0.60, f1_macro=0.5676
[CV 2/10] method=ratio-support, alpha=0.60, f1_macro=0.6083
[CV 3/10] method=ratio-support, alpha=0.60, f1_macro=0.6948
[CV 4/10] method=ratio-support, alpha=0.60, f1_macro=0.6759
[CV 5/10] method=ratio-support, alpha=0.60, f1_macro=0.6573
[CV 6/10] method=ratio-support, alpha=0.60, f1_macro=0.6871
[CV 7/10] method=ratio-support, alpha=0.60, f1_macro=0.5865
[CV 8/10] method=ratio-support, alpha=0.60, f1_macro=0.5881
[CV 9/10] method=ratio-support, alpha=0.60, f1_macro=0.6351
[CV 10/10] method=ratio-support, alpha=0.60, f1_macro=0.6538
f1_mean = 0.6354

[CV 1/10] method=ratio-support, alpha=0.65, f1_macro=0.5853
[CV 2/10] method=ratio-support, alpha=0.65, f1_macro=0.6439
[CV 3/10] method=ratio-support, alpha=0.65, f1_macro=0.7548
[CV 4/10] method=ratio-support, alpha=0.65, f1_macro=0.6622
[CV 5/10] method=ratio-support, alpha=0.65, f1_macro=0.6573
[C

In [37]:
parameters = {'alpha' : np.linspace(0, 10, 21),
              'method': ['ratio-support'],
             }

n = kfold.get_n_splits(X)

for method in parameters['method']:
    for alpha in parameters['alpha']:
        
        f1 = [0 for i in range(n)]
        for i, (train_index, test_index) in enumerate(kfold.split(X, y)):
            bin_cls = fcalc.classifier.PatternBinaryClassifier(
                X.iloc[train_index].values, 
                y.iloc[train_index].to_numpy(),
                method=method,
                alpha=alpha
            )
            bin_cls.predict(X.iloc[test_index].values)
            f1[i] = f1_score(y.iloc[test_index], bin_cls.predictions, average='macro')
            print(f"[CV {i+1}/{n}] method={method}, alpha={alpha:0.2f}, f1_macro={f1[i]:0.4f}") 
                    
        f1_mean = np.mean(f1)
        print(f'f1_mean = {f1_mean:0.4f}\n')
        if(f1_mean > f1_best_p):
            f1_best_p = f1_mean
            alpha_best_p = alpha
            method_best_p = method

best_parameters['PatternBinaryClassifier'] = {'method': method_best_p, 'alpha': alpha_best_p}            
print(f"f1_best={f1_best_p:0.4f}, method={method_best_p}, alpha={alpha_best_p:0.2f}")

[CV 1/10] method=ratio-support, alpha=0.00, f1_macro=0.5320
[CV 2/10] method=ratio-support, alpha=0.00, f1_macro=0.5853
[CV 3/10] method=ratio-support, alpha=0.00, f1_macro=0.6210
[CV 4/10] method=ratio-support, alpha=0.00, f1_macro=0.5853
[CV 5/10] method=ratio-support, alpha=0.00, f1_macro=0.5853
[CV 6/10] method=ratio-support, alpha=0.00, f1_macro=0.6314
[CV 7/10] method=ratio-support, alpha=0.00, f1_macro=0.5437
[CV 8/10] method=ratio-support, alpha=0.00, f1_macro=0.4868
[CV 9/10] method=ratio-support, alpha=0.00, f1_macro=0.5984
[CV 10/10] method=ratio-support, alpha=0.00, f1_macro=0.5984
f1_mean = 0.5767

[CV 1/10] method=ratio-support, alpha=0.50, f1_macro=0.5676
[CV 2/10] method=ratio-support, alpha=0.50, f1_macro=0.6083
[CV 3/10] method=ratio-support, alpha=0.50, f1_macro=0.6948
[CV 4/10] method=ratio-support, alpha=0.50, f1_macro=0.6759
[CV 5/10] method=ratio-support, alpha=0.50, f1_macro=0.6210
[CV 6/10] method=ratio-support, alpha=0.50, f1_macro=0.7063
[CV 7/10] method=rati

[CV 4/10] method=ratio-support, alpha=6.50, f1_macro=0.4471
[CV 5/10] method=ratio-support, alpha=6.50, f1_macro=0.4471
[CV 6/10] method=ratio-support, alpha=6.50, f1_macro=0.4268
[CV 7/10] method=ratio-support, alpha=6.50, f1_macro=0.4458
[CV 8/10] method=ratio-support, alpha=6.50, f1_macro=0.4458
[CV 9/10] method=ratio-support, alpha=6.50, f1_macro=0.4524
[CV 10/10] method=ratio-support, alpha=6.50, f1_macro=0.5512
f1_mean = 0.4605

[CV 1/10] method=ratio-support, alpha=7.00, f1_macro=0.5367
[CV 2/10] method=ratio-support, alpha=7.00, f1_macro=0.4268
[CV 3/10] method=ratio-support, alpha=7.00, f1_macro=0.4471
[CV 4/10] method=ratio-support, alpha=7.00, f1_macro=0.4471
[CV 5/10] method=ratio-support, alpha=7.00, f1_macro=0.4471
[CV 6/10] method=ratio-support, alpha=7.00, f1_macro=0.4268
[CV 7/10] method=ratio-support, alpha=7.00, f1_macro=0.4458
[CV 8/10] method=ratio-support, alpha=7.00, f1_macro=0.4458
[CV 9/10] method=ratio-support, alpha=7.00, f1_macro=0.4524
[CV 10/10] method=rat

In [38]:
f1 = [0 for i in range(n)]
accuracy = [0 for i in range(n)]
f1_binary = [0 for i in range(n)]

for i, (train_index, test_index) in enumerate(kfold.split(X, y)):
    bin_cls = fcalc.classifier.PatternBinaryClassifier(
        X.iloc[train_index].values, 
        y.iloc[train_index].to_numpy(),
        method=method_best_p,
        alpha=alpha_best_p
    )
    bin_cls.predict(X.iloc[test_index].values)
    
    accuracy[i], f1[i], f1_binary[i] = results_fca(y.iloc[test_index], bin_cls.predictions)
    
    print(f"[CV {i+1}/{n}] method={method_best_p}, alpha={alpha_best_p:0.2f}", 
          f"accuracy={accuracy[i]:0.4f}, f1_macro={f1[i]:0.4f}, f1_binary={f1_binary[i]:0.4f}")

f1_mean = np.mean(f1)
accuracy_mean = np.mean(accuracy)
f1_binary_mean = np.mean(f1_binary)

best_metrics['PatternBinaryClassifier'] = {'Accuracy': accuracy_mean, 'F1_binary': f1_binary_mean, 'F1_macro': f1_mean}
print(f"\n accuracy={accuracy_mean:0.4f}, f1_binary={f1_binary_mean:0.4f}, f1_macro={f1_mean:0.4f}")

[CV 1/10] method=standard, alpha=0.00 accuracy=0.9574, f1_macro=0.9313, f1_binary=0.9737
[CV 2/10] method=standard, alpha=0.00 accuracy=0.9149, f1_macro=0.8626, f1_binary=0.9474
[CV 3/10] method=standard, alpha=0.00 accuracy=0.9787, f1_macro=0.9641, f1_binary=0.9870
[CV 4/10] method=standard, alpha=0.00 accuracy=0.9574, f1_macro=0.9247, f1_binary=0.9744
[CV 5/10] method=standard, alpha=0.00 accuracy=0.9574, f1_macro=0.9161, f1_binary=0.9750
[CV 6/10] method=standard, alpha=0.00 accuracy=0.9362, f1_macro=0.9080, f1_binary=0.9589
[CV 7/10] method=standard, alpha=0.00 accuracy=0.9130, f1_macro=0.8315, f1_binary=0.9487
[CV 8/10] method=standard, alpha=0.00 accuracy=0.9130, f1_macro=0.8315, f1_binary=0.9487
[CV 9/10] method=standard, alpha=0.00 accuracy=0.9348, f1_macro=0.8656, f1_binary=0.9620
[CV 10/10] method=standard, alpha=0.00 accuracy=0.8913, f1_macro=0.7760, f1_binary=0.9367

 accuracy=0.9354, f1_macro=0.8811, f1_binary=0.9612


# Overall

In [39]:
for i in best_parameters:
    print(i)
    print(best_parameters[i])
    print()

LogisticRegression
{'C': 1.0}

KNeighborsClassifier
{'n_neighbors': 12}

MultinomialNB
{'alpha': 0.501}

GaussianNB
{'var_smoothing': 0.1873817422860384}

ComplementNB
{'alpha': 60.801}

DecisionTreeClassifier
{'criterion': 'gini', 'max_depth': 6, 'min_samples_split': 10}

RandomForestClassifier
{'criterion': 'entropy', 'max_depth': 6, 'min_samples_split': 4, 'n_estimators': 70}

BinarizedBinaryClassifier
{'method': 'standard', 'alpha': 0.15000000000000002}

PatternBinaryClassifier
{'method': 'standard', 'alpha': 0.0}



In [40]:
for i in best_metrics:
    print(i)
    print(best_metrics[i])
    print()

LogisticRegression
{'Accuracy': 0.9633, 'F1_binary': 0.9784, 'F1_macro': 0.9281}

KNeighborsClassifier
{'Accuracy': 0.9376, 'F1_binary': 0.9637, 'F1_macro': 0.8679}

MultinomialNB
{'Accuracy': 0.8668, 'F1_binary': 0.9255, 'F1_macro': 0.6488}

GaussianNB
{'Accuracy': 0.9634, 'F1_binary': 0.9782, 'F1_macro': 0.9313}

ComplementNB
{'Accuracy': 0.9269, 'F1_binary': 0.9576, 'F1_macro': 0.843}

DecisionTreeClassifier
{'Accuracy': 0.9505, 'F1_binary': 0.9706, 'F1_macro': 0.9048}

RandomForestClassifier
{'Accuracy': 0.9548, 'F1_binary': 0.9736, 'F1_macro': 0.907}

BinarizedBinaryClassifier
{'Accuracy': 0.8432469935245143, 'F1_binary': 0.9133869933515575, 'F1_macro': 0.5422490522313342}

PatternBinaryClassifier
{'Accuracy': 0.9354301572617946, 'F1_binary': 0.9612498777191038, 'F1_macro': 0.8811407220959232}



# Extra: Expandad table for BinarizedBinaryClassifier

In [49]:
X_expanded = ~X
anticolumns = X_expanded.columns.to_list()
for i in range(len(anticolumns)):
    anticolumns[i] = 'NOT_' + anticolumns[i]
X_expanded.columns = anticolumns
X_expanded = pd.concat([X, X_expanded],axis=1)
X_expanded.head()

Unnamed: 0,Breathing Problem,Fever,Dry Cough,Sore throat,Running Nose,Asthma,Chronic Lung Disease,Headache,Heart Disease,Diabetes,...,NOT_Hyper Tension,NOT_Fatigue,NOT_Gastrointestinal,NOT_Abroad travel,NOT_Contact with COVID Patient,NOT_Attended Large Gathering,NOT_Visited Public Exposed Places,NOT_Family working in Public Exposed Places,NOT_Wearing Masks,NOT_Sanitization from Market
0,True,True,True,True,True,False,False,False,False,True,...,False,False,False,True,False,True,False,False,True,True
1,True,True,True,True,False,True,True,True,False,False,...,True,False,True,True,True,False,False,True,True,True
2,True,True,True,True,True,True,True,True,False,True,...,True,False,False,False,True,True,True,True,True,True
3,True,True,True,False,False,True,False,False,True,True,...,True,True,True,False,True,False,False,True,True,True
4,True,True,True,True,True,False,True,True,True,True,...,False,True,False,True,False,True,False,True,True,True


In [50]:
parameters = {'alpha' : np.linspace(0, 1, 21),
              'method': ['standard','standard-support','ratio-support'],
             }

n = kfold.get_n_splits(X_expanded)

f1_best_e = 0
alpha_best_e = 0.
method_best_e = 'standard'

for method in parameters['method']:
    for alpha in parameters['alpha']:
        
        f1 = [0 for i in range(n)]
        for i, (train_index, test_index) in enumerate(kfold.split(X_expanded, y)):
            bin_cls = fcalc.classifier.BinarizedBinaryClassifier(
                X_expanded.iloc[train_index].values, 
                y.iloc[train_index].to_numpy(),
                method=method,
                alpha=alpha
            )
            bin_cls.predict(X_expanded.iloc[test_index].values)
            f1[i] = f1_score(y.iloc[test_index], bin_cls.predictions, average='macro')
            print(f"[CV {i+1}/{n}] method={method}, alpha={alpha:0.2f}, f1_macro={f1[i]:0.4f}") 
                    
        f1_mean = np.mean(f1)
        print(f'f1_mean = {f1_mean:0.4f}\n')
        if(f1_mean > f1_best_e):
            f1_best_e = f1_mean
            alpha_best_e = alpha
            method_best_e = method

best_parameters_expanded = {'method': method_best_e, 'alpha': alpha_best_e}             
print(f"f1_best={f1_best_e:0.4f}, method={method_best_e}, alpha={alpha_best_e:0.2f}")

[CV 1/10] method=standard, alpha=0.00, f1_macro=0.9313
[CV 2/10] method=standard, alpha=0.00, f1_macro=0.8626
[CV 3/10] method=standard, alpha=0.00, f1_macro=0.9641
[CV 4/10] method=standard, alpha=0.00, f1_macro=0.9247
[CV 5/10] method=standard, alpha=0.00, f1_macro=0.9161
[CV 6/10] method=standard, alpha=0.00, f1_macro=0.9080
[CV 7/10] method=standard, alpha=0.00, f1_macro=0.8315
[CV 8/10] method=standard, alpha=0.00, f1_macro=0.8315
[CV 9/10] method=standard, alpha=0.00, f1_macro=0.8656
[CV 10/10] method=standard, alpha=0.00, f1_macro=0.7760
f1_mean = 0.8811

[CV 1/10] method=standard, alpha=0.05, f1_macro=0.4071
[CV 2/10] method=standard, alpha=0.05, f1_macro=0.3681
[CV 3/10] method=standard, alpha=0.05, f1_macro=0.4861
[CV 4/10] method=standard, alpha=0.05, f1_macro=0.4703
[CV 5/10] method=standard, alpha=0.05, f1_macro=0.6390
[CV 6/10] method=standard, alpha=0.05, f1_macro=0.4581
[CV 7/10] method=standard, alpha=0.05, f1_macro=0.3677
[CV 8/10] method=standard, alpha=0.05, f1_macr

[CV 6/10] method=standard, alpha=0.70, f1_macro=0.0000
[CV 7/10] method=standard, alpha=0.70, f1_macro=0.0000
[CV 8/10] method=standard, alpha=0.70, f1_macro=0.0000
[CV 9/10] method=standard, alpha=0.70, f1_macro=0.0000
[CV 10/10] method=standard, alpha=0.70, f1_macro=0.0000
f1_mean = 0.0000

[CV 1/10] method=standard, alpha=0.75, f1_macro=0.0000
[CV 2/10] method=standard, alpha=0.75, f1_macro=0.0000
[CV 3/10] method=standard, alpha=0.75, f1_macro=0.0000
[CV 4/10] method=standard, alpha=0.75, f1_macro=0.0000
[CV 5/10] method=standard, alpha=0.75, f1_macro=0.0000
[CV 6/10] method=standard, alpha=0.75, f1_macro=0.0000
[CV 7/10] method=standard, alpha=0.75, f1_macro=0.0000
[CV 8/10] method=standard, alpha=0.75, f1_macro=0.0000
[CV 9/10] method=standard, alpha=0.75, f1_macro=0.0000
[CV 10/10] method=standard, alpha=0.75, f1_macro=0.0000
f1_mean = 0.0000

[CV 1/10] method=standard, alpha=0.80, f1_macro=0.0000
[CV 2/10] method=standard, alpha=0.80, f1_macro=0.0000
[CV 3/10] method=standard, 

[CV 1/10] method=standard-support, alpha=0.35, f1_macro=0.1455
[CV 2/10] method=standard-support, alpha=0.35, f1_macro=0.1455
[CV 3/10] method=standard-support, alpha=0.35, f1_macro=0.1455
[CV 4/10] method=standard-support, alpha=0.35, f1_macro=0.1455
[CV 5/10] method=standard-support, alpha=0.35, f1_macro=0.1455
[CV 6/10] method=standard-support, alpha=0.35, f1_macro=0.1607
[CV 7/10] method=standard-support, alpha=0.35, f1_macro=0.1481
[CV 8/10] method=standard-support, alpha=0.35, f1_macro=0.1481
[CV 9/10] method=standard-support, alpha=0.35, f1_macro=0.1481
[CV 10/10] method=standard-support, alpha=0.35, f1_macro=0.1481
f1_mean = 0.1481

[CV 1/10] method=standard-support, alpha=0.40, f1_macro=0.1455
[CV 2/10] method=standard-support, alpha=0.40, f1_macro=0.1455
[CV 3/10] method=standard-support, alpha=0.40, f1_macro=0.1455
[CV 4/10] method=standard-support, alpha=0.40, f1_macro=0.1455
[CV 5/10] method=standard-support, alpha=0.40, f1_macro=0.1455
[CV 6/10] method=standard-support, a

[CV 9/10] method=standard-support, alpha=0.95, f1_macro=0.1481
[CV 10/10] method=standard-support, alpha=0.95, f1_macro=0.1481
f1_mean = 0.1481

[CV 1/10] method=standard-support, alpha=1.00, f1_macro=0.1455
[CV 2/10] method=standard-support, alpha=1.00, f1_macro=0.1455
[CV 3/10] method=standard-support, alpha=1.00, f1_macro=0.1455
[CV 4/10] method=standard-support, alpha=1.00, f1_macro=0.1455
[CV 5/10] method=standard-support, alpha=1.00, f1_macro=0.1455
[CV 6/10] method=standard-support, alpha=1.00, f1_macro=0.1607
[CV 7/10] method=standard-support, alpha=1.00, f1_macro=0.1481
[CV 8/10] method=standard-support, alpha=1.00, f1_macro=0.1481
[CV 9/10] method=standard-support, alpha=1.00, f1_macro=0.1481
[CV 10/10] method=standard-support, alpha=1.00, f1_macro=0.1481
f1_mean = 0.1481

[CV 1/10] method=ratio-support, alpha=0.00, f1_macro=0.5320
[CV 2/10] method=ratio-support, alpha=0.00, f1_macro=0.5853
[CV 3/10] method=ratio-support, alpha=0.00, f1_macro=0.6210
[CV 4/10] method=ratio-sup

[CV 1/10] method=ratio-support, alpha=0.60, f1_macro=0.5676
[CV 2/10] method=ratio-support, alpha=0.60, f1_macro=0.6083
[CV 3/10] method=ratio-support, alpha=0.60, f1_macro=0.6948
[CV 4/10] method=ratio-support, alpha=0.60, f1_macro=0.6759
[CV 5/10] method=ratio-support, alpha=0.60, f1_macro=0.6573
[CV 6/10] method=ratio-support, alpha=0.60, f1_macro=0.6871
[CV 7/10] method=ratio-support, alpha=0.60, f1_macro=0.5865
[CV 8/10] method=ratio-support, alpha=0.60, f1_macro=0.5881
[CV 9/10] method=ratio-support, alpha=0.60, f1_macro=0.6351
[CV 10/10] method=ratio-support, alpha=0.60, f1_macro=0.6538
f1_mean = 0.6354

[CV 1/10] method=ratio-support, alpha=0.65, f1_macro=0.5853
[CV 2/10] method=ratio-support, alpha=0.65, f1_macro=0.6439
[CV 3/10] method=ratio-support, alpha=0.65, f1_macro=0.7548
[CV 4/10] method=ratio-support, alpha=0.65, f1_macro=0.6622
[CV 5/10] method=ratio-support, alpha=0.65, f1_macro=0.6573
[CV 6/10] method=ratio-support, alpha=0.65, f1_macro=0.7063
[CV 7/10] method=rati

In [51]:
parameters = {'alpha' : np.linspace(1, 10, 19),
              'method': ['ratio-support'],
             }

n = kfold.get_n_splits(X_expanded)

for method in parameters['method']:
    for alpha in parameters['alpha']:
        
        f1 = [0 for i in range(n)]
        for i, (train_index, test_index) in enumerate(kfold.split(X_expanded, y)):
            bin_cls = fcalc.classifier.BinarizedBinaryClassifier(
                X_expanded.iloc[train_index].values, 
                y.iloc[train_index].to_numpy(),
                method=method,
                alpha=alpha
            )
            bin_cls.predict(X_expanded.iloc[test_index].values)
            f1[i] = f1_score(y.iloc[test_index], bin_cls.predictions, average='macro')
            print(f"[CV {i+1}/{n}] method={method}, alpha={alpha:0.2f}, f1_macro={f1[i]:0.4f}") 
                    
        f1_mean = np.mean(f1)
        print(f'f1_mean = {f1_mean:0.4f}\n')
        if(f1_mean > f1_best_e):
            f1_best_e = f1_mean
            alpha_best_e = alpha
            method_best_e = method

best_parameters_expanded = {'method': method_best_e, 'alpha': alpha_best_e}            
print(f"f1_best={f1_best_e:0.4f}, method={method_best_e}, alpha={alpha_best_e:0.2f}")

[CV 1/10] method=ratio-support, alpha=1.00, f1_macro=0.6210
[CV 2/10] method=ratio-support, alpha=1.00, f1_macro=0.6439
[CV 3/10] method=ratio-support, alpha=1.00, f1_macro=0.8220
[CV 4/10] method=ratio-support, alpha=1.00, f1_macro=0.7202
[CV 5/10] method=ratio-support, alpha=1.00, f1_macro=0.6759
[CV 6/10] method=ratio-support, alpha=1.00, f1_macro=0.7457
[CV 7/10] method=ratio-support, alpha=1.00, f1_macro=0.6593
[CV 8/10] method=ratio-support, alpha=1.00, f1_macro=0.6054
[CV 9/10] method=ratio-support, alpha=1.00, f1_macro=0.6593
[CV 10/10] method=ratio-support, alpha=1.00, f1_macro=0.6979
f1_mean = 0.6850

[CV 1/10] method=ratio-support, alpha=1.50, f1_macro=0.7762
[CV 2/10] method=ratio-support, alpha=1.50, f1_macro=0.7032
[CV 3/10] method=ratio-support, alpha=1.50, f1_macro=0.6987
[CV 4/10] method=ratio-support, alpha=1.50, f1_macro=0.8095
[CV 5/10] method=ratio-support, alpha=1.50, f1_macro=0.6458
[CV 6/10] method=ratio-support, alpha=1.50, f1_macro=0.8321
[CV 7/10] method=rati

[CV 4/10] method=ratio-support, alpha=7.50, f1_macro=0.5524
[CV 5/10] method=ratio-support, alpha=7.50, f1_macro=0.4535
[CV 6/10] method=ratio-support, alpha=7.50, f1_macro=0.4268
[CV 7/10] method=ratio-support, alpha=7.50, f1_macro=0.4458
[CV 8/10] method=ratio-support, alpha=7.50, f1_macro=0.4458
[CV 9/10] method=ratio-support, alpha=7.50, f1_macro=0.4524
[CV 10/10] method=ratio-support, alpha=7.50, f1_macro=0.4458
f1_mean = 0.4633

[CV 1/10] method=ratio-support, alpha=8.00, f1_macro=0.5367
[CV 2/10] method=ratio-support, alpha=8.00, f1_macro=0.4268
[CV 3/10] method=ratio-support, alpha=8.00, f1_macro=0.4471
[CV 4/10] method=ratio-support, alpha=8.00, f1_macro=0.4471
[CV 5/10] method=ratio-support, alpha=8.00, f1_macro=0.4535
[CV 6/10] method=ratio-support, alpha=8.00, f1_macro=0.4268
[CV 7/10] method=ratio-support, alpha=8.00, f1_macro=0.4458
[CV 8/10] method=ratio-support, alpha=8.00, f1_macro=0.4458
[CV 9/10] method=ratio-support, alpha=8.00, f1_macro=0.4524
[CV 10/10] method=rat

In [52]:
f1 = [0 for i in range(n)]
accuracy = [0 for i in range(n)]
f1_binary = [0 for i in range(n)]

for i, (train_index, test_index) in enumerate(kfold.split(X_expanded, y)):
    bin_cls = fcalc.classifier.BinarizedBinaryClassifier(
        X_expanded.iloc[train_index].values, 
        y.iloc[train_index].to_numpy(),
        method=method_best_e,
        alpha=alpha_best_e
    )
    bin_cls.predict(X_expanded.iloc[test_index].values)
    
    accuracy[i], f1[i], f1_binary[i] = results_fca(y.iloc[test_index], bin_cls.predictions)
    
    print(f"[CV {i+1}/{n}] method={method_best_e}, alpha={alpha_best_e:0.2f}", 
          f"accuracy={accuracy[i]:0.4f}, f1_macro={f1[i]:0.4f}, f1_binary={f1_binary[i]:0.4f}")

f1_mean = np.mean(f1)
accuracy_mean = np.mean(accuracy)
f1_binary_mean = np.mean(f1_binary)

best_metrics_expanded = {'Accuracy': accuracy_mean, 'F1_binary': f1_binary_mean, 'F1_macro': f1_mean}
print(f"\n accuracy={accuracy_mean:0.4f}, f1_binary={f1_binary_mean:0.4f}, f1_macro={f1_mean:0.4f}")

[CV 1/10] method=standard, alpha=0.00 accuracy=0.9574, f1_macro=0.9313, f1_binary=0.9737
[CV 2/10] method=standard, alpha=0.00 accuracy=0.9149, f1_macro=0.8626, f1_binary=0.9474
[CV 3/10] method=standard, alpha=0.00 accuracy=0.9787, f1_macro=0.9641, f1_binary=0.9870
[CV 4/10] method=standard, alpha=0.00 accuracy=0.9574, f1_macro=0.9247, f1_binary=0.9744
[CV 5/10] method=standard, alpha=0.00 accuracy=0.9574, f1_macro=0.9161, f1_binary=0.9750
[CV 6/10] method=standard, alpha=0.00 accuracy=0.9362, f1_macro=0.9080, f1_binary=0.9589
[CV 7/10] method=standard, alpha=0.00 accuracy=0.9130, f1_macro=0.8315, f1_binary=0.9487
[CV 8/10] method=standard, alpha=0.00 accuracy=0.9130, f1_macro=0.8315, f1_binary=0.9487
[CV 9/10] method=standard, alpha=0.00 accuracy=0.9348, f1_macro=0.8656, f1_binary=0.9620
[CV 10/10] method=standard, alpha=0.00 accuracy=0.8913, f1_macro=0.7760, f1_binary=0.9367

 accuracy=0.9354, f1_binary=0.9612, f1_macro=0.8811


In [53]:
best_parameters_expanded

{'method': 'standard', 'alpha': 0.0}

In [54]:
best_metrics_expanded

{'Accuracy': 0.9354301572617946,
 'F1_binary': 0.9612498777191038,
 'F1_macro': 0.8811407220959232}