# Dataset: Israel's Covid-19 Dataset for Year 2020 & 2021
Link to a dataset: https://www.kaggle.com/datasets/mykeysid10/covid19-dataset-for-year-2020

In [1]:
import numpy as np
import pandas as pd

from sklearn.metrics import make_scorer, accuracy_score, f1_score

from sklearn import model_selection
from sklearn.model_selection import (
    StratifiedKFold,
    cross_validate,
    GridSearchCV,
)

Standard models used:

In [2]:
from sklearn.linear_model import LogisticRegression

from sklearn.neighbors import KNeighborsClassifier

from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import ComplementNB

from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import RandomForestClassifier

**fcalc** library files are located in **FCALC** folder

**FCALC** folder is located in the same directory as this notebook.

In [3]:
import FCALC.fcalc as fcalc

# Loading data

In [4]:
data_isr = pd.read_csv('Datasets/covid_israel.csv', 
                       sep=',',
                       true_values=['Yes','yes', 'Positive'],
                       false_values=['No','no', 'Negative']
                      )
data_isr.dropna(inplace = True)
data_isr.drop_duplicates(inplace=True)

data_isr.head()

Unnamed: 0.1,Unnamed: 0,cough,fever,sore_throat,shortness_of_breath,head_ache,corona_result,age_60_and_above,test_indication
0,0,0,0,0,0,0,False,True,Other
1,2,0,0,0,0,0,False,False,Other
2,218,0,0,0,0,0,True,False,Other
3,234,0,0,0,0,0,True,True,Other
4,239,0,0,0,0,0,True,False,Contact with confirmed


In [5]:
onehot = pd.get_dummies(data_isr.test_indication)
onehot.drop(['Other'], axis=1, inplace=True)

df = pd.concat([onehot, data_isr], axis=1)
df.drop(['test_indication'], axis=1, inplace=True)
df.reset_index(inplace=True, drop=True)
df = df.astype(bool)

# changing the order of columns
new_order =[
     'Abroad','Contact with confirmed',
     'cough', 'fever', 'sore_throat',
     'shortness_of_breath', 'head_ache', 
     'age_60_and_above', 'corona_result'
]
df = df[new_order]

df

Unnamed: 0,Abroad,Contact with confirmed,cough,fever,sore_throat,shortness_of_breath,head_ache,age_60_and_above,corona_result
0,False,False,False,False,False,False,False,True,False
1,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,True
3,False,False,False,False,False,False,False,True,True
4,False,True,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...
361,True,False,False,True,False,True,True,True,True
362,True,False,True,True,False,True,True,True,False
363,True,False,False,False,True,True,True,False,True
364,True,False,False,True,True,True,True,False,True


In [6]:
df.shape

(366, 9)

In [7]:
df.corona_result.value_counts(normalize=True)

corona_result
False    0.5
True     0.5
Name: proportion, dtype: float64

In [8]:
features = df.columns[0:-1]
target = df.columns[-1]

In [9]:
X = df[features]
y = df[target]

# Testing models

In [10]:
best_parameters = { 
    'LogisticRegression': [],
    'KNeighborsClassifier': [],
    'MultinomialNB': [],
    'GaussianNB': [],
    'ComplementNB': [],
    'DecisionTreeClassifier': [],
    'RandomForestClassifier': [],
    'BinarizedBinaryClassifier': [],
    'PatternBinaryClassifier': [],
}

In [11]:
best_metrics = {
    'LogisticRegression': [],
    'KNeighborsClassifier': [],
    'MultinomialNB': [],
    'GaussianNB': [],
    'ComplementNB': [],
    'DecisionTreeClassifier': [],
    'RandomForestClassifier': [],
    'BinarizedBinaryClassifier': [],
    'PatternBinaryClassifier': [],  
}

In [12]:
scoring = {'accuracy' : make_scorer(accuracy_score),
           'f1_macro' : make_scorer(f1_score, average='macro'),
           'f1_binary' : make_scorer(f1_score),
          }

kfold = StratifiedKFold(n_splits=10, random_state=49, shuffle=True)

In [13]:
def count_metrics(results):
    acc = np.round(np.mean(results['test_accuracy']), 4) 
    f1_m = np.round(np.mean(results['test_f1_macro']), 4)
    f1_b = np.round(np.mean(results['test_f1_binary']), 4)
    return(acc, f1_m, f1_b)  

In [14]:
def print_results(results):
    acc, f1_m, f1_b = count_metrics(results)
    print(f'Accuracy = {acc:0.4f}, F1_binary = {f1_b:0.4f}, F1_macro = {f1_m:0.4f}')  

In [15]:
def fill_best_metrics(results, method):
    acc, f1_m, f1_b = count_metrics(results)
    best_metrics[method] = {'Accuracy': acc, 'F1_binary': f1_b, 'F1_macro': f1_m}

# Standard models

### Logistic regression

In [16]:
parameters = {'C' : np.linspace(1,5,101)}

model = LogisticRegression()

best = GridSearchCV(estimator=model, 
                    param_grid=parameters, 
                    cv=kfold,
                    verbose=1, 
                    scoring=scoring,
                    refit='f1_binary'
                   )
best.fit(X, y)
best_parameters['LogisticRegression'] = best.best_params_
best.best_params_

Fitting 10 folds for each of 101 candidates, totalling 1010 fits


{'C': 1.88}

In [17]:
model = LogisticRegression(C=best.best_params_['C'])
results = cross_validate(estimator=model,
                         X=X, y=y,
                         cv=kfold,
                         scoring=scoring
                         )
fill_best_metrics(results, 'LogisticRegression')
print_results(results)

Accuracy = 0.3629, F1_binary = 0.3485, F1_macro = 0.3595


# K-Nearest Neighbours

In [18]:
parameters = {'n_neighbors' : range(5,78,4)}

model = KNeighborsClassifier()

best = GridSearchCV(estimator=model, 
                    param_grid=parameters, 
                    cv=kfold,
                    verbose=1, 
                    scoring=scoring,
                    refit='f1_binary'
                   )
best.fit(X, y)
best_parameters['KNeighborsClassifier'] = best.best_params_
best.best_params_

Fitting 10 folds for each of 19 candidates, totalling 190 fits


{'n_neighbors': 45}

In [19]:
model = KNeighborsClassifier(n_neighbors=best.best_params_['n_neighbors'])
results = cross_validate(estimator=model,
                         X=X,
                         y=y,
                         cv=kfold,
                         scoring=scoring
                         )
fill_best_metrics(results, 'KNeighborsClassifier')
print_results(results)

Accuracy = 0.4751, F1_binary = 0.5811, F1_macro = 0.4367


# Naive Bayes

##### MULTINOMIAL NB

In [20]:
parameters = {'alpha' : np.linspace(0.001,100.001,1001)}

model = MultinomialNB()

best = GridSearchCV(estimator=model, 
                    param_grid=parameters, 
                    cv=kfold,
                    verbose=1, 
                    scoring=scoring,
                    refit='f1_binary'
                   )
best.fit(X, y)
best_parameters['MultinomialNB'] = best.best_params_
best.best_params_

Fitting 10 folds for each of 1001 candidates, totalling 10010 fits


{'alpha': 26.901000000000003}

In [21]:
model = MultinomialNB(alpha=best.best_params_['alpha'])

results = cross_validate(estimator=model,
                         X=X,
                         y=y,
                         cv=kfold,
                         scoring=scoring
                         )
fill_best_metrics(results, 'MultinomialNB')
print_results(results)

Accuracy = 0.3631, F1_binary = 0.3545, F1_macro = 0.3564


##### GAUSSIAN NB

In [22]:
parameters = {'var_smoothing': np.logspace(0,-9, num=100)}

model = GaussianNB()

best = GridSearchCV(estimator=model, 
                    param_grid=parameters, 
                    cv=kfold,
                    verbose=1, 
                    scoring=scoring,
                    refit='f1_binary'
                   )
best.fit(X, y)
best_parameters['GaussianNB'] = best.best_params_
best.best_params_

Fitting 10 folds for each of 100 candidates, totalling 1000 fits


{'var_smoothing': 1.0}

In [23]:
model = GaussianNB(var_smoothing=best.best_params_['var_smoothing'])

results = cross_validate(estimator=model,
                         X=X,
                         y=y,
                         cv=kfold,
                         scoring=scoring
                         )
fill_best_metrics(results, 'GaussianNB')
print_results(results)

Accuracy = 0.3770, F1_binary = 0.3785, F1_macro = 0.3738


##### COMPLEMENT NB

In [24]:
parameters = {'alpha' : np.linspace(0.001,100.001,1001)}

model =  ComplementNB()

best = GridSearchCV(estimator=model, 
                    param_grid=parameters, 
                    cv=kfold,
                    verbose=1, 
                    scoring=scoring,
                    refit='f1_binary'
                   )
best.fit(X, y)
best_parameters['ComplementNB'] = best.best_params_
best.best_params_

Fitting 10 folds for each of 1001 candidates, totalling 10010 fits


{'alpha': 91.001}

In [25]:
model = ComplementNB(alpha=best.best_params_['alpha'])
results = cross_validate(estimator=model,
                         X=X,
                         y=y,
                         cv=kfold,
                         scoring=scoring
                         )
fill_best_metrics(results, 'ComplementNB')
print_results(results) 

Accuracy = 0.3577, F1_binary = 0.3562, F1_macro = 0.3542


### Decision tree

In [26]:
parameters = {'min_samples_split' : range(2,23,2),
              'max_depth' : range(2,21,2),
              'criterion' : ['gini', 'entropy']
             }

model = DecisionTreeClassifier(random_state=49)

best = GridSearchCV(estimator=model, 
                    param_grid=parameters, 
                    cv=kfold,
                    verbose=1, 
                    scoring=scoring,
                    refit='f1_binary'
                   )
best.fit(X, y)
best_parameters['DecisionTreeClassifier'] = best.best_params_
best.best_params_

Fitting 10 folds for each of 220 candidates, totalling 2200 fits


{'criterion': 'gini', 'max_depth': 2, 'min_samples_split': 2}

In [27]:
model = DecisionTreeClassifier(
    max_depth=best.best_params_['max_depth'], 
    min_samples_split=best.best_params_['min_samples_split'],
    criterion=best.best_params_['criterion'],
    random_state=49
)
results = cross_validate(estimator=model,
                         X=X,
                         y=y,
                         cv=kfold,
                         scoring=scoring
                         )
fill_best_metrics(results, 'DecisionTreeClassifier')
print_results(results)

Accuracy = 0.3493, F1_binary = 0.2666, F1_macro = 0.3343


### RandomForest

In [28]:
parameters = {'n_estimators' : range(40,121,10),
              'min_samples_split' : range(2,13,2),
              'max_depth' : range(2,21,2),
              'criterion' : ['gini', 'entropy']
             }
model = RandomForestClassifier(random_state=49)

best = GridSearchCV(estimator=model, 
                    param_grid=parameters, 
                    cv=kfold,
                    verbose=1, 
                    scoring=scoring,
                    refit='f1_binary'
                   )
best.fit(X, y)
best_parameters['RandomForestClassifier'] = best.best_params_
best.best_params_

Fitting 10 folds for each of 1080 candidates, totalling 10800 fits


{'criterion': 'gini',
 'max_depth': 2,
 'min_samples_split': 2,
 'n_estimators': 70}

In [29]:
model = RandomForestClassifier(
    n_estimators=best.best_params_['n_estimators'],
    min_samples_split=best.best_params_['min_samples_split'],
    max_depth=best.best_params_['max_depth'],
    criterion=best.best_params_['criterion'],
    random_state=49
)
results = cross_validate(estimator=model,
                         X=X,
                         y=y,
                         cv=kfold,
                         scoring=scoring
                         )
fill_best_metrics(results, 'RandomForestClassifier')
print_results(results)

Accuracy = 0.3633, F1_binary = 0.3834, F1_macro = 0.3533


# Lazy FCA

Due to the multilabel output of **BinarizedBinaryClassifier** (**1** for **True**, **0** for **false** and **-1** for **undefined**) we cannot utilize **f1_score** with **average='binary'**. Therefore, a function that interprets **undefined** as misclassification was implemented.

In [30]:
def compare_with_binary_f1_old(y_true, y_pred):
    y_tmp = np.concatenate(
        (np.array(y_true)[:,None],np.array(y_pred)[:,None]),
        axis=1
    )
    df_tmp = pd.DataFrame(y_tmp, columns=['y_true','y_pred'])
    df_tmp.y_true = df_tmp.y_true.astype(bool)
    df_tmp['y_new']= ~df_tmp.loc[df_tmp.y_pred==-1]['y_true']
    df_tmp.loc[df_tmp.y_pred!=-1, 'y_new'] = df_tmp.loc[df_tmp.y_pred!=-1,'y_pred'].astype(bool) 
    df_tmp.y_new = df_tmp.y_new.astype(bool)
    return(f1_score(df_tmp.y_true, df_tmp.y_new))

In [31]:
def results_fca(y_true, y_pred):
    f1 = f1_score(y_true, y_pred, average='macro')
    accuracy = accuracy_score(y_true, y_pred)
    f1_binary = compare_with_binary_f1(y_true, y_pred)
    return (accuracy, f1, f1_binary)  

In [32]:
def compare_with_binary_f1(y_true, y_pred):
    y_tmp = np.copy(y_pred)
    undef = y_pred == -1
    y_tmp[undef] = (y_true[undef] - np.ones(shape=y_tmp[undef].shape) * 2) // -2
    return (f1_score(y_true, y_tmp))

(0 - 2) // (-2) = 1

(1 - 2) // (-2) = 0

## BinarizedBinaryClassifier

**BinarizedBinaryClassifier** is not a **scikit** model, so **GridSearchCV** and **cross_validate** from **scikit** do not support it. Thus, we check desired parameters in **for**  loops.

In [33]:
parameters = {'alpha' : np.linspace(0, 1, 21),
              'method': ['standard','standard-support','ratio-support'],
             }

n = kfold.get_n_splits(X)

f1_best = 0
alpha_best = 0.
method_best = 'standard'

for method in parameters['method']:
    for alpha in parameters['alpha']:
        
        f1 = [0 for i in range(n)]
        for i, (train_index, test_index) in enumerate(kfold.split(X, y)):
            bin_cls = fcalc.classifier.BinarizedBinaryClassifier(
                X.iloc[train_index].values, 
                y.iloc[train_index].to_numpy(),
                method=method,
                alpha=alpha
            )
            bin_cls.predict(X.iloc[test_index].values)
            f1[i] = f1_score(y.iloc[test_index], bin_cls.predictions, average='macro')
            print(f"[CV {i+1}/{n}] method={method}, alpha={alpha:0.2f}, f1_macro={f1[i]:0.4f}") 
                    
        f1_mean = np.mean(f1)
        print(f'f1_mean = {f1_mean:0.4f}\n')
        if(f1_mean > f1_best):
            f1_best = f1_mean
            alpha_best = alpha
            method_best = method

best_parameters['BinarizedBinaryClassifier'] = {'method': method_best, 'alpha': alpha_best}              
print(f"f1_best={f1_best:0.4f}, method={method_best}, alpha={alpha_best:0.2f}")

[CV 1/10] method=standard, alpha=0.00, f1_macro=0.0000
[CV 2/10] method=standard, alpha=0.00, f1_macro=0.0000
[CV 3/10] method=standard, alpha=0.00, f1_macro=0.0000
[CV 4/10] method=standard, alpha=0.00, f1_macro=0.0533
[CV 5/10] method=standard, alpha=0.00, f1_macro=0.0000
[CV 6/10] method=standard, alpha=0.00, f1_macro=0.0000
[CV 7/10] method=standard, alpha=0.00, f1_macro=0.0000
[CV 8/10] method=standard, alpha=0.00, f1_macro=0.0000
[CV 9/10] method=standard, alpha=0.00, f1_macro=0.0000
[CV 10/10] method=standard, alpha=0.00, f1_macro=0.0303
f1_mean = 0.0084

[CV 1/10] method=standard, alpha=0.05, f1_macro=0.1127
[CV 2/10] method=standard, alpha=0.05, f1_macro=0.1160
[CV 3/10] method=standard, alpha=0.05, f1_macro=0.2013
[CV 4/10] method=standard, alpha=0.05, f1_macro=0.1296
[CV 5/10] method=standard, alpha=0.05, f1_macro=0.0952
[CV 6/10] method=standard, alpha=0.05, f1_macro=0.1119
[CV 7/10] method=standard, alpha=0.05, f1_macro=0.1010
[CV 8/10] method=standard, alpha=0.05, f1_macr

[CV 8/10] method=standard, alpha=0.70, f1_macro=0.2292
[CV 9/10] method=standard, alpha=0.70, f1_macro=0.2113
[CV 10/10] method=standard, alpha=0.70, f1_macro=0.2851
f1_mean = 0.3451

[CV 1/10] method=standard, alpha=0.75, f1_macro=0.3079
[CV 2/10] method=standard, alpha=0.75, f1_macro=0.5281
[CV 3/10] method=standard, alpha=0.75, f1_macro=0.5180
[CV 4/10] method=standard, alpha=0.75, f1_macro=0.4559
[CV 5/10] method=standard, alpha=0.75, f1_macro=0.3833
[CV 6/10] method=standard, alpha=0.75, f1_macro=0.3479
[CV 7/10] method=standard, alpha=0.75, f1_macro=0.1844
[CV 8/10] method=standard, alpha=0.75, f1_macro=0.2292
[CV 9/10] method=standard, alpha=0.75, f1_macro=0.2113
[CV 10/10] method=standard, alpha=0.75, f1_macro=0.2851
f1_mean = 0.3451

[CV 1/10] method=standard, alpha=0.80, f1_macro=0.3079
[CV 2/10] method=standard, alpha=0.80, f1_macro=0.5281
[CV 3/10] method=standard, alpha=0.80, f1_macro=0.5180
[CV 4/10] method=standard, alpha=0.80, f1_macro=0.4559
[CV 5/10] method=standard, 

[CV 6/10] method=standard-support, alpha=0.35, f1_macro=0.2681
[CV 7/10] method=standard-support, alpha=0.35, f1_macro=0.2500
[CV 8/10] method=standard-support, alpha=0.35, f1_macro=0.3566
[CV 9/10] method=standard-support, alpha=0.35, f1_macro=0.1656
[CV 10/10] method=standard-support, alpha=0.35, f1_macro=0.1961
f1_mean = 0.2428

[CV 1/10] method=standard-support, alpha=0.40, f1_macro=0.2160
[CV 2/10] method=standard-support, alpha=0.40, f1_macro=0.3478
[CV 3/10] method=standard-support, alpha=0.40, f1_macro=0.2771
[CV 4/10] method=standard-support, alpha=0.40, f1_macro=0.2278
[CV 5/10] method=standard-support, alpha=0.40, f1_macro=0.1417
[CV 6/10] method=standard-support, alpha=0.40, f1_macro=0.2681
[CV 7/10] method=standard-support, alpha=0.40, f1_macro=0.2611
[CV 8/10] method=standard-support, alpha=0.40, f1_macro=0.3566
[CV 9/10] method=standard-support, alpha=0.40, f1_macro=0.1497
[CV 10/10] method=standard-support, alpha=0.40, f1_macro=0.1961
f1_mean = 0.2442

[CV 1/10] method=

[CV 8/10] method=standard-support, alpha=1.00, f1_macro=0.3566
[CV 9/10] method=standard-support, alpha=1.00, f1_macro=0.4286
[CV 10/10] method=standard-support, alpha=1.00, f1_macro=0.3050
f1_mean = 0.3823

[CV 1/10] method=ratio-support, alpha=0.00, f1_macro=0.2885
[CV 2/10] method=ratio-support, alpha=0.00, f1_macro=0.3478
[CV 3/10] method=ratio-support, alpha=0.00, f1_macro=0.3217
[CV 4/10] method=ratio-support, alpha=0.00, f1_macro=0.3273
[CV 5/10] method=ratio-support, alpha=0.00, f1_macro=0.3223
[CV 6/10] method=ratio-support, alpha=0.00, f1_macro=0.3471
[CV 7/10] method=ratio-support, alpha=0.00, f1_macro=0.3566
[CV 8/10] method=ratio-support, alpha=0.00, f1_macro=0.2340
[CV 9/10] method=ratio-support, alpha=0.00, f1_macro=0.3455
[CV 10/10] method=ratio-support, alpha=0.00, f1_macro=0.2138
f1_mean = 0.3104

[CV 1/10] method=ratio-support, alpha=0.05, f1_macro=0.2885
[CV 2/10] method=ratio-support, alpha=0.05, f1_macro=0.3478
[CV 3/10] method=ratio-support, alpha=0.05, f1_macro=

[CV 2/10] method=ratio-support, alpha=0.65, f1_macro=0.3478
[CV 3/10] method=ratio-support, alpha=0.65, f1_macro=0.3217
[CV 4/10] method=ratio-support, alpha=0.65, f1_macro=0.3273
[CV 5/10] method=ratio-support, alpha=0.65, f1_macro=0.3223
[CV 6/10] method=ratio-support, alpha=0.65, f1_macro=0.3471
[CV 7/10] method=ratio-support, alpha=0.65, f1_macro=0.3566
[CV 8/10] method=ratio-support, alpha=0.65, f1_macro=0.2340
[CV 9/10] method=ratio-support, alpha=0.65, f1_macro=0.3455
[CV 10/10] method=ratio-support, alpha=0.65, f1_macro=0.2138
f1_mean = 0.3104

[CV 1/10] method=ratio-support, alpha=0.70, f1_macro=0.2885
[CV 2/10] method=ratio-support, alpha=0.70, f1_macro=0.3478
[CV 3/10] method=ratio-support, alpha=0.70, f1_macro=0.3217
[CV 4/10] method=ratio-support, alpha=0.70, f1_macro=0.3273
[CV 5/10] method=ratio-support, alpha=0.70, f1_macro=0.3223
[CV 6/10] method=ratio-support, alpha=0.70, f1_macro=0.3710
[CV 7/10] method=ratio-support, alpha=0.70, f1_macro=0.3125
[CV 8/10] method=rati

Extra for ratio-support

In [34]:
parameters = {'alpha' : np.linspace(1, 10, 19),
              'method': ['ratio-support'],
             }

n = kfold.get_n_splits(X)

for method in parameters['method']:
    for alpha in parameters['alpha']:
        
        f1 = [0 for i in range(n)]
        for i, (train_index, test_index) in enumerate(kfold.split(X, y)):
            bin_cls = fcalc.classifier.BinarizedBinaryClassifier(
                X.iloc[train_index].values, 
                y.iloc[train_index].to_numpy(),
                method=method,
                alpha=alpha
            )
            bin_cls.predict(X.iloc[test_index].values)
            f1[i] = f1_score(y.iloc[test_index], bin_cls.predictions, average='macro')
            print(f"[CV {i+1}/{n}] method={method}, alpha={alpha:0.2f}, f1_macro={f1[i]:0.4f}") 
                    
        f1_mean = np.mean(f1)
        print(f'f1_mean = {f1_mean:0.4f}\n')
        if(f1_mean > f1_best):
            f1_best = f1_mean
            alpha_best = alpha
            method_best = method

best_parameters['BinarizedBinaryClassifier'] = {'method': method_best, 'alpha': alpha_best}            
print(f"f1_best={f1_best:0.4f}, method={method_best}, alpha={alpha_best:0.2f}")

[CV 1/10] method=ratio-support, alpha=1.00, f1_macro=0.2711
[CV 2/10] method=ratio-support, alpha=1.00, f1_macro=0.3478
[CV 3/10] method=ratio-support, alpha=1.00, f1_macro=0.3217
[CV 4/10] method=ratio-support, alpha=1.00, f1_macro=0.3273
[CV 5/10] method=ratio-support, alpha=1.00, f1_macro=0.2382
[CV 6/10] method=ratio-support, alpha=1.00, f1_macro=0.2531
[CV 7/10] method=ratio-support, alpha=1.00, f1_macro=0.3125
[CV 8/10] method=ratio-support, alpha=1.00, f1_macro=0.2593
[CV 9/10] method=ratio-support, alpha=1.00, f1_macro=0.3125
[CV 10/10] method=ratio-support, alpha=1.00, f1_macro=0.2138
f1_mean = 0.2857

[CV 1/10] method=ratio-support, alpha=1.50, f1_macro=0.0290
[CV 2/10] method=ratio-support, alpha=1.50, f1_macro=0.0278
[CV 3/10] method=ratio-support, alpha=1.50, f1_macro=0.0333
[CV 4/10] method=ratio-support, alpha=1.50, f1_macro=0.0460
[CV 5/10] method=ratio-support, alpha=1.50, f1_macro=0.0556
[CV 6/10] method=ratio-support, alpha=1.50, f1_macro=0.0290
[CV 7/10] method=rati

[CV 7/10] method=ratio-support, alpha=7.50, f1_macro=0.0000
[CV 8/10] method=ratio-support, alpha=7.50, f1_macro=0.0000
[CV 9/10] method=ratio-support, alpha=7.50, f1_macro=0.0000
[CV 10/10] method=ratio-support, alpha=7.50, f1_macro=0.0303
f1_mean = 0.0084

[CV 1/10] method=ratio-support, alpha=8.00, f1_macro=0.0000
[CV 2/10] method=ratio-support, alpha=8.00, f1_macro=0.0000
[CV 3/10] method=ratio-support, alpha=8.00, f1_macro=0.0000
[CV 4/10] method=ratio-support, alpha=8.00, f1_macro=0.0533
[CV 5/10] method=ratio-support, alpha=8.00, f1_macro=0.0000
[CV 6/10] method=ratio-support, alpha=8.00, f1_macro=0.0000
[CV 7/10] method=ratio-support, alpha=8.00, f1_macro=0.0000
[CV 8/10] method=ratio-support, alpha=8.00, f1_macro=0.0000
[CV 9/10] method=ratio-support, alpha=8.00, f1_macro=0.0000
[CV 10/10] method=ratio-support, alpha=8.00, f1_macro=0.0303
f1_mean = 0.0084

[CV 1/10] method=ratio-support, alpha=8.50, f1_macro=0.0000
[CV 2/10] method=ratio-support, alpha=8.50, f1_macro=0.0000
[C

In [35]:
f1 = [0 for i in range(n)]
accuracy = [0 for i in range(n)]
f1_binary = [0 for i in range(n)]

for i, (train_index, test_index) in enumerate(kfold.split(X, y)):
    bin_cls = fcalc.classifier.BinarizedBinaryClassifier(
        X.iloc[train_index].values, 
        y.iloc[train_index].to_numpy(),
        method=method_best,
        alpha=alpha_best
    )
    bin_cls.predict(X.iloc[test_index].values)
    
    accuracy[i], f1[i], f1_binary[i] = results_fca(y.iloc[test_index], bin_cls.predictions)
    
    print(f"[CV {i+1}/{n}] method={method_best}, alpha={alpha_best:0.2f}", 
          f"accuracy={accuracy[i]:0.4f}, f1_binary={f1_binary[i]:0.4f}, f1_macro={f1[i]:0.4f}")

f1_mean = np.mean(f1)
accuracy_mean = np.mean(accuracy)
f1_binary_mean = np.mean(f1_binary)

best_metrics['BinarizedBinaryClassifier'] = {'Accuracy': accuracy_mean, 'F1_binary': f1_binary_mean, 'F1_macro': f1_mean}
print(f"\n accuracy={accuracy_mean:0.4f}, f1_binary={f1_binary_mean:0.4f}, f1_macro={f1_mean:0.4f}")

[CV 1/10] method=standard-support, alpha=1.00 accuracy=0.3243, f1_binary=0.4186, f1_macro=0.2095
[CV 2/10] method=standard-support, alpha=1.00 accuracy=0.3514, f1_binary=0.3684, f1_macro=0.3509
[CV 3/10] method=standard-support, alpha=1.00 accuracy=0.3784, f1_binary=0.4651, f1_macro=0.3616
[CV 4/10] method=standard-support, alpha=1.00 accuracy=0.5405, f1_binary=0.5854, f1_macro=0.5351
[CV 5/10] method=standard-support, alpha=1.00 accuracy=0.5676, f1_binary=0.6800, f1_macro=0.5067
[CV 6/10] method=standard-support, alpha=1.00 accuracy=0.4595, f1_binary=0.4444, f1_macro=0.4591
[CV 7/10] method=standard-support, alpha=1.00 accuracy=0.5000, f1_binary=0.6087, f1_macro=0.3100
[CV 8/10] method=standard-support, alpha=1.00 accuracy=0.3611, f1_binary=0.4103, f1_macro=0.3566
[CV 9/10] method=standard-support, alpha=1.00 accuracy=0.4444, f1_binary=0.5238, f1_macro=0.4286
[CV 10/10] method=standard-support, alpha=1.00 accuracy=0.4444, f1_binary=0.4444, f1_macro=0.3050

 accuracy=0.4372, f1_binary=

## PatternBinaryClassifier

In [36]:
parameters = {'alpha' : np.linspace(0, 1, 21),
              'method': ['standard','standard-support','ratio-support'],
             }

n = kfold.get_n_splits(X)

f1_best_p = 0
alpha_best_p = 0.
method_best_p = 'standard'

for method in parameters['method']:
    for alpha in parameters['alpha']:
        
        f1 = [0 for i in range(n)]
        for i, (train_index, test_index) in enumerate(kfold.split(X, y)):
            bin_cls = fcalc.classifier.PatternBinaryClassifier(
                X.iloc[train_index].values, 
                y.iloc[train_index].to_numpy(),
                method=method,
                alpha=alpha
            )
            bin_cls.predict(X.iloc[test_index].values)
            f1[i] = f1_score(y.iloc[test_index], bin_cls.predictions, average='macro')
            print(f"[CV {i+1}/{n}] method={method}, alpha={alpha:0.2f}, f1_macro={f1[i]:0.4f}") 
                    
        f1_mean = np.mean(f1)
        print(f'f1_mean = {f1_mean:0.4f}\n')
        if(f1_mean > f1_best_p):
            f1_best_p = f1_mean
            alpha_best_p = alpha
            method_best_p = method

best_parameters['PatternBinaryClassifier'] = {'method': method_best_p, 'alpha': alpha_best_p}              
print(f"f1_best={f1_best_p:0.4f}, method={method_best_p}, alpha={alpha_best_p:0.2f}")

[CV 1/10] method=standard, alpha=0.00, f1_macro=0.0804
[CV 2/10] method=standard, alpha=0.00, f1_macro=0.0185
[CV 3/10] method=standard, alpha=0.00, f1_macro=0.0804
[CV 4/10] method=standard, alpha=0.00, f1_macro=0.0351
[CV 5/10] method=standard, alpha=0.00, f1_macro=0.0541
[CV 6/10] method=standard, alpha=0.00, f1_macro=0.0185
[CV 7/10] method=standard, alpha=0.00, f1_macro=0.0566
[CV 8/10] method=standard, alpha=0.00, f1_macro=0.0000
[CV 9/10] method=standard, alpha=0.00, f1_macro=0.0185
[CV 10/10] method=standard, alpha=0.00, f1_macro=0.0556
f1_mean = 0.0418

[CV 1/10] method=standard, alpha=0.05, f1_macro=0.3243
[CV 2/10] method=standard, alpha=0.05, f1_macro=0.4850
[CV 3/10] method=standard, alpha=0.05, f1_macro=0.5392
[CV 4/10] method=standard, alpha=0.05, f1_macro=0.4324
[CV 5/10] method=standard, alpha=0.05, f1_macro=0.4591
[CV 6/10] method=standard, alpha=0.05, f1_macro=0.4050
[CV 7/10] method=standard, alpha=0.05, f1_macro=0.2302
[CV 8/10] method=standard, alpha=0.05, f1_macr

[CV 6/10] method=standard, alpha=0.70, f1_macro=0.1500
[CV 7/10] method=standard, alpha=0.70, f1_macro=0.1333
[CV 8/10] method=standard, alpha=0.70, f1_macro=0.0884
[CV 9/10] method=standard, alpha=0.70, f1_macro=0.2184
[CV 10/10] method=standard, alpha=0.70, f1_macro=0.1389
f1_mean = 0.2239

[CV 1/10] method=standard, alpha=0.75, f1_macro=0.2617
[CV 2/10] method=standard, alpha=0.75, f1_macro=0.2289
[CV 3/10] method=standard, alpha=0.75, f1_macro=0.1622
[CV 4/10] method=standard, alpha=0.75, f1_macro=0.2032
[CV 5/10] method=standard, alpha=0.75, f1_macro=0.2576
[CV 6/10] method=standard, alpha=0.75, f1_macro=0.1500
[CV 7/10] method=standard, alpha=0.75, f1_macro=0.0000
[CV 8/10] method=standard, alpha=0.75, f1_macro=0.0896
[CV 9/10] method=standard, alpha=0.75, f1_macro=0.1018
[CV 10/10] method=standard, alpha=0.75, f1_macro=0.0952
f1_mean = 0.1550

[CV 1/10] method=standard, alpha=0.80, f1_macro=0.2617
[CV 2/10] method=standard, alpha=0.80, f1_macro=0.2289
[CV 3/10] method=standard, 

[CV 1/10] method=standard-support, alpha=0.35, f1_macro=0.1950
[CV 2/10] method=standard-support, alpha=0.35, f1_macro=0.1886
[CV 3/10] method=standard-support, alpha=0.35, f1_macro=0.1950
[CV 4/10] method=standard-support, alpha=0.35, f1_macro=0.1345
[CV 5/10] method=standard-support, alpha=0.35, f1_macro=0.1597
[CV 6/10] method=standard-support, alpha=0.35, f1_macro=0.2382
[CV 7/10] method=standard-support, alpha=0.35, f1_macro=0.1938
[CV 8/10] method=standard-support, alpha=0.35, f1_macro=0.1641
[CV 9/10] method=standard-support, alpha=0.35, f1_macro=0.2198
[CV 10/10] method=standard-support, alpha=0.35, f1_macro=0.1628
f1_mean = 0.1852

[CV 1/10] method=standard-support, alpha=0.40, f1_macro=0.1055
[CV 2/10] method=standard-support, alpha=0.40, f1_macro=0.1523
[CV 3/10] method=standard-support, alpha=0.40, f1_macro=0.1597
[CV 4/10] method=standard-support, alpha=0.40, f1_macro=0.1597
[CV 5/10] method=standard-support, alpha=0.40, f1_macro=0.1055
[CV 6/10] method=standard-support, a

[CV 8/10] method=standard-support, alpha=0.95, f1_macro=0.1084
[CV 9/10] method=standard-support, alpha=0.95, f1_macro=0.2125
[CV 10/10] method=standard-support, alpha=0.95, f1_macro=0.1329
f1_mean = 0.1449

[CV 1/10] method=standard-support, alpha=1.00, f1_macro=0.1055
[CV 2/10] method=standard-support, alpha=1.00, f1_macro=0.1523
[CV 3/10] method=standard-support, alpha=1.00, f1_macro=0.2139
[CV 4/10] method=standard-support, alpha=1.00, f1_macro=0.1886
[CV 5/10] method=standard-support, alpha=1.00, f1_macro=0.1597
[CV 6/10] method=standard-support, alpha=1.00, f1_macro=0.1597
[CV 7/10] method=standard-support, alpha=1.00, f1_macro=0.2198
[CV 8/10] method=standard-support, alpha=1.00, f1_macro=0.2198
[CV 9/10] method=standard-support, alpha=1.00, f1_macro=0.1429
[CV 10/10] method=standard-support, alpha=1.00, f1_macro=0.1084
f1_mean = 0.1671

[CV 1/10] method=ratio-support, alpha=0.00, f1_macro=0.1622
[CV 2/10] method=ratio-support, alpha=0.00, f1_macro=0.1345
[CV 3/10] method=ratio-

[CV 10/10] method=ratio-support, alpha=0.55, f1_macro=0.1000
f1_mean = 0.1684

[CV 1/10] method=ratio-support, alpha=0.60, f1_macro=0.1886
[CV 2/10] method=ratio-support, alpha=0.60, f1_macro=0.1345
[CV 3/10] method=ratio-support, alpha=0.60, f1_macro=0.1741
[CV 4/10] method=ratio-support, alpha=0.60, f1_macro=0.1886
[CV 5/10] method=ratio-support, alpha=0.60, f1_macro=0.1622
[CV 6/10] method=ratio-support, alpha=0.60, f1_macro=0.2382
[CV 7/10] method=ratio-support, alpha=0.60, f1_macro=0.2198
[CV 8/10] method=ratio-support, alpha=0.60, f1_macro=0.1111
[CV 9/10] method=ratio-support, alpha=0.60, f1_macro=0.1667
[CV 10/10] method=ratio-support, alpha=0.60, f1_macro=0.1000
f1_mean = 0.1684

[CV 1/10] method=ratio-support, alpha=0.65, f1_macro=0.1886
[CV 2/10] method=ratio-support, alpha=0.65, f1_macro=0.1345
[CV 3/10] method=ratio-support, alpha=0.65, f1_macro=0.1741
[CV 4/10] method=ratio-support, alpha=0.65, f1_macro=0.1886
[CV 5/10] method=ratio-support, alpha=0.65, f1_macro=0.1622
[C

Extra for ratio-support

In [37]:
parameters = {'alpha' : np.linspace(0, 10, 21),
              'method': ['ratio-support'],
             }

n = kfold.get_n_splits(X)

for method in parameters['method']:
    for alpha in parameters['alpha']:
        
        f1 = [0 for i in range(n)]
        for i, (train_index, test_index) in enumerate(kfold.split(X, y)):
            bin_cls = fcalc.classifier.PatternBinaryClassifier(
                X.iloc[train_index].values, 
                y.iloc[train_index].to_numpy(),
                method=method,
                alpha=alpha
            )
            bin_cls.predict(X.iloc[test_index].values)
            f1[i] = f1_score(y.iloc[test_index], bin_cls.predictions, average='macro')
            print(f"[CV {i+1}/{n}] method={method}, alpha={alpha:0.2f}, f1_macro={f1[i]:0.4f}") 
                    
        f1_mean = np.mean(f1)
        print(f'f1_mean = {f1_mean:0.4f}\n')
        if(f1_mean > f1_best_p):
            f1_best_p = f1_mean
            alpha_best_p = alpha
            method_best_p = method

best_parameters['PatternBinaryClassifier'] = {'method': method_best_p, 'alpha': alpha_best_p}            
print(f"f1_best={f1_best_p:0.4f}, method={method_best_p}, alpha={alpha_best_p:0.2f}")

[CV 1/10] method=ratio-support, alpha=0.00, f1_macro=0.1622
[CV 2/10] method=ratio-support, alpha=0.00, f1_macro=0.1345
[CV 3/10] method=ratio-support, alpha=0.00, f1_macro=0.1523
[CV 4/10] method=ratio-support, alpha=0.00, f1_macro=0.1886
[CV 5/10] method=ratio-support, alpha=0.00, f1_macro=0.1622
[CV 6/10] method=ratio-support, alpha=0.00, f1_macro=0.2382
[CV 7/10] method=ratio-support, alpha=0.00, f1_macro=0.2198
[CV 8/10] method=ratio-support, alpha=0.00, f1_macro=0.1111
[CV 9/10] method=ratio-support, alpha=0.00, f1_macro=0.1382
[CV 10/10] method=ratio-support, alpha=0.00, f1_macro=0.1000
f1_mean = 0.1607

[CV 1/10] method=ratio-support, alpha=0.50, f1_macro=0.1886
[CV 2/10] method=ratio-support, alpha=0.50, f1_macro=0.1345
[CV 3/10] method=ratio-support, alpha=0.50, f1_macro=0.1741
[CV 4/10] method=ratio-support, alpha=0.50, f1_macro=0.1886
[CV 5/10] method=ratio-support, alpha=0.50, f1_macro=0.1622
[CV 6/10] method=ratio-support, alpha=0.50, f1_macro=0.2382
[CV 7/10] method=rati

[CV 4/10] method=ratio-support, alpha=6.50, f1_macro=0.0351
[CV 5/10] method=ratio-support, alpha=6.50, f1_macro=0.0541
[CV 6/10] method=ratio-support, alpha=6.50, f1_macro=0.0185
[CV 7/10] method=ratio-support, alpha=6.50, f1_macro=0.0566
[CV 8/10] method=ratio-support, alpha=6.50, f1_macro=0.0185
[CV 9/10] method=ratio-support, alpha=6.50, f1_macro=0.0185
[CV 10/10] method=ratio-support, alpha=6.50, f1_macro=0.0556
f1_mean = 0.0436

[CV 1/10] method=ratio-support, alpha=7.00, f1_macro=0.0804
[CV 2/10] method=ratio-support, alpha=7.00, f1_macro=0.0185
[CV 3/10] method=ratio-support, alpha=7.00, f1_macro=0.0804
[CV 4/10] method=ratio-support, alpha=7.00, f1_macro=0.0351
[CV 5/10] method=ratio-support, alpha=7.00, f1_macro=0.0541
[CV 6/10] method=ratio-support, alpha=7.00, f1_macro=0.0185
[CV 7/10] method=ratio-support, alpha=7.00, f1_macro=0.0566
[CV 8/10] method=ratio-support, alpha=7.00, f1_macro=0.0185
[CV 9/10] method=ratio-support, alpha=7.00, f1_macro=0.0185
[CV 10/10] method=rat

In [38]:
f1 = [0 for i in range(n)]
accuracy = [0 for i in range(n)]
f1_binary = [0 for i in range(n)]

for i, (train_index, test_index) in enumerate(kfold.split(X, y)):
    bin_cls = fcalc.classifier.PatternBinaryClassifier(
        X.iloc[train_index].values, 
        y.iloc[train_index].to_numpy(),
        method=method_best_p,
        alpha=alpha_best_p
    )
    bin_cls.predict(X.iloc[test_index].values)
    
    accuracy[i], f1[i], f1_binary[i] = results_fca(y.iloc[test_index], bin_cls.predictions)
    
    print(f"[CV {i+1}/{n}] method={method_best_p}, alpha={alpha_best_p:0.2f}", 
          f"accuracy={accuracy[i]:0.4f}, f1_binary={f1_binary[i]:0.4f}, f1_macro={f1[i]:0.4f}")

f1_mean = np.mean(f1)
accuracy_mean = np.mean(accuracy)
f1_binary_mean = np.mean(f1_binary)

best_metrics['PatternBinaryClassifier'] = {'Accuracy': accuracy_mean, 'F1_binary': f1_binary_mean, 'F1_macro': f1_mean}
print(f"\n accuracy={accuracy_mean:0.4f}, f1_binary={f1_binary_mean:0.4f}, f1_macro={f1_mean:0.4f}")

[CV 1/10] method=standard, alpha=0.10 accuracy=0.4595, f1_binary=0.4737, f1_macro=0.4591
[CV 2/10] method=standard, alpha=0.10 accuracy=0.4865, f1_binary=0.5366, f1_macro=0.4804
[CV 3/10] method=standard, alpha=0.10 accuracy=0.5405, f1_binary=0.6047, f1_macro=0.5281
[CV 4/10] method=standard, alpha=0.10 accuracy=0.4865, f1_binary=0.5128, f1_macro=0.4850
[CV 5/10] method=standard, alpha=0.10 accuracy=0.3784, f1_binary=0.3429, f1_macro=0.3766
[CV 6/10] method=standard, alpha=0.10 accuracy=0.4324, f1_binary=0.4000, f1_macro=0.4308
[CV 7/10] method=standard, alpha=0.10 accuracy=0.3611, f1_binary=0.3030, f1_macro=0.2517
[CV 8/10] method=standard, alpha=0.10 accuracy=0.4444, f1_binary=0.4118, f1_macro=0.3172
[CV 9/10] method=standard, alpha=0.10 accuracy=0.4444, f1_binary=0.4444, f1_macro=0.3140
[CV 10/10] method=standard, alpha=0.10 accuracy=0.3333, f1_binary=0.3333, f1_macro=0.2319

 accuracy=0.4367, f1_binary=0.4363, f1_macro=0.3875


# Overall

In [39]:
for i in best_parameters:
    print(i)
    print(best_parameters[i])
    print()

LogisticRegression
{'C': 1.88}

KNeighborsClassifier
{'n_neighbors': 45}

MultinomialNB
{'alpha': 26.901000000000003}

GaussianNB
{'var_smoothing': 1.0}

ComplementNB
{'alpha': 91.001}

DecisionTreeClassifier
{'criterion': 'gini', 'max_depth': 2, 'min_samples_split': 2}

RandomForestClassifier
{'criterion': 'gini', 'max_depth': 2, 'min_samples_split': 2, 'n_estimators': 70}

BinarizedBinaryClassifier
{'method': 'standard-support', 'alpha': 1.0}

PatternBinaryClassifier
{'method': 'standard', 'alpha': 0.1}



In [40]:
for i in best_metrics:
    print(i)
    print(best_metrics[i])
    print()

LogisticRegression
{'Accuracy': 0.3629, 'F1_binary': 0.3485, 'F1_macro': 0.3595}

KNeighborsClassifier
{'Accuracy': 0.4751, 'F1_binary': 0.5811, 'F1_macro': 0.4367}

MultinomialNB
{'Accuracy': 0.3631, 'F1_binary': 0.3545, 'F1_macro': 0.3564}

GaussianNB
{'Accuracy': 0.377, 'F1_binary': 0.3785, 'F1_macro': 0.3738}

ComplementNB
{'Accuracy': 0.3577, 'F1_binary': 0.3562, 'F1_macro': 0.3542}

DecisionTreeClassifier
{'Accuracy': 0.3493, 'F1_binary': 0.2666, 'F1_macro': 0.3343}

RandomForestClassifier
{'Accuracy': 0.3633, 'F1_binary': 0.3834, 'F1_macro': 0.3533}

BinarizedBinaryClassifier
{'Accuracy': 0.4371621621621622, 'F1_binary': 0.49491583116514093, 'F1_macro': 0.3823026751943705}

PatternBinaryClassifier
{'Accuracy': 0.43671171171171175, 'F1_binary': 0.43631711815387614, 'F1_macro': 0.3874723161895567}



# Extra: Expandad table for BinarizedBinaryClassifier

In [48]:
X_expanded = ~X
anticolumns = X_expanded.columns.to_list()
for i in range(len(anticolumns)):
    anticolumns[i] = 'NOT_' + anticolumns[i]
X_expanded.columns = anticolumns
X_expanded = pd.concat([X, X_expanded],axis=1)
X_expanded.head()

Unnamed: 0,Abroad,Contact with confirmed,cough,fever,sore_throat,shortness_of_breath,head_ache,age_60_and_above,NOT_Abroad,NOT_Contact with confirmed,NOT_cough,NOT_fever,NOT_sore_throat,NOT_shortness_of_breath,NOT_head_ache,NOT_age_60_and_above
0,False,False,False,False,False,False,False,True,True,True,True,True,True,True,True,False
1,False,False,False,False,False,False,False,False,True,True,True,True,True,True,True,True
2,False,False,False,False,False,False,False,False,True,True,True,True,True,True,True,True
3,False,False,False,False,False,False,False,True,True,True,True,True,True,True,True,False
4,False,True,False,False,False,False,False,False,True,False,True,True,True,True,True,True


In [49]:
parameters = {'alpha' : np.linspace(0, 1, 21),
              'method': ['standard','standard-support','ratio-support'],
             }

n = kfold.get_n_splits(X_expanded)

f1_best_e = 0
alpha_best_e = 0.
method_best_e = 'standard'

for method in parameters['method']:
    for alpha in parameters['alpha']:
        
        f1 = [0 for i in range(n)]
        for i, (train_index, test_index) in enumerate(kfold.split(X_expanded, y)):
            bin_cls = fcalc.classifier.BinarizedBinaryClassifier(
                X_expanded.iloc[train_index].values, 
                y.iloc[train_index].to_numpy(),
                method=method,
                alpha=alpha
            )
            bin_cls.predict(X_expanded.iloc[test_index].values)
            f1[i] = f1_score(y.iloc[test_index], bin_cls.predictions, average='macro')
            print(f"[CV {i+1}/{n}] method={method}, alpha={alpha:0.2f}, f1_macro={f1[i]:0.4f}") 
                    
        f1_mean = np.mean(f1)
        print(f'f1_mean = {f1_mean:0.4f}\n')
        if(f1_mean > f1_best_e):
            f1_best_e = f1_mean
            alpha_best_e = alpha
            method_best_e = method

best_parameters_expanded = {'method': method_best_e, 'alpha': alpha_best_e}             
print(f"f1_best={f1_best_e:0.4f}, method={method_best_e}, alpha={alpha_best_e:0.2f}")

[CV 1/10] method=standard, alpha=0.00, f1_macro=0.0804
[CV 2/10] method=standard, alpha=0.00, f1_macro=0.0185
[CV 3/10] method=standard, alpha=0.00, f1_macro=0.0804
[CV 4/10] method=standard, alpha=0.00, f1_macro=0.0351
[CV 5/10] method=standard, alpha=0.00, f1_macro=0.0541
[CV 6/10] method=standard, alpha=0.00, f1_macro=0.0185
[CV 7/10] method=standard, alpha=0.00, f1_macro=0.0566
[CV 8/10] method=standard, alpha=0.00, f1_macro=0.0000
[CV 9/10] method=standard, alpha=0.00, f1_macro=0.0185
[CV 10/10] method=standard, alpha=0.00, f1_macro=0.0556
f1_mean = 0.0418

[CV 1/10] method=standard, alpha=0.05, f1_macro=0.3243
[CV 2/10] method=standard, alpha=0.05, f1_macro=0.4850
[CV 3/10] method=standard, alpha=0.05, f1_macro=0.5392
[CV 4/10] method=standard, alpha=0.05, f1_macro=0.4324
[CV 5/10] method=standard, alpha=0.05, f1_macro=0.4591
[CV 6/10] method=standard, alpha=0.05, f1_macro=0.4050
[CV 7/10] method=standard, alpha=0.05, f1_macro=0.2302
[CV 8/10] method=standard, alpha=0.05, f1_macr

[CV 7/10] method=standard, alpha=0.70, f1_macro=0.1333
[CV 8/10] method=standard, alpha=0.70, f1_macro=0.0884
[CV 9/10] method=standard, alpha=0.70, f1_macro=0.2184
[CV 10/10] method=standard, alpha=0.70, f1_macro=0.1389
f1_mean = 0.2239

[CV 1/10] method=standard, alpha=0.75, f1_macro=0.2617
[CV 2/10] method=standard, alpha=0.75, f1_macro=0.2289
[CV 3/10] method=standard, alpha=0.75, f1_macro=0.1622
[CV 4/10] method=standard, alpha=0.75, f1_macro=0.2032
[CV 5/10] method=standard, alpha=0.75, f1_macro=0.2576
[CV 6/10] method=standard, alpha=0.75, f1_macro=0.1500
[CV 7/10] method=standard, alpha=0.75, f1_macro=0.0000
[CV 8/10] method=standard, alpha=0.75, f1_macro=0.0896
[CV 9/10] method=standard, alpha=0.75, f1_macro=0.1018
[CV 10/10] method=standard, alpha=0.75, f1_macro=0.0952
f1_mean = 0.1550

[CV 1/10] method=standard, alpha=0.80, f1_macro=0.2617
[CV 2/10] method=standard, alpha=0.80, f1_macro=0.2289
[CV 3/10] method=standard, alpha=0.80, f1_macro=0.1622
[CV 4/10] method=standard, 

[CV 2/10] method=standard-support, alpha=0.35, f1_macro=0.1886
[CV 3/10] method=standard-support, alpha=0.35, f1_macro=0.1950
[CV 4/10] method=standard-support, alpha=0.35, f1_macro=0.1345
[CV 5/10] method=standard-support, alpha=0.35, f1_macro=0.1597
[CV 6/10] method=standard-support, alpha=0.35, f1_macro=0.2382
[CV 7/10] method=standard-support, alpha=0.35, f1_macro=0.1938
[CV 8/10] method=standard-support, alpha=0.35, f1_macro=0.1641
[CV 9/10] method=standard-support, alpha=0.35, f1_macro=0.2198
[CV 10/10] method=standard-support, alpha=0.35, f1_macro=0.1628
f1_mean = 0.1852

[CV 1/10] method=standard-support, alpha=0.40, f1_macro=0.1055
[CV 2/10] method=standard-support, alpha=0.40, f1_macro=0.1523
[CV 3/10] method=standard-support, alpha=0.40, f1_macro=0.1597
[CV 4/10] method=standard-support, alpha=0.40, f1_macro=0.1597
[CV 5/10] method=standard-support, alpha=0.40, f1_macro=0.1055
[CV 6/10] method=standard-support, alpha=0.40, f1_macro=0.1597
[CV 7/10] method=standard-support, a

[CV 9/10] method=standard-support, alpha=0.95, f1_macro=0.2125
[CV 10/10] method=standard-support, alpha=0.95, f1_macro=0.1329
f1_mean = 0.1449

[CV 1/10] method=standard-support, alpha=1.00, f1_macro=0.1055
[CV 2/10] method=standard-support, alpha=1.00, f1_macro=0.1523
[CV 3/10] method=standard-support, alpha=1.00, f1_macro=0.2139
[CV 4/10] method=standard-support, alpha=1.00, f1_macro=0.1886
[CV 5/10] method=standard-support, alpha=1.00, f1_macro=0.1597
[CV 6/10] method=standard-support, alpha=1.00, f1_macro=0.1597
[CV 7/10] method=standard-support, alpha=1.00, f1_macro=0.2198
[CV 8/10] method=standard-support, alpha=1.00, f1_macro=0.2198
[CV 9/10] method=standard-support, alpha=1.00, f1_macro=0.1429
[CV 10/10] method=standard-support, alpha=1.00, f1_macro=0.1084
f1_mean = 0.1671

[CV 1/10] method=ratio-support, alpha=0.00, f1_macro=0.1622
[CV 2/10] method=ratio-support, alpha=0.00, f1_macro=0.1345
[CV 3/10] method=ratio-support, alpha=0.00, f1_macro=0.1523
[CV 4/10] method=ratio-sup

[CV 1/10] method=ratio-support, alpha=0.60, f1_macro=0.1886
[CV 2/10] method=ratio-support, alpha=0.60, f1_macro=0.1345
[CV 3/10] method=ratio-support, alpha=0.60, f1_macro=0.1741
[CV 4/10] method=ratio-support, alpha=0.60, f1_macro=0.1886
[CV 5/10] method=ratio-support, alpha=0.60, f1_macro=0.1622
[CV 6/10] method=ratio-support, alpha=0.60, f1_macro=0.2382
[CV 7/10] method=ratio-support, alpha=0.60, f1_macro=0.2198
[CV 8/10] method=ratio-support, alpha=0.60, f1_macro=0.1111
[CV 9/10] method=ratio-support, alpha=0.60, f1_macro=0.1667
[CV 10/10] method=ratio-support, alpha=0.60, f1_macro=0.1000
f1_mean = 0.1684

[CV 1/10] method=ratio-support, alpha=0.65, f1_macro=0.1886
[CV 2/10] method=ratio-support, alpha=0.65, f1_macro=0.1345
[CV 3/10] method=ratio-support, alpha=0.65, f1_macro=0.1741
[CV 4/10] method=ratio-support, alpha=0.65, f1_macro=0.1886
[CV 5/10] method=ratio-support, alpha=0.65, f1_macro=0.1622
[CV 6/10] method=ratio-support, alpha=0.65, f1_macro=0.2382
[CV 7/10] method=rati

In [50]:
parameters = {'alpha' : np.linspace(1, 10, 19),
              'method': ['ratio-support'],
             }

n = kfold.get_n_splits(X_expanded)

for method in parameters['method']:
    for alpha in parameters['alpha']:
        
        f1 = [0 for i in range(n)]
        for i, (train_index, test_index) in enumerate(kfold.split(X_expanded, y)):
            bin_cls = fcalc.classifier.BinarizedBinaryClassifier(
                X_expanded.iloc[train_index].values, 
                y.iloc[train_index].to_numpy(),
                method=method,
                alpha=alpha
            )
            bin_cls.predict(X_expanded.iloc[test_index].values)
            f1[i] = f1_score(y.iloc[test_index], bin_cls.predictions, average='macro')
            print(f"[CV {i+1}/{n}] method={method}, alpha={alpha:0.2f}, f1_macro={f1[i]:0.4f}") 
                    
        f1_mean = np.mean(f1)
        print(f'f1_mean = {f1_mean:0.4f}\n')
        if(f1_mean > f1_best_e):
            f1_best_e = f1_mean
            alpha_best_e = alpha
            method_best_e = method

best_parameters_expanded = {'method': method_best_e, 'alpha': alpha_best_e}            
print(f"f1_best={f1_best_e:0.4f}, method={method_best_e}, alpha={alpha_best_e:0.2f}")

[CV 1/10] method=ratio-support, alpha=1.00, f1_macro=0.1055
[CV 2/10] method=ratio-support, alpha=1.00, f1_macro=0.1597
[CV 3/10] method=ratio-support, alpha=1.00, f1_macro=0.1523
[CV 4/10] method=ratio-support, alpha=1.00, f1_macro=0.1345
[CV 5/10] method=ratio-support, alpha=1.00, f1_macro=0.0976
[CV 6/10] method=ratio-support, alpha=1.00, f1_macro=0.1778
[CV 7/10] method=ratio-support, alpha=1.00, f1_macro=0.1329
[CV 8/10] method=ratio-support, alpha=1.00, f1_macro=0.1111
[CV 9/10] method=ratio-support, alpha=1.00, f1_macro=0.1382
[CV 10/10] method=ratio-support, alpha=1.00, f1_macro=0.0526
f1_mean = 0.1262

[CV 1/10] method=ratio-support, alpha=1.50, f1_macro=0.0750
[CV 2/10] method=ratio-support, alpha=1.50, f1_macro=0.0976
[CV 3/10] method=ratio-support, alpha=1.50, f1_macro=0.0750
[CV 4/10] method=ratio-support, alpha=1.50, f1_macro=0.0804
[CV 5/10] method=ratio-support, alpha=1.50, f1_macro=0.0750
[CV 6/10] method=ratio-support, alpha=1.50, f1_macro=0.0804
[CV 7/10] method=rati

[CV 5/10] method=ratio-support, alpha=7.50, f1_macro=0.0541
[CV 6/10] method=ratio-support, alpha=7.50, f1_macro=0.0185
[CV 7/10] method=ratio-support, alpha=7.50, f1_macro=0.0566
[CV 8/10] method=ratio-support, alpha=7.50, f1_macro=0.0185
[CV 9/10] method=ratio-support, alpha=7.50, f1_macro=0.0185
[CV 10/10] method=ratio-support, alpha=7.50, f1_macro=0.0556
f1_mean = 0.0436

[CV 1/10] method=ratio-support, alpha=8.00, f1_macro=0.0804
[CV 2/10] method=ratio-support, alpha=8.00, f1_macro=0.0185
[CV 3/10] method=ratio-support, alpha=8.00, f1_macro=0.0804
[CV 4/10] method=ratio-support, alpha=8.00, f1_macro=0.0351
[CV 5/10] method=ratio-support, alpha=8.00, f1_macro=0.0541
[CV 6/10] method=ratio-support, alpha=8.00, f1_macro=0.0185
[CV 7/10] method=ratio-support, alpha=8.00, f1_macro=0.0566
[CV 8/10] method=ratio-support, alpha=8.00, f1_macro=0.0185
[CV 9/10] method=ratio-support, alpha=8.00, f1_macro=0.0185
[CV 10/10] method=ratio-support, alpha=8.00, f1_macro=0.0556
f1_mean = 0.0436

[C

In [51]:
f1 = [0 for i in range(n)]
accuracy = [0 for i in range(n)]
f1_binary = [0 for i in range(n)]

for i, (train_index, test_index) in enumerate(kfold.split(X_expanded, y)):
    bin_cls = fcalc.classifier.BinarizedBinaryClassifier(
        X_expanded.iloc[train_index].values, 
        y.iloc[train_index].to_numpy(),
        method=method_best_e,
        alpha=alpha_best_e
    )
    bin_cls.predict(X_expanded.iloc[test_index].values)
    
    accuracy[i], f1[i], f1_binary[i] = results_fca(y.iloc[test_index], bin_cls.predictions)
    
    print(f"[CV {i+1}/{n}] method={method_best_e}, alpha={alpha_best_e:0.2f}", 
          f"accuracy={accuracy[i]:0.4f}, f1_macro={f1[i]:0.4f}, f1_binary={f1_binary[i]:0.4f}")

f1_mean = np.mean(f1)
accuracy_mean = np.mean(accuracy)
f1_binary_mean = np.mean(f1_binary)

best_metrics_expanded = {'Accuracy': accuracy_mean, 'F1_binary': f1_binary_mean, 'F1_macro': f1_mean}
print(f"\n accuracy={accuracy_mean:0.4f}, f1_binary={f1_binary_mean:0.4f}, f1_macro={f1_mean:0.4f}")

[CV 1/10] method=standard, alpha=0.10 accuracy=0.4595, f1_macro=0.4591, f1_binary=0.4737
[CV 2/10] method=standard, alpha=0.10 accuracy=0.4865, f1_macro=0.4804, f1_binary=0.5366
[CV 3/10] method=standard, alpha=0.10 accuracy=0.5405, f1_macro=0.5281, f1_binary=0.6047
[CV 4/10] method=standard, alpha=0.10 accuracy=0.4865, f1_macro=0.4850, f1_binary=0.5128
[CV 5/10] method=standard, alpha=0.10 accuracy=0.3784, f1_macro=0.3766, f1_binary=0.3429
[CV 6/10] method=standard, alpha=0.10 accuracy=0.4324, f1_macro=0.4308, f1_binary=0.4000
[CV 7/10] method=standard, alpha=0.10 accuracy=0.3611, f1_macro=0.2517, f1_binary=0.3030
[CV 8/10] method=standard, alpha=0.10 accuracy=0.4444, f1_macro=0.3172, f1_binary=0.4118
[CV 9/10] method=standard, alpha=0.10 accuracy=0.4444, f1_macro=0.3140, f1_binary=0.4444
[CV 10/10] method=standard, alpha=0.10 accuracy=0.3333, f1_macro=0.2319, f1_binary=0.3333

 accuracy=0.4367, f1_binary=0.4363, f1_macro=0.3875


In [52]:
best_parameters_expanded

{'method': 'standard', 'alpha': 0.1}

In [53]:
best_metrics_expanded

{'Accuracy': 0.43671171171171175,
 'F1_binary': 0.43631711815387614,
 'F1_macro': 0.3874723161895567}

# Extra: The reason of low scores - duplicates over features

In [62]:
df.iloc[:, :-1].duplicated(keep=False).value_counts()

True     358
False      8
Name: count, dtype: int64