In [2]:
import sys

In [3]:
from statsmodels.stats.proportion import proportion_confint


In [10]:
import pandas as pd
import ast
from scipy.stats import norm
import numpy as np

In [11]:
from confidenceinterval import accuracy_score, \
    precision_score, \
    recall_score, \
    f1_score

from confidenceinterval.utils import get_positive_negative_counts

import sklearn.metrics
import numpy as np

In [12]:
def wilson_score_interval(tp, fp, fn, metric):
    confidence=0.95
    x = tp
    print(tp, fp, fn, metric)
    if metric == "precision":
        n = tp + fp
    elif metric == "recall":
        n = tp + fn
    else:
        return (0, 0, 0)

    z = norm.ppf(1 - (1 - confidence) / 2)
    phat = x / n
    center = (x + z ** 2 / 2) / (n + z ** 2)

    interval = ((z * np.sqrt(n)) / (n + z ** 2)) * np.sqrt(phat * (1 - phat) + z ** 2 / (4 * n))

    lower_bound = center - interval
    upper_bound = center + interval
    
    if phat == 0:
        return (0,0,0)
    
    return round(phat,3), (round(lower_bound,3), round(upper_bound,3))

## Multi-class

In [45]:
y_true, y_pred = [[2, 0, 0, 2, 1, 1, 1, 1,2,2],[2, 0, 1, 0, 1, 0, 1, 1,2,2]]
FP, FN, TP, TN, CM = get_positive_negative_counts(y_true, y_pred)

In [46]:
FP, FN, TP, TN, CM

(array([2, 1, 0]),
 array([1, 1, 1]),
 array([1, 3, 3]),
 array([6, 5, 6]),
 array([[1, 1, 0],
        [1, 3, 0],
        [1, 0, 3]]))

In [47]:
tp_sum = TP.sum()
fp_sum = FP.sum()
fn_sum = FN.sum()
tp_sum, fp_sum, fn_sum

(7, 3, 3)

In [50]:
sklearn_result = sklearn.metrics.precision_score(y_true, y_pred, average='micro')
precision, ci = precision_score(y_true, y_pred, average='micro')
sklearn_result, precision, ci

(0.7, 0.7, (0.4159742349106746, 0.9840257650893254))

In [51]:
sklearn_result = sklearn.metrics.recall_score(y_true, y_pred, average='micro')
recall, ci = recall_score(y_true, y_pred, average='micro')
sklearn_result, recall, ci

(0.7, 0.7, (0.4159742349106746, 0.9840257650893254))

In [52]:
sklearn_result = sklearn.metrics.precision_score(y_true, y_pred, average='macro')
precision, ci = precision_score(y_true, y_pred, average='macro')
sklearn_result, precision, ci

(0.6944444444444443,
 0.6944444444444443,
 (0.5449946838878599, 0.8438942050010287))

In [53]:
sklearn_result = sklearn.metrics.recall_score(y_true, y_pred, average='macro')
recall, ci = recall_score(y_true, y_pred, average='macro')
sklearn_result, recall, ci

(0.6666666666666666,
 0.6666666666666666,
 (0.43597160658727707, 0.8973617267460562))

In [54]:
sklearn_result = sklearn.metrics.f1_score(y_true, y_pred, average='macro')
f1, ci = f1_score(y_true, y_pred, average='macro')
sklearn_result, f1, ci

(0.669047619047619,
 0.6690476091743198,
 (0.23963266346377893, 1.0984625548848608))

In [55]:
micro_f1, ci = f1_score(y_true, y_pred, confidence_level=0.95, average='micro')
micro_f1, ci

(0.7, (0.4159742349106746, 0.9840257650893254))

## Binary case

In [24]:
y_true_binary, y_pred_binary = [[0, 0, 0, 0, 0 ,0, 1, 1, 1,1,1,0,1,1],[0, 0, 0, 0, 0 ,0, 1, 1, 1,1,1,1,0,0]]
FP, FN, TP, TN, CM = get_positive_negative_counts(y_true_binary, y_pred_binary)

tp_sum = TP[1] # TP for recognising 1 correctly!
fp_sum = FP[1]
fn_sum = FN[1]
#TP, FN = TP[1], FN[1]
CM, tp_sum, fp_sum, fn_sum

(array([[6, 1],
        [2, 5]]),
 5,
 1,
 2)

In [25]:
print("FP",FP, fp_sum) 
print("FN", FN, fn_sum)

FP [2 1] 1
FN [1 2] 2


In [26]:
# test statsmodels.stats.proportion
count = tp_sum # nr of successes
nobs_precision = tp_sum + fp_sum # nr of trials, precision
proportion_confint(count, nobs_precision, alpha=0.05, method='wilson')

(0.43649717781352965, 0.9699466302516933)

In [27]:
nobs_recall = tp_sum + fn_sum # nr of trials, recall
proportion_confint(count, nobs_recall, alpha=0.05, method='wilson')

(0.3589344518326191, 0.9177810759959432)

In [28]:
# test sklearn and local library
# precision
sklearn_result = sklearn.metrics.precision_score(y_true_binary, y_pred_binary, average='binary')
precision, ci = precision_score(y_true_binary, y_pred_binary, average='binary', method='wilson')
"P",sklearn_result, precision, ci

('P',
 0.8333333333333334,
 0.8333333194444447,
 (0.43649717781352987, 0.9699466302516935))

In [29]:
# test own implementation
wilson_score_interval(tp_sum, fp_sum, fn_sum, metric = "precision")

5 1 2 precision


(0.833, (0.436, 0.97))

In [30]:
# Recall
sklearn_result = sklearn.metrics.recall_score(y_true_binary, y_pred_binary, average='binary')
recall, ci = recall_score(y_true_binary, y_pred_binary, average='binary', method='wilson')
"R",sklearn_result, recall, ci

('R',
 0.7142857142857143,
 0.7142857040816327,
 (0.35893445183261935, 0.9177810759959433))

In [31]:
wilson_score_interval(tp_sum, fp_sum, fn_sum, metric = "recall")

5 1 2 recall


(0.714, (0.359, 0.918))

In [32]:
sklearn_result = sklearn.metrics.f1_score(y_true_binary, y_pred_binary)
binary_f1, ci = f1_score(y_true_binary, y_pred_binary, confidence_level=0.95, average='binary')
"F1", sklearn_result, binary_f1, ci

('F1',
 0.7692307692307693,
 0.7692307692307693,
 (0.4899279111581648, 1.0485336273033736))

# Custom Classification Report

In [33]:
from sklearn.metrics import classification_report
y_true = [0, 1, 2, 2, 2]
y_pred = [0, 0, 2, 2, 1]
target_names = ['class 0', 'class 1', 'class 2']
print(classification_report(y_true, y_pred, target_names=target_names))

              precision    recall  f1-score   support

     class 0       0.50      1.00      0.67         1
     class 1       0.00      0.00      0.00         1
     class 2       1.00      0.67      0.80         3

    accuracy                           0.60         5
   macro avg       0.50      0.56      0.49         5
weighted avg       0.70      0.60      0.61         5



In [34]:
label_to_numerical = {
    'Remaining': 0,
    'Non-systematic-review': 1,
    'Human-non-RCT-non-drug-intervention': 2
}
numerical_to_label = {v: f"{v} - {k}" for k, v in label_to_numerical.items()}
numerical_to_label

{0: '0 - Remaining',
 1: '1 - Non-systematic-review',
 2: '2 - Human-non-RCT-non-drug-intervention'}

In [35]:
y_true = [0, 1, 2, 2, 2]
y_pred = [0, 0, 2, 2, 1]

precision, ci = precision_score(y_true_binary, y_pred_binary, average='binary', method='wilson')
recall, ci = recall_score(y_true_binary, y_pred_binary, average='binary', method='wilson')
binary_f1, ci = f1_score(y_true_binary, y_pred_binary, confidence_level=0.95, average='binary')


In [92]:
def round_tuple(t, decimals=3):
    return tuple(round(num, decimals) for num in t)

def classification_report_with_ci(y_true, y_pred, binary_method = 'wilson', round_ndigits=3, numerical_to_label_map = None):
    # Unique classes in the dataset
    classes = np.unique(y_true)

    # Validate that all unique classes are covered in the numerical_to_label_map if provided
    if numerical_to_label_map is not None:
        missing_labels = [cls for cls in classes if cls not in numerical_to_label_map]
        if missing_labels:
            raise ValueError(f'Missing labels for classes: {missing_labels}')
    
    data = []  # List to store row dictionaries
    
    # Unique classes in the dataset
    classes = np.unique(y_true)
    
    # Calculate precision, recall, f1 for each class treated as binary
    for class_ in classes:
        y_true_binary = [1 if y == class_ else 0 for y in y_true]
        y_pred_binary = [1 if y == class_ else 0 for y in y_pred]
        
        # Calculate metrics
        precision, precision_ci = precision_score(y_true_binary, y_pred_binary, average='binary', method=binary_method)
        recall, recall_ci = recall_score(y_true_binary, y_pred_binary, average='binary', method=binary_method)
        binary_f1, binary_f1_ci = f1_score(y_true_binary, y_pred_binary, confidence_level=0.95, average='binary')
    
        class_name = numerical_to_label_map[class_] if (numerical_to_label_map and class_ in numerical_to_label_map) else f'Class {class_}'
        support = sum(y_true_binary)

        # Create a new row as a DataFrame and append it to the main DataFrame
        # Append new row to the list
        data.append({
            'Class': class_name,
            'Precision': round(precision, round_ndigits),
            'Recall': round(recall, round_ndigits),
            'F1-Score': round(binary_f1, round_ndigits),
            'Precision CI': round_tuple(precision_ci, round_ndigits),
            'Recall CI': round_tuple(recall_ci, round_ndigits),
            'F1-Score CI': round_tuple(binary_f1_ci, round_ndigits),
            'Support': support
        })
    
    precision_micro, p_ci_micro = precision_score(y_true, y_pred, average='micro')
    precision_macro, p_ci_macro = precision_score(y_true, y_pred, average='macro')
    
    recall_micro, r_ci_micro = recall_score(y_true, y_pred, average='micro')
    recall_macro, r_ci_macro = recall_score(y_true, y_pred, average='macro')
    
    f1_micro, f1_ci_micro = f1_score(y_true, y_pred, average='micro')
    f1_macro, f1_ci_macro = f1_score(y_true, y_pred, average='macro')
    
    data.append({
        'Class': 'micro',
        'Precision': round(precision_micro, round_ndigits),
        'Recall': round(recall_micro, round_ndigits),
        'F1-Score': round(f1_micro, round_ndigits),
        'Precision CI': round_tuple(p_ci_micro, round_ndigits),
        'Recall CI': round_tuple(r_ci_micro, round_ndigits),
        'F1-Score CI': round_tuple(f1_ci_micro, round_ndigits),
        'Support' : len(y_true)
    })
    
    data.append({
        'Class': 'macro',
        'Precision': round(precision_macro,round_ndigits),
        'Recall': round(recall_macro,round_ndigits),
        'F1-Score': round(f1_macro,round_ndigits),
        'Precision CI': round_tuple(p_ci_macro, decimals=round_ndigits),
        'Recall CI': round_tuple(r_ci_macro, decimals=round_ndigits),
        'F1-Score CI': round_tuple(f1_ci_macro, decimals=round_ndigits),
        'Support' : len(y_true)

    })

    df = pd.DataFrame(data)

    return df

In [93]:
y_true = [0, 1, 2, 2, 2, 1, 1, 1, 0, 2, 2, 1, 0, 2, 2, 1, 2, 2, 1, 1]
y_pred = [0, 1, 0, 0, 2, 1, 1, 1, 0, 2, 2, 1, 0, 1, 2, 1, 2, 2, 1, 1]

classification_report_with_ci(y_true, y_pred)

Unnamed: 0,Class,Precision,Recall,F1-Score,Precision CI,Recall CI,F1-Score CI,Support
0,Class 0,0.6,1.0,0.75,"(0.231, 0.882)","(0.439, 1.0)","(0.408, 1.092)",3
1,Class 1,0.889,1.0,0.941,"(0.565, 0.98)","(0.676, 1.0)","(0.796, 1.086)",8
2,Class 2,1.0,0.667,0.8,"(0.61, 1.0)","(0.354, 0.879)","(0.562, 1.038)",9
3,micro,0.85,0.85,0.85,"(0.694, 1.006)","(0.694, 1.006)","(0.694, 1.006)",20
4,macro,0.83,0.889,0.83,"(0.702, 0.958)","(0.775, 1.002)","(0.548, 1.113)",20


In [96]:
numerical_to_label = {
    0: "Cherries",
    1: "Olives",
    2: "Tangerines"
}

classification_report_with_ci(y_true, y_pred, round_ndigits=2, numerical_to_label_map = numerical_to_label)

Unnamed: 0,Class,Precision,Recall,F1-Score,Precision CI,Recall CI,F1-Score CI,Support
0,Cherries,0.6,1.0,0.75,"(0.23, 0.88)","(0.44, 1.0)","(0.41, 1.09)",3
1,Olives,0.89,1.0,0.94,"(0.57, 0.98)","(0.68, 1.0)","(0.8, 1.09)",8
2,Tangerines,1.0,0.67,0.8,"(0.61, 1.0)","(0.35, 0.88)","(0.56, 1.04)",9
3,micro,0.85,0.85,0.85,"(0.69, 1.01)","(0.69, 1.01)","(0.69, 1.01)",20
4,macro,0.83,0.89,0.83,"(0.7, 0.96)","(0.78, 1.0)","(0.55, 1.11)",20


In [69]:
from confidenceinterval import classification_report_with_ci

In [70]:
y_true, y_pred = [[2, 0, 0, 2, 1, 1, 1, 1,2,2],[2, 0, 1, 0, 1, 0, 1, 1,2,2]]

classification_report_with_ci(y_true, y_pred)

  df = pd.concat([df, new_row], ignore_index=True)


Unnamed: 0,Class,Precision,Recall,F1-Score,Precision CI,Recall CI,F1-Score CI,Support
0,Class 0,0.333,0.5,0.4,"(0.061, 0.792)","(0.095, 0.905)","(-0.148, 0.948)",2
1,Class 1,0.75,0.75,0.75,"(0.301, 0.954)","(0.301, 0.954)","(0.389, 1.111)",4
2,Class 2,1.0,0.75,0.857,"(0.439, 1.0)","(0.301, 0.954)","(0.547, 1.167)",4
3,micro,0.7,0.7,0.7,"(0.416, 0.984)","(0.416, 0.984)","(0.416, 0.984)",10
4,macro,0.694,0.667,0.669,"(0.545, 0.844)","(0.436, 0.897)","(0.24, 1.098)",10


In [68]:
from confidenceinterval.bootstrap import bootstrap_ci
# You can specify a random generator for reproducability, or pass None
y_true, y_pred = [[0, 0, 0, 0, 0 ,0, 1, 1, 1,1,1,0,1,1],[0, 0, 0, 0, 0 ,0, 1, 1, 1,1,1,1,0,0]]

random_generator = np.random.default_rng()
bootstrap_ci(y_true=y_true,
             y_pred=y_pred,
             metric=sklearn.metrics.balanced_accuracy_score,
             confidence_level=0.95,
             n_resamples=9999,
             method='bootstrap_bca',
             random_state=random_generator)

(0.7857142857142857, (0.475, 0.9444444444444444))