# Evaluation
## Preparation
### Imports

In [1]:
import pandas as pd, polars as pl
from src import data
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from sklearn.metrics import classification_report

from typing import Tuple
import numpy as np
from sklearn.metrics import confusion_matrix

from sklearn.metrics import precision_score, recall_score, f1_score, fbeta_score, accuracy_score
from imblearn.metrics import specificity_score

### Loading The Data

In [2]:
path_data = '../../../data/datasets/04_preprocessed'
path_predictions = '../../../data/predictions'

datasets = data.dict_from_directory(path_data, type='polars')
preds_sml = data.dict_from_directory(path_predictions + '/supervised_machine_learning', type='polars')
preds_llama = data.dict_from_directory(path_predictions + '/llama_3_1_8B', type='polars')

Add the true labels for the llama results and rename the columns to better distinguish after merging with sml predictions:

In [3]:
preds_llama = {
    subject: predictions.rename(
        {
            'include': 'llama',
            'reason': 'llama_reason'
        }
    ).with_columns(
        datasets[subject].select(
            pl.col('include').alias('true_llama')
        )
    )
    for subject, predictions in preds_llama.items()
}

Combine the predictions from SML and LLAMA into one dictionary.

**Caution: The predictions differ in size and order, which is why sml and llama have their own ground truth column**

In [4]:
joint_preds = {
    subject: pl.concat(
        [
            predictions.rename(
                {
                    'true': 'true_sml'
                }
            ),
            preds_llama[subject].select(
                [
                    'true_llama',
                    'llama',
                    'llama_reason']
            )
        ],
        how='horizontal'
    )
    for subject, predictions in preds_sml.items()
}

### Helper Functions

In [5]:
def confusion_matrices(y_true, y_pred) -> Tuple[np.ndarray, np.ndarray]:

    matrix = confusion_matrix(
           y_true=y_true, 
           y_pred=y_pred,
           normalize=None
        )
       
    matrix_norm = confusion_matrix(
        y_true=y_true, 
        y_pred=y_pred,
        normalize='true'
    )

    return matrix, matrix_norm

In [6]:
def generate_bootstrap_samples(
        y_true, # true labels
        y_pred, # predicted labels
        n_resamples=1000, # number of bootstrap samples, 1000 is common
    ):

    # save the bootstrapped samples here
    samples = []

    # create n_resamples bootstrap samples
    for _ in range(n_resamples):
        indices = np.random.choice(len(y_true), len(y_true), replace=True)
        y_true_resampled = y_true[indices]
        y_pred_resampled = y_pred[indices]

        #samples.append((y_true_resampled, y_pred_resampled))
        samples.append(
            {
                'y_true': y_true_resampled,
                'y_pred': y_pred_resampled
            }
        )

    return samples

In [7]:
def metric_with_ci(
        bootstraps,
        metric, # tuple: name and function
        round_ndigits=2,
        confidence_level=0.95
):
    
    metric_name, metric_function = metric[0], metric[1]

    # save the scores for each bootstrap sample
    scores = []

    # calculate the score for each bootstrap sample
    for index, sample in enumerate(bootstraps):

        y_true = sample['y_true']
        y_pred = sample['y_pred']

        # calculate the score
        if metric_name == 'f2': # pass beta argument for f2-score
            score = metric_function(y_true, y_pred, beta=2)
        else:
            score = metric_function(y_true, y_pred)
        
        # add the score to the distribution
        scores.append(score)

    
    scores = np.array(scores) # transform to numpy for calculations
    
    # calculate the mean and lower and upper bounds
    mean = np.mean(scores)
    lower = np.percentile(scores, (1 - confidence_level) / 2 * 100)
    upper = np.percentile(scores, (1 + confidence_level) / 2 * 100)

    # round the results
    mean = round(mean, round_ndigits)
    lower = round(lower, round_ndigits)
    upper = round(upper, round_ndigits)

    return mean, lower, upper, scores

# Evaluate

## Define Structures
### Estimators

In [8]:
estimators = [
    'logistic_regression',
    'random_forest',
    'support_vector_machine',
    'naive_bayes',
    'llama'
]

### Metric Functions

In [9]:
metrics = {
    'accuracy': accuracy_score,
    'precision': precision_score,
    'recall': recall_score,
    'f1': f1_score,
    'f2': fbeta_score,
    'specificity': specificity_score,

}

### Scores Table
The structure for the large table that will save scores and confidence intervals across all datasets and estimators:

In [10]:
schema_context = {
    'dataset': pl.String,
    'estimator': pl.String,
}

schema_metrics = {
    metric: pl.Struct(
        [
            pl.Field(f'{metric}_mean', pl.Float64),
            pl.Field(f'{metric}_lower', pl.Float64),
            pl.Field(f'{metric}_upper', pl.Float64),
        ]
    )
    for metric in metrics.keys()
}

schema_scores = schema_context | schema_metrics

### Results Dictionary
Every piece of the results will be saved within this dictionary for easy access:

In [11]:
results = {
    'scores': pl.DataFrame([], schema=schema_scores),
    'datasets': {
        subject: {
            estimator: {
                'classification_report': None,
                'matrix': {},
                'bootstrap': {
                    'samples': None,
                    'scores': {
                        metric: None
                        for metric in metrics
                    }

                }
            }
            for estimator in estimators
        }
        for subject in joint_preds.keys()
    }
}

# Calculate Results 
Calculate the result metrics and save all metrics and accompaning data, such as classification results, confusion matrices, and bootstrap samples, within one results-dictionary:

In [12]:
for subject, dataset in tqdm(
    iterable=joint_preds.items(),  # iterate over the datasets
    desc='Datasets',
    total=len(joint_preds),
    leave=True
):



    predictions_only = dataset.select(
        pl.exclude(
            ['true_sml', 'true_llama', 'llama_reason']
        )
    )

    for estimator in tqdm(
        iterable=predictions_only.iter_columns(),
        desc='Estimators',
        total=len(predictions_only.columns),
        leave=False
    ):
        
        model = estimator.name  # to access the results dictionary

        if model == 'llama':
            y_true = dataset['true_llama'].to_numpy()
        else:
            #y_true = dataset['true_sml'].to_numpy()
            y_true = dataset.filter(
                pl.col(model).is_not_null()
            )['true_sml'].to_numpy()
        
        # remove empty rows that result from the join
        estimator = estimator.drop_nulls()

        estimator = estimator.to_numpy()  # to perform calculations

        # add the classification report
        report = classification_report(y_true, estimator)
        results['datasets'][subject][model]['classification_report'] = report

        # add the confusion matrices
        matrix, matrix_norm = confusion_matrices(y_true, estimator)
        results['datasets'][subject][model]['matrix']['absolute'] = matrix
        results['datasets'][subject][model]['matrix']['norm'] = matrix_norm


        estimator_scores = {}

        # calculate all metrics on the same set of bootstrap samples
        bootstraps = generate_bootstrap_samples(y_true, estimator)
        # add the bootstraps to the results
        results['datasets'][subject][model]['bootstrap']['samples'] = bootstraps

        # calculate metrics with confidence intervals by bootstrapping
        for metric, function in metrics.items():

            # calculate the metric with confidence intervals
            mean, lower, upper, bootstrap_scores = metric_with_ci(bootstraps, (metric, function))
            results['datasets'][subject][model]['bootstrap']['scores'][metric] = bootstrap_scores

            # Create a Series with the struct values
            struct_series = pl.Series(
                name=metric,
                values=[(mean, lower, upper)],
                dtype=pl.Struct(
                    [
                        pl.Field(f'{metric}_mean', pl.Float64),
                        pl.Field(f'{metric}_lower', pl.Float64),
                        pl.Field(f'{metric}_upper', pl.Float64),
                    ]
                )
            )

            # add the results struct to the list of scores
            estimator_scores[metric] = struct_series[0]

        # add the results to the scores dataframe
        scores = pl.DataFrame({
            est: [vals]
            for est, vals in estimator_scores.items()
        })
        
        scores = scores.with_columns(
            pl.lit(subject, pl.String).alias('dataset'),
            pl.lit(model, pl.String).alias('estimator')
        )

        # reorder the dataframe for stacking
        column_order = ['dataset', 'estimator'] + list(metrics.keys())
        scores = scores.select(column_order)

        results['scores'] = results['scores'].vstack(scores)

Datasets:   0%|          | 0/6 [00:00<?, ?it/s]

Estimators:   0%|          | 0/5 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Estimators:   0%|          | 0/5 [00:00<?, ?it/s]

Estimators:   0%|          | 0/5 [00:00<?, ?it/s]

Estimators:   0%|          | 0/5 [00:00<?, ?it/s]

Estimators:   0%|          | 0/5 [00:00<?, ?it/s]

Estimators:   0%|          | 0/5 [00:00<?, ?it/s]

# Export

In [13]:
with open ('./results.pkl', 'wb') as file:
    import pickle
    pickle.dump(results, file, protocol=pickle.HIGHEST_PROTOCOL)