# Evaluation
## Preparation
### Imports

In [1]:
import pandas as pd, polars as pl
from src import data
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

### Loading The Data

In [2]:
path_data = '../../../../data/datasets/04_preprocessed'
path_predictions = '../../../../data/predictions'

datasets = data.dict_from_directory(path_data, type='polars')
preds_sml = data.dict_from_directory(path_predictions + '/supervised_machine_learning', type='polars')
preds_llama = data.dict_from_directory(path_predictions + '/llama_3_1_8B', type='polars')

In [3]:
datasets['adhd']

index,include,title,abstract,first_author,year,journal,doi,pubmed_id,authors,pubmed_type,publication_types,mesh,webofscience_id,central_id,openalex_id,title_length,title_word_count,title_sentence_count,abstract_length,abstract_word_count,abstract_sentence_count,language_title,language_abstract
i64,bool,str,str,str,i64,str,str,i64,str,str,str,str,str,str,str,i64,i64,i64,i64,i64,i64,str,str
0,false,"""The effectiveness of clonidine…","""To compare the analgesic effec…","""Reimer EJ""",1998,"""Can J Anaesth""","""https://doi.org/10.1007/bf0301…",10051933,"""Reimer EJ; Dunn GS; Montgomery…","""article""","""D016430: Clinical Trial; D0031…","""D000082: Acetaminophen; D00023…",,,"""https://openalex.org/W20826139…",79,10,1,1501,244,12,"""en""","""en"""
1,false,"""A Controlled Trial of Sustaine…","""Use of nicotinereplacement the…","""Jorenby DE""",1999,"""N Engl J Med""","""https://doi.org/10.1056/nejm19…",10053177,"""Jorenby DE; Leischow SJ; Nides…","""article""","""D016430: Clinical Trial; D0164…","""D000279: Administration, Cutan…",,,"""https://openalex.org/W23126093…",98,16,1,727,141,5,"""en""","""en"""
2,false,"""Effects of methylphenidate on …","""Three experiments were conduct…","""Berman T""",1999,"""J Abnorm Psychol""","""https://doi.org/10.1037/0021-8…",10066996,"""Berman T; Douglas VI; Barr RG""","""article""","""D016430: Clinical Trial; D0188…","""D000293: Adolescent; D000367: …",,,"""https://openalex.org/W20229048…",103,12,1,841,149,5,"""en""","""en"""
3,false,"""Spinal Clonidine Prolongs Labo…","""We sought to determine whether…","""D'Angelo R""",1999,"""Anesth Analg""","""https://doi.org/10.1097/000005…",10072008,"""D'Angelo R; Evans E; Dean LA; …","""article""","""D016430: Clinical Trial; D0164…","""D000328: Adult; D016362: Analg…",,,"""https://openalex.org/W20210973…",80,10,1,1322,224,8,"""en""","""en"""
4,false,"""LowDose Clozapine for the Trea…","""Druginduced psychosis is a dif…","""Parkinson Study Group""",1999,"""N Engl J Med""","""https://doi.org/10.1056/nejm19…",10072410,"""Parkinson Study Group""","""article""","""D016430: Clinical Trial; D0164…","""D000368: Aged; D000978: Antipa…",,,"""https://openalex.org/W42392839…",85,12,1,1994,336,12,"""en""","""en"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
845,false,"""Combination treatment with clo…","""Clozapine is a drug with many …","""Anghelescu I""",1998,"""Eur Neuropsychopharmacol""","""https://doi.org/10.1016/s0924-…",9928923,"""Anghelescu I; Szegedi A; Schle…","""article""","""D016430: Clinical Trial; D0188…","""D000328: Adult; D014150: Antip…",,,"""https://openalex.org/W20144890…",137,19,1,1703,269,12,"""en""","""en"""
846,false,"""Prolactin Levels and Adverse E…","""Hyperprolactinemia is a common…","""Kleinberg DL""",1999,"""J Clin Psychopharmacol""","""https://doi.org/10.1097/000047…",9934944,"""Kleinberg DL; Davis JM; de Cos…","""article""","""D016428: Journal Article""","""D000328: Adult; D014150: Antip…",,,"""https://openalex.org/W20158377…",72,10,1,1687,271,10,"""en""","""en"""
847,false,"""Dexmedetomidine Failed to Bloc…","""Orally administered clonidine …","""Fu W""",1999,"""Anesthesiology""","""https://doi.org/10.1097/000005…",9952147,"""Fu W; White PF""","""article""","""D016430: Clinical Trial; D0164…","""D000368: Aged; D018712: Analge…",,,"""https://openalex.org/W20335373…",93,11,1,1392,212,4,"""en""","""en"""
848,false,"""A DoubleBlind PlaceboControlle…","""A randomized doubleblind place…","""Wong YN""",1999,"""J Clin Pharmacol""","""https://doi.org/10.1177/009127…",9987698,"""Wong YN; Simcoe D; Hartman LN;…","""article""","""D016430: Clinical Trial; D0164…","""D000293: Adolescent; D000328: …",,,"""https://openalex.org/W21021969…",150,19,1,1711,295,14,"""en""","""en"""


In [4]:
preds_llama = {
    subject: predictions.rename(
        {
            'include': 'llama',
            'reason': 'llama_reason'
        }
    ).with_columns(
        datasets[subject].select(
            pl.col('include').alias('true_llama')
        )
    )
    for subject, predictions in preds_llama.items()
}

In [5]:
preds_llama

{'adhd': shape: (798, 6)
 ┌───────┬─────────────────────┬─────────────────────┬─────────────────────┬───────────┬────────────┐
 │ llama ┆ llama_reason        ┆ title               ┆ doi                 ┆ pubmed_id ┆ true_llama │
 │ ---   ┆ ---                 ┆ ---                 ┆ ---                 ┆ ---       ┆ ---        │
 │ bool  ┆ str                 ┆ str                 ┆ str                 ┆ i64       ┆ bool       │
 ╞═══════╪═════════════════════╪═════════════════════╪═════════════════════╪═══════════╪════════════╡
 │ false ┆ The study           ┆ The effectiveness   ┆ https://doi.org/10. ┆ 10051933  ┆ false      │
 │       ┆ population does     ┆ of clonidine…       ┆ 1007/bf0301…        ┆           ┆            │
 │       ┆ not…                ┆                     ┆                     ┆           ┆            │
 │ false ┆ The study           ┆ A Controlled Trial  ┆ https://doi.org/10. ┆ 10053177  ┆ false      │
 │       ┆ population does     ┆ of Sustaine…        ┆ 10

Combine the predictions from SML and LLAMA into one dictionary.

**Caution: The predictions differ in size and order, which is why sml and llama have their own ground truth column**

In [6]:
joint_preds = {
    subject: pl.concat(
        [
            predictions.rename(
                {
                    'true': 'true_sml'
                }
            ),
            preds_llama[subject].select(
                [
                    'true_llama',
                    'llama',
                    'llama_reason']
            )
        ],
        how='horizontal'
    )
    for subject, predictions in preds_sml.items()
}

In [7]:
joint_preds['oral_hypoglycemics']

true_sml,logistic_regression,random_forest,support_vector_machine,naive_bayes,true_llama,llama,llama_reason
bool,bool,bool,bool,bool,bool,bool,str
false,true,true,true,true,false,true,""" The study meets all the inclu…"
false,false,false,false,true,true,true,""" The study meets all the inclu…"
false,true,true,true,true,false,true,""" The study meets all the inclu…"
false,false,false,false,true,true,true,""" The study meets all the inclu…"
false,false,false,false,true,false,false,""" The study does not meet the i…"
…,…,…,…,…,…,…,…
,,,,,false,true,""" The study meets the inclusion…"
,,,,,false,true,""" The study meets the inclusion…"
,,,,,false,false,""" The study does not meet the i…"
,,,,,false,true,""" The study meets all the inclu…"


In [8]:
preds_small = {k: v for k, v in joint_preds.items() if k in ['oral_hypoglycemics']}

Incorporate the ground truth labels in the llama prediction dataframes:

### Helper Functions

In [9]:
from typing import Tuple
import numpy as np
from sklearn.metrics import confusion_matrix

def confusion_matrices(y_true, y_pred) -> Tuple[np.ndarray, np.ndarray]:

    matrix = confusion_matrix(
           y_true=y_true, 
           y_pred=y_pred,
           normalize=None
        )
       
    matrix_norm = confusion_matrix(
        y_true=y_true, 
        y_pred=y_pred,
        normalize='pred'
    )

    return matrix, matrix_norm

# Evaluate

In [10]:
from sklearn.metrics import precision_score, recall_score, f1_score, fbeta_score, accuracy_score
from imblearn.metrics import specificity_score

estimators = [
    'logistic_regression',
    'random_forest',
    'support_vector_machine',
    'naive_bayes',
    'llama'
]

metrics = {
    'accuracy': accuracy_score,
    'precision': precision_score,
    'recall': recall_score,
    'f1': f1_score,
    'f2': fbeta_score,
    'specificity': specificity_score,

}

In [11]:
schema_context = {
    'dataset': pl.String,
    'estimator': pl.String,
}

schema_metrics = {
    metric: pl.Struct(
        [
            pl.Field(f'{metric}_mean', pl.Float64),
            pl.Field(f'{metric}_lower', pl.Float64),
            pl.Field(f'{metric}_upper', pl.Float64),
        ]
    )
    for metric in metrics.keys()
}

schema_scores = schema_context | schema_metrics

In [12]:
results = {
    'scores': pl.DataFrame([], schema=schema_scores),
    'datasets': {
        subject: {
            estimator: {
                'classification_report': None,
                'matrices': {},
                'bootstraps': {
                    metric: None
                    for metric in metrics
                }
            }
            for estimator in estimators
        }
        for subject in preds_small.keys() # CHANGE LATER!!!
    }
}

In [13]:
results['scores']

dataset,estimator,accuracy,precision,recall,f1,f2,specificity
str,str,struct[3],struct[3],struct[3],struct[3],struct[3],struct[3]


In [14]:
def generate_bootstrap_samples(
        y_true, # true labels
        y_pred, # predicted labels
        n_resamples=1000, # number of bootstrap samples, 1000 is common
    ):

    # save the bootstrapped samples here
    samples = []

    # create n_resamples bootstrap samples
    for _ in range(n_resamples):
        indices = np.random.choice(len(y_true), len(y_true), replace=True)
        y_true_resampled = y_true[indices]
        y_pred_resampled = y_pred[indices]

        #samples.append((y_true_resampled, y_pred_resampled))
        samples.append(
            {
                'y_true': y_true_resampled,
                'y_pred': y_pred_resampled
            }
        )

    return samples

In [15]:
def metric_with_ci(
        bootstraps,
        metric, # tuple: name and function
        round_ndigits=2,
        confidence_level=0.95
):
    
    metric_name, metric_function = metric[0], metric[1]

    # save the scores for each bootstrap sample
    scores = []

    # calculate the score for each bootstrap sample
    for index, sample in enumerate(bootstraps):

        y_true = sample['y_true']
        y_pred = sample['y_pred']

        # calculate the score
        if metric_name == 'f2': # pass beta argument for f2-score
            score = metric_function(y_true, y_pred, beta=2)
        else:
            score = metric_function(y_true, y_pred)
        
        # add the score to the distribution
        scores.append(score)

    
    scores = np.array(scores) # transform to numpy for calculations
    
    # calculate the mean and lower and upper bounds
    mean = np.mean(scores)
    lower = np.percentile(scores, (1 - confidence_level) / 2 * 100)
    upper = np.percentile(scores, (1 + confidence_level) / 2 * 100)

    # round the results
    mean = round(mean, round_ndigits)
    lower = round(lower, round_ndigits)
    upper = round(upper, round_ndigits)

    return mean, lower, upper

# TODO: INTEGRATE THE LLAMA PREDICTIONS INTO THE LOOP

In [16]:
from sklearn.metrics import classification_report

for subject, dataset in tqdm(
    iterable=preds_small.items(),  # iterate over the datasets
    desc='Datasets',
    total=len(preds_small),
    leave=True
):



    predictions_only = dataset.select(
        pl.exclude(
            ['true_sml', 'true_llama', 'llama_reason']
        )
    )

    for estimator in tqdm(
        iterable=predictions_only.iter_columns(),
        desc='Estimators',
        total=len(predictions_only.columns),
        leave=False
    ):
        
        model = estimator.name  # to access the results dictionary

        if model == 'llama':
            y_true = dataset['true_llama'].to_numpy()
        else:
            #y_true = dataset['true_sml'].to_numpy()
            y_true = dataset.filter(
                pl.col(model).is_not_null()
            )['true_sml'].to_numpy()
        
        # remove empty rows that result from the join
        estimator = estimator.drop_nulls()

        estimator = estimator.to_numpy()  # to perform calculations

        # add the classification report
        report = classification_report(y_true, estimator)
        results['datasets'][subject][model]['classification_report'] = report

        # add the confusion matrices
        matrix, matrix_norm = confusion_matrices(y_true, estimator)
        results['datasets'][subject][model]['matrices']['absolute'] = matrix
        results['datasets'][subject][model]['matrices']['normed'] = matrix_norm


        estimator_scores = {}

        # calculate metrics with confidence intervals by bootstrapping
        for metric, function in metrics.items():

            # add the bootstrap samples to the results
            boot = generate_bootstrap_samples(y_true, estimator)
            results['datasets'][subject][model]['bootstraps'][metric] = boot

            # calculate the metric with confidence intervals
            mean, lower, upper = metric_with_ci(boot, (metric, function))

            # Create a Series with the struct values
            struct_series = pl.Series(
                name=metric,
                values=[(mean, lower, upper)],
                dtype=pl.Struct(
                    [
                        pl.Field(f'{metric}_mean', pl.Float64),
                        pl.Field(f'{metric}_lower', pl.Float64),
                        pl.Field(f'{metric}_upper', pl.Float64),
                    ]
                )
            )

            # add the results struct to the list of scores
            estimator_scores[metric] = struct_series[0]

        # add the results to the scores dataframe
        scores = pl.DataFrame({
            est: [vals]
            for est, vals in estimator_scores.items()
        })
        
        scores = scores.with_columns(
            pl.lit(subject, pl.String).alias('dataset'),
            pl.lit(model, pl.String).alias('estimator')
        )

        # reorder the dataframe for stacking
        column_order = ['dataset', 'estimator'] + list(metrics.keys())
        scores = scores.select(column_order)

        results['scores'] = results['scores'].vstack(scores)

Datasets:   0%|          | 0/1 [00:00<?, ?it/s]

Estimators:   0%|          | 0/5 [00:00<?, ?it/s]

In [18]:
results['scores']

dataset,estimator,accuracy,precision,recall,f1,f2,specificity
str,str,struct[3],struct[3],struct[3],struct[3],struct[3],struct[3]
"""oral_hypoglycemics""","""logistic_regression""","{0.69,0.62,0.77}","{0.43,0.3,0.57}","{0.58,0.41,0.74}","{0.49,0.35,0.61}","{0.54,0.39,0.68}","{0.72,0.64,0.81}"
"""oral_hypoglycemics""","""random_forest""","{0.6,0.52,0.68}","{0.35,0.24,0.47}","{0.64,0.49,0.79}","{0.45,0.33,0.57}","{0.55,0.42,0.67}","{0.59,0.5,0.68}"
"""oral_hypoglycemics""","""support_vector_machine""","{0.73,0.66,0.8}","{0.48,0.32,0.63}","{0.53,0.37,0.69}","{0.5,0.36,0.64}","{0.52,0.37,0.66}","{0.8,0.72,0.88}"
"""oral_hypoglycemics""","""naive_bayes""","{0.38,0.3,0.46}","{0.3,0.22,0.38}","{1.0,1.0,1.0}","{0.46,0.36,0.55}","{0.68,0.58,0.76}","{0.17,0.09,0.23}"
"""oral_hypoglycemics""","""llama""","{0.45,0.4,0.49}","{0.31,0.26,0.36}","{0.89,0.83,0.94}","{0.46,0.4,0.51}","{0.64,0.59,0.69}","{0.29,0.24,0.34}"


In [21]:
report = results['datasets']['oral_hypoglycemics']['llama']['matrices']['normed']
print(report)

[[0.88288288 0.69164265]
 [0.11711712 0.30835735]]
