In [1]:
import pandas as pd
from thefuzz import process

In [2]:
data_path = '/Users/jk1/Downloads/NI9RV3E7_extraction.xlsx - Sheet1.csv'

In [3]:
df = pd.read_csv(data_path)

In [4]:
df

Unnamed: 0,Key,Title,DOI,recommendation,class,LOE
0,NI9RV3E7,2015 ESC/ERS Guidelines for the diagnosis and ...,10.1183/13993003.01032-2015,In patients with low echocardiographic probabi...,IIa,C
1,,,,In patients with low echocardiographic probabi...,IIa,C
2,,,,In patients with intermediate echocardiographi...,IIa,C
3,,,,In patients with intermediate echocardiographi...,IIb,C
4,,,,In patients with intermediate echocardiographi...,IIa,B
...,...,...,...,...,...,...
212,,,,Patients with PAH should be referred to expert...,I,C
213,,,,In patients with PAH with inadequate clinical ...,I,A
214,,,,"Initial combination therapy, including intrave...",I,B
215,,,,PAH-approved therapies are not recommended in ...,III,C


In [5]:
process.extractBests('In patients with low echocardiographic probability of PH without risk factors for PAH or CTEPH, alternative diagnosis should be considered.', df.recommendation)

[('In patients with low echocardiographic probability of PH without risk factors for PAH or CTEPH, alternative diagnosis should be considered.',
  100,
  0),
 ('In patients with intermediate echocardiographic probability of PH without risk factors for PAH or CTEPH, alternative diagnosis and echo follow-up should be considered.',
  94,
  2),
 ('In patients with low echocardiographic probability of PH with risk factors for PAH or CTEPH, echo follow-up should be considered.',
  89,
  1),
 ('Vasoreactivity testing is indicated only in expert centres', 86, 15),
 ('Nitric oxide is recommended for performing vasoreactivity testing', 86, 18)]

In [6]:
import spacy
nlp = spacy.load("en_core_web_lg")

def get_similarity_score(text1, text2):
    """
    Calculate the similarity score between two texts using Spacy's language model.

    Parameters
    ----------
    text1 : str
        The first text to compare.
    text2 : str
        The second text to compare.

    Returns
    -------
    float
        The similarity score between the two texts.
    """
    doc1 = nlp(text1)
    doc2 = nlp(text2)
    return doc1.similarity(doc2)

In [13]:
from similarity_evaluation.similarity_models import SimilarityModel


def score_recommendation(recommendation_text:str, recommendation_grade:str, recommendation_level:str, recommendation_gt_df:pd.DataFrame,
                         semantic_model:SimilarityModel, semantic_threshold:float=0.6,
                         interactive:bool=True, verbose:bool=True) -> tuple:
    """
    This function takes a recommendation text, grade, and level, and returns the scores of the recommendation based on the ground truth dataframe.

    if recommendation text is not in the ground truth dataframe, it returns (-1, -1, -1, -1)
    if recommendation text is in the ground truth dataframe, TRUE/FALSE for grade and level based on if the recommendation grade and level are the same as the ground truth dataframe, as well as the matched recommendation text and the score of the match.

    Parameters
    ----------
    recommendation_text : str
        The recommendation text to be scored.
    recommendation_grade : str
        The recommendation grade to be scored.
    recommendation_level : str
        The recommendation level to be scored.
    recommendation_gt_df : pd.DataFrame
        The ground truth dataframe containing the recommendation text, grade, and level.
    verbose : bool
        If True, print the fuzzy match found for the recommendation text.


    Returns
    -------
    tuple
        A tuple containing the grade evaluation, level evaluation, matched recommendation text, score of the match, as well as manual validation
        (grade_eval, level_eval, recommendation_text, score, manual_validation)

    """

    # Check if exact match for recommendation text in the ground truth dataframe
    if recommendation_text in recommendation_gt_df.recommendation.values:
        match_row = recommendation_gt_df[recommendation_gt_df.recommendation.str.lower() == recommendation_text.lower()]
        grade_eval = recommendation_grade.lower() == match_row['class'].values[0].lower()
        level_eval = recommendation_level.lower() == match_row.LOE.values[0].lower()
        return (grade_eval, level_eval, recommendation_text, 100, False)

    # Check if fuzzy match for recommendation text in the ground truth dataframe
    matches = process.extractBests(recommendation_text.lower(), recommendation_gt_df.recommendation.str.lower().values,
                                   limit=10)

    best_match = matches[0]
    if best_match[1] >= 95:
        match_row = recommendation_gt_df[recommendation_gt_df.recommendation.str.lower() == best_match[0].lower()]
        grade_eval = recommendation_grade.lower() == match_row['class'].values[0].lower()
        level_eval = recommendation_level.lower() == match_row.LOE.values[0].lower()

        if verbose:
            print(f'Fuzzy match found for "{recommendation_text}" with score {best_match[1]}: "{best_match[0]}"')

        return (grade_eval, level_eval, best_match[0], best_match[1], False)

    else:
        # if not complete match is found, use get_similarity_score to find the best match
        similarities = []
        for i, row in recommendation_gt_df.iterrows():
            score = semantic_model.compute_similarity(recommendation_text, row['recommendation'])
            similarities.append((row['recommendation'], score))

        # Find the best semantic match
        best_semantic_match = max(similarities, key=lambda x: x[1])
        if best_semantic_match[1] >= semantic_threshold:
            match_row = recommendation_gt_df[recommendation_gt_df.recommendation == best_semantic_match[0]]
            grade_eval = recommendation_grade.lower() == match_row['class'].values[0].lower()
            level_eval = recommendation_level.lower() == match_row.LOE.values[0].lower()
            
            if verbose:
                print(f'Semantic match found for "{recommendation_text}" with score {best_semantic_match[1]:.3f}: "{best_semantic_match[0]}"')
            
            return (grade_eval, level_eval, best_semantic_match[0], best_semantic_match[1] * 100, False)

        if not interactive:
            if verbose:
                print(f'No match found for "{recommendation_text}"')
            return (-1, -1, -1, -1, False)
        else:
            # give choice of best matches to the user
            print(f'No exact match found, please choose from the following options:')
            print(f'- "{recommendation_text} -"')
            for i, match in enumerate(matches):
                print(f'{i}: {match[0]} - {match[1]}')
            # add none option
            print(f'{len(matches)}: None')

            # get user input
            nl = '\n'
            user_choice = input(f'No exact match found, please choose from the following options:'
                                f'\n- "{recommendation_text} -"'
                                f'\n{nl.join([f"{i}: {match[0]} - {match[1]}" for i, match in enumerate(matches)])}'
                                f'\n{len(matches)}: None'
                                f'\nPlease enter the number of your choice: ')

            # check if user input is valid
            if user_choice.isdigit() and int(user_choice) < len(matches):
                match_row = recommendation_gt_df[recommendation_gt_df.recommendation.str.lower() == matches[int(user_choice)][0].lower()]
                grade_eval = recommendation_grade.lower() == match_row['class'].values[0].lower()
                level_eval = recommendation_level.lower() == match_row.LOE.values[0].lower()
                return (grade_eval, level_eval, matches[int(user_choice)][0], matches[int(user_choice)][1], True)

            # no match found
            elif user_choice.isdigit() and int(user_choice) == len(matches):
                return (-1, -1, -1, -1, True)
            else:
                print('Invalid choice, no match retained')
                return (-1, -1, -1, -1, False)



'NoneType' object has no attribute 'cadam32bit_grad_fp32'


  warn("The installed version of bitsandbytes was compiled without GPU support. "


In [8]:
df[df.recommendation.str.lower() == ('Nitric oxide is recommended for performing vasoreactivity testing').lower()]

Unnamed: 0,Key,Title,DOI,recommendation,class,LOE
18,,,,Nitric oxide is recommended for performing vas...,I,C


In [14]:
from similarity_evaluation.similarity_models import SentenceTransformerSimilarityModel

model = SentenceTransformerSimilarityModel("neuml/pubmedbert-base-embeddings")
score_recommendation('Nitric oxide is recommended for testing', 'I', 'C', df, model)

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: mps
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: neuml/pubmedbert-base-embeddings


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Semantic match found for "Nitric oxide is recommended for testing" with score 0.873: "Nitric oxide is recommended for performing vasoreactivity testing"


(True,
 True,
 'Nitric oxide is recommended for performing vasoreactivity testing',
 87.34111189842224,
 False)

In [15]:
def evaluate_guideline_extraction(df:pd.DataFrame, gt_df:pd.DataFrame, interactive:bool=True, verbose:bool=True) -> tuple:
    """
    Evaluate an extracted guideline recommendation dataframe against a ground truth dataframe.
    The function iterates through the extracted dataframe, scoring each recommendation based on its text, grade, and level.
    Accuracy for grade and level is calculated as the percentage of correct matches.

    Parameters
    ----------
    :param df:
    :param gt_df:
    :param interactive:
    :param verbose:

    Returns
    -------
    :return: tuple
        A tuple containing the accuracy of recommendations, accuracy of grades, accuracy of levels, number of missing recommendations,
        a dataframe of all matches, and a dataframe of missing recommendations.
        (accuracy_recommendation, accuracy_grade, accuracy_level, n_missing_recommendations, all_matches_df, missing_recommendations_df)
    """
    # Initialize variables
    n_recommendations = len(df)
    n_correct_recommendations = 0
    n_correct_grades = 0
    n_correct_levels = 0
    all_matches = []

    semantic_model = SentenceTransformerSimilarityModel("neuml/pubmedbert-base-embeddings")

    # Iterate through the extracted dataframe
    for i, row in df.iterrows():
        recommendation_text = row['recommendation']
        recommendation_grade = row['class']
        recommendation_level = row['LOE']

        # Score the recommendation
        grade_eval, level_eval, matched_text, score, manual_validation = score_recommendation(recommendation_text,
                                                                                             recommendation_grade,
                                                                                             recommendation_level,
                                                                                             gt_df,
                                                                                             semantic_model=semantic_model,
                                                                                             interactive=interactive,
                                                                                             verbose=verbose)

        # Update counts based on evaluation
        if grade_eval == True:
            n_correct_grades += 1
        if level_eval == True:
            n_correct_levels += 1
        if matched_text != -1:
            n_correct_recommendations += 1

        # Append match to all matches list
        all_matches.append((recommendation_text, recommendation_grade, recommendation_level, matched_text, grade_eval, level_eval, score, manual_validation))

    # Calculate accuracies
    accuracy_recommendation = n_correct_recommendations / n_recommendations * 100
    accuracy_grade = n_correct_grades / n_correct_recommendations * 100
    accuracy_level = n_correct_levels / n_correct_recommendations * 100

    all_matches_df = pd.DataFrame(all_matches, columns=['recommendation_text', 'recommendation_grade', 'recommendation_level', 'matched_text', 'grade_eval', 'level_eval', 'match_score', 'manual_validation'])

    # missing recommendations (ie in ground truth but not in extracted)
    missing_recommendations_df = gt_df[~gt_df.recommendation.isin(df.recommendation.values)]
    n_missing_recommendations = len(missing_recommendations_df)

    return (accuracy_recommendation, accuracy_grade, accuracy_level, n_missing_recommendations, all_matches_df, missing_recommendations_df)


In [16]:
accuracy_recommendation, accuracy_grade, accuracy_level, n_missing_recommendations, all_matches_df, missing_recommendations_df = evaluate_guideline_extraction(df, df, interactive=False, verbose=True)

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: mps
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: neuml/pubmedbert-base-embeddings


In [17]:
accuracy_recommendation, accuracy_grade, accuracy_level, n_missing_recommendations

(100.0, 100.0, 100.0, 0)