---

In [None]:
# pip install mlflow dagshub

In [None]:
# !pip install ipynb -q
# !pip install langchain -q
# # !pip install anthropic -q
# !pip install tiktoken
# !pip install nltk
# !pip install rouge-score
# !pip install evaluate
# !pip3 install fmeval --upgrade-strategy only-if-needed --force-reinstall
# !pip install transformers
# !pip install detoxify

In [17]:
# pip install -U deepeval

In [18]:
# pip install -U "ray[default]"
# !pip install openpyxl

In [11]:
import json
from nltk.translate import meteor_score
from nltk import word_tokenize
import evaluate as hf_evaluate
import pandas as pd


def get_meteor_score(target_output: str, model_output: str, **kwargs) -> float:
    """
    METEOR is a metric for text similarity between the machine-produced summary and human-produced reference summaries.
    Unigrams can be matched based on their surface forms, stemmed forms,
    and meanings; furthermore, METEOR can be easily extended to include more
    advanced matching strategies. Once all generalized unigram matches
    between the two strings have been found, METEOR computes a score for
    this matching using a combination of unigram-precision, unigram-recall, and
    a measure of fragmentation that is designed to directly capture how
    well-ordered the matched words in the machine translation are in relation
    to the reference.

    :param target_output: The expected responses from the model
    :param model_output: The output of a model that we want to evaluate.
    :returns: meteor score
    """
    return meteor_score.single_meteor_score(
        reference=word_tokenize(target_output), hypothesis=word_tokenize(model_output)
    )


def get_rouge_score(target_output: str, model_output: str, **kwargs) -> float:
    """
    The ROUGE-N, where N=[1,2,L], score is a standard metric for summarization quality.
    It computes the word overlap between the reference and model summary. Given that this metric is based on simple
    word overlap statistics, it works best for extractive summaries.
    Note that if we rephrase the summary without changing its meaning the ROUGE-N score will drop.

    Reference: https://huggingface.co/spaces/evaluate-metric/rouge

    :param target_output: The expected responses from the model
    :param model_output: The output of a model that we want to evaluate.
    :returns: rouge score
    """
    rouge_type = "rouge2"
    rouge = hf_evaluate.load("rouge")
    return rouge.compute(
        predictions=[model_output],
        references=[target_output],
        use_stemmer=True,
        rouge_types=[rouge_type],
    )[rouge_type]


def get_bert_score(target_output: str, model_output: str, **kwargs) -> float:
    """
    BERTscore is a similarity-based metric that compares the embedding of the prediction and target sentences
    under a learned model, typically, from the BERT family.
    This score may lead to increased flexibility compared to ROUGE and METEOR in terms of rephrasing since
    semantically similar sentences are (typically) embedded similarly.

    https://huggingface.co/spaces/evaluate-metric/bertscore

    :param target_output: The expected responses from the model
    :param model_output: The output of a model that we want to evaluate.
    :returns: bert score
    """
    bertscore = hf_evaluate.load("bertscore")
    return bertscore.compute(
        predictions=[model_output],
        references=[target_output],
        lang="en"
    )["f1"][0]


def get_accuracy_evaluation(dataset: pd.DataFrame, EXPECTED_RESPONSE_COL,OUTPUT_TO_BE_EVALUATED_COL ):
    eval_scores = []   
    meteor_scores = []
    rouge_scores = []
    bert_scores = []
    
    for index, row in dataset.iterrows():
        target_output = row[EXPECTED_RESPONSE_COL]
        model_output = row[OUTPUT_TO_BE_EVALUATED_COL]
        
        meteor = get_meteor_score(target_output, model_output)
        rouge = get_rouge_score(target_output, model_output)
        bert = get_bert_score(target_output, model_output)
        
        meteor_scores.append(meteor)
        rouge_scores.append(rouge)
        bert_scores.append(bert)
        
        dataset.at[index, f'{EXPECTED_RESPONSE_COL} VS {OUTPUT_TO_BE_EVALUATED_COL} - METEOR Score'] = meteor
        dataset.at[index, f'{EXPECTED_RESPONSE_COL} VS {OUTPUT_TO_BE_EVALUATED_COL} - ROUGE Score'] = rouge
        dataset.at[index, f'{EXPECTED_RESPONSE_COL} VS {OUTPUT_TO_BE_EVALUATED_COL} - BERT Score'] = bert

    m_score = sum(meteor_scores) / len(meteor_scores)
    r_score = sum(rouge_scores) / len(rouge_scores)
    b_score = sum(bert_scores) / len(bert_scores)
    
    eval_scores.append({"name": f'{EXPECTED_RESPONSE_COL} VS {OUTPUT_TO_BE_EVALUATED_COL} - METEOR Score', "value": m_score})
    eval_scores.append({"name": f'{EXPECTED_RESPONSE_COL} VS {OUTPUT_TO_BE_EVALUATED_COL} - ROUGE Score', "value": r_score})
    eval_scores.append({"name": f'{EXPECTED_RESPONSE_COL} VS {OUTPUT_TO_BE_EVALUATED_COL} - BERT Score', "value": b_score})
    
    return eval_scores, dataset

In [13]:
import pandas as pd

df = pd.read_excel(r'mixtral_8x7b-shuffled_transcript-F200.xlsx')
df

Unnamed: 0.1,Unnamed: 0,Transcript,mixtral_8x7b - Summary,mixtral_8x7b - Issue,mixtral_8x7b - Issue Summary,mixtral_8x7b - Severity,mixtral_8x7b - Clinician to Contact
0,0,"""This is Nurse Thompson at Lakeside Hospital. ...",Urgent transfer needed for Mr. Brown at Lakes...,"A patient, Mr. David Brown, at Lakeside Hospit...",Stroke patient needing urgent transfer.,High,Support
1,1,"""My friend has a known allergy to soy and acci...",Friend with soy allergy accidentally ingested...,Patient with a known soy allergy has accidenta...,Allergic reaction to soy ingestion,High,general
2,2,"""My wife just stopped seizing, but she's confu...","Wife's seizure ended, but she's confused and ...",Patient experienced a seizure and is currently...,Post-seizure confusion and memory loss.,Medium,Support
3,3,"""A man at the restaurant just fell and isn't r...","Man Fainted, Unresponsive, No Pulse at Restau...",A man has fallen and is unconscious with no pu...,Unconscious man with no pulse,High,Technical
4,4,I slipped on the ice and my leg buckled. I thi...,Possible Leg Fracture from Ice Slip; Seeks Me...,The patient slipped on ice and believes they m...,Possible broken leg from ice slip,Medium,Support
...,...,...,...,...,...,...,...
195,195,"""My friend's dental implant fell out and is ca...","Dental implant fell out, causing friend's pai...",Patient's friend is experiencing intense pain ...,Fallen-out dental implant causing pain and ble...,Medium,General
196,196,My husband just collapsed! He's not breathing ...,Spouse's Sudden Collapse; Non-Responsive and ...,"The patient's husband has collapsed, is not br...",Patient's husband is unresponsive and not brea...,High,Technical
197,197,"""My partner's dental filling came out and is c...","Dental filling dislodged, causing severe pain...",Partner's dental filling came out and causing ...,Partner's dental filling causing pain,Low,General
198,198,"""My girlfriend's contraceptive shot wore off, ...",Girlfriend Worries About Pregnancy After Cont...,"Patient's contraceptive shot wore off, and she...","Contraceptive shot failure, pregnancy concern",Low,General


In [14]:
# # Assuming df is your DataFrame
# df = pd.DataFrame({
#     "mixtral_8x7b - Summary": [
#         "MLflow is an open-source platform for managing the end-to-end machine learning (ML) lifecycle.",
#         "Apache Spark is an open-source, distributed computing system designed for big data processing and analytics."
#     ],
#     "mixtral_8x7b - Issue Summary": [
#         "MLflow is a platform for managing the machine learning lifecycle.",
#         "Apache Spark is a distributed computing system for big data processing."
#     ]
# })

eval_scores, updated_df = get_accuracy_evaluation(df,
                                                  EXPECTED_RESPONSE_COL = "mixtral_8x7b - Summary",
                                                  OUTPUT_TO_BE_EVALUATED_COL = "mixtral_8x7b - Issue Summary")
# print(json.dumps(eval_scores, default=vars, indent=4))
updated_df


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

Unnamed: 0.1,Unnamed: 0,Transcript,mixtral_8x7b - Summary,mixtral_8x7b - Issue,mixtral_8x7b - Issue Summary,mixtral_8x7b - Severity,mixtral_8x7b - Clinician to Contact,mixtral_8x7b - Summary VS mixtral_8x7b - Issue Summary - METEOR Score,mixtral_8x7b - Summary VS mixtral_8x7b - Issue Summary - ROUGE Score,mixtral_8x7b - Summary VS mixtral_8x7b - Issue Summary - BERT Score
0,0,"""This is Nurse Thompson at Lakeside Hospital. ...",Urgent transfer needed for Mr. Brown at Lakes...,"A patient, Mr. David Brown, at Lakeside Hospit...",Stroke patient needing urgent transfer.,High,Support,0.281818,0.133333,0.887617
1,1,"""My friend has a known allergy to soy and acci...",Friend with soy allergy accidentally ingested...,Patient with a known soy allergy has accidenta...,Allergic reaction to soy ingestion,High,general,0.075764,0.093023,0.847654
2,2,"""My wife just stopped seizing, but she's confu...","Wife's seizure ended, but she's confused and ...",Patient experienced a seizure and is currently...,Post-seizure confusion and memory loss.,Medium,Support,0.160727,0.111111,0.867032
3,3,"""A man at the restaurant just fell and isn't r...","Man Fainted, Unresponsive, No Pulse at Restau...",A man has fallen and is unconscious with no pu...,Unconscious man with no pulse,High,Technical,0.195081,0.153846,0.874599
4,4,I slipped on the ice and my leg buckled. I thi...,Possible Leg Fracture from Ice Slip; Seeks Me...,The patient slipped on ice and believes they m...,Possible broken leg from ice slip,Medium,Support,0.424762,0.307692,0.929276
...,...,...,...,...,...,...,...,...,...,...
195,195,"""My friend's dental implant fell out and is ca...","Dental implant fell out, causing friend's pai...",Patient's friend is experiencing intense pain ...,Fallen-out dental implant causing pain and ble...,Medium,General,0.417562,0.315789,0.922366
196,196,My husband just collapsed! He's not breathing ...,Spouse's Sudden Collapse; Non-Responsive and ...,"The patient's husband has collapsed, is not br...",Patient's husband is unresponsive and not brea...,High,Technical,0.488889,0.266667,0.898198
197,197,"""My partner's dental filling came out and is c...","Dental filling dislodged, causing severe pain...",Partner's dental filling came out and causing ...,Partner's dental filling causing pain,Low,General,0.276864,0.153846,0.895531
198,198,"""My girlfriend's contraceptive shot wore off, ...",Girlfriend Worries About Pregnancy After Cont...,"Patient's contraceptive shot wore off, and she...","Contraceptive shot failure, pregnancy concern",Low,General,0.224172,0.153846,0.893974


In [19]:

_ , updated_df2 = get_accuracy_evaluation(updated_df,
                                                  EXPECTED_RESPONSE_COL = "mixtral_8x7b - Issue",
                                                  OUTPUT_TO_BE_EVALUATED_COL = "mixtral_8x7b - Issue Summary")
# print(json.dumps(eval_scores, default=vars, indent=4))
updated_df2


updated_df2

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

Unnamed: 0.1,Unnamed: 0,Transcript,mixtral_8x7b - Summary,mixtral_8x7b - Issue,mixtral_8x7b - Issue Summary,mixtral_8x7b - Severity,mixtral_8x7b - Clinician to Contact,mixtral_8x7b - Summary VS mixtral_8x7b - Issue Summary - METEOR Score,mixtral_8x7b - Summary VS mixtral_8x7b - Issue Summary - ROUGE Score,mixtral_8x7b - Summary VS mixtral_8x7b - Issue Summary - BERT Score,mixtral_8x7b - Issue VS mixtral_8x7b - Issue Summary - METEOR Score,mixtral_8x7b - Issue VS mixtral_8x7b - Issue Summary - ROUGE Score,mixtral_8x7b - Issue VS mixtral_8x7b - Issue Summary - BERT Score
0,0,"""This is Nurse Thompson at Lakeside Hospital. ...",Urgent transfer needed for Mr. Brown at Lakes...,"A patient, Mr. David Brown, at Lakeside Hospit...",Stroke patient needing urgent transfer.,High,Support,0.281818,0.133333,0.887617,0.193074,0.080000,0.891254
1,1,"""My friend has a known allergy to soy and acci...",Friend with soy allergy accidentally ingested...,Patient with a known soy allergy has accidenta...,Allergic reaction to soy ingestion,High,general,0.075764,0.093023,0.847654,0.071429,0.000000,0.869903
2,2,"""My wife just stopped seizing, but she's confu...","Wife's seizure ended, but she's confused and ...",Patient experienced a seizure and is currently...,Post-seizure confusion and memory loss.,Medium,Support,0.160727,0.111111,0.867032,0.391228,0.133333,0.902846
3,3,"""A man at the restaurant just fell and isn't r...","Man Fainted, Unresponsive, No Pulse at Restau...",A man has fallen and is unconscious with no pu...,Unconscious man with no pulse,High,Technical,0.195081,0.153846,0.874599,0.340458,0.250000,0.910012
4,4,I slipped on the ice and my leg buckled. I thi...,Possible Leg Fracture from Ice Slip; Seeks Me...,The patient slipped on ice and believes they m...,Possible broken leg from ice slip,Medium,Support,0.424762,0.307692,0.929276,0.151515,0.000000,0.904194
...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,195,"""My friend's dental implant fell out and is ca...","Dental implant fell out, causing friend's pai...",Patient's friend is experiencing intense pain ...,Fallen-out dental implant causing pain and ble...,Medium,General,0.417562,0.315789,0.922366,0.442401,0.454545,0.916400
196,196,My husband just collapsed! He's not breathing ...,Spouse's Sudden Collapse; Non-Responsive and ...,"The patient's husband has collapsed, is not br...",Patient's husband is unresponsive and not brea...,High,Technical,0.488889,0.266667,0.898198,0.369152,0.300000,0.938964
197,197,"""My partner's dental filling came out and is c...","Dental filling dislodged, causing severe pain...",Partner's dental filling came out and causing ...,Partner's dental filling causing pain,Low,General,0.276864,0.153846,0.895531,0.585938,0.428571,0.950336
198,198,"""My girlfriend's contraceptive shot wore off, ...",Girlfriend Worries About Pregnancy After Cont...,"Patient's contraceptive shot wore off, and she...","Contraceptive shot failure, pregnancy concern",Low,General,0.224172,0.153846,0.893974,0.281818,0.133333,0.892457


---

# **Eval Toxicity**

In [28]:
import pandas as pd
import json
from typing import Any, Dict, List
from detoxify import Detoxify
from transformers import pipeline

def get_toxicity(text_input: List[str], **kwargs) -> List[float]:
    """
    Helper model for toxigen model: https://huggingface.co/tomh/toxigen_roberta/tree/main
    """
    model_name = "tomh/toxigen_roberta"
    model = pipeline("text-classification", model=model_name)

    inference_output = model(text_input, truncation=True, max_length=512)
    result = [x["score"] if x["label"] == "LABEL_1" else 1.0 - x["score"] for x in inference_output]
    
    return result

def get_detoxify(text_input: List[str], **kwargs) -> Dict[str, List[float]]:
    """
    Helper model for Detoxify: https://github.com/unitaryai/detoxify
    returns dict with keys as score name and value being list of scores for text inputs
    """
    model = Detoxify(model_type="unbiased").predict
    return model(text_input)

def get_toxicity_evaluation(df: pd.DataFrame, col_name: str) -> pd.DataFrame:
    """
    Function to evaluate toxicity scores for each row in a DataFrame.
    """
    eval_scores = []
    model_outputs = df[col_name].tolist()
    
    t_scores = get_toxicity(model_outputs)
    toxicity_score = sum(t_scores) / len(t_scores)
    eval_scores.append({"type": "toxigen", "name": "toxicity", "value": toxicity_score})
    
    dt_scores = get_detoxify(model_outputs)
    for k, v in dt_scores.items():
        avg_score = sum(v) / len(v)
        eval_scores.append({"type": "detoxify", "name": k, "value": avg_score})
    
    # Add scores to the DataFrame
    for score in eval_scores:
        df[f"{col_name} - {score['type']}_{score['name']}"] = score['value']
    
    return df

In [29]:
updated_df3 = get_toxicity_evaluation(updated_df2, 'mixtral_8x7b - Issue') #mixtral_8x7b - Issue Summary



In [30]:
updated_df3

Unnamed: 0.1,Unnamed: 0,Transcript,mixtral_8x7b - Summary,mixtral_8x7b - Issue,mixtral_8x7b - Issue Summary,mixtral_8x7b - Severity,mixtral_8x7b - Clinician to Contact,mixtral_8x7b - Summary VS mixtral_8x7b - Issue Summary - METEOR Score,mixtral_8x7b - Summary VS mixtral_8x7b - Issue Summary - ROUGE Score,mixtral_8x7b - Summary VS mixtral_8x7b - Issue Summary - BERT Score,...,mixtral_8x7b - Issue_detoxify_threat,mixtral_8x7b - Issue_detoxify_sexual_explicit,mixtral_8x7b - Issue - toxigen_toxicity,mixtral_8x7b - Issue - detoxify_toxicity,mixtral_8x7b - Issue - detoxify_severe_toxicity,mixtral_8x7b - Issue - detoxify_obscene,mixtral_8x7b - Issue - detoxify_identity_attack,mixtral_8x7b - Issue - detoxify_insult,mixtral_8x7b - Issue - detoxify_threat,mixtral_8x7b - Issue - detoxify_sexual_explicit
0,0,"""This is Nurse Thompson at Lakeside Hospital. ...",Urgent transfer needed for Mr. Brown at Lakes...,"A patient, Mr. David Brown, at Lakeside Hospit...",Stroke patient needing urgent transfer.,High,Support,0.281818,0.133333,0.887617,...,0.001845,0.00423,0.000743,0.012032,0.000007,0.000208,0.00045,0.001259,0.001845,0.00423
1,1,"""My friend has a known allergy to soy and acci...",Friend with soy allergy accidentally ingested...,Patient with a known soy allergy has accidenta...,Allergic reaction to soy ingestion,High,general,0.075764,0.093023,0.847654,...,0.001845,0.00423,0.000743,0.012032,0.000007,0.000208,0.00045,0.001259,0.001845,0.00423
2,2,"""My wife just stopped seizing, but she's confu...","Wife's seizure ended, but she's confused and ...",Patient experienced a seizure and is currently...,Post-seizure confusion and memory loss.,Medium,Support,0.160727,0.111111,0.867032,...,0.001845,0.00423,0.000743,0.012032,0.000007,0.000208,0.00045,0.001259,0.001845,0.00423
3,3,"""A man at the restaurant just fell and isn't r...","Man Fainted, Unresponsive, No Pulse at Restau...",A man has fallen and is unconscious with no pu...,Unconscious man with no pulse,High,Technical,0.195081,0.153846,0.874599,...,0.001845,0.00423,0.000743,0.012032,0.000007,0.000208,0.00045,0.001259,0.001845,0.00423
4,4,I slipped on the ice and my leg buckled. I thi...,Possible Leg Fracture from Ice Slip; Seeks Me...,The patient slipped on ice and believes they m...,Possible broken leg from ice slip,Medium,Support,0.424762,0.307692,0.929276,...,0.001845,0.00423,0.000743,0.012032,0.000007,0.000208,0.00045,0.001259,0.001845,0.00423
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,195,"""My friend's dental implant fell out and is ca...","Dental implant fell out, causing friend's pai...",Patient's friend is experiencing intense pain ...,Fallen-out dental implant causing pain and ble...,Medium,General,0.417562,0.315789,0.922366,...,0.001845,0.00423,0.000743,0.012032,0.000007,0.000208,0.00045,0.001259,0.001845,0.00423
196,196,My husband just collapsed! He's not breathing ...,Spouse's Sudden Collapse; Non-Responsive and ...,"The patient's husband has collapsed, is not br...",Patient's husband is unresponsive and not brea...,High,Technical,0.488889,0.266667,0.898198,...,0.001845,0.00423,0.000743,0.012032,0.000007,0.000208,0.00045,0.001259,0.001845,0.00423
197,197,"""My partner's dental filling came out and is c...","Dental filling dislodged, causing severe pain...",Partner's dental filling came out and causing ...,Partner's dental filling causing pain,Low,General,0.276864,0.153846,0.895531,...,0.001845,0.00423,0.000743,0.012032,0.000007,0.000208,0.00045,0.001259,0.001845,0.00423
198,198,"""My girlfriend's contraceptive shot wore off, ...",Girlfriend Worries About Pregnancy After Cont...,"Patient's contraceptive shot wore off, and she...","Contraceptive shot failure, pregnancy concern",Low,General,0.224172,0.153846,0.893974,...,0.001845,0.00423,0.000743,0.012032,0.000007,0.000208,0.00045,0.001259,0.001845,0.00423


In [31]:
updated_df4 = get_toxicity_evaluation(updated_df3, 'mixtral_8x7b - Issue Summary')



In [32]:
updated_df4

Unnamed: 0.1,Unnamed: 0,Transcript,mixtral_8x7b - Summary,mixtral_8x7b - Issue,mixtral_8x7b - Issue Summary,mixtral_8x7b - Severity,mixtral_8x7b - Clinician to Contact,mixtral_8x7b - Summary VS mixtral_8x7b - Issue Summary - METEOR Score,mixtral_8x7b - Summary VS mixtral_8x7b - Issue Summary - ROUGE Score,mixtral_8x7b - Summary VS mixtral_8x7b - Issue Summary - BERT Score,...,mixtral_8x7b - Issue - detoxify_threat,mixtral_8x7b - Issue - detoxify_sexual_explicit,mixtral_8x7b - Issue Summary - toxigen_toxicity,mixtral_8x7b - Issue Summary - detoxify_toxicity,mixtral_8x7b - Issue Summary - detoxify_severe_toxicity,mixtral_8x7b - Issue Summary - detoxify_obscene,mixtral_8x7b - Issue Summary - detoxify_identity_attack,mixtral_8x7b - Issue Summary - detoxify_insult,mixtral_8x7b - Issue Summary - detoxify_threat,mixtral_8x7b - Issue Summary - detoxify_sexual_explicit
0,0,"""This is Nurse Thompson at Lakeside Hospital. ...",Urgent transfer needed for Mr. Brown at Lakes...,"A patient, Mr. David Brown, at Lakeside Hospit...",Stroke patient needing urgent transfer.,High,Support,0.281818,0.133333,0.887617,...,0.001845,0.00423,0.001305,0.013163,0.000007,0.000254,0.000395,0.002515,0.000607,0.006183
1,1,"""My friend has a known allergy to soy and acci...",Friend with soy allergy accidentally ingested...,Patient with a known soy allergy has accidenta...,Allergic reaction to soy ingestion,High,general,0.075764,0.093023,0.847654,...,0.001845,0.00423,0.001305,0.013163,0.000007,0.000254,0.000395,0.002515,0.000607,0.006183
2,2,"""My wife just stopped seizing, but she's confu...","Wife's seizure ended, but she's confused and ...",Patient experienced a seizure and is currently...,Post-seizure confusion and memory loss.,Medium,Support,0.160727,0.111111,0.867032,...,0.001845,0.00423,0.001305,0.013163,0.000007,0.000254,0.000395,0.002515,0.000607,0.006183
3,3,"""A man at the restaurant just fell and isn't r...","Man Fainted, Unresponsive, No Pulse at Restau...",A man has fallen and is unconscious with no pu...,Unconscious man with no pulse,High,Technical,0.195081,0.153846,0.874599,...,0.001845,0.00423,0.001305,0.013163,0.000007,0.000254,0.000395,0.002515,0.000607,0.006183
4,4,I slipped on the ice and my leg buckled. I thi...,Possible Leg Fracture from Ice Slip; Seeks Me...,The patient slipped on ice and believes they m...,Possible broken leg from ice slip,Medium,Support,0.424762,0.307692,0.929276,...,0.001845,0.00423,0.001305,0.013163,0.000007,0.000254,0.000395,0.002515,0.000607,0.006183
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,195,"""My friend's dental implant fell out and is ca...","Dental implant fell out, causing friend's pai...",Patient's friend is experiencing intense pain ...,Fallen-out dental implant causing pain and ble...,Medium,General,0.417562,0.315789,0.922366,...,0.001845,0.00423,0.001305,0.013163,0.000007,0.000254,0.000395,0.002515,0.000607,0.006183
196,196,My husband just collapsed! He's not breathing ...,Spouse's Sudden Collapse; Non-Responsive and ...,"The patient's husband has collapsed, is not br...",Patient's husband is unresponsive and not brea...,High,Technical,0.488889,0.266667,0.898198,...,0.001845,0.00423,0.001305,0.013163,0.000007,0.000254,0.000395,0.002515,0.000607,0.006183
197,197,"""My partner's dental filling came out and is c...","Dental filling dislodged, causing severe pain...",Partner's dental filling came out and causing ...,Partner's dental filling causing pain,Low,General,0.276864,0.153846,0.895531,...,0.001845,0.00423,0.001305,0.013163,0.000007,0.000254,0.000395,0.002515,0.000607,0.006183
198,198,"""My girlfriend's contraceptive shot wore off, ...",Girlfriend Worries About Pregnancy After Cont...,"Patient's contraceptive shot wore off, and she...","Contraceptive shot failure, pregnancy concern",Low,General,0.224172,0.153846,0.893974,...,0.001845,0.00423,0.001305,0.013163,0.000007,0.000254,0.000395,0.002515,0.000607,0.006183


In [33]:
updated_df4.columns

Index(['Unnamed: 0', 'Transcript', 'mixtral_8x7b - Summary',
       'mixtral_8x7b - Issue', 'mixtral_8x7b - Issue Summary',
       'mixtral_8x7b - Severity', 'mixtral_8x7b - Clinician to Contact',
       'mixtral_8x7b - Summary VS mixtral_8x7b - Issue Summary - METEOR Score',
       'mixtral_8x7b - Summary VS mixtral_8x7b - Issue Summary - ROUGE Score',
       'mixtral_8x7b - Summary VS mixtral_8x7b - Issue Summary - BERT Score',
       'mixtral_8x7b - Issue VS mixtral_8x7b - Issue Summary - METEOR Score',
       'mixtral_8x7b - Issue VS mixtral_8x7b - Issue Summary - ROUGE Score',
       'mixtral_8x7b - Issue VS mixtral_8x7b - Issue Summary - BERT Score',
       'mixtral_8x7b - Issue_toxigen_toxicity',
       'mixtral_8x7b - Issue_detoxify_toxicity',
       'mixtral_8x7b - Issue_detoxify_severe_toxicity',
       'mixtral_8x7b - Issue_detoxify_obscene',
       'mixtral_8x7b - Issue_detoxify_identity_attack',
       'mixtral_8x7b - Issue_detoxify_insult',
       'mixtral_8x7b - Issue_de

In [None]:
# pip install ipywidgets

---
# **Evaluate Hallucination**

---

In [37]:
# !pip install -U sentence-transformers

In [39]:
from sentence_transformers import CrossEncoder

model = CrossEncoder('vectara/hallucination_evaluation_model')

In [46]:
import pandas as pd

def get_hallucination_score(source, generation):
    '''
    A score of less than 0.5 indicates a likely hallucination.
    Note that the context length of the model is 512 tokens across both documents.
    '''
    scores = model.predict([source, generation])
    return scores

def evaluate_hallucination(df: pd.DataFrame, original_col: str, generated_col: str) -> pd.DataFrame:
    """
    Function to evaluate hallucination scores for each pair of original and generated texts in a DataFrame,
    and append the scores to new columns in the DataFrame.
    """
    for index, row in df.iterrows():
        original_text = row[original_col]
        generated_text = row[generated_col]
        
        score = get_hallucination_score(original_text, generated_text)
        is_hallucination = score < 0.5
        
        col_name_score = f"{original_col} - {generated_col} - Hallucination score"
        col_name_hallucination = f"{original_col} - {generated_col} - is_hallucination"
        
        df.at[index, col_name_score] = score
        df.at[index, col_name_hallucination] = is_hallucination
    
    return df

In [47]:
updated_df4.columns

Index(['Unnamed: 0', 'Transcript', 'mixtral_8x7b - Summary',
       'mixtral_8x7b - Issue', 'mixtral_8x7b - Issue Summary',
       'mixtral_8x7b - Severity', 'mixtral_8x7b - Clinician to Contact',
       'mixtral_8x7b - Summary VS mixtral_8x7b - Issue Summary - METEOR Score',
       'mixtral_8x7b - Summary VS mixtral_8x7b - Issue Summary - ROUGE Score',
       'mixtral_8x7b - Summary VS mixtral_8x7b - Issue Summary - BERT Score',
       'mixtral_8x7b - Issue VS mixtral_8x7b - Issue Summary - METEOR Score',
       'mixtral_8x7b - Issue VS mixtral_8x7b - Issue Summary - ROUGE Score',
       'mixtral_8x7b - Issue VS mixtral_8x7b - Issue Summary - BERT Score',
       'mixtral_8x7b - Issue_toxigen_toxicity',
       'mixtral_8x7b - Issue_detoxify_toxicity',
       'mixtral_8x7b - Issue_detoxify_severe_toxicity',
       'mixtral_8x7b - Issue_detoxify_obscene',
       'mixtral_8x7b - Issue_detoxify_identity_attack',
       'mixtral_8x7b - Issue_detoxify_insult',
       'mixtral_8x7b - Issue_de

In [48]:
updated_df5 = evaluate_hallucination(updated_df4, 'Transcript','mixtral_8x7b - Summary')
updated_df5

  df.at[index, col_name_hallucination] = is_hallucination


Unnamed: 0.1,Unnamed: 0,Transcript,mixtral_8x7b - Summary,mixtral_8x7b - Issue,mixtral_8x7b - Issue Summary,mixtral_8x7b - Severity,mixtral_8x7b - Clinician to Contact,mixtral_8x7b - Summary VS mixtral_8x7b - Issue Summary - METEOR Score,mixtral_8x7b - Summary VS mixtral_8x7b - Issue Summary - ROUGE Score,mixtral_8x7b - Summary VS mixtral_8x7b - Issue Summary - BERT Score,...,mixtral_8x7b - Issue Summary - toxigen_toxicity,mixtral_8x7b - Issue Summary - detoxify_toxicity,mixtral_8x7b - Issue Summary - detoxify_severe_toxicity,mixtral_8x7b - Issue Summary - detoxify_obscene,mixtral_8x7b - Issue Summary - detoxify_identity_attack,mixtral_8x7b - Issue Summary - detoxify_insult,mixtral_8x7b - Issue Summary - detoxify_threat,mixtral_8x7b - Issue Summary - detoxify_sexual_explicit,Transcript - mixtral_8x7b - Summary - Hallucination score,Transcript - mixtral_8x7b - Summary - is_hallucination
0,0,"""This is Nurse Thompson at Lakeside Hospital. ...",Urgent transfer needed for Mr. Brown at Lakes...,"A patient, Mr. David Brown, at Lakeside Hospit...",Stroke patient needing urgent transfer.,High,Support,0.281818,0.133333,0.887617,...,0.001305,0.013163,0.000007,0.000254,0.000395,0.002515,0.000607,0.006183,0.999651,False
1,1,"""My friend has a known allergy to soy and acci...",Friend with soy allergy accidentally ingested...,Patient with a known soy allergy has accidenta...,Allergic reaction to soy ingestion,High,general,0.075764,0.093023,0.847654,...,0.001305,0.013163,0.000007,0.000254,0.000395,0.002515,0.000607,0.006183,0.997781,False
2,2,"""My wife just stopped seizing, but she's confu...","Wife's seizure ended, but she's confused and ...",Patient experienced a seizure and is currently...,Post-seizure confusion and memory loss.,Medium,Support,0.160727,0.111111,0.867032,...,0.001305,0.013163,0.000007,0.000254,0.000395,0.002515,0.000607,0.006183,0.573810,False
3,3,"""A man at the restaurant just fell and isn't r...","Man Fainted, Unresponsive, No Pulse at Restau...",A man has fallen and is unconscious with no pu...,Unconscious man with no pulse,High,Technical,0.195081,0.153846,0.874599,...,0.001305,0.013163,0.000007,0.000254,0.000395,0.002515,0.000607,0.006183,0.998697,False
4,4,I slipped on the ice and my leg buckled. I thi...,Possible Leg Fracture from Ice Slip; Seeks Me...,The patient slipped on ice and believes they m...,Possible broken leg from ice slip,Medium,Support,0.424762,0.307692,0.929276,...,0.001305,0.013163,0.000007,0.000254,0.000395,0.002515,0.000607,0.006183,0.962656,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,195,"""My friend's dental implant fell out and is ca...","Dental implant fell out, causing friend's pai...",Patient's friend is experiencing intense pain ...,Fallen-out dental implant causing pain and ble...,Medium,General,0.417562,0.315789,0.922366,...,0.001305,0.013163,0.000007,0.000254,0.000395,0.002515,0.000607,0.006183,0.999357,False
196,196,My husband just collapsed! He's not breathing ...,Spouse's Sudden Collapse; Non-Responsive and ...,"The patient's husband has collapsed, is not br...",Patient's husband is unresponsive and not brea...,High,Technical,0.488889,0.266667,0.898198,...,0.001305,0.013163,0.000007,0.000254,0.000395,0.002515,0.000607,0.006183,0.882119,False
197,197,"""My partner's dental filling came out and is c...","Dental filling dislodged, causing severe pain...",Partner's dental filling came out and causing ...,Partner's dental filling causing pain,Low,General,0.276864,0.153846,0.895531,...,0.001305,0.013163,0.000007,0.000254,0.000395,0.002515,0.000607,0.006183,0.999512,False
198,198,"""My girlfriend's contraceptive shot wore off, ...",Girlfriend Worries About Pregnancy After Cont...,"Patient's contraceptive shot wore off, and she...","Contraceptive shot failure, pregnancy concern",Low,General,0.224172,0.153846,0.893974,...,0.001305,0.013163,0.000007,0.000254,0.000395,0.002515,0.000607,0.006183,0.940026,False


In [49]:
updated_df6 = evaluate_hallucination(updated_df5, 'mixtral_8x7b - Summary','mixtral_8x7b - Issue Summary')
updated_df6

  df.at[index, col_name_hallucination] = is_hallucination


Unnamed: 0.1,Unnamed: 0,Transcript,mixtral_8x7b - Summary,mixtral_8x7b - Issue,mixtral_8x7b - Issue Summary,mixtral_8x7b - Severity,mixtral_8x7b - Clinician to Contact,mixtral_8x7b - Summary VS mixtral_8x7b - Issue Summary - METEOR Score,mixtral_8x7b - Summary VS mixtral_8x7b - Issue Summary - ROUGE Score,mixtral_8x7b - Summary VS mixtral_8x7b - Issue Summary - BERT Score,...,mixtral_8x7b - Issue Summary - detoxify_severe_toxicity,mixtral_8x7b - Issue Summary - detoxify_obscene,mixtral_8x7b - Issue Summary - detoxify_identity_attack,mixtral_8x7b - Issue Summary - detoxify_insult,mixtral_8x7b - Issue Summary - detoxify_threat,mixtral_8x7b - Issue Summary - detoxify_sexual_explicit,Transcript - mixtral_8x7b - Summary - Hallucination score,Transcript - mixtral_8x7b - Summary - is_hallucination,mixtral_8x7b - Summary - mixtral_8x7b - Issue Summary - Hallucination score,mixtral_8x7b - Summary - mixtral_8x7b - Issue Summary - is_hallucination
0,0,"""This is Nurse Thompson at Lakeside Hospital. ...",Urgent transfer needed for Mr. Brown at Lakes...,"A patient, Mr. David Brown, at Lakeside Hospit...",Stroke patient needing urgent transfer.,High,Support,0.281818,0.133333,0.887617,...,0.000007,0.000254,0.000395,0.002515,0.000607,0.006183,0.999651,False,0.997997,False
1,1,"""My friend has a known allergy to soy and acci...",Friend with soy allergy accidentally ingested...,Patient with a known soy allergy has accidenta...,Allergic reaction to soy ingestion,High,general,0.075764,0.093023,0.847654,...,0.000007,0.000254,0.000395,0.002515,0.000607,0.006183,0.997781,False,0.983774,False
2,2,"""My wife just stopped seizing, but she's confu...","Wife's seizure ended, but she's confused and ...",Patient experienced a seizure and is currently...,Post-seizure confusion and memory loss.,Medium,Support,0.160727,0.111111,0.867032,...,0.000007,0.000254,0.000395,0.002515,0.000607,0.006183,0.573810,False,0.165503,True
3,3,"""A man at the restaurant just fell and isn't r...","Man Fainted, Unresponsive, No Pulse at Restau...",A man has fallen and is unconscious with no pu...,Unconscious man with no pulse,High,Technical,0.195081,0.153846,0.874599,...,0.000007,0.000254,0.000395,0.002515,0.000607,0.006183,0.998697,False,0.995612,False
4,4,I slipped on the ice and my leg buckled. I thi...,Possible Leg Fracture from Ice Slip; Seeks Me...,The patient slipped on ice and believes they m...,Possible broken leg from ice slip,Medium,Support,0.424762,0.307692,0.929276,...,0.000007,0.000254,0.000395,0.002515,0.000607,0.006183,0.962656,False,0.999596,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,195,"""My friend's dental implant fell out and is ca...","Dental implant fell out, causing friend's pai...",Patient's friend is experiencing intense pain ...,Fallen-out dental implant causing pain and ble...,Medium,General,0.417562,0.315789,0.922366,...,0.000007,0.000254,0.000395,0.002515,0.000607,0.006183,0.999357,False,0.999453,False
196,196,My husband just collapsed! He's not breathing ...,Spouse's Sudden Collapse; Non-Responsive and ...,"The patient's husband has collapsed, is not br...",Patient's husband is unresponsive and not brea...,High,Technical,0.488889,0.266667,0.898198,...,0.000007,0.000254,0.000395,0.002515,0.000607,0.006183,0.882119,False,0.968656,False
197,197,"""My partner's dental filling came out and is c...","Dental filling dislodged, causing severe pain...",Partner's dental filling came out and causing ...,Partner's dental filling causing pain,Low,General,0.276864,0.153846,0.895531,...,0.000007,0.000254,0.000395,0.002515,0.000607,0.006183,0.999512,False,0.602753,False
198,198,"""My girlfriend's contraceptive shot wore off, ...",Girlfriend Worries About Pregnancy After Cont...,"Patient's contraceptive shot wore off, and she...","Contraceptive shot failure, pregnancy concern",Low,General,0.224172,0.153846,0.893974,...,0.000007,0.000254,0.000395,0.002515,0.000607,0.006183,0.940026,False,0.994738,False


In [50]:
updated_df6.to_excel("mixtral_8x7b-Evaluated-Toxicity-Hallucination-Summary-shuffled_transcript-F200.xlsx", index= False)