In [1]:
import torch
import numpy as np
from transformers import AutoModelForSequenceClassification, AutoConfig
from eval_metrics import IOGuards, TextStat, ComparisonMetrics
from eval_metrics import TraditionalPipelines
import evaluate

# Initialize the evaluation components
guard = IOGuards()
stat = TextStat()
comp = ComparisonMetrics()

# Initialize the traditional NLP pipelines
trad_pipelines = TraditionalPipelines()

# Load the correct model configuration and model
config = AutoConfig.from_pretrained('vectara/hallucination_evaluation_model', trust_remote_code=True)
model = AutoModelForSequenceClassification.from_pretrained('vectara/hallucination_evaluation_model', config=config, trust_remote_code=True)

# Load the BLEURT model with a larger sequence length
bleurt = evaluate.load('bleurt', 'bleurt-large-512')

def evaluate_all(query, context_lis, response):
    """
    Evaluate the quality and safety of the response given a query and context.
    """
    context = "\n".join(context_lis)

    RESULT = {}

    # Guards and Safety Checks
    RESULT["guards"] = {
        "query_injection": guard.prompt_injection_classif(query),
        "context_injection": guard.prompt_injection_classif(context),
        "query_bias": guard.bias(query),
        "context_bias": guard.bias(context),
        "response_bias": guard.bias(response),
        "query_regex": guard.detect_pattern(query),
        "context_regex": guard.detect_pattern(context),
        "response_regex": guard.detect_pattern(response),
        "query_toxicity": guard.toxicity(query),
        "context_toxicity": guard.toxicity(context),
        "response_toxicity": guard.toxicity(response),
        "query_sentiment": guard.sentiment(query),
        "query_polarity": guard.polarity(query),
        "context_polarity": guard.polarity(context), 
        "response_polarity": guard.polarity(response), 
        "query_response_hallucination": comp.hallucinations(query, response),
        "context_response_hallucination": comp.hallucinations(context, response),
        "query_response_contradiction": comp.contradiction(query, response),
        "context_response_contradiction": comp.contradiction(context, response),
    }

    # Harmful content and refusal analysis
    RESULT["guards"].update(guard.harmful_refusal_guards(query, context, response))

    # Reference-Based Metrics
    RESULT["reference_based_metrics"] = {}
    for key, val in comp.ref_focussed_metrics(query, response).items():
        RESULT["reference_based_metrics"][f"query_response_{key}"] = val

    for key, val in comp.ref_focussed_metrics(context, response).items():
        RESULT["reference_based_metrics"][f"context_response_{key}"] = val
    
    # String Similarity Metrics
    RESULT["string_similarities"] = {}
    for key, val in comp.string_similarity(query, response).items():
        RESULT["string_similarities"][f"query_response_{key}"] = val

    for key, val in comp.string_similarity(context, response).items():
        RESULT["string_similarities"][f"context_response_{key}"] = val
    
    # Text Statistics
    RESULT["response_text_stats"] = {}
    for key, val in stat.calculate_text_stat(response).items():
        RESULT["response_text_stats"][f"result_{key}"] = val

    # Summarization
    summary_text = trad_pipelines.Summarizer(response, model_name="t5")
    RESULT["summarizer_summary"] = summary_text

    # POS and NER
    pos_result = trad_pipelines.POS(response)
    ner_result = trad_pipelines.NER(response)
    RESULT["POS"] = pos_result
    RESULT["NER"] = ner_result
    
    return RESULT

# Example usage
query = "Everyone is a terrorist"
context_lis = ["Eminem is the white legend", "Trump's a bitch"]
response = "There is no answer to that. These questions and context are bad"

evaluation_result = evaluate_all(query, context_lis, response)

# Display the evaluation results
import pprint
pprint.pprint(evaluation_result)





2024-07-30 14:33:09.592 
  command:

    streamlit run E:\Github_Projects\ChatwithMyData\.venv\lib\site-packages\ipykernel_launcher.py [ARGUMENTS]
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Bling\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!





All model checkpoint layers were used when initializing TFDistilBertForSequenceClassification.

All the layers of TFDistilBertForSequenceClassification were initialized from the model checkpoint at d4data/bias-detection-model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.
You are using a model of type HHEMv2Config to instantiate a model of type HHEMv2. This is not supported for all configurations of models and can yield errors.
Using default BLEURT-Base checkpoint for sequence maximum length 128. You can use a bigger model for better results with e.g.: evaluate.load('bleurt', 'bleurt-large-512').



INFO:tensorflow:Reading checkpoint C:\Users\Bling\.cache\huggingface\metrics\bleurt\default\downloads\extracted\a6efdcb912e038fca582570be0606d0ef6237a18b82cde00fe064f0c620e1f06\bleurt-base-128.
INFO:tensorflow:Config file found, reading.
INFO:tensorflow:Will load checkpoint bert_custom
INFO:tensorflow:Loads full paths and checks that files exists.
INFO:tensorflow:... name:bert_custom
INFO:tensorflow:... vocab_file:vocab.txt
INFO:tensorflow:... bert_config_file:bert_config.json
INFO:tensorflow:... do_lower_case:True
INFO:tensorflow:... max_seq_length:128
INFO:tensorflow:Creating BLEURT scorer.
INFO:tensorflow:Creating WordPiece tokenizer.

INFO:tensorflow:WordPiece tokenizer instantiated.
INFO:tensorflow:Creating Eager Mode predictor.
INFO:tensorflow:Loading model.
INFO:tensorflow:BLEURT initialized.


INFO:tensorflow:BLEURT initialized.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Bling\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Bling\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Bling\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision f2482bf (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected 

INFO:tensorflow:Reading checkpoint C:\Users\Bling\.cache\huggingface\metrics\bleurt\bleurt-large-512\downloads\extracted\fb1fde3a4c34adc8df0dc83962aea738ecfc537a61ee99b9f3f5b9d8beb530e9\bleurt-large-512.


INFO:tensorflow:Reading checkpoint C:\Users\Bling\.cache\huggingface\metrics\bleurt\bleurt-large-512\downloads\extracted\fb1fde3a4c34adc8df0dc83962aea738ecfc537a61ee99b9f3f5b9d8beb530e9\bleurt-large-512.


INFO:tensorflow:Config file found, reading.


INFO:tensorflow:Config file found, reading.


INFO:tensorflow:Will load checkpoint bert_custom


INFO:tensorflow:Will load checkpoint bert_custom


INFO:tensorflow:Loads full paths and checks that files exists.


INFO:tensorflow:Loads full paths and checks that files exists.


INFO:tensorflow:... name:bert_custom


INFO:tensorflow:... name:bert_custom


INFO:tensorflow:... vocab_file:vocab.txt


INFO:tensorflow:... vocab_file:vocab.txt


INFO:tensorflow:... bert_config_file:bert_config.json


INFO:tensorflow:... bert_config_file:bert_config.json


INFO:tensorflow:... do_lower_case:True


INFO:tensorflow:... do_lower_case:True


INFO:tensorflow:... max_seq_length:512


INFO:tensorflow:... max_seq_length:512


INFO:tensorflow:Creating BLEURT scorer.


INFO:tensorflow:Creating BLEURT scorer.


INFO:tensorflow:Creating WordPiece tokenizer.


INFO:tensorflow:Creating WordPiece tokenizer.


INFO:tensorflow:WordPiece tokenizer instantiated.


INFO:tensorflow:WordPiece tokenizer instantiated.


INFO:tensorflow:Creating Eager Mode predictor.


INFO:tensorflow:Creating Eager Mode predictor.


INFO:tensorflow:Loading model.


INFO:tensorflow:Loading model.


INFO:tensorflow:BLEURT initialized.


INFO:tensorflow:BLEURT initialized.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Your max_length is set to 30, but your input_length is only 16. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=8)


{'NER': [],
 'POS': [{'end': 5,
          'entity': 'PRON',
          'index': 1,
          'score': 0.9994524,
          'start': 0,
          'word': 'there'},
         {'end': 8,
          'entity': 'VERB',
          'index': 2,
          'score': 0.9990658,
          'start': 6,
          'word': 'is'},
         {'end': 11,
          'entity': 'DET',
          'index': 3,
          'score': 0.999286,
          'start': 9,
          'word': 'no'},
         {'end': 18,
          'entity': 'NOUN',
          'index': 4,
          'score': 0.99812347,
          'start': 12,
          'word': 'answer'},
         {'end': 21,
          'entity': 'ADP',
          'index': 5,
          'score': 0.99938786,
          'start': 19,
          'word': 'to'},
         {'end': 26,
          'entity': 'PRON',
          'index': 6,
          'score': 0.9989139,
          'start': 22,
          'word': 'that'},
         {'end': 27,
          'entity': 'PUNCT',
          'index': 7,
          'score': 