# Using IBM watsonx.governance metrics toolkit to evaluate the quality of your Prompt Template

In [None]:
!pip install --upgrade ibm-watson-machine-learning   | tail -n 1
!pip install --upgrade ibm-watson-openscale --no-cache | tail -n 1
!pip install --upgrade ibm-metrics-plugin --no-cache | tail -n 1

In [None]:
!pip install --upgrade evaluate --no-cache | tail -n 1
!pip install --upgrade rouge_score --no-cache | tail -n 1
!pip install --upgrade textstat --no-cache | tail -n 1
!pip install --upgrade sacrebleu --no-cache | tail -n 1
!pip install --upgrade sacremoses --no-cache | tail -n 1
!pip install --upgrade datasets==2.10.0 --no-cache | tail -n 1
!pip install --upgrade torchmetrics --no-cache | tail -n 1
!pip install --upgrade pyspellchecker spacy --no-cache | tail -n 1

In [None]:
import spacy
spacy.cli.download("en_core_web_sm")
!python -m nltk.downloader punkt

In [4]:
import warnings
warnings.filterwarnings('ignore')

## Provision services and configure credentials

If you have not already, provision an instance of IBM Watson OpenScale using the [OpenScale link in the Cloud catalog](https://cloud.ibm.com/catalog/services/watson-openscale).

Your Cloud API key can be generated by going to the [**Users** section of the Cloud console](https://cloud.ibm.com/iam#/users). From that page, click your name, scroll down to the **API Keys** section, and click **Create an IBM Cloud API key**. Give your key a name and click **Create**, then copy the created key and paste it below.

**NOTE:** You can also get OpenScale `API_KEY` using IBM CLOUD CLI.

How to install IBM Cloud (bluemix) console: [instruction](https://console.bluemix.net/docs/cli/reference/ibmcloud/download_cli.html#install_use)

How to get api key using console:
```
bx login --sso
bx iam api-key-create 'my_key'
```

In [6]:
use_cpd = False
CLOUD_API_KEY = "***"
IAM_URL="https://iam.ng.bluemix.net/oidc/token"

Uncomment the code and run the below cell only if you are running your notebook on a CPD cluster.

In [8]:
# use_cpd = True
# WOS_CREDENTIALS = {
#     "url": "xxxxx",
#     "username": "xxxxx",
#     "password": "xxxxx",
#     "apikey": "xxxxx"
# }

## IBM watsonx.governance authentication

In [None]:
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator,BearerTokenAuthenticator,CloudPakForDataAuthenticator

from ibm_watson_openscale import *
from ibm_watson_openscale.supporting_classes.enums import *
from ibm_watson_openscale.supporting_classes import *

if use_cpd:
    authenticator = CloudPakForDataAuthenticator(
            url=WOS_CREDENTIALS['url'],
            username=WOS_CREDENTIALS['username'],
            apikey=WOS_CREDENTIALS['apikey'],
            disable_ssl_verification=True,
        )
    
    client = APIClient(service_url=WOS_CREDENTIALS['url'],authenticator=authenticator)
    print(client.version)
else:
    authenticator = IAMAuthenticator(apikey=CLOUD_API_KEY)
    client = APIClient(authenticator=authenticator)
    print(client.version)

# Common Imports

In [11]:
from ibm_metrics_plugin.metrics.llm.utils.constants import LLMTextMetricGroup
from ibm_metrics_plugin.metrics.llm.utils.constants import  LLMGenerationMetrics
from ibm_metrics_plugin.metrics.llm.utils.constants import LLMSummarizationMetrics
from ibm_metrics_plugin.metrics.llm.utils.constants import LLMQAMetrics
from ibm_metrics_plugin.metrics.llm.utils.constants import LLMClassificationMetrics
from ibm_metrics_plugin.metrics.llm.utils.constants import HAP_SCORE
from ibm_metrics_plugin.metrics.llm.utils.constants import PII_DETECTION
from ibm_metrics_plugin.metrics.llm.utils.constants import LLMCommonMetrics
from ibm_metrics_plugin.metrics.llm.utils.constants import ContentValidationMetrics
from ibm_metrics_plugin.metrics.llm.utils.constants import ContentValidationMetricsParameters
from ibm_metrics_plugin.metrics.llm.utils.constants import FLESCH
import pandas as pd
import numpy as np

# Evaluating Summarization output from AWS/anthropic.claude-v2

## Test data containing the summarization output from model and the reference data

In [None]:
!rm -fr llm_content.csv
!wget "https://raw.githubusercontent.com/IBM/watson-openscale-samples/main/IBM%20Cloud/WML/assets/data/watsonx/llm_content.csv"

In [None]:
llm_data_all = pd.read_csv("llm_content.csv")
llm_data_all.head()

In [None]:
llm_data = llm_data_all.head(10)
llm_data.head()

In [15]:
df_input = llm_data[['input_text']].copy()
df_output = llm_data[['generated_summary']].copy()
df_reference = llm_data[['reference_summary_1']].copy()

## Metrics configuration for evaluation

In [16]:
metric_config = {   
    "configuration": {
        LLMTextMetricGroup.SUMMARIZATION.value: {
            LLMSummarizationMetrics.ROUGE_SCORE.value: {},
            LLMSummarizationMetrics.SARI.value: {},
            LLMSummarizationMetrics.METEOR.value: {},
            LLMSummarizationMetrics.NORMALIZED_RECALL.value: {},
            LLMSummarizationMetrics.NORMALIZED_PRECISION.value: {},
            LLMSummarizationMetrics.NORMALIZED_F1_SCORE.value: {},
            LLMSummarizationMetrics.COSINE_SIMILARITY.value: {},
            LLMSummarizationMetrics.JACCARD_SIMILARITY.value: {},
            LLMSummarizationMetrics.BLEU.value: {},
            LLMSummarizationMetrics.FLESCH.value: {},
            LLMCommonMetrics.CONTENT_ANALYSIS.value: {},
            LLMCommonMetrics.KEYWORDS_INCLUSION.value: {}
        }
    }
}

## Summarization Metrics Evaluation

In [None]:
import json
result = client.llm_metrics.compute_metrics(metric_config,df_input,df_output, df_reference)

## Evaluated Metrics

In [None]:
print(json.dumps(result,indent=2))

# Evaluating Content Generation output from the Foundation Model

## Test data containing the content generation output from model and the reference data

In [None]:
!rm -fr llm_content_generation.csv
!wget "https://raw.githubusercontent.com/IBM/watson-openscale-samples/main/IBM%20Cloud/WML/assets/data/watsonx/llm_content_generation.csv"

In [None]:
data = pd.read_csv("llm_content_generation.csv")
data.head()

In [22]:
df_input = data[['question']].copy()
df_output = data[['generated_text']].copy()
df_reference = data[['reference_text']].copy()

## Metrics configuration for evaluation

In [23]:
metric_config = {   
    #All Common parameters goes here 
    "configuration": {        
        LLMTextMetricGroup.GENERATION.value: { # metric group   
            LLMGenerationMetrics.BLEU.value: {},
            LLMGenerationMetrics.ROUGE_SCORE.value: {},
            LLMGenerationMetrics.FLESCH.value: {},
            LLMGenerationMetrics.METEOR.value: {},            
            LLMGenerationMetrics.NORMALIZED_RECALL.value: {},
            LLMGenerationMetrics.NORMALIZED_PRECISION.value: {},
            LLMGenerationMetrics.NORMALIZED_F1_SCORE.value: {}            
        }    
    }
}

## Content Generation Metrics Evaluation

In [None]:
result = client.llm_metrics.compute_metrics(metric_config,df_input,df_output, df_reference)

## Evaluated Metrics

In [None]:
print(json.dumps(result,indent=2))

# Evaluating Question and Answering output from the Foundation Model

## Test data containing the question and answer output from model and the reference data

In [None]:
!rm -fr llm_content_qa.csv
!wget "https://raw.githubusercontent.com/IBM/watson-openscale-samples/main/IBM%20Cloud/WML/assets/data/watsonx/llm_content_qa.csv"

In [None]:
data = pd.read_csv("llm_content_qa.csv")
data.head()

In [28]:
df_input = data[['question']].copy()
df_output = data[['answers']].copy()
df_reference = data[['answers']].copy()

## Metrics configuration for evaluation

In [29]:
metric_config = {   
    #All Common parameters goes here 
    "configuration": {        
        LLMTextMetricGroup.QA.value: { # metric group   
            LLMQAMetrics.EXACT_MATCH.value: {},
            LLMQAMetrics.ROUGE_SCORE.value: {},
            LLMQAMetrics.BLEU.value: {},
            LLMCommonMetrics.UNSUCCESSFUL_REQUESTS.value: {},
            LLMCommonMetrics.KEYWORDS_INCLUSION.value: {},
            LLMCommonMetrics.QUESTION_ROBUSTNESS.value: {"metrics": ["spelling_robustness"],
                                                         "excluded_keywords": ["chris","bucharest"]}
        }    
    }
}

## Question and Answering Metrics Evaluation

In [30]:
result = client.llm_metrics.compute_metrics(metric_config,df_input,df_output, df_reference)

## Evaluated Metrics

In [None]:
print(json.dumps(result,indent=2))

# Evaluating Text Classification output from the Foundation Model

## Test data containing the text classification output from model and the reference data

In [None]:
!rm -fr llm_content_classification.csv
!wget "https://raw.githubusercontent.com/IBM/watson-openscale-samples/main/IBM%20Cloud/WML/assets/data/watsonx/llm_content_classification.csv"

In [None]:
data = pd.read_csv("llm_content_classification.csv")
data.head()

In [34]:
data['label'] = data['label'].replace({'ham': 0, 'spam': 1})

In [35]:
df_input = data[['text']].copy()
df_output = data[['label']].copy()
df_reference = data[['label']].copy()

## Make some realistic reference column

In [36]:
shuffled_column = df_reference['label'].sample(frac=1).reset_index(drop=True)
df_reference['label'] = shuffled_column

## Metrics configuration for evaluation

In [37]:
metric_config = {   
    #All Common parameters goes here 
    "configuration": {        
        LLMTextMetricGroup.CLASSIFICATION.value: { # metric group   
            LLMClassificationMetrics.ACCURACY.value: {},
            LLMClassificationMetrics.PRECISION.value: {},
            LLMClassificationMetrics.RECALL.value: {},
            LLMClassificationMetrics.F1_SCORE.value: {},
            LLMClassificationMetrics.MATTHEWS_CORRELATION.value: {},            
        }    
    }
}

## Text Classification Metrics Evaluation

In [38]:
result = client.llm_metrics.compute_metrics(metric_config,df_input,df_output, df_reference)

## Evaluated Metrics

In [None]:
print(json.dumps(result,indent=2))

# Evaluating Entity extraction output from the Foundation Model

## Test data containing the entity extraction output from model and the reference data

In [None]:
!rm -fr llm_extraction.csv
!wget "https://raw.githubusercontent.com/IBM/watson-openscale-samples/main/IBM%20Cloud/WML/assets/data/watsonx/llm_extraction.csv"

In [None]:
data = pd.read_csv("llm_extraction.csv")
data.head()

In [None]:
df_input = data[['input_text']].copy()
df_output = data[['generated_text']].copy()
df_reference = data[['reference_text']].copy()

## Metrics configuration for extraction

In [None]:
metric_config = {   
    #All Common parameters goes here 
    "configuration": {        
        LLMTextMetricGroup.EXTRACTION.value: { # metric group   
            LLMQAMetrics.EXACT_MATCH.value: {},
            LLMExtractionMetrics.MULTI_LABEL.value: {},
            FLESCH: {}            
        }    
    }
}

## Entity extraction Metrics Evaluation

In [None]:
result = client.llm_metrics.compute_metrics(metric_config,df_input,df_output,df_reference)

## Evaluated Metrics

In [None]:
print(json.dumps(result,indent=2))

 # Evaluating Retrieval-Augmented Generation(RAG) output from Foundation Model

## Test data containing question, answer and relevant context from model output for RAG metrics.

In [40]:
!rm -rf rag_ibm_faq.csv
!wget "https://raw.githubusercontent.com/IBM/watson-openscale-samples/main/IBM%20Cloud/WML/assets/data/watsonx/rag_ibm_faq.csv"

In [41]:
data = pd.read_csv("rag_ibm_faq.csv")
data.head()

In [43]:
df_input = data[["contexts","question"]].copy()
df_output = data[["answer"]].copy()
df_reference = data[["reference"]].copy()

#### Metrics configuration for evaluation
##### For Content-Analysis metrics, a list of sub-metrics can be passed **["coverage","density","compression","abstractness","repetitiveness"]**. RAG task type only have **["coverage","density","abstractness"]**, "n-grams" for abstractness and repetitiveness can also be configured, default 1.

e.g. **LLMCommonMetrics.CONTENT_ANALYSIS.value: {"metrics":["coverage","density","abstractness"], "abstractness":{"ngrams":2}}**

##### For RAG task type source dataframe variable can have multiple columns, in configuration it needs to be specified at global level, with key "context_columns", value as list of columns name for context and with key "question_column", value as name of question column.

##### For Unsuccessful-request, a list of custom phrase can be passed, it will override the default phrases.

e.g. **LLMCommonMetrics.UNSUCCESSFUL_REQUESTS.value: {"unsuccessful_phrases":["i don't know", "i am not sure"]}**

##### For Question-Robustness a list of keywords can be passed with key as "excluded_keywords" which will be excluded from spell check

In [44]:
metric_config = {
    "configuration": {
        "record_level":False,
        "context_columns":["contexts"],
        "question_column": "question",
        LLMTextMetricGroup.RAG.value: {
            LLMCommonMetrics.CONTENT_ANALYSIS.value: {},
            LLMCommonMetrics.UNSUCCESSFUL_REQUESTS.value: {
                # "unsuccessful_phrases": []
            },
            LLMCommonMetrics.KEYWORDS_INCLUSION.value: {},
            LLMCommonMetrics.QUESTION_ROBUSTNESS.value: {"metrics": ["spelling_robustness"],
                                                         "excluded_keywords": ["ibm","watsonx","openshift","ocp"]},
        }
    }
}

## RAG Metrics Evaluation

In [45]:
result = client.llm_metrics.compute_metrics(metric_config,df_input, df_output,df_reference)

## Evaluated Metrics

In [None]:
print(json.dumps(result,indent=2))

 # Evaluating content validation metrics

In [None]:
!rm -rf llm_content_validation.csv
!wget "https://raw.githubusercontent.com/IBM/watson-openscale-samples/main/IBM%20Cloud/WML/assets/data/watsonx/llm_content_validation.csv"

In [None]:
llm_data_all = pd.read_csv("content_validation.csv")
llm_data_all.head()

predictions = llm_data_all[['predictions']].copy()

### Metrics configuration for evaluating content validation metrics under `SUMMARIZATION` task type.

If we dont specify any sub metric for content validation, all the sub metics that do not need an additional parameter will computed

Note : 
- `CONTAINS_VALID_LINK` and `NO_INVALID_LINKS` cannot compute the accurate measures in an air gaped environment.

#### Metrics configuration without sub metric

In [None]:
metric_config = {    
    "configuration": {        
        LLMTextMetricGroup.SUMMARIZATION.value: {
            LLMCommonMetrics.CONTENT_VALIDATION.value: {},           
        }    
    }
}

In [None]:
result = client.llm_metrics.compute_metrics(configuration=metric_config, predictions=predictions)

### Evaluated Metrics

In [None]:
import json
print(json.dumps(result,indent=2))

#### Metrics configuration with sub metrics

In [None]:
metric_config = {    
    "configuration": {        
        LLMTextMetricGroup.SUMMARIZATION.value: {
            "record_level":True,
            LLMCommonMetrics.CONTENT_VALIDATION.value: {
                ContentValidationMetrics.CONTAINS_ANY.value: {
                    ContentValidationMetricsParameters.KEYWORDS.value: ['Distant', 'treatment'], 
                    ContentValidationMetricsParameters.CASE_SENSITIVE.value: True},
                ContentValidationMetrics.CONTAINS_STRING.value: {
                    ContentValidationMetricsParameters.SUBSTRING.value: "Another", 
                    ContentValidationMetricsParameters.CASE_SENSITIVE.value: True},
                ContentValidationMetrics.REGEX.value: {
                    ContentValidationMetricsParameters.PATTERN.value: "someone", 
                    ContentValidationMetricsParameters.CASE_SENSITIVE.value: True},
                ContentValidationMetrics.REGEX.value: {
                    ContentValidationMetricsParameters.PATTERN.value: "someone", 
                    ContentValidationMetricsParameters.CASE_SENSITIVE.value: True},
                ContentValidationMetrics.CONTAINS_EMAIL.value: {},
                ContentValidationMetrics.CONTAINS_JSON.value: {},
                ContentValidationMetrics.CONTAINS_LINK.value: {},
                ContentValidationMetrics.CONTAINS_NONE.value: {
                    ContentValidationMetricsParameters.KEYWORDS.value: ["Distant", "New"],
                    ContentValidationMetricsParameters.CASE_SENSITIVE.value: True},
                ContentValidationMetrics.CONTAINS_VALID_LINK.value: {},
                ContentValidationMetrics.ENDS_WITH.value: {
                    ContentValidationMetricsParameters.SUBSTRING.value: "end",
                    ContentValidationMetricsParameters.CASE_SENSITIVE.value: True},
                ContentValidationMetrics.EQUALS_TO.value: {
                    ContentValidationMetricsParameters.TEXT.value: "Technology",
                    ContentValidationMetricsParameters.CASE_SENSITIVE.value: True},
                ContentValidationMetrics.IS_EMAIL.value: {},
                ContentValidationMetrics.IS_JSON.value: {},
                ContentValidationMetrics.LENGTH_GREATER_THAN.value: {
                    ContentValidationMetricsParameters.LENGTH.value: 100},
                ContentValidationMetrics.LENGTH_LESS_THAN.value: {
                    ContentValidationMetricsParameters.LENGTH.value: 5},
                ContentValidationMetrics.NO_INVALID_LINKS.value: {},
                ContentValidationMetrics.STARTS_WITH.value: {
                    ContentValidationMetricsParameters.SUBSTRING.value: "new",
                    ContentValidationMetricsParameters.CASE_SENSITIVE.value: True},
                ContentValidationMetrics.FUZZY_MATCH.value: {
                    ContentValidationMetricsParameters.SIMILARITY_RATIO.value: 50,
                    ContentValidationMetricsParameters.TEXT.value: "demo"
                }
            }       
        }    
    }
}

In [None]:
result = client.llm_metrics.compute_metrics(configuration=metric_config, predictions=predictions)

### Evaluated Metrics

In [None]:
print(json.dumps(result,indent=2))

Author: kishore.patel@in.ibm.com , ravi.chamarthy@in.ibm.com