# Using IBM watsonx.governance metrics toolkit to evaluate the quality of your Prompt Template

In [None]:
!pip install --upgrade ibm-watson-machine-learning   | tail -n 1
!pip install --upgrade ibm-watson-openscale --no-cache | tail -n 1
!pip install --upgrade ibm-metrics-plugin --no-cache | tail -n 1

In [None]:
!pip install --upgrade evaluate --no-cache | tail -n 1
!pip install --upgrade rouge_score --no-cache | tail -n 1
!pip install --upgrade textstat --no-cache | tail -n 1
!pip install --upgrade sacrebleu --no-cache | tail -n 1
!pip install --upgrade sacremoses --no-cache | tail -n 1
!pip install --upgrade datasets==2.10.0 --no-cache | tail -n 1
!pip install --upgrade torchmetrics --no-cache | tail -n 1

In [None]:
import warnings
warnings.filterwarnings('ignore')

## Provision services and configure credentials

If you have not already, provision an instance of IBM Watson OpenScale using the [OpenScale link in the Cloud catalog](https://cloud.ibm.com/catalog/services/watson-openscale).

Your Cloud API key can be generated by going to the [**Users** section of the Cloud console](https://cloud.ibm.com/iam#/users). From that page, click your name, scroll down to the **API Keys** section, and click **Create an IBM Cloud API key**. Give your key a name and click **Create**, then copy the created key and paste it below.

**NOTE:** You can also get OpenScale `API_KEY` using IBM CLOUD CLI.

How to install IBM Cloud (bluemix) console: [instruction](https://console.bluemix.net/docs/cli/reference/ibmcloud/download_cli.html#install_use)

How to get api key using console:
```
bx login --sso
bx iam api-key-create 'my_key'
```

In [None]:
use_cpd = False
CLOUD_API_KEY = "***"
IAM_URL="https://iam.ng.bluemix.net/oidc/token"

Uncomment the code and run the below cell only if you are running your notebook on a CPD cluster.

In [None]:
# use_cpd = True
# WOS_CREDENTIALS = {
#     "url": "xxxxx",
#     "username": "xxxxx",
#     "password": "xxxxx"
# }

## IBM watsonx.governance authentication

In [None]:
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator,BearerTokenAuthenticator,CloudPakForDataAuthenticator

from ibm_watson_openscale import *
from ibm_watson_openscale.supporting_classes.enums import *
from ibm_watson_openscale.supporting_classes import *

if use_cpd:
    authenticator = CloudPakForDataAuthenticator(
            url=WOS_CREDENTIALS['url'],
            username=WOS_CREDENTIALS['username'],
            password=WOS_CREDENTIALS['password'],
            disable_ssl_verification=True
        )
    
    client = APIClient(service_url=WOS_CREDENTIALS['url'],authenticator=authenticator)
    print(client.version)
else:
    authenticator = IAMAuthenticator(apikey=CLOUD_API_KEY)
    client = APIClient(authenticator=authenticator)
    print(client.version)

# Common Imports

In [None]:
from ibm_metrics_plugin.metrics.llm.utils.constants import LLMTextMetricGroup
from ibm_metrics_plugin.metrics.llm.utils.constants import  LLMGenerationMetrics
from ibm_metrics_plugin.metrics.llm.utils.constants import LLMSummarizationMetrics
from ibm_metrics_plugin.metrics.llm.utils.constants import LLMQAMetrics
from ibm_metrics_plugin.metrics.llm.utils.constants import LLMClassificationMetrics
from ibm_metrics_plugin.metrics.llm.utils.constants import HAP_SCORE
from ibm_metrics_plugin.metrics.llm.utils.constants import PII_DETECTION
from ibm_metrics_plugin.metrics.llm.utils.constants import LLMExtractionMetrics, FLESCH

# Evaluating Summarization output from AWS/anthropic.claude-v2

## Test data containing the summarization output from model and the reference data

In [None]:
!rm -fr llm_content.csv
!wget "https://raw.githubusercontent.com/IBM/watson-openscale-samples/main/IBM%20Cloud/WML/assets/data/watsonx/llm_content.csv"

In [None]:
import pandas as pd
import numpy as np
llm_data_all = pd.read_csv("llm_content.csv")
llm_data_all.head()

In [None]:
llm_data = llm_data_all.head(10)
llm_data.head()

In [None]:
df_input = llm_data[['input_text']].copy()
df_output = llm_data[['generated_summary']].copy()
df_reference = llm_data[['reference_summary_1']].copy()

## Metrics configuration for evaluation

In [None]:
metric_config = {   
    "configuration": {
        LLMTextMetricGroup.SUMMARIZATION.value: {
            LLMSummarizationMetrics.ROUGE_SCORE.value: {},
            LLMSummarizationMetrics.SARI.value: {},
            LLMSummarizationMetrics.METEOR.value: {},
            LLMSummarizationMetrics.NORMALIZED_RECALL.value: {},
            LLMSummarizationMetrics.NORMALIZED_PRECISION.value: {},
            LLMSummarizationMetrics.NORMALIZED_F1_SCORE.value: {},
            LLMSummarizationMetrics.COSINE_SIMILARITY.value: {},
            LLMSummarizationMetrics.JACCARD_SIMILARITY.value: {},
            LLMSummarizationMetrics.BLEU.value: {},
            LLMSummarizationMetrics.FLESCH.value: {}
        }
    }
}

## Summarization Metrics Evaluation

In [None]:
import json
result = client.llm_metrics.compute_metrics(metric_config,df_input,df_output, df_reference)

## Evaluated Metrics

In [None]:
print(json.dumps(result,indent=2))

# Evaluating Content Generation output from the Foundation Model

## Test data containing the content generation output from model and the reference data

In [None]:
!rm -fr llm_content_generation.csv
!wget "https://raw.githubusercontent.com/IBM/watson-openscale-samples/main/IBM%20Cloud/WML/assets/data/watsonx/llm_content_generation.csv"

In [None]:
data = pd.read_csv("llm_content_generation.csv")
data.head()

In [None]:
df_input = data[['question']].copy()
df_output = data[['generated_text']].copy()
df_reference = data[['reference_text']].copy()

## Metrics configuration for evaluation

In [None]:
metric_config = {   
    #All Common parameters goes here 
    "configuration": {        
        LLMTextMetricGroup.GENERATION.value: { # metric group   
            LLMGenerationMetrics.BLEU.value: {},
            LLMGenerationMetrics.ROUGE_SCORE.value: {},
            LLMGenerationMetrics.FLESCH.value: {},
            LLMGenerationMetrics.METEOR.value: {},            
            LLMGenerationMetrics.NORMALIZED_RECALL.value: {},
            LLMGenerationMetrics.NORMALIZED_PRECISION.value: {},
            LLMGenerationMetrics.NORMALIZED_F1_SCORE.value: {}            
        }    
    }
}

## Content Generation Metrics Evaluation

In [None]:
result = client.llm_metrics.compute_metrics(metric_config,df_input,df_output, df_reference)

## Evaluated Metrics

In [None]:
print(json.dumps(result,indent=2))

# Evaluating Question and Answering output from the Foundation Model

## Test data containing the question and answer output from model and the reference data

In [None]:
!rm -fr llm_content_qa.csv
!wget "https://raw.githubusercontent.com/IBM/watson-openscale-samples/main/IBM%20Cloud/WML/assets/data/watsonx/llm_content_qa.csv"

In [None]:
data = pd.read_csv("llm_content_qa.csv")
data.head()

In [None]:
df_input = data[['question']].copy()
df_output = data[['answers']].copy()
df_reference = data[['answers']].copy()

## Metrics configuration for evaluation

In [None]:
metric_config = {   
    #All Common parameters goes here 
    "configuration": {        
        LLMTextMetricGroup.QA.value: { # metric group   
            LLMQAMetrics.EXACT_MATCH.value: {},
            LLMQAMetrics.ROUGE_SCORE.value: {},
            LLMQAMetrics.BLEU.value: {}          
        }    
    }
}

## Question and Answering Metrics Evaluation

In [None]:
result = client.llm_metrics.compute_metrics(metric_config,df_input,df_output, df_reference)

## Evaluated Metrics

In [None]:
print(json.dumps(result,indent=2))

# Evaluating Text Classification output from the Foundation Model

## Test data containing the text classification output from model and the reference data

In [None]:
!rm -fr llm_content_classification.csv
!wget "https://raw.githubusercontent.com/IBM/watson-openscale-samples/main/IBM%20Cloud/WML/assets/data/watsonx/llm_content_classification.csv"

In [None]:
data = pd.read_csv("llm_content_classification.csv")
data.head()

In [None]:
data['label'] = data['label'].replace({'ham': 0, 'spam': 1})

In [None]:
df_input = data[['text']].copy()
df_output = data[['label']].copy()
df_reference = data[['label']].copy()

## Make some realistic reference column

In [None]:
shuffled_column = df_reference['label'].sample(frac=1).reset_index(drop=True)
df_reference['label'] = shuffled_column

## Metrics configuration for evaluation

In [None]:
metric_config = {   
    #All Common parameters goes here 
    "configuration": {        
        LLMTextMetricGroup.CLASSIFICATION.value: { # metric group   
            LLMClassificationMetrics.ACCURACY.value: {},
            LLMClassificationMetrics.PRECISION.value: {},
            LLMClassificationMetrics.RECALL.value: {},
            LLMClassificationMetrics.F1_SCORE.value: {},
            LLMClassificationMetrics.MATTHEWS_CORRELATION.value: {},            
        }    
    }
}

## Text Classification Metrics Evaluation

In [None]:
result = client.llm_metrics.compute_metrics(metric_config,df_input,df_output, df_reference)

## Evaluated Metrics

In [None]:
print(json.dumps(result,indent=2))

# Evaluating Entity extraction output from the Foundation Model

## Test data containing the entity extraction output from model and the reference data

In [None]:
!rm -fr llm_content_classification.csv
!wget "https://raw.githubusercontent.com/IBM/watson-openscale-samples/main/IBM%20Cloud/WML/assets/data/watsonx/llm_extraction.csv"

In [None]:
data = pd.read_csv("llm_extraction.csv")
data.head()

In [None]:
df_input = data[['input_text']].copy()
df_output = data[['generated_text']].copy()
df_reference = data[['reference_text']].copy()

## Metrics configuration for extraction

In [None]:
metric_config = {   
    #All Common parameters goes here 
    "configuration": {        
        LLMTextMetricGroup.EXTRACTION.value: { # metric group   
            LLMQAMetrics.EXACT_MATCH.value: {},
            LLMExtractionMetrics.MULTI_LABEL.value: {},
            FLESCH: {}            
        }    
    }
}

## Entity extraction Metrics Evaluation

In [None]:
result = client.llm_metrics.compute_metrics(metric_config,df_input,df_output,df_reference)

## Evaluated Metrics

In [None]:
print(json.dumps(result,indent=2))

Author: kishore.patel@in.ibm.com , ravi.chamarthy@in.ibm.com