# Using IBM watsonx.governance metrics toolkit to evaluate the quality of your Prompt Template

In [None]:
!pip install --upgrade ibm-watson-machine-learning   | tail -n 1
!pip install --upgrade ibm-watson-openscale --no-cache | tail -n 1
!pip install --upgrade ibm-metrics-plugin --no-cache | tail -n 1

In [None]:
!pip install --upgrade evaluate --no-cache | tail -n 1
!pip install --upgrade rouge_score --no-cache | tail -n 1
!pip install --upgrade textstat --no-cache | tail -n 1
!pip install --upgrade sacrebleu --no-cache | tail -n 1
!pip install --upgrade sacremoses --no-cache | tail -n 1
!pip install --upgrade torchmetrics --no-cache | tail -n 1
!pip install --upgrade datasets==2.10.0 --no-cache | tail -n 1

In [46]:
import warnings
warnings.filterwarnings('ignore')

### Set the ARTIFACTORY_USERNAME and ARTIFACTORY_API_KEY to install watson_nlp from artifactory
Watson NLP artifacts are stored in Artifactory repository. To pull Watson NLP’s dependencies or download models, you must configure 2 variables, ARTIFACTORY_USERNAME and ARTIFACTORY_API_KEY. See <a href="https://pages.github.ibm.com/ai-foundation/watson-nlp-documentation/installation.html#obtaining-artifactory-credential">  instructions</a> to get ARTIFACTORY_USERNAME and ARTIFACTORY_API_KEY

Note: Replace ARTIFACTORY_USERNAME and ARTIFACTORY_API_KEY with your artifactory credentials.


In [97]:
ARTIFACTORY_USERNAME = "<Your artifactory username>"
ARTIFACTORY_API_KEY = "<Your artifactory api key>"

In [None]:
#install watson_nlp from the artifactory
!pip install --index-url https://<ARTIFACTORY_USERNAME>:<ARTIFACTORY_API_KEY>@na.artifactory.swg-devops.com/artifactory/api/pypi/wcp-ai-foundation-team-pypi-virtual/simple 'watson_nlp[all]' --no-cache | tail -n 1


## Provision services and configure credentials

If you have not already, provision an instance of IBM Watson OpenScale using the [OpenScale link in the Cloud catalog](https://cloud.ibm.com/catalog/services/watson-openscale).

Your Cloud API key can be generated by going to the [**Users** section of the Cloud console](https://cloud.ibm.com/iam#/users). From that page, click your name, scroll down to the **API Keys** section, and click **Create an IBM Cloud API key**. Give your key a name and click **Create**, then copy the created key and paste it below.

**NOTE:** You can also get OpenScale `API_KEY` using IBM CLOUD CLI.

How to install IBM Cloud (bluemix) console: [instruction](https://console.bluemix.net/docs/cli/reference/ibmcloud/download_cli.html#install_use)

How to get api key using console:
```
bx login --sso
bx iam api-key-create 'my_key'
```

In [102]:
CLOUD_API_KEY = "***"
IAM_URL="https://iam.ng.bluemix.net/oidc/token"

## IBM watsonx.governance authentication

In [103]:
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator,BearerTokenAuthenticator

from ibm_watson_openscale import *
from ibm_watson_openscale.supporting_classes.enums import *
from ibm_watson_openscale.supporting_classes import *


authenticator = IAMAuthenticator(apikey=CLOUD_API_KEY)
client = APIClient(authenticator=authenticator)
client.version

'3.0.34'

# Common Imports

In [104]:
from ibm_metrics_plugin.metrics.llm.utils.constants import LLMTextMetricGroup
from ibm_metrics_plugin.metrics.llm.utils.constants import  LLMGenerationMetrics
from ibm_metrics_plugin.metrics.llm.utils.constants import LLMSummarizationMetrics
from ibm_metrics_plugin.metrics.llm.utils.constants import LLMQAMetrics
from ibm_metrics_plugin.metrics.llm.utils.constants import LLMClassificationMetrics
from ibm_metrics_plugin.metrics.llm.utils.constants import HAP_SCORE
from ibm_metrics_plugin.metrics.llm.utils.constants import PII_DETECTION

# Evaluating Summarization output from AWS/anthropic.claude-v2

## Test data containing the summarization output from model and the reference data

In [105]:
!rm -fr llm_content.csv
!wget "https://raw.githubusercontent.com/ravichamarthy/custom_metrics/main/llm_content.csv"

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
--2024-02-01 17:24:02--  https://raw.githubusercontent.com/ravichamarthy/custom_metrics/main/llm_content.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 31230 (30K) [text/plain]
Saving to: ‘llm_cont

In [106]:
import pandas as pd
import numpy as np
llm_data_all = pd.read_csv("llm_content.csv")
llm_data_all.head()

Unnamed: 0,input_text,generated_summary,reference_summary_1,reference_summary_2
0,Scientists have discovered a new species of de...,New bioluminescent fish species found in deep ...,Discovery of deep-sea fish emitting soothing l...,Scientists find new bioluminescent fish specie...
1,An international team of astronomers has ident...,Distant exoplanet\'s water vapor-filled atmosp...,Astronomers identify exoplanet with water vapo...,Discovery of exoplanet with water vapor in its...
2,Researchers have developed a novel nanotechnol...,New nanotechnology-based cancer treatment demo...,Researchers create cancer treatment using nano...,Innovative cancer treatment utilizing nanotech...
3,A new app is aiming to reduce food waste by co...,App connects local restaurants with customers ...,New sustainability-focused app facilitates sal...,Initiative to reduce food waste involves app c...
4,Archaeologists have uncovered an ancient city ...,"Ancient city dating back over 4,000 years disc...",Archaeological find in Iraq reveals ancient ci...,"Discovery of 4,000-year-old ancient city in Ku..."


In [107]:
# Append new rows to DataFrame which contain data related to PII and HAP
new_row = [{'input_text': "A new app is aiming to reduce food waste by connecting local restaurants with customers to sell excess food at discounted prices. This initiative could play a significant role in promoting sustainability. Quote the social security number `051-0301802` to get the discounts on orders.",
           'generated_summary': "App connects local restaurants with customers to sell surplus food, contributing to the reduction of food waste. Use social security number `051-0301802` to get the discounts",
           'reference_summary_1':"New sustainability-focused app facilitates sale of excess restaurant food, combating food waste."
          },
          { 'input_text': "According to a recent study, regular exercise in middle age can lead to a reduced risk of dementia in later life. The research found that individuals who engaged in physical activity in their 40s and 50s had a 26% lower risk of developing dementia compared to those who were inactive.",
           'generated_summary': "Oh shit! Middle-age exercise linked to lower risk 26% of dementia only in old age.",
           'reference_summary_1':"New study suggests that staying physically active in your 40s and 50s can significantly decrease the risk of dementia later in life."
          }
          ]

In [109]:
llm_data_df = llm_data_all.head(10)
llm_data = llm_data_df._append(new_row,ignore_index=True)
llm_data.head()

Unnamed: 0,input_text,generated_summary,reference_summary_1,reference_summary_2
0,Scientists have discovered a new species of de...,New bioluminescent fish species found in deep ...,Discovery of deep-sea fish emitting soothing l...,Scientists find new bioluminescent fish specie...
1,An international team of astronomers has ident...,Distant exoplanet\'s water vapor-filled atmosp...,Astronomers identify exoplanet with water vapo...,Discovery of exoplanet with water vapor in its...
2,Researchers have developed a novel nanotechnol...,New nanotechnology-based cancer treatment demo...,Researchers create cancer treatment using nano...,Innovative cancer treatment utilizing nanotech...
3,A new app is aiming to reduce food waste by co...,App connects local restaurants with customers ...,New sustainability-focused app facilitates sal...,Initiative to reduce food waste involves app c...
4,Archaeologists have uncovered an ancient city ...,"Ancient city dating back over 4,000 years disc...",Archaeological find in Iraq reveals ancient ci...,"Discovery of 4,000-year-old ancient city in Ku..."


In [110]:
df_input = llm_data[['input_text']].copy()
df_output = llm_data[['generated_summary']].copy()
df_reference = llm_data[['reference_summary_1']].copy()

### Set the envrionment variables to download the watson nlp models for HAP and PII

In [111]:
import os
os.environ["ARTIFACTORY_USERNAME"] = ARTIFACTORY_USERNAME
os.environ["ARTIFACTORY_API_KEY"] = ARTIFACTORY_API_KEY
os.environ["JAVA_TOOL_OPTIONS"] = "-Xnocompressedrefs" 

### Create models dir to download the nlp models

In [64]:
cwd = os.getcwd()
os.mkdir(cwd+"/models")
os.chdir('models')

In [65]:
import watson_nlp
    
watson_nlp.download("classification_transformer_en_slate.38m.hap")
watson_nlp.download("entity-mentions_rbr_multi_pii")


'/Users/swapna/workspace/git/notebooks/watsonx/models/entity-mentions_rbr_multi_pii'

In [None]:
os.chdir('../')#set the previous working directory as current directory
os.getcwd()

## Metrics configuration for evaluation

In [112]:
metric_config = {   
    "configuration": {
        LLMTextMetricGroup.SUMMARIZATION.value: {
            LLMSummarizationMetrics.ROUGE_SCORE.value: {},
            LLMSummarizationMetrics.SARI.value: {},
            LLMSummarizationMetrics.METEOR.value: {},
            LLMSummarizationMetrics.NORMALIZED_RECALL.value: {},
            LLMSummarizationMetrics.NORMALIZED_PRECISION.value: {},
            LLMSummarizationMetrics.NORMALIZED_F1_SCORE.value: {},
            LLMSummarizationMetrics.COSINE_SIMILARITY.value: {},
            LLMSummarizationMetrics.JACCARD_SIMILARITY.value: {},
            LLMSummarizationMetrics.BLEU.value: {},
            LLMSummarizationMetrics.FLESCH.value: {},
            HAP_SCORE: {},
            PII_DETECTION: {}
        }
    }
}

## Summarization Metrics Evaluation

In [None]:
import json
result = client.llm_metrics.compute_metrics(metric_config,df_input,df_output, df_reference)

## Evaluated Metrics

In [114]:
print(json.dumps(result,indent=2))

{
  "rouge_score": {
    "rouge1": {
      "metric_value": 0.3089
    },
    "rouge2": {
      "metric_value": 0.093
    },
    "rougeL": {
      "metric_value": 0.2515
    },
    "rougeLsum": {
      "metric_value": 0.2515
    }
  },
  "meteor": {
    "metric_value": 0.2605789083446832
  },
  "sari": {
    "metric_value": 44.15487276975899
  },
  "cosine_similarity": {
    "metric_value": 0.20308697348194513,
    "mean": 0.20308697348194513,
    "min": 0.10354961707236612,
    "max": 0.3212096323232406,
    "std": 0.061556841041606164
  },
  "jaccard_similarity": {
    "metric_value": 0.14099871538589684,
    "mean": 0.14099871538589684,
    "min": 0.034482758620689655,
    "max": 0.25,
    "std": 0.06314965811781187
  },
  "normalized_precision": {
    "metric_value": 0.2767176110926111,
    "mean": 0.2767176110926111,
    "min": 0.07692307692307693,
    "max": 0.42857142857142855,
    "std": 0.09952895598865172
  },
  "normalized_f1": {
    "metric_value": 0.25993800106703335,
    "

# Evaluating Content Generation output from the Foundation Model

## Test data containing the content generation output from model and the reference data

In [12]:
!rm -fr llm_content_generation.csv
!wget "https://raw.githubusercontent.com/ravichamarthy/custom_metrics/main/llm_content_generation.csv"

--2023-12-05 13:21:49--  https://raw.githubusercontent.com/ravichamarthy/custom_metrics/main/llm_content_generation.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 11794 (12K) [text/plain]
Saving to: ‘llm_content_generation.csv’


2023-12-05 13:21:50 (11.8 MB/s) - ‘llm_content_generation.csv’ saved [11794/11794]



In [13]:
data = pd.read_csv("llm_content_generation.csv")
data.head()

Unnamed: 0,question,generated_text,reference_text
0,What are the benefits of regular exercise?,"Regular exercise has numerous benefits, includ...","Regular exercise has numerous benefits, includ..."
1,What is the process of photosynthesis?,Photosynthesis is the process by which plants ...,Photosynthesis is the process by which plants ...
2,What are the key features of a smartphone?,A smartphone is a mobile device that typically...,A smartphone is a mobile device that typically...
3,How does the immune system work?,The immune system is a complex network of cell...,The immune system is a complex network of cell...
4,What is the capital of France?,"The capital of France is Paris, which is known...","The capital of France is Paris, which is known..."


In [14]:
df_input = data[['question']].copy()
df_output = data[['generated_text']].copy()
df_reference = data[['reference_text']].copy()

## Metrics configuration for evaluation

In [15]:
metric_config = {   
    #All Common parameters goes here 
    "configuration": {        
        LLMTextMetricGroup.GENERATION.value: { # metric group   
            LLMGenerationMetrics.BLEU.value: {},
            LLMGenerationMetrics.ROUGE_SCORE.value: {},
            LLMGenerationMetrics.FLESCH.value: {},
            LLMGenerationMetrics.METEOR.value: {},            
            LLMGenerationMetrics.NORMALIZED_RECALL.value: {},
            LLMGenerationMetrics.NORMALIZED_PRECISION.value: {},
            LLMGenerationMetrics.NORMALIZED_F1_SCORE.value: {}            
        }    
    }
}

## Content Generation Metrics Evaluation

In [16]:
result = client.llm_metrics.compute_metrics(metric_config,df_input,df_output, df_reference)

[nltk_data] Downloading package wordnet to /home/wsuser/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/wsuser/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/wsuser/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


## Evaluated Metrics

In [17]:
print(json.dumps(result,indent=2))

{
  "flesch": {
    "flesch_reading_ease": {
      "metric_value": 39.10217391304347,
      "mean": 39.10217391304347,
      "min": -11.44,
      "max": 69.62,
      "std": 20.153544505710833
    },
    "flesch_kincaid_grade": {
      "metric_value": 12.673913043478263,
      "mean": 12.673913043478263,
      "min": 8.0,
      "max": 18.6,
      "std": 3.2043743730833554
    }
  },
  "bleu": {
    "precisions": [
      1.0,
      0.9949174078780177,
      0.9947643979057592,
      0.9946018893387314
    ],
    "brevity_penalty": 0.7138823993242189,
    "length_ratio": 0.7479224376731302,
    "translation_length": 810,
    "reference_length": 1083,
    "metric_value": 0.711075655695426
  },
  "rouge_score": {
    "rouge1": {
      "metric_value": 0.8451
    },
    "rouge2": {
      "metric_value": 0.8402
    },
    "rougeL": {
      "metric_value": 0.8451
    },
    "rougeLsum": {
      "metric_value": 0.8451
    }
  },
  "normalized_recall": {
    "metric_value": 0.7335547397366776,
  

# Evaluating Question and Answering output from the Foundation Model

## Test data containing the question and answer output from model and the reference data

In [18]:
!rm -fr llm_content_qa.csv
!wget "https://raw.githubusercontent.com/ravichamarthy/custom_metrics/main/llm_content_qa.csv"

--2023-12-05 13:21:56--  https://raw.githubusercontent.com/ravichamarthy/custom_metrics/main/llm_content_qa.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3109 (3.0K) [text/plain]
Saving to: ‘llm_content_qa.csv’


2023-12-05 13:21:56 (9.89 MB/s) - ‘llm_content_qa.csv’ saved [3109/3109]



In [19]:
data = pd.read_csv("llm_content_qa.csv")
data.head()

Unnamed: 0,question,answers
0,who did chris carter play for last year,Milwaukee Brewers
1,what is the latest version of safari on mac,Safari 11
2,when did bucharest become the capital of romania,1862
3,who did jeffrey dean morgan play on supernatural,John Eric Winchester
4,who is the shortest man that ever lived,Chandra Bahadur Dangi


In [20]:
df_input = data[['question']].copy()
df_output = data[['answers']].copy()
df_reference = data[['answers']].copy()

## Metrics configuration for evaluation

In [21]:
metric_config = {   
    #All Common parameters goes here 
    "configuration": {        
        LLMTextMetricGroup.QA.value: { # metric group   
            LLMQAMetrics.EXACT_MATCH.value: {},
            LLMQAMetrics.ROUGE_SCORE.value: {},
            LLMQAMetrics.BLEU.value: {}          
        }    
    }
}

## Question and Answering Metrics Evaluation

In [22]:
result = client.llm_metrics.compute_metrics(metric_config,df_input,df_output, df_reference)

## Evaluated Metrics

In [23]:
print(json.dumps(result,indent=2))

{
  "exact_match": {
    "metric_value": 1.0
  },
  "bleu": {
    "precisions": [
      1.0,
      1.0,
      1.0,
      1.0
    ],
    "brevity_penalty": 1.0,
    "length_ratio": 1.0,
    "translation_length": 133,
    "reference_length": 133,
    "metric_value": 1.0
  },
  "rouge_score": {
    "rouge1": {
      "metric_value": 1.0
    },
    "rouge2": {
      "metric_value": 0.74
    },
    "rougeL": {
      "metric_value": 1.0
    },
    "rougeLsum": {
      "metric_value": 1.0
    }
  }
}


# Evaluating Text Classification output from the Foundation Model

## Test data containing the text classification output from model and the reference data

In [24]:
!rm -fr llm_content_classification.csv
!wget "https://raw.githubusercontent.com/ravichamarthy/custom_metrics/main/llm_content_classification.csv"

--2023-12-05 13:22:01--  https://raw.githubusercontent.com/ravichamarthy/custom_metrics/main/llm_content_classification.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 480803 (470K) [text/plain]
Saving to: ‘llm_content_classification.csv’


2023-12-05 13:22:01 (51.2 MB/s) - ‘llm_content_classification.csv’ saved [480803/480803]



In [25]:
data = pd.read_csv("llm_content_classification.csv")
data.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [26]:
data['label'] = data['label'].replace({'ham': 0, 'spam': 1})

In [27]:
df_input = data[['text']].copy()
df_output = data[['label']].copy()
df_reference = data[['label']].copy()

## Make some realistic reference column

In [28]:
shuffled_column = df_reference['label'].sample(frac=1).reset_index(drop=True)
df_reference['label'] = shuffled_column

## Metrics configuration for evaluation

In [29]:
metric_config = {   
    #All Common parameters goes here 
    "configuration": {        
        LLMTextMetricGroup.CLASSIFICATION.value: { # metric group   
            LLMClassificationMetrics.ACCURACY.value: {},
            LLMClassificationMetrics.PRECISION.value: {},
            LLMClassificationMetrics.RECALL.value: {},
            LLMClassificationMetrics.F1_SCORE.value: {},
            LLMClassificationMetrics.MATTHEWS_CORRELATION.value: {},            
        }    
    }
}

## Text Classification Metrics Evaluation

In [30]:
result = client.llm_metrics.compute_metrics(metric_config,df_input,df_output, df_reference)

## Evaluated Metrics

In [31]:
print(json.dumps(result,indent=2))

{
  "accuracy": {
    "accuracy": 0.7674084709260589
  },
  "precision": {
    "precision": 0.13253012048192772
  },
  "matthews_correlation": {
    "matthews_correlation": -0.001770397652787315
  },
  "recall": {
    "recall": 0.13253012048192772
  },
  "f1": {
    "f1": 0.13253012048192772
  }
}


Author: kishore.patel@in.ibm.com , ravi.chamarthy@in.ibm.com