# Using IBM watsonx.governance metrics toolkit to evaluate the quality of your Large Language Models

In [None]:
!pip install --upgrade ibm-watson-machine-learning   | tail -n 1
!pip install --upgrade ibm-watson-openscale --no-cache | tail -n 1
!pip install --upgrade ibm-metrics-plugin --no-cache | tail -n 1

In [None]:
!pip install --upgrade evaluate --no-cache | tail -n 1
!pip install --upgrade rouge_score --no-cache | tail -n 1
!pip install --upgrade textstat --no-cache | tail -n 1
!pip install --upgrade sacrebleu --no-cache | tail -n 1
!pip install --upgrade sacremoses --no-cache | tail -n 1
!pip install --upgrade datasets==2.10.0 --no-cache | tail -n 1

In [None]:
!pip install boto3 -U --no-cache | tail -n 1

In [1]:
import warnings
warnings.filterwarnings('ignore')

## Provision services and configure credentials

If you have not already, provision an instance of IBM Watson OpenScale using the [OpenScale link in the Cloud catalog](https://cloud.ibm.com/catalog/services/watson-openscale).

Your Cloud API key can be generated by going to the [**Users** section of the Cloud console](https://cloud.ibm.com/iam#/users). From that page, click your name, scroll down to the **API Keys** section, and click **Create an IBM Cloud API key**. Give your key a name and click **Create**, then copy the created key and paste it below.

**NOTE:** You can also get OpenScale `API_KEY` using IBM CLOUD CLI.

How to install IBM Cloud (bluemix) console: [instruction](https://console.bluemix.net/docs/cli/reference/ibmcloud/download_cli.html#install_use)

How to get api key using console:
```
bx login --sso
bx iam api-key-create 'my_key'
```

In [2]:
CLOUD_API_KEY = "xxxxx"
IAM_URL="https://iam.ng.bluemix.net/oidc/token"

## IBM watsonx.governance authentication

In [3]:
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator,BearerTokenAuthenticator

from ibm_watson_openscale import *
from ibm_watson_openscale.supporting_classes.enums import *
from ibm_watson_openscale.supporting_classes import *


authenticator = IAMAuthenticator(apikey=CLOUD_API_KEY)
client = APIClient(authenticator=authenticator)
client.version

'3.0.33'

# Common Imports

In [4]:
from ibm_metrics_plugin.metrics.llm.utils.constants import LLMTextMetricGroup
from ibm_metrics_plugin.metrics.llm.utils.constants import  LLMGenerationMetrics
from ibm_metrics_plugin.metrics.llm.utils.constants import LLMSummarizationMetrics
from ibm_metrics_plugin.metrics.llm.utils.constants import LLMQAMetrics
from ibm_metrics_plugin.metrics.llm.utils.constants import LLMClassificationMetrics
from ibm_metrics_plugin.metrics.llm.utils.constants import HAP_SCORE
from ibm_metrics_plugin.metrics.llm.utils.constants import PII_DETECTION

# Evaluating Summarization output from AWS/anthropic.claude-v2

## Test data containing the summarization output from model and the reference data

In [5]:
!rm -fr llm_content.csv
!wget "https://raw.githubusercontent.com/ravichamarthy/custom_metrics/main/llm_content.csv"

--2023-12-05 13:12:22--  https://raw.githubusercontent.com/ravichamarthy/custom_metrics/main/llm_content.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 31230 (30K) [text/plain]
Saving to: ‘llm_content.csv’


2023-12-05 13:12:22 (19.2 MB/s) - ‘llm_content.csv’ saved [31230/31230]



In [6]:
import pandas as pd
import numpy as np
llm_data_all = pd.read_csv("llm_content.csv")
llm_data_all.head()

Unnamed: 0,input_text,generated_summary,reference_summary_1,reference_summary_2
0,Scientists have discovered a new species of de...,New bioluminescent fish species found in deep ...,Discovery of deep-sea fish emitting soothing l...,Scientists find new bioluminescent fish specie...
1,An international team of astronomers has ident...,Distant exoplanet\'s water vapor-filled atmosp...,Astronomers identify exoplanet with water vapo...,Discovery of exoplanet with water vapor in its...
2,Researchers have developed a novel nanotechnol...,New nanotechnology-based cancer treatment demo...,Researchers create cancer treatment using nano...,Innovative cancer treatment utilizing nanotech...
3,A new app is aiming to reduce food waste by co...,App connects local restaurants with customers ...,New sustainability-focused app facilitates sal...,Initiative to reduce food waste involves app c...
4,Archaeologists have uncovered an ancient city ...,"Ancient city dating back over 4,000 years disc...",Archaeological find in Iraq reveals ancient ci...,"Discovery of 4,000-year-old ancient city in Ku..."


In [7]:
llm_data = llm_data_all.head(10)
llm_data.head()

Unnamed: 0,input_text,generated_summary,reference_summary_1,reference_summary_2
0,Scientists have discovered a new species of de...,New bioluminescent fish species found in deep ...,Discovery of deep-sea fish emitting soothing l...,Scientists find new bioluminescent fish specie...
1,An international team of astronomers has ident...,Distant exoplanet\'s water vapor-filled atmosp...,Astronomers identify exoplanet with water vapo...,Discovery of exoplanet with water vapor in its...
2,Researchers have developed a novel nanotechnol...,New nanotechnology-based cancer treatment demo...,Researchers create cancer treatment using nano...,Innovative cancer treatment utilizing nanotech...
3,A new app is aiming to reduce food waste by co...,App connects local restaurants with customers ...,New sustainability-focused app facilitates sal...,Initiative to reduce food waste involves app c...
4,Archaeologists have uncovered an ancient city ...,"Ancient city dating back over 4,000 years disc...",Archaeological find in Iraq reveals ancient ci...,"Discovery of 4,000-year-old ancient city in Ku..."


In [8]:
import boto3, json

# AWS Access credentials

In [9]:
aws_access_key_id = 'xxxxxx'
aws_secret_access_key = 'xxxxxx'

In [10]:
session = boto3.Session()

## Create the bedrock service client

In [11]:
bedrock = session.client(service_name='bedrock', 
                         aws_access_key_id = aws_access_key_id, 
                         aws_secret_access_key = aws_secret_access_key, 
                         region_name = 'us-east-1',
                         endpoint_url = 'https://bedrock.us-east-1.amazonaws.com')

## List the available foundation models to use

In [12]:
fm_model_list = bedrock.list_foundation_models()

In [13]:
fm_model_names = [x['modelId'] for x in fm_model_list['modelSummaries']]
print(*fm_model_names, sep = "\n")

amazon.titan-tg1-large
amazon.titan-image-generator-v1:0
amazon.titan-image-generator-v1
amazon.titan-embed-g1-text-02
amazon.titan-text-lite-v1:0:4k
amazon.titan-text-lite-v1
amazon.titan-text-express-v1:0:8k
amazon.titan-text-express-v1
amazon.titan-embed-text-v1:2:8k
amazon.titan-embed-text-v1
amazon.titan-embed-image-v1:0
amazon.titan-embed-image-v1
stability.stable-diffusion-xl
stability.stable-diffusion-xl-v0
stability.stable-diffusion-xl-v1:0
stability.stable-diffusion-xl-v1
ai21.j2-grande-instruct
ai21.j2-jumbo-instruct
ai21.j2-mid
ai21.j2-mid-v1
ai21.j2-ultra
ai21.j2-ultra-v1
anthropic.claude-instant-v1:2:100k
anthropic.claude-instant-v1
anthropic.claude-v1:3:18k
anthropic.claude-v1:3:100k
anthropic.claude-v1
anthropic.claude-v2:0:18k
anthropic.claude-v2:0:100k
anthropic.claude-v2:1:18k
anthropic.claude-v2:1:200k
anthropic.claude-v2:1
anthropic.claude-v2
cohere.command-text-v14:7:4k
cohere.command-text-v14
cohere.command-light-text-v14:7:4k
cohere.command-light-text-v14
cohere

## Let's use `anthropic.claude-v2` model for inferencing

In [14]:
modelId = 'anthropic.claude-v2' # change this to use a different version from the model provider
accept = 'application/json'
contentType = 'application/json'

## Create bedrock-runtime client

In [15]:
bedrock_runtime = session.client(service_name='bedrock-runtime', 
                         aws_access_key_id = aws_access_key_id, 
                         aws_secret_access_key = aws_secret_access_key, 
                         region_name = 'us-east-1',
                         endpoint_url = 'https://bedrock-runtime.us-east-1.amazonaws.com')

## Prompt for inferencing

In [16]:
def get_prompt(text):
    prompt = f"""Human: Please provide a summary of the following text with maximum of 20 words.
    
{text}
    
Assistant:"""
    return prompt

## Evaluate the prompt

In [17]:
def prompt_evaluation(text):
    prompt = get_prompt(text)
    body = json.dumps({"prompt": prompt,
                     "max_tokens_to_sample":2048,
                     "temperature":0.1,
                     "top_k":250,
                     "top_p":0.5,
                     "stop_sequences":[]
                      }) 
    response = bedrock_runtime.invoke_model(body=body, modelId=modelId, accept=accept, contentType=contentType)
    response_body = json.loads(response.get('body').read())
    completion = response_body['completion']
    summary = completion
    if '\n\n' in completion:
        summary = completion.split("\n\n")[1]
    print('-----')    
    print(summary)
    print('-----')
    return summary

## Append the generated summary, obtained as part of the inferencing against `anthropic.claude-v2` model, to the LLM data set

In [18]:
llm_data['anthropic_generated_summary'] = llm_data['input_text'].apply(prompt_evaluation)

-----
Scientists discovered a new bioluminescent deep-sea fish species that emits a soft light, which could advance underwater exploration.
-----
-----
Astronomers discovered a distant exoplanet with water vapor in its atmosphere, bringing us closer to understanding habitability beyond our solar system.
-----
-----
Novel nanotechnology cancer treatment precisely targets tumor cells, holding promise for more effective and less invasive therapies.
-----
-----
New app connects restaurants and customers to sell excess food, promoting sustainability by reducing food waste.
-----
-----
 Archaeologists uncovered 4,000 year old ancient city in Iraq's Kurdistan region, providing insights into the region's history and culture.
-----
-----
New solar panel is highly efficient in low light, enabling renewable energy production even in areas with limited sunlight.
-----
-----
Regular exercise in middle age can reduce dementia risk later in life, according to a recent study.
-----
-----
Scientists cr

In [19]:
llm_data.head()

Unnamed: 0,input_text,generated_summary,reference_summary_1,reference_summary_2,anthropic_generated_summary
0,Scientists have discovered a new species of de...,New bioluminescent fish species found in deep ...,Discovery of deep-sea fish emitting soothing l...,Scientists find new bioluminescent fish specie...,Scientists discovered a new bioluminescent dee...
1,An international team of astronomers has ident...,Distant exoplanet\'s water vapor-filled atmosp...,Astronomers identify exoplanet with water vapo...,Discovery of exoplanet with water vapor in its...,Astronomers discovered a distant exoplanet wit...
2,Researchers have developed a novel nanotechnol...,New nanotechnology-based cancer treatment demo...,Researchers create cancer treatment using nano...,Innovative cancer treatment utilizing nanotech...,Novel nanotechnology cancer treatment precisel...
3,A new app is aiming to reduce food waste by co...,App connects local restaurants with customers ...,New sustainability-focused app facilitates sal...,Initiative to reduce food waste involves app c...,New app connects restaurants and customers to ...
4,Archaeologists have uncovered an ancient city ...,"Ancient city dating back over 4,000 years disc...",Archaeological find in Iraq reveals ancient ci...,"Discovery of 4,000-year-old ancient city in Ku...","Archaeologists uncovered 4,000 year old ancie..."


## Get the necessary data for evaluating the quality of the prompt template

In [20]:
df_input = llm_data[['input_text']].copy()
df_output = llm_data[['anthropic_generated_summary']].copy()
df_reference = llm_data[['reference_summary_2']].copy()

## Metrics configuration for evaluation

In [21]:
metric_config = {   
    "configuration": {
        LLMTextMetricGroup.SUMMARIZATION.value: {
            LLMSummarizationMetrics.ROUGE_SCORE.value: {},
            LLMSummarizationMetrics.SARI.value: {},
            LLMSummarizationMetrics.METEOR.value: {},
            LLMSummarizationMetrics.NORMALIZED_RECALL.value: {},
            LLMSummarizationMetrics.NORMALIZED_PRECISION.value: {},
            LLMSummarizationMetrics.NORMALIZED_F1_SCORE.value: {},
            LLMSummarizationMetrics.COSINE_SIMILARITY.value: {},
            LLMSummarizationMetrics.JACCARD_SIMILARITY.value: {},
            LLMSummarizationMetrics.BLEU.value: {},
            LLMSummarizationMetrics.FLESCH.value: {}
        }
    }
}

## Summarization Metrics Evaluation

In [22]:
import json
result = client.llm_metrics.compute_metrics(metric_config,sources = df_input, predictions = df_output, references = df_reference)

Please install adversarial-robustness-toolbox package
please install adversarial-robustness-toolbox package
please install adversarial-robustness-toolbox package
please install adversarial-robustness-toolbox package
please install adversarial-robustness-toolbox package
Please install `blanc` package
Please install watson_nlp package to detect PII information
Please install watson_nlp package to detect PII information
Please install watson_nlp package to detect PII information
Please install watson_nlp package to compute HAP score
Please install watson_nlp package to compute HAP score
Please install watson_nlp package to compute HAP score


Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/12.1k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.93k [00:00<?, ?B/s]

[nltk_data] Downloading package wordnet to /home/wsuser/nltk_data...
[nltk_data] Downloading package punkt to /home/wsuser/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package omw-1.4 to /home/wsuser/nltk_data...


## Evaluated Metrics

In [23]:
print(json.dumps(result,indent=2))

{
  "flesch": {
    "flesch_reading_ease": {
      "metric_value": 28.874000000000002,
      "mean": 28.874000000000002,
      "min": 7.86,
      "max": 45.76,
      "std": 11.491078452434305
    },
    "flesch_kincaid_grade": {
      "metric_value": 13.66,
      "mean": 13.66,
      "min": 11.1,
      "max": 17.4,
      "std": 1.787288449019911
    }
  },
  "bleu": {
    "precisions": [
      0.45685279187817257,
      0.24064171122994651,
      0.15254237288135594,
      0.10179640718562874
    ],
    "brevity_penalty": 1.0,
    "length_ratio": 1.1257142857142857,
    "translation_length": 197,
    "reference_length": 175,
    "metric_value": 0.20326730756556333
  },
  "sari": {
    "metric_value": 47.6455102721839
  },
  "rouge_score": {
    "rouge1": {
      "metric_value": 0.4922
    },
    "rouge2": {
      "metric_value": 0.274
    },
    "rougeL": {
      "metric_value": 0.4396
    },
    "rougeLsum": {
      "metric_value": 0.4396
    }
  },
  "normalized_recall": {
    "metri

# Evaluating Content Generation output from the Foundation Model

## Test data containing the content generation output from model and the reference data

In [24]:
!rm -fr llm_content_generation.csv
!wget "https://raw.githubusercontent.com/ravichamarthy/custom_metrics/main/llm_content_generation.csv"

--2023-12-05 13:13:21--  https://raw.githubusercontent.com/ravichamarthy/custom_metrics/main/llm_content_generation.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 11794 (12K) [text/plain]
Saving to: ‘llm_content_generation.csv’


2023-12-05 13:13:22 (16.7 MB/s) - ‘llm_content_generation.csv’ saved [11794/11794]



In [25]:
data = pd.read_csv("llm_content_generation.csv")
data.head()

Unnamed: 0,question,generated_text,reference_text
0,What are the benefits of regular exercise?,"Regular exercise has numerous benefits, includ...","Regular exercise has numerous benefits, includ..."
1,What is the process of photosynthesis?,Photosynthesis is the process by which plants ...,Photosynthesis is the process by which plants ...
2,What are the key features of a smartphone?,A smartphone is a mobile device that typically...,A smartphone is a mobile device that typically...
3,How does the immune system work?,The immune system is a complex network of cell...,The immune system is a complex network of cell...
4,What is the capital of France?,"The capital of France is Paris, which is known...","The capital of France is Paris, which is known..."


In [26]:
df_input = data[['question']].copy()
df_output = data[['generated_text']].copy()
df_reference = data[['reference_text']].copy()

## Metrics configuration for evaluation

In [27]:
metric_config = {   
    #All Common parameters goes here 
    "configuration": {        
        LLMTextMetricGroup.GENERATION.value: { # metric group   
            LLMGenerationMetrics.BLEU.value: {},
            LLMGenerationMetrics.ROUGE_SCORE.value: {},
            LLMGenerationMetrics.FLESCH.value: {},
            LLMGenerationMetrics.METEOR.value: {},            
            LLMGenerationMetrics.NORMALIZED_RECALL.value: {},
            LLMGenerationMetrics.NORMALIZED_PRECISION.value: {},
            LLMGenerationMetrics.NORMALIZED_F1_SCORE.value: {}            
        }    
    }
}

## Content Generation Metrics Evaluation

In [28]:
result = client.llm_metrics.compute_metrics(metric_config,df_input,df_output, df_reference)

[nltk_data] Downloading package wordnet to /home/wsuser/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/wsuser/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/wsuser/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


## Evaluated Metrics

In [29]:
print(json.dumps(result,indent=2))

{
  "flesch": {
    "flesch_reading_ease": {
      "metric_value": 39.10217391304347,
      "mean": 39.10217391304347,
      "min": -11.44,
      "max": 69.62,
      "std": 20.153544505710833
    },
    "flesch_kincaid_grade": {
      "metric_value": 12.673913043478263,
      "mean": 12.673913043478263,
      "min": 8.0,
      "max": 18.6,
      "std": 3.2043743730833554
    }
  },
  "bleu": {
    "precisions": [
      1.0,
      0.9949174078780177,
      0.9947643979057592,
      0.9946018893387314
    ],
    "brevity_penalty": 0.7138823993242189,
    "length_ratio": 0.7479224376731302,
    "translation_length": 810,
    "reference_length": 1083,
    "metric_value": 0.711075655695426
  },
  "rouge_score": {
    "rouge1": {
      "metric_value": 0.8451
    },
    "rouge2": {
      "metric_value": 0.8402
    },
    "rougeL": {
      "metric_value": 0.8451
    },
    "rougeLsum": {
      "metric_value": 0.8451
    }
  },
  "normalized_recall": {
    "metric_value": 0.7335547397366776,
  

# Evaluating Question and Answering output from the Foundation Model

## Test data containing the question and answer output from model and the reference data

In [30]:
!rm -fr llm_content_qa.csv
!wget "https://raw.githubusercontent.com/ravichamarthy/custom_metrics/main/llm_content_qa.csv"

--2023-12-05 13:13:30--  https://raw.githubusercontent.com/ravichamarthy/custom_metrics/main/llm_content_qa.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3109 (3.0K) [text/plain]
Saving to: ‘llm_content_qa.csv’


2023-12-05 13:13:31 (11.9 MB/s) - ‘llm_content_qa.csv’ saved [3109/3109]



In [31]:
data = pd.read_csv("llm_content_qa.csv")
data.head()

Unnamed: 0,question,answers
0,who did chris carter play for last year,Milwaukee Brewers
1,what is the latest version of safari on mac,Safari 11
2,when did bucharest become the capital of romania,1862
3,who did jeffrey dean morgan play on supernatural,John Eric Winchester
4,who is the shortest man that ever lived,Chandra Bahadur Dangi


In [32]:
df_input = data[['question']].copy()
df_output = data[['answers']].copy()
df_reference = data[['answers']].copy()

## Metrics configuration for evaluation

In [33]:
metric_config = {   
    #All Common parameters goes here 
    "configuration": {        
        LLMTextMetricGroup.QA.value: { # metric group   
            LLMQAMetrics.EXACT_MATCH.value: {},
            LLMQAMetrics.ROUGE_SCORE.value: {},
            LLMQAMetrics.BLEU.value: {}          
        }    
    }
}

## Question and Answering Metrics Evaluation

In [34]:
result = client.llm_metrics.compute_metrics(metric_config,df_input,df_output, df_reference)

Downloading builder script:   0%|          | 0.00/5.67k [00:00<?, ?B/s]

## Evaluated Metrics

In [35]:
print(json.dumps(result,indent=2))

{
  "exact_match": {
    "metric_value": 1.0
  },
  "bleu": {
    "precisions": [
      1.0,
      1.0,
      1.0,
      1.0
    ],
    "brevity_penalty": 1.0,
    "length_ratio": 1.0,
    "translation_length": 133,
    "reference_length": 133,
    "metric_value": 1.0
  },
  "rouge_score": {
    "rouge1": {
      "metric_value": 1.0
    },
    "rouge2": {
      "metric_value": 0.74
    },
    "rougeL": {
      "metric_value": 1.0
    },
    "rougeLsum": {
      "metric_value": 1.0
    }
  }
}


# Evaluating Text Classification output from the Foundation Model

## Test data containing the text classification output from model and the reference data

In [36]:
!rm -fr llm_content_classification.csv
!wget "https://raw.githubusercontent.com/ravichamarthy/custom_metrics/main/llm_content_classification.csv"

--2023-12-05 13:13:37--  https://raw.githubusercontent.com/ravichamarthy/custom_metrics/main/llm_content_classification.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 480803 (470K) [text/plain]
Saving to: ‘llm_content_classification.csv’


2023-12-05 13:13:37 (57.6 MB/s) - ‘llm_content_classification.csv’ saved [480803/480803]



In [37]:
data = pd.read_csv("llm_content_classification.csv")
data.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [38]:
data['label'] = data['label'].replace({'ham': 0, 'spam': 1})

In [39]:
df_input = data[['text']].copy()
df_output = data[['label']].copy()
df_reference = data[['label']].copy()

## Make some realistic reference column

In [40]:
shuffled_column = df_reference['label'].sample(frac=1).reset_index(drop=True)
df_reference['label'] = shuffled_column

## Metrics configuration for evaluation

In [41]:
metric_config = {   
    #All Common parameters goes here 
    "configuration": {        
        LLMTextMetricGroup.CLASSIFICATION.value: { # metric group   
            LLMClassificationMetrics.ACCURACY.value: {},
            LLMClassificationMetrics.PRECISION.value: {},
            LLMClassificationMetrics.RECALL.value: {},
            LLMClassificationMetrics.F1_SCORE.value: {},
            LLMClassificationMetrics.MATTHEWS_CORRELATION.value: {},            
        }    
    }
}

## Text Classification Metrics Evaluation

In [42]:
result = client.llm_metrics.compute_metrics(metric_config,df_input,df_output, df_reference)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.55k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.60k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

## Evaluated Metrics

In [43]:
print(json.dumps(result,indent=2))

{
  "accuracy": {
    "accuracy": 0.7674084709260589
  },
  "precision": {
    "precision": 0.13253012048192772
  },
  "matthews_correlation": {
    "matthews_correlation": -0.001770397652787315
  },
  "recall": {
    "recall": 0.13253012048192772
  },
  "f1": {
    "f1": 0.13253012048192772
  }
}


Author: kishore.patel@in.ibm.com , ravi.chamarthy@in.ibm.com