# Evaluating how well a language model performs

In [1]:
import openai
from pprint import pprint
import math

In [2]:
from rouge import Rouge
import sacrebleu

In [3]:
# Set your OpenAI API key here
openai.api_key = ''

## (1) Checking model performance using next token probabilities

In [4]:
client = openai.OpenAI()

In [5]:
def get_token_probabilities(prompt, model="davinci-002", max_tokens=3, logprobs=5):
    """
    Get token probabilities for a given prompt using the OpenAI API.
    
    Parameters:
    - prompt: str. The input text to generate completions for.
    - model: str, optional. The model to use for generating completions.
    - max_tokens: int, optional. The maximum number of tokens to generate.
    - logprobs: int, optional. The number of log probabilities to return for each token.
    
    Returns:
    - response: dict. The API response containing token probabilities.
    """
    try:
        response = client.completions.create(
            model=model,
            prompt=prompt,
            max_tokens=max_tokens,
            logprobs=logprobs
        )
        return response
    except Exception as e:
        print(f"An error occurred: {e}")

# Example usage
prompt_text = "The quick brown fox jumps over the lazy"
response = get_token_probabilities(prompt_text)
print(response)

Completion(id='cmpl-9ZHHS0d7zWKhpcglACeLyBhePNu9m', choices=[CompletionChoice(finish_reason='length', index=0, logprobs=Logprobs(text_offset=[39, 43, 44], token_logprobs=[-0.03429916, -2.5415063, -1.0932422], tokens=[' dog', '"', ' \n'], top_logprobs=[{' dog': -0.03429916, ' dogs': -4.873874, ' cat': -5.661428, ' brown': -5.7598166, ' d': -6.34283}, {'"': -2.5415063, '\n\n': -1.8044438, '.\n\n': -2.2798486, '”': -2.6530614, '.': -2.7223058}, {' \n': -1.0932422, ' has': -2.759645, ' is': -2.967853, ' -': -3.0105128, ' in': -3.1142306}]), text=' dog" \n')], created=1718194934, model='davinci-002', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=3, prompt_tokens=8, total_tokens=11))


In [6]:
choices = response.choices

In [7]:
 if choices:
    # Focus on the first choice's 'logprobs' for simplicity
    logprobs = choices[0].logprobs
    if logprobs:
        # Iterate over all tokens and probabilities
        for token, token_probs in zip(logprobs.tokens, logprobs.top_logprobs):
            # Print each token and its highest probability
            print(f"Token: {token}")
            for prob_token, prob_value in token_probs.items():
                print(f"  {prob_token}: {math.exp(prob_value):.4f}")
            print()  

Token:  dog
   dog: 0.9663
   dogs: 0.0076
   cat: 0.0035
   brown: 0.0032
   d: 0.0018

Token: "
  ": 0.0787
  

: 0.1646
  .

: 0.1023
  ”: 0.0704
  .: 0.0657

Token:  

   
: 0.3351
   has: 0.0633
   is: 0.0514
   -: 0.0493
   in: 0.0444



## (2) Calculating ROUGE scores

ROUGE-N is a measure of **the overlap of n-grams between the generated text and the reference text.**

* ROUGE-1: Overlap of unigrams (individual words).
* ROUGE-2: Overlap of bigrams (two-word pairs).
* ROUGE-N: Overlap of n-grams (where n can be any number).

"ROUGE, or Recall-Oriented Understudy for Gisting Evaluation, is a set of metrics and a software package used for evaluating automatic summarization and machine translation software in natural language processing. The metrics compare an automatically produced summary or translation against a reference or a set of references (human-produced) summary or translation." [source](https://huggingface.co/spaces/evaluate-metric/rouge)

In [8]:
def generate_text(prompt, model="davinci-002", max_tokens=10):
    """
    Generate text using the OpenAI API.
    
    Parameters:
    - prompt: str. The input text to generate completions for.
    - model: str, optional. The model to use for generating completions.
    - max_tokens: int, optional. The maximum number of tokens to generate.
    
    Returns:
    - generated_text: str. The text generated by the model.
    """
    try:
        response = client.completions.create(
            model=model,
            prompt=prompt,
            max_tokens=max_tokens
        )
        generated_text = response.choices[0].text.strip()
        return generated_text
    except Exception as e:
        print(f"An error occurred: {e}")
        return ""

In [9]:
def evaluate_rouge(generated_text, reference_text):
    """
    Evaluate the ROUGE score between generated and reference texts.
    
    Parameters:
    - generated_text: str. The text generated by the model.
    - reference_text: str. The reference text to compare against.
    
    Returns:
    - rouge_scores: dict. The ROUGE scores.
    """
    rouge = Rouge()
    scores = rouge.get_scores(generated_text, reference_text, avg=True)
    return scores

In [10]:
# Example usage
prompt_text = "The quick brown fox"
reference_text = "The quick brown fox jumps over the lazy dog."

In [11]:
# Generate text using GPT-3
generated_text = generate_text(prompt_text)
print(generated_text)

jumped over the lazy dog"

The spell check of


In [12]:
# Evaluate ROUGE score
rouge_scores = evaluate_rouge(generated_text, reference_text)

print("Generated Text:", generated_text)
print("Reference Text:", reference_text)
print("ROUGE Scores:", rouge_scores)

Generated Text: jumped over the lazy dog"

The spell check of
Reference Text: The quick brown fox jumps over the lazy dog.
ROUGE Scores: {'rouge-1': {'r': 0.4444444444444444, 'p': 0.4444444444444444, 'f': 0.44444443944444445}, 'rouge-2': {'r': 0.25, 'p': 0.25, 'f': 0.24999999500000009}, 'rouge-l': {'r': 0.3333333333333333, 'p': 0.3333333333333333, 'f': 0.3333333283333334}}


## (3) Calculating BLEU scores

Definition: "BLEU evaluates the degree to which the text generated by the LLM is comparable to human-written material, going beyond simple factual accuracy.

It contrasts the produced text with professionally translated human references. Through the analysis of factors such as n-gram overlap (the frequency with which word sequences occur in both the produced text and that reference—BLEU generates a score that accounts for readability, fluency, and grammatical accuracy." [source](https://www.labellerr.com/blog/evaluating-large-language-models/)

In [13]:
def evaluate_bleu(generated_text, reference_text):
    """
    Evaluate the BLEU score between generated and reference texts.
    
    Parameters:
    - generated_text: str. The text generated by the model.
    - reference_text: str. The reference text to compare against.
    
    Returns:
    - bleu_score: float. The BLEU score.
    """
    bleu = sacrebleu.corpus_bleu([generated_text], [[reference_text]])
    return bleu.score

In [14]:
# Evaluate BLEU score
bleu_score = evaluate_bleu(generated_text, reference_text)
print("Generated Text:", generated_text)
print("Reference Text:", reference_text)
print("BLEU Score:", bleu_score)

Generated Text: jumped over the lazy dog"

The spell check of
Reference Text: The quick brown fox jumps over the lazy dog.
BLEU Score: 27.77619034011791


## (4) Evaluating perplexity 

Definition: "[P]erplexity measures the confidence a model has in its (next word) predictions. The concept of perplexity measures how confused the model is in predicting the next word in a sequence.

Lower perplexity indicates that the model is more certain about its predictions. In comparison, higher perplexity suggests the model is more uncertain. Perplexity is a crucial metric for evaluating the performance of language models in tasks like machine translation, speech recognition, and text generation." [source](https://blog.uptrain.ai/decoding-perplexity-and-its-significance-in-llms/)

In [15]:
def calculate_perplexity(text, model="davinci-002"):
    # Tokenize the input text
    tokens = client.completions.create(
        model=model,
        prompt=text,
        max_tokens=0,
        logprobs=1,
        echo=True
    ).choices[0].logprobs.tokens

    # Get log probabilities of the tokens
    logprobs = client.completions.create(
        model=model,
        prompt=tokens,
        max_tokens=1,
        logprobs=1
    ).choices[0].logprobs.token_logprobs

    # Calculate perplexity
    N = len(logprobs)
    log_sum = sum(logprobs)
    perplexity = math.exp(-log_sum / N)
    
    return perplexity

In [17]:
prompt = "Once upon a time"
generated_text = generate_text(prompt)
print("Generated Text:")
print(generated_text)

perplexity = calculate_perplexity(generated_text)
print("\nPerplexity of the generated text:", perplexity)

Generated Text:
, iPhone costs $8 billion to make, and

Perplexity of the generated text: 27.9051204315316


In [18]:
prompt = "Once upon a time"
generated_text = generate_text(prompt)
print("Generated Text:")
print(generated_text)

perplexity = calculate_perplexity(generated_text)
print("\nPerplexity of the generated text:", perplexity)

Generated Text:
, sports betting online existed solely […]

In this V

Perplexity of the generated text: 15970.29958070121
