In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from rouge import Rouge
import torch
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm
2023-07-23 13:16:32.535296: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
model_name = 'microsoft/DialoGPT-large'

tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side='left')
model = AutoModelForCausalLM.from_pretrained(model_name)

In [3]:
def get_response_and_scores(prompt, reference):
    # Encode & Decode
    inputs = tokenizer.encode(prompt + tokenizer.eos_token, return_tensors="pt")
    outputs = model.generate(inputs, max_length=50, num_return_sequences=1, temperature=0.7)
    response = tokenizer.decode(outputs[0], skip_special_tokens=False)

    input_ids = tokenizer.encode(response, return_tensors='pt')

    with torch.no_grad():
        outputs = model(input_ids, labels=input_ids)
    loss = outputs.loss
    perplexity = torch.exp(loss)

    rouge = Rouge()
    scores = rouge.get_scores(response, reference)

    df = pd.DataFrame({
        'Metric': ['ROUGE-1', 'ROUGE-2', 'ROUGE-L'],
        'Recall': [scores[0]['rouge-1']['r'], scores[0]['rouge-2']['r'], scores[0]['rouge-l']['r']],
        'Precision': [scores[0]['rouge-1']['p'], scores[0]['rouge-2']['p'], scores[0]['rouge-l']['p']],
        'F1 Score': [scores[0]['rouge-1']['f'], scores[0]['rouge-2']['f'], scores[0]['rouge-l']['f']],
    })

    print(f'ROUGE scores\n{df}')

    return response, perplexity.item(), scores

In [4]:
new_prompt = 'Imagine you are an expert on AI. Explain the impact of AI.'
reference_text = 'AI has a significant impact on various fields, including healthcare, transportation, and finance. It can automate tasks, improve decision-making, and provide new insights from data.'

response, perplexity, rouge_scores = get_response_and_scores(new_prompt, reference_text)
print('\nResponse:', response)
print('Perplexity:', perplexity)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


ROUGE scores
    Metric    Recall  Precision  F1 Score
0  ROUGE-1  0.208333   0.178571  0.192308
1  ROUGE-2  0.000000   0.000000  0.000000
2  ROUGE-L  0.125000   0.107143  0.115385

Response: Imagine you are an expert on AI. Explain the impact of AI.<|endoftext|>I'm not an expert on AI, but I can tell you that it's not a good idea to use AI to make money.<|endoftext|>
Perplexity: 31.15668296813965
