In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
sys.path.append('../src')

from lm_polygraph.utils.model import WhiteboxModel
from lm_polygraph.estimators import *
from lm_polygraph.stat_calculators import *
from lm_polygraph.utils.openai_chat import OpenAIChat
from lm_polygraph.utils.deberta import MultilingualDeberta

In [3]:
bloomz="bigscience/bloomz-560m"
yi6bchat="01-ai/Yi-6B-Chat"
yi6b="01-ai/Yi-6B"
t5base='google-t5/t5-base'
bloomz560m='bigscience/bloomz-560m' # poor quality but fast calculations
model = WhiteboxModel.from_pretrained(yi6b, add_bos_token=False)

In [4]:
texts = ["请介绍一下曹操。"]
stat = {}

# MBZ Budget
os.environ["OPENAI_KEY"] = "YOUR_API_KEY"

for calculator in [
    GreedyProbsCalculator(),
    EntropyCalculator(),
    GreedyLMProbsCalculator(),
    # ClaimsExtractorZH(OpenAIChat("gpt-4o")),
]:
    stat.update(calculator(stat, texts, model))

claim_extractor=ClaimsExtractor(OpenAIChat("gpt-4o"),language="zh")
stat.update(claim_extractor(stat, texts, model))

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [5]:
stat.keys()

dict_keys(['input_texts', 'input_tokens', 'greedy_log_probs', 'greedy_tokens', 'greedy_tokens_alternatives', 'greedy_texts', 'greedy_log_likelihoods', 'embeddings_decoder', 'entropy', 'greedy_lm_log_probs', 'greedy_lm_log_likelihoods', 'claims', 'claim_texts_concatenated', 'claim_input_texts_concatenated'])

In [6]:
stat['claims']

[[Claim(claim_text='曹操是三国时期的政治家。', sentence=' 曹操是三国时期最伟大的政治家', aligned_tokens=[1, 2, 3, 4, 7]),
  Claim(claim_text='曹操是三国时期最伟大的政治家。', sentence=' 曹操是三国时期最伟大的政治家', aligned_tokens=[1, 2, 3, 4, 5, 6, 7]),
  Claim(claim_text='他一生中为官。', sentence='他一生中为官，为官期间，他多次被封为侯，并多次被封为侯爵', aligned_tokens=[9, 10, 11, 12, 13]),
  Claim(claim_text='他多次被封为侯。', sentence='他一生中为官，为官期间，他多次被封为侯，并多次被封为侯爵', aligned_tokens=[9, 20, 21, 22, 23]),
  Claim(claim_text='他多次被封为侯爵。', sentence='他一生中为官，为官期间，他多次被封为侯，并多次被封为侯爵', aligned_tokens=[9, 20, 21, 22, 23, 30]),
  Claim(claim_text='曹操一生中为官。', sentence='曹操一生中为官，为官期间，他多次被封为侯，并多次被封为侯爵', aligned_tokens=[32, 33, 34, 35, 36]),
  Claim(claim_text='为官期间，曹操多次被封为侯。', sentence='曹操一生中为官，为官期间，他多次被封为侯，并多次被封为侯爵', aligned_tokens=[32, 35, 36, 38, 39, 40, 43, 44, 45, 46]),
  Claim(claim_text='为官期间，曹操多次被封为侯爵。', sentence='曹操一生中为官，为官期间，他多次被封为侯，并多次被封为侯爵', aligned_tokens=[32, 35, 36, 40, 43, 44, 45, 46, 53])]]

In [7]:
print("Output:", stat["greedy_texts"][0])

Output:  曹操是三国时期最伟大的政治家。他一生中为官，为官期间，他多次被封为侯，并多次被封为侯爵。曹操一生中为官，为官期间，他多次被封为侯，并多次被封为侯爵。


In [8]:
print("Output:", stat["greedy_texts"][0])
print()
for claim in stat["claims"][0]:
    print("claim:", claim.claim_text)
    print("aligned tokens:", claim.aligned_tokens)
    print()

Output:  曹操是三国时期最伟大的政治家。他一生中为官，为官期间，他多次被封为侯，并多次被封为侯爵。曹操一生中为官，为官期间，他多次被封为侯，并多次被封为侯爵。

claim: 曹操是三国时期的政治家。
aligned tokens: [1, 2, 3, 4, 7]

claim: 曹操是三国时期最伟大的政治家。
aligned tokens: [1, 2, 3, 4, 5, 6, 7]

claim: 他一生中为官。
aligned tokens: [9, 10, 11, 12, 13]

claim: 他多次被封为侯。
aligned tokens: [9, 20, 21, 22, 23]

claim: 他多次被封为侯爵。
aligned tokens: [9, 20, 21, 22, 23, 30]

claim: 曹操一生中为官。
aligned tokens: [32, 33, 34, 35, 36]

claim: 为官期间，曹操多次被封为侯。
aligned tokens: [32, 35, 36, 38, 39, 40, 43, 44, 45, 46]

claim: 为官期间，曹操多次被封为侯爵。
aligned tokens: [32, 35, 36, 40, 43, 44, 45, 46, 53]



In [9]:
# Maximum Claim Probability
max_prob = MaximumClaimProbability()
max_prob(stat)  # Uncertainty for each claim, the higher, the less certain

[[5.517868,
  8.203161,
  11.508308,
  10.462716,
  11.252705,
  7.277602,
  4.004912,
  3.6238604]]

In [10]:
# Perlexity
perlexity_claim = PerplexityClaim()
perlexity_claim (stat)  

[[1.1035736,
  1.1718801,
  2.3016617,
  2.0925431,
  1.8754507,
  1.4555204,
  0.40049118,
  0.40265116]]

In [11]:
# Maximum Token Entropy
max_token_ent = MaxTokenEntropyClaim()
max_token_ent(stat)  

[[2.0959154e-05,
  2.0959154e-05,
  2.4157525e-05,
  2.5509149e-05,
  2.5509149e-05,
  2.243754e-05,
  2.126221e-05,
  2.126221e-05]]

In [12]:
# Pointwise Mutual Information
pmi = PointwiseMutualInformationClaim()
pmi (stat)  

[[-31.81642296910286,
  -45.93048223853111,
  -30.54413414001465,
  -29.65266525745392,
  -36.55968350172043,
  -34.77716279029846,
  -109.19080419000238,
  -97.22904723981628]]

In [13]:
p_true_calculator_stat = PromptCalculator(
    "Question: {q}\n Possible answer:{a}\n "
    "Is the possible answer True or False? The possible answer is: ",
    "True",
    "p_true_claim",
    input_text_dependency="claim_input_texts_concatenated",
    sample_text_dependency=None,
    generation_text_dependency="claim_texts_concatenated",
)

stat.update(p_true_calculator_stat(stat, texts, model))

ptrue_claim = PTrueClaim()
ptrue_claim (stat)  

[[9.566464,
  9.849717,
  10.163636,
  11.212116,
  11.415592,
  9.818885,
  10.598446,
  10.781264]]

In [14]:
# Claim Conditional Probability
for calculator in [
    GreedyAlternativesNLICalculator(MultilingualDeberta())
]:
    stat.update(calculator(stat, texts, model))

ccp = ClaimConditionedProbabilityClaim()
ccp (stat)

[[-0.3098740519439675,
  -0.25099232306072716,
  -0.6779380193045293,
  -0.45294784600333193,
  -0.4083827155215617,
  -0.8225784658523201,
  -0.8718909106823165,
  -0.8765211222672762]]

In [15]:
for calculator in [
    GreedyAlternativesFactPrefNLICalculator(MultilingualDeberta())
]:
    stat.update(calculator(stat, texts, model))

ccp_no_cxt=ClaimConditionedProbabilityClaim(nli_context="fact_pref")
ccp_no_cxt (stat)

[[-0.5257379541381006,
  -0.5118784852870085,
  -0.6496051811740936,
  -0.801711036128341,
  -0.801711036128341,
  -0.8218088030097428,
  -0.8774868957560673,
  -0.8775706039132196]]

In [16]:
from lm_polygraph.generation_metrics.openai_fact_check import OpenAIFactCheck
chinese_checker = OpenAIFactCheck('gpt-4o', language="zh")
chatgpt_response = chinese_checker(stat, None, None)

In [17]:
chatgpt_response

[[0, nan, 1, 1, 0, 0, 1, 0]]