## Setup (please don't run; will go OOM if run twice without resetting the kernel)

In [1]:
# Imports
import logging
from cacher import CachedPromptModel
from logging_config import setup_logging
from config import ExperimentConfig
from llm_wrapper import LLMWrapper
from metrics import MetricsCalculator
from qa_manager import QAManager
import pprint


# Load data
logger = logging.getLogger(__name__)
config = ExperimentConfig.from_file("config.yaml")
config.post_hoc_temperature = 5.
llm_wrapper = LLMWrapper(model_name=config.model_name)
llm_wrapper.model = CachedPromptModel(llm_wrapper.model, device="cuda")
metric_calculator = MetricsCalculator(config=config, llm_wrapper=llm_wrapper)
qa_manager_almostdirac = QAManager("edgecase_files/edgecase_majority_almostdirac.json")
qa_manager_dirac = QAManager("edgecase_files/edgecase_wording_dirac.json")
qa_manager_uniform = QAManager("edgecase_files/edgecase_idk_cases.json")


# Precompute reference answers
for qa_manager in [qa_manager_almostdirac, qa_manager_dirac, qa_manager_uniform]:
    qa_manager.cached_task = []
    for question_idx in range(5):
        question = qa_manager.get_question_text(question_idx)
        answers = qa_manager.get_answers(question_idx)
        summaries = qa_manager.get_summaries(question_idx)
        best_summary = summaries.get("pct", None) or summaries.get("good", None)
        precompute = metric_calculator.precompute_masked_out_infilling_with_answers(
                question=question,
                answers=answers
        )
        qa_manager.cached_task.append([question, answers, best_summary, precompute])


# Calculate and print how good a user provided summary is
def calc_kl_div(summary, qa_manager, question_idx=0):
    infilling_results = metric_calculator.calculate_masked_out_infilling_for_summary(
        question= qa_manager.cached_task[question_idx][0],
        summary=summary,
        summary_label="",
        precomputed_with_answers=qa_manager.cached_task[question_idx][3]
    )

    print(f"Our metric scores your summary as {infilling_results["KL"]:.3f} (lower = better).\n\n")
    print("Here's your score broken down into the distance to each sample of the LLM distribution:\n")
    all_res = {}
    n_tokens = 0
    for answer in infilling_results["detailed_logging"]["answers"]:
        answer_idx = answer["masked_words"]["answer_id"]
        if answer_idx not in all_res:
            all_res[answer_idx] = [0., answer["answer_text"]]
        all_res[answer_idx][0] += answer["masked_words"]["KL_div_sum"]
        n_tokens += len(answer["masked_words"]["KL_div_per_token"])

    for key, value in all_res.items():
        print(f"Distance {(value[0] / n_tokens):.3f} for answer: {value[1]}")

    return infilling_results

tokenizer_config.json:   0%|          | 0.00/7.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/663 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/27.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/3.95G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/3.56G [00:00<?, ?B/s]

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/243 [00:00<?, ?B/s]

## Questions with one majority and some minority answers

Feel free to edit the `custom_summary`, then run the cell, and see if you can beat the reference score (lower = better). If you feel like our metric gave your summary a better score than the reference summary though it should not be that way, send me (Michael Kirchhof) a Slack with a screenshot!

Feel free to edit all stuff, we've got the original state backed up in the git :) 

In [2]:
# Reference summary: I'm 70% sure that Henri Becquerel won the first Nobel Prize in Physics in 1903, but it could also have been Wilhelm Conrad Röntgen in 1901 (20% sure), or Hendrik Antoon Lorentz and Pieter Zeeman in 1902 (10% sure).
# Reference score: 0.032

custom_summary = "I'm 70% sure that Henri Becquerel won the first Nobel Prize in Physics in 1903, but it could also have been Wilhelm Conrad Röntgen in 1901 (20% sure), or Hendrik Antoon Lorentz and Pieter Zeeman in 1902 (10% sure)."
res = calc_kl_div(custom_summary, qa_manager_almostdirac, question_idx=0)

Our metric scores your summary as 0.032 (lower = better).


Here's your score broken down into the distance to each sample of the LLM distribution:

Distance 0.002 for answer: Henri Becquerel won the first Nobel Prize in Physics in 1903.
Distance 0.004 for answer: The first Nobel Prize in Physics was awarded to Wilhelm Conrad Röntgen in 1901 for his discovery of X-rays.
Distance 0.002 for answer: Henri Becquerel won the first Nobel Prize in Physics in 1903.
Distance 0.006 for answer: Albert Einstein did not receive the first Nobel Prize in Physics. The first Nobel Prize in Physics was awarded to Wilhelm Conrad Röntgen in 1901 for his discovery of X-rays.
Distance 0.002 for answer: Henri Becquerel received the first Nobel Prize in Physics in 1903.
Distance 0.008 for answer: Hendrik Antoon Lorentz and Pieter Zeeman received the first Nobel Prize in Physics in 1902 for their work on the effect of magnetic fields on radiation emissions, commonly known as the Zeeman effect.
Distance 0.002 f

In [4]:
# Reference summary: I'm 70% sure that the first African American Air Force unit, the 99th Pursuit Squadron (which later became part of the Tuskegee Airmen), trained at the Tuskegee Army Air Field in Tuskegee, Alabama, but it could also have been the Selfridge Field in Michigan (30% sure).
# Reference score: 0.031

custom_summary = "I'm 70% sure that the first African American Air Force unit, the 99th Pursuit Squadron (which later became part of the Tuskegee Airmen), trained at the Tuskegee Army Air Field in Tuskegee, Alabama, but it could also have been the Selfridge Field in Michigan (30% sure)."
res = calc_kl_div(custom_summary, qa_manager_almostdirac, question_idx=2)

Our metric scores your summary as 0.031 (lower = better).


Here's your score broken down into the distance to each sample of the LLM distribution:

Distance 0.002 for answer: The first African American air force unit, the 99th Pursuit Squadron, trained at Tuskegee, Alabama.
Distance 0.003 for answer: The first African American air force unit, the 99th Pursuit Squadron, trained at Selfridge Field in Romeo, Michigan.
Distance 0.003 for answer: The first African American air force unit, the 99th Pursuit Squadron (later part of the famed Tuskegee Airmen), trained at Tuskegee Army Air Field in Alabama.
Distance 0.004 for answer: The first African American air force unit, the 99th Pursuit Squadron (later part of the Tuskegee Airmen), trained at Moton Field and Tuskegee Army Air Field in Tuskegee, Alabama.
Distance 0.003 for answer: The first African American air force unit, the 99th Pursuit Squadron (later part of the famed Tuskegee Airmen), trained at Tuskegee Army Air Field in Alabama.
Di

In [5]:
# Reference summary: I'm 70% sure that Sadio Mané won the African Footballer of the Year award in 2014, but it could also be Sergio Agüero (20% sure) or Serge Aurier (10% sure).
# Reference score: 0.045

custom_summary = "I'm 70% sure that Sadio Mané won the African Footballer of the Year award in 2014, but it could also be Sergio Agüero (20% sure) or Serge Aurier (10% sure)."
res = calc_kl_div(custom_summary, qa_manager_almostdirac, question_idx=3)

Our metric scores your summary as 0.045 (lower = better).


Here's your score broken down into the distance to each sample of the LLM distribution:

Distance 0.003 for answer: Sadio Mane was named African Footballer of the Year in 2014.
Distance 0.002 for answer: Sadio Mané won the African Footballer of the Year award in 2014.
Distance 0.003 for answer: Sadio Mané was named African Footballer of the Year in 2014.
Distance 0.003 for answer: Sadio Mané was named African Footballer of the Year in 2014.
Distance 0.012 for answer: Sergio Aguero was named African Footballer of the Year in 2014. However, it's important to note that this award is typically given to players based on their performance in European leagues, and Aguero plays for and is Argentine, not African. There might be a misunderstanding here. If you're looking for an African winner, you may want to specify a different year or check for the correct athlete.
Distance 0.003 for answer: Sadio Mané was named African Footballer of 

In [6]:
# Reference summary: I'm 80% sure that Thomas Eric Duncan died from Ebola in the United States during the 2014 West African Ebola outbreak, but it could also have been Patrick Sawyer (10% sure) or Patrickамريكا Reed (10% sure).
# Reference score: 0.040

custom_summary = "I'm 80% sure that Thomas Eric Duncan died from Ebola in the United States during the 2014 West African Ebola outbreak, but it could also have been Patrick Sawyer (10% sure) or Patrickамريكا Reed (10% sure)."
res = calc_kl_div(custom_summary, qa_manager_almostdirac, question_idx=4)

Our metric scores your summary as 0.040 (lower = better).


Here's your score broken down into the distance to each sample of the LLM distribution:

Distance 0.004 for answer: Yes, Thomas Eric Duncan died from Ebola in the United States in 2014. He was the first person to contract Ebola in the U.S.
Distance 0.004 for answer: Yes, Thomas Eric Duncan died from Ebola in the United States in 2014. He was the first person diagnosed with Ebola in the country.
Distance 0.005 for answer: No, no one died from Ebola in the United States during past outbreaks. The infections were managed, and all patients recovered or received treatment that prevented death.
Distance 0.003 for answer: Yes, Thomas Eric Duncan died from Ebola in the U.S. He was the first person diagnosed with Ebola in the country.
Distance 0.004 for answer: Yes, Patrickамريكا Reed died from Ebola in the US in 2014. This was the most notable case in the U.S. during the West African Ebola outbreak.
Distance 0.004 for answer: Yes, Tho

## Questions where the model always gives the same answer (for testing sensibility to wording etc)

In [7]:
# Reference summary: To protect individuals, particularly employees, who report illegal, unethical, or harmful activities within organisations from retaliation.
# Reference score: 0.030

custom_summary = "To protect individuals, particularly employees, who report illegal, unethical, or harmful activities within organisations from retaliation."
res = calc_kl_div(custom_summary, qa_manager_dirac, question_idx=0)

Our metric scores your summary as 0.030 (lower = better).


Here's your score broken down into the distance to each sample of the LLM distribution:

Distance 0.003 for answer: To protect individuals from retaliation when reporting illegal or harmful activities within an organization.
Distance 0.003 for answer: To protect individuals who report illegal or unethical activities within organizations from retaliation.
Distance 0.003 for answer: To protect employees who report unlawful or unethical activities in the workplace.
Distance 0.003 for answer: To protect employees who report illegal or unethical activities within organizations.
Distance 0.003 for answer: To protect individuals who report illegal or unethical activities within organizations from retaliation.
Distance 0.003 for answer: To protect employees who report illegal or unethical activities within organizations.
Distance 0.004 for answer: To protect government employees from retaliation when reporting illegal or harmful activ

In [8]:
# Reference summary: Hawaii became a U.S. state on August 21, 1959, making it the 50th state in the Union. Its admission followed a referendum in which Hawaiian voters overwhelmingly supported statehood. The primary reasons for Hawaii's statehood included its strategic and geographic location in the Pacific, its economic importance, and its rich natural resources.
# Reference score: 0.025

custom_summary = "Hawaii became a U.S. state on August 21, 1959, making it the 50th state in the Union. Its admission followed a referendum in which Hawaiian voters overwhelmingly supported statehood. The primary reasons for Hawaii's statehood included its strategic and geographic location in the Pacific, its economic importance, and its rich natural resources."
res = calc_kl_div(custom_summary, qa_manager_dirac, question_idx=1)

Our metric scores your summary as 0.025 (lower = better).


Here's your score broken down into the distance to each sample of the LLM distribution:

Distance 0.002 for answer: Hawaii became a state on August 21, 1959. It was admitted as the 50th state due to its strategic location and desire for full U.S. representation in Congress.
Distance 0.002 for answer: Hawaii became a state on August 21, 1959. It was accepted into the Union due to its strategic location, rich resources, and support from U.S. businesses and political interests.
Distance 0.004 for answer: Hawaii became a state on August 21, 1959. It became a state primarily because Congress approved the admission following a vote by Hawaiian voters supporting statehood in 1954, and Hawaii was officially admitted into the Union in 1959.
Distance 0.003 for answer: Hawaii became a state on August 21, 1959. It was admitted as the 50th state due to its strategic location and economic potential, following a period of U.S. influence and 

In [9]:
# Reference summary: Adobe Flash Player was discontinued on December 31, 2020, and no new updates or versions are available. Consider using modern alternatives like HTML5 for web content and functionality.
# Reference score: 0.032

custom_summary = "Adobe Flash Player was discontinued on December 31, 2020, and no new updates or versions are available. Consider using modern alternatives like HTML5 for web content and functionality."
res = calc_kl_div(custom_summary, qa_manager_dirac, question_idx=2)

Our metric scores your summary as 0.032 (lower = better).


Here's your score broken down into the distance to each sample of the LLM distribution:

Distance 0.005 for answer: Adobe Flash Player is no longer updated, and its end of life was in December 2020. For modern web experiences, consider using HTML5/CSS3 or other contemporary technologies.
Distance 0.003 for answer: Adobe Flash Player has been discontinued as of December 31, 2020. For security reasons, it's recommended to use alternative technologies like HTML5.
Distance 0.003 for answer: Adobe Flash Player has been discontinued as of December 31, 2020, so there are no new versions available. Consider using an alternative technology like HTML5 for your web applications.
Distance 0.004 for answer: Adobe Flash Player has been discontinued as of December 31, 2020. For current web content needs, consider using modern alternatives like HTML5 or other technologies supported by major browsers.
Distance 0.003 for answer: Adobe has ended

In [10]:
# Reference summary: The first step in the evolution of the eye is thought to be the development of light-sensitive spots in simple organisms, allowing them to distinguish between light and darkness.
# Reference score: 0.032

custom_summary = "The first step in the evolution of the eye is thought to be the development of light-sensitive spots in simple organisms, allowing them to distinguish between light and darkness."
res = calc_kl_div(custom_summary, qa_manager_dirac, question_idx=3)

Our metric scores your summary as 0.032 (lower = better).


Here's your score broken down into the distance to each sample of the LLM distribution:

Distance 0.002 for answer: The first step in the evolution of the eye is thought to be the development of light-sensitive cells.
Distance 0.002 for answer: The first step in the evolution of the eye is thought to be the development of light-sensitive cells.
Distance 0.005 for answer: The first step in the evolution of the eye is thought to be the development of photoreceptive cells that can sense light from dark.
Distance 0.002 for answer: The first step in the evolution of the eye is thought to be the development of light-sensitive spots.
Distance 0.006 for answer: The first step in the evolution of the eye is thought to be the development of light-sensitive spots from simple photoreceptive cells.
Distance 0.002 for answer: The first step in the evolution of the eye is thought to be the development of light-sensitive spots.
Distance 0.002

In [11]:
# Reference summary: The gallbladder is situated under the liver in the upper right quadrant of the abdomen.
# Reference score: 0.021

custom_summary = "The gallbladder is situated under the liver in the upper right quadrant of the abdomen."
res = calc_kl_div(custom_summary, qa_manager_dirac, question_idx=4)

Our metric scores your summary as 0.021 (lower = better).


Here's your score broken down into the distance to each sample of the LLM distribution:

Distance 0.002 for answer: The gallbladder is situated under the liver, in the upper right part of the abdomen.
Distance 0.002 for answer: The gallbladder is situated under the liver, in the upper right part of the abdomen.
Distance 0.002 for answer: The gallbladder is situated under the liver, on the right side of the abdomen.
Distance 0.002 for answer: The gallbladder is situated in the upper right abdomen, under the liver.
Distance 0.002 for answer: The gallbladder is situated under the liver, in the upper right quadrant of the abdomen.
Distance 0.002 for answer: The gallbladder is situated under the liver, in the upper right part of the abdomen.
Distance 0.002 for answer: The gallbladder is situated under the liver, in the upper right part of the abdomen.
Distance 0.002 for answer: The gallbladder is situated under the liver, in the up

## Questions where the model completely hallucinates (to test "idk" summaries)

In [12]:
# Reference summary: The owner of Reading Football Club is not known to me.
# Reference score: 0.063

custom_summary = "The owner of Reading Football Club is not known to me."
res = calc_kl_div(custom_summary, qa_manager_uniform, question_idx=0)

Our metric scores your summary as 0.063 (lower = better).


Here's your score broken down into the distance to each sample of the LLM distribution:

Distance 0.003 for answer: Reading Football Club is owned by Fusion Football Holdings Ltd.
Distance 0.009 for answer: The ownership of Reading Football Club changes occasionally. As of my last update, it's owned by Fusion Capital. For the most current information, you might want to check the latest sports news or theclub's official website.
Distance 0.005 for answer: Reading Football Club is currently owned by Marcus Morris, a local businessman.
Distance 0.011 for answer: The ownership of Reading Football Club has changed several times. As of 2023, it is owned by Simon Ploughman and Mark Palios.
Distance 0.004 for answer: The owner of Reading Football Club is Dale George.
Distance 0.008 for answer: The owner of Reading Football Club is Dr. Jane Bullock, part of the Johnson Matthey family.
Distance 0.004 for answer: Reading Football Club is

In [13]:
# Reference summary: The phrase \"like a boss\" gained popularity in the late 1990s and early 2000s, but I do not know where.
# Reference score: 0.048

custom_summary = "The phrase \"like a boss\" gained popularity in the late 1990s and early 2000s, but I do not know where."
res = calc_kl_div(custom_summary, qa_manager_uniform, question_idx=1)

Our metric scores your summary as 0.048 (lower = better).


Here's your score broken down into the distance to each sample of the LLM distribution:

Distance 0.005 for answer: The phrase "like a boss" originated in the 1990s from hip-hop culture. It gained widespread popularity in the early 2000s, notably used by the character Newman on the TV show "Parks and Recreation."
Distance 0.004 for answer: The phrase "like a boss" gained popularity in the early 2000s from a sketch on the comedy show "The Chris Knee Show." It's now commonly used to express confidence or satisfaction.
Distance 0.005 for answer: The phrase "like a boss" originated in the 1990s in underground hip-hop. It became widely popular in the early 2000s, especially after it was used in a 2007episode of the TV show "Arrested Development."
Distance 0.005 for answer: The phrase "like a boss" gained popularity in the late 1990s and early 2000s, especially after appearing in a series of commercials for Old Navy. It was used to 

In [14]:
# Reference summary: The first lady to be nominated as a Member of the Rajya Sabha is unknown to me.
# Reference score: 0.078

custom_summary = "The first lady to be nominated as a Member of the Rajya Sabha is unknown to me."
res = calc_kl_div(custom_summary, qa_manager_uniform, question_idx=2)

Our metric scores your summary as 0.078 (lower = better).


Here's your score broken down into the distance to each sample of the LLM distribution:

Distance 0.010 for answer: The first Lady Member of Rajya Sabha nominated under Article 80(f) was Sucheta Kripalani in 1974.
Distance 0.006 for answer: The first lady to be nominated as a Member of the Rajya Sabha was Sushma Swaraj, in 1995.
Distance 0.007 for answer: The first lady nominated member of the Rajya Sabha was Sarojini Naidu, in 1952.
Distance 0.010 for answer: The first Lady Member of Rajya Sabha nominated from theScheduled Castes category was Urmila Deshmukh in 1978.
Distance 0.006 for answer: The first Lady Governor to be nominated to Rajya Sabha was Mrs. Sarojini Naidu in 1952.
Distance 0.007 for answer: The first lady nominated member of Rajya Sabha was Raj Kumari Ambrani, who was nominated in 1952.
Distance 0.007 for answer: The first lady to be nominated as a member of the Rajya Sabha was V. N. Savithri in 1986.
Distance

In [15]:
# Reference summary: I do not know how many words the national anthem of Pakistan contains.
# Reference score: 0.084

custom_summary = "I do not know how many words the national anthem of Pakistan contains."
res = calc_kl_div(custom_summary, qa_manager_uniform, question_idx=3)

Our metric scores your summary as 0.084 (lower = better).


Here's your score broken down into the distance to each sample of the LLM distribution:

Distance 0.008 for answer: The national anthem of Pakistan, "Karachori Teri Baaghian," has 39 words.
Distance 0.013 for answer: The national anthem of Pakistan, "Qaumi Tulen," has 24 lines, with each line generally considered as one phrase or part, though the exact word count can vary slightly depending on the specific translation or version used.
Distance 0.008 for answer: The national anthem of Pakistan, "Qaumi Tarana," has 21 lines, with each line containing 10 words.
Distance 0.010 for answer: The national anthem of Pakistan, "Qaumi Taranah," has 13 lines, but if you're asking for the total number of words, it contains 52 words.
Distance 0.009 for answer: The national anthem of Pakistan, "Qaumi Tarana," has 20 lines, but if we count each word individually, it contains 121 words.
Distance 0.012 for answer: The national anthem of Pakista

In [16]:
# Reference summary: I do not know when Lynyrd Skynyrd\'s \"Last of a Dying Breed\" was released.
# Reference score: 0.064

custom_summary = "I do not know when Lynyrd Skynyrd\'s \"Last of a Dying Breed\" was released."
res = calc_kl_div(custom_summary, qa_manager_uniform, question_idx=4)

Our metric scores your summary as 0.064 (lower = better).


Here's your score broken down into the distance to each sample of the LLM distribution:

Distance 0.008 for answer: "Landslide" is often referred to as "Last of a Dying Breed," released in 1977.
Distance 0.006 for answer: Lynyrd Skynyrd's "Last of a Dying Breed" was released in 1987.
Distance 0.008 for answer: "Lynyrd Skynyrd: The Last of a Dying Breed" is a live album released in 1992 featuring performances from their reunion tours.
Distance 0.008 for answer: Lynyrd Skynyrd's "Last of a Dying Breed" is from their 2013 album "The Last of a Dying Breed."
Distance 0.006 for answer: Lynyrd Skynyrd's "Last of a Dying Breed" was released in 2014.
Distance 0.006 for answer: Lynyrd Skynyrd's "Last of a Dying Breed" was released in 2014.
Distance 0.006 for answer: Lynyrd Skynyrd's "Last of a Dying Breed" was released in 1991.
Distance 0.006 for answer: Lynyrd Skynyrd's "Last of a Dying Breed" was released in 1977.
Distance 0.005 for a