In [1]:
# Loading a bunch of different datasets, filtering for obscenity, ensuring minimal content length, and producing one or more potential outputs.
import json
from enum import Enum
from pydantic import BaseModel, ValidationError, Field
from typing import Optional, Union

# Test Cases:

- Named Entity Recognition (classical): Given a sentence, pull out names, locations, organizations, and 'misc'
- Named 'Thing' Recognition: Like NER, but without a hard constraint on the type.
- Unit Extraction: Pull out measurements like '10cm', '1 year', 'one olive'. Only explicitly mentioned objects with quantities, so 'a moose' could be {quantity: 1, unit: 'moose'}.
- Event Extraction: Pull out dates, names, and locations (lat/long or city/region).
- Free-form generation: given no dataset, but given a description of a task, generate output.

# Datasets:

- Wikipedia.
- Twitter.
- Reddit comments.
- Emails.
- News Articles.

# Models:

- OpenAI GPT-4o
- Anthropic Clive
- NuMind NuExtract
- Microsoft Phi
- FAIR Llama

# Generation Modes:

- Function Calling
- Plain Prompting
- Constrained Decoding

# Evaluation / Failure Cases:

- Bad JSON. (JSON itself is malformed and cannot be parsed.)
- Good JSON structure, bad JSON content.  (Proper structure, but required fields/keys are missing or renamed.  Mismatch of the schema, for instance, but with valid JSON.)  (Do we want a separate field for data being good?)
- Good JSON structure, good JSON content, bad data.  (Proper structure with all fields present, but data hallucinated.)
- Good JSON structure, good JSON content, invalid data. (Proper data, but constraints like range are invalidated? Separate from bad data?)
- Good JSON structure, good JSON content, extra data.  (There is at least one correct JSON row with all the expected fields and content, but there are additional hallucinated rows.)

In [52]:
# Named Entity Extraction

# Single Document -> Single JSON Object:
class NERMultiExtraction(BaseModel):
    names: list[str]
    locations: list[str]
    organizations: list[str]
    misc: list[str]

# Single Document -> Many JSON Entries with ENUM constraint:
class _NERObject(str, Enum):
    name = 'name'
    location = 'location'
    organization = 'organization'
    misc = 'misc'

class _NERExtraction(BaseModel):
    text: str
    extraction_type: _NERObject

class NERNestedExtraction(BaseModel):
    named_entities: list[_NERExtraction]

In [3]:
# "Thing" extraction. 

# Similar to NER, but without a hard restriction on a type.
class ThingExtraction(BaseModel):
    text: str
    thingtype: str

class NestedThingExtraction(BaseModel):
    things: list[ThingExtraction]

In [4]:
# Unit Extraction
class UnitExtraction(BaseModel):
    quantity: float
    unit: str

class NestedUnitExtraction(BaseModel):
    items: list[UnitExtraction]

In [5]:
# Event Extraction
class GeographicLocation(BaseModel):
    latitude: float
    longitude: float

class NamedLocation(BaseModel):
    fine: str = Field(description="The finest/narrowest defined region, for example: Kyiv, San Francisco, Wabash and Lake, Navy Pier.")
    coarse: str = Field(description="A coarse disambiguating region, for example: California, the Midwest, France.")

class EventExtraction(BaseModel):
    year: int
    month: int = Field(le=12)  # Zero indexed?  One indexed?
    day: int = Field(le=31)
    hour: int = Field(le=23)
    minute: int = Field(le=60)
    name: str
    location: Optional[Union[GeographicLocation, NamedLocation]]

class NestedEventExtraction(BaseModel):
    events: list[EventExtraction]

In [89]:
from enum import Enum

class Task(str, Enum):
    NER_FLAT = "ner_flat"
    NER_NESTED = "ner_nested"
    THING_EXTRACTION = "thing_extraction"
    UNIT_EXTRACTION = "unit_extraction"
    EVENT_EXTRACTION = "event_extraction"

SCHEMAS = {
    Task.NER_FLAT: json.dumps(NERMultiExtraction.model_json_schema()),
    Task.NER_NESTED: json.dumps(NERNestedExtraction.model_json_schema()),
    Task.THING_EXTRACTION: json.dumps(NestedThingExtraction.model_json_schema()),
    Task.UNIT_EXTRACTION: json.dumps(NestedUnitExtraction.model_json_schema()),
    Task.EVENT_EXTRACTION: json.dumps(NestedEventExtraction.model_json_schema()),
}

# NuExtract largely fails with the JSON Schema standard. We need to custom-define the schema for the examples.
NUEXTRACT_SCHEMAS = {
    Task.NER_FLAT: 
"""{
    "names": [],
    "organizations": [],
    "locations": [],
    "misc": []
}""",
    Task.NER_NESTED: 
"""{
    "named_entities": [
        {
            "text": "",
            "extraction_type": "name|location|organization|misc"
        }
    ]
}""",
    Task.THING_EXTRACTION:
"""{
    "things": [
        {
            "text": "",
            "thingtype": ""
        }
    ]
}""",
    Task.UNIT_EXTRACTION:
"""{
    "items": [
        {
            "quantity": "",
            "unit": ""
        }
    ]
}""",
    Task.EVENT_EXTRACTION:
"""{
    "events": [
        {
            "year": "",
            "month": "",
            "day": "",
            "hour": "",
            "minute": "",
            "name": "",
            "location": {
                "fine": "",
                "coarse": "",
                "latitude": "",
                "longitude": ""
            }
        }
    ]
}""" 
}

EXAMPLES = {
    Task.NER_FLAT: [
"""{
    "names": ["Bob Loblaw", "Orson Wells", "Aaron Spacemuseum"],
    "organizations": ["Bob Loblaw's Law Blog", "Air and Space Museum"],
    "locations": ["Washington D.C."],
    "misc": []
}""",
"""{
    "names": [],
    "organizations": ["NASA", "The National Aeronotics and Space Administration"],
    "locations": [],
    "misc": ["Straight Outta Compton"]
}""",
"""{
    "names": [],
    "organizations": [],
    "locations": ["San Francisco", "Oakland, CA"],
    "misc": []
}"""
    ],
    Task.NER_NESTED: [
"""{
    "named_entities": [
        {
            "text": "Aaron Spacemuseum",
            "extraction_type": "name"
        },
        {
            "text": "NASA",
            "extraction_type": "organization"
        },
        {
            "text": "San Francisco",
            "extraction_type": "location"
        }
    ]
}""",
"""{
    "named_entities": [
        {
            "text": "Norm Alman",
            "extraction_type": "name"
        },
        {
            "text": "Dark Side of the Moon",
            "extraction_type": "misc"
        }
    ]
}""",
"""{
    "named_entities": [
        {
            "text": "Moe Thegrass",
            "extraction_type": "name"
        },
        {
            "text": "Huge Yakman",
            "extraction_type": "name"
        }
    ]
}""",],
    Task.THING_EXTRACTION: [
"""{
    "things": [
        {
            "text": "Pink Floyd",
            "thingtype": "band"
        },
        {
            "text": "San Francisco Bay Area",
            "thingtype": "location"
        },
        {
            "text": "google.com",
            "thingtype": "website"
        }
    ]
}""",
"""{
    "things": [
        {
            "text": "Riverside",
            "thingtype": "location"
        },
        {
            "text": "Bob Loblaw's Law Blog",
            "thingtype": "website"
        }
    ]
}""",
"""{
    "things": [
        {
            "text": "Earth",
            "thingtype": "planet"
        },
        {
            "text": "The Odyssey",
            "thingtype": "book"
        }
    ]
}"""],
    Task.UNIT_EXTRACTION:[
"""{
    "items": [
        {
            "quantity": "10",
            "unit": "oz"
        },
        {
            "quantity": "320",
            "unit": "ml"
        }
    ]
}""",
"""{
    "items": [
        {
            "quantity": "2.61",
            "unit": "lightyears"
        }
    ]
}""",
    """{
    "items": [
        {
            "quantity": "3.0x10^8",
            "unit": "m/s"
        },
        {
            "quantity": "9.81",
            "unit": "m/s^2"
        }
    ]
}""",],
    Task.EVENT_EXTRACTION: [
"""{
    "events": [
        {
            "year": 2024,
            "month": 7,
            "day": 13,
            "hour": null,
            "minute": null,
            "name": "Fusion Breakthrough",
            "location": {
                "fine": "Lawrence Livermore Labs",
                "coarse": "California"
            }
        }
    ]
}""",
"""{
    "events": [
        {
            "year": 2023,
            "month": 12,
            "day": 28,
            "hour": 12,
            "minute": 0,
            "name": "Largest snowfall ever recorded",
            "location": {
                "fine": "Minneapolis",
                "coarse": "Minnesota"
            }
        },
        {
            "year": 2023,
            "month": 12,
            "day": 29,
            "hour": 18,
            "minute": 0,
            "name": "Snow plow deployment paused",
            "location": {
                "latitude": 44.986656,
                "longitude": -93.258133
            }
        }
    ]
}""",
"""{
    "events": [
        {
            "year": 1999,
            "month": 1,
            "day": 1,
            "hour": 0,
            "minute": 1,
            "name": "New Year's Day Celebration",
            "location": {
                "fine": "Times Square",
                "coarse": "United States"
            }
        }
    ]
}""",
    ]}


In [91]:
print("Test NUEXTRACT SCHEMAS")
for t in Task:
    json.loads(NUEXTRACT_SCHEMAS[t])

print("Test SCHEMAS")
for t in Task:
    json.loads(NUEXTRACT_SCHEMAS[t])

print("Test EXAMPLES")
for t in Task:
    for ex in EXAMPLES[t]:
        json.loads(ex)

print("All okay!")

Test NUEXTRACT SCHEMAS
Test SCHEMAS
Test EXAMPLES
All okay!


# Data Loading:

In [158]:
import csv
import gzip
import json
import os

MAX_SAMPLES_PER_SOURCE = 10

SAVE_NAME = "saved_test_cases.json.gz"

# If we already did the data processing, load it.
if os.path.isfile(SAVE_NAME):
    print("Loading data")
    with gzip.open(SAVE_NAME, 'rt') as fin:
        data = json.load(fin)
        news_articles = data['news']
        print(f"{len(news_articles)} news articles")
        wikipedia = data['wikipedia']
        print(f"{len(wikipedia)} wikipedia pages")

Loading data
10 news articles
10 wikipedia pages


In [69]:
# Wikipedia, extracting N articles at random.
# Wikipedia was extracted from the RedPanda dataset, then sampled down to 0.1% of available articles for a total of 6552.
# It was then further sampled as follows:
with open("/Users/josephcatrambone/Datasets/wikipedia_en_sampled.json", 'r') as fin:
    wikipedia = json.load(fin)
wikipedia = [article for article in wikipedia if "References" in article and len(article) > 250]
# Cut off the 'References' and everything at the end because it tends to be noise.
trimmed_wikipedia = list()
for article in wikipedia:
    trimmed_wikipedia.append(article[:article.index("References")])
wikipedia = trimmed_wikipedia

In [121]:
print(wikipedia[0])

Aberdeen (Lakota: Ablíla) is a city in and the county seat of Brown County, South Dakota, United States, located approximately  northeast of Pierre. The city population was 28,495 at the 2020 census, making it the third most populous city in the state after Sioux Falls and Rapid City. Aberdeen is the principal city of the Aberdeen Micropolitan Statistical Area, which includes all of Brown and Edmunds counties and has a population of 42,287 in 2020. Aberdeen is considered a college town, being the home of both Northern State University and Presentation College.

History

Settlement
Before Aberdeen or Brown County was inhabited by European settlers, it was inhabited by the Sioux Indians from approximately 1700 to 1879. Europeans entered the region for business, founding fur trading posts during the 1820s; these trading posts operated until the mid-1830s. The first "settlers" of this region were the Arikara Indians, but they would later be joined by others.

The first group of Euro-Americ

In [108]:
# Twitter, extract N tweets at random (omitting obscene entries).
tweets = list()
with open("/Users/josephcatrambone/Datasets/tweets_sampled.csv", 'rt') as fin:
    cin = csv.reader(fin)
    header = next(cin)
    text_column = header.index("Tweet Content")
    rt_column = header.index("Tweet Type")
    for row in cin:
        if row[rt_column].lower() == "retweet":
            continue
        text = row[text_column]
        tweets.append(text)

In [None]:
# Reddit, extract N comments.

In [None]:
# EMail, extract N messages at random.

In [133]:
# News Articles, extracting N articles.
#%pip install unidecode
import csv
import random
import zipfile
from io import StringIO

from unidecode import unidecode

csv.field_size_limit(1000000)

csv_buffer = StringIO()
with zipfile.ZipFile("/Users/josephcatrambone/Datasets/guardian_articles.zip", 'r') as fin:
    #print(fin.filelist)
    article_bytes = fin.read(name=fin.filelist[0].filename)
    csv_buffer.write(article_bytes.decode("utf-8"))
    csv_buffer.seek(0)
cin = csv.reader(csv_buffer, )
header = next(cin)
body_index = header.index("bodyContent")
print(f"Article content at index {body_index}")

news_articles = list()
for line in cin:
    article = unidecode(line[body_index])
    if len(article.strip()) > 100:
        news_articles.append(article)

random.shuffle(news_articles)
news_articles = news_articles[:MAX_SAMPLES_PER_SOURCE]
del csv_buffer

Article content at index 4


In [18]:
news_articles[2]

'The Mintel survey comes as no surprise to anyone who has been to fashion week recently. High heels - which were, until a few years ago, a non-negotiable element of the looks both on and off the catwalk - no longer have the monopoly on status footwear. In 2016, the right trainer is more alpha than Manolo. The defining moment in the rise of the trainer as fashionable footwear came in January 2014, when Karl Lagerfeld\'s dressed every model on his Chanel haute couture catwalk in a pair of Chanel trainers, to go with their PS100,000 ballgowns. Trainers had been in the ascendency among fashion\'s more minimalist dressers since 2010, when Celine designer Phoebe Philo\'s habit of wearing Stan Smiths to take her catwalk bow sparked a slew of front-row copycats, but Chanel\'s endorsement broke down the last remaining barriers. Once trainers were deemed chic enough for Chanel haute couture, there was no stopping them.\nAthleisure has all but cannibalised fashion in the past 18 months. Tracksuit

In [162]:
## Save All Datasets:
data = {
    "wikipedia": wikipedia,
    "news": news_articles,
}

import gzip
with gzip.open(SAVE_NAME, 'wt') as fout:
    json.dump(data, fout)

In [63]:
#%pip install pandas
import pandas as pd

# There's a combinatorial explosion of factors which we can add to 'data' below, but this is, sadly, a job for a data frame.
"""
for gr_state in ("without_guardrails", "with_guardrails"):
    data[gr_state] = dict()
    for task in ("ner_flat_task", "ner_nested_task", "thing_extraction", "unit_extraction", "event_extraction"):
        data[gr_state][task] = dict()
        for model in ("openai", "anthropic", "phi", "numind", "gpt2"):
            data[gr_state][task][model] = dict()
            for ds in ("wikipedia", "news"):
                data[gr_state][task][model][ds] = list()
"""

def make_df():
    return pd.DataFrame(columns=["doc_idx", "task", "model", "dataset", "guardrails", "num_shots", "time", "raw_model_output", "exception"])
raw_results = make_df()
raw_results


Unnamed: 0,doc_idx,task,model,dataset,guardrails,num_shots,time,raw_model_output,exception


In [64]:
# Make sure we can save results.
def append(df, new_data: dict):
    columns = list(df.columns)
    new_row = pd.Series([new_data[k] for k in columns], index=df.columns)
    df = pd.concat([df, new_row.to_frame().T], ignore_index=True)
    return df

def append_run(df, doc_idx, task, model, dataset, guardrails, num_shots, time, raw_model_output, exc):
    return append(df, {"doc_idx": doc_idx, "task": task, "model": model, "dataset": dataset, "guardrails": guardrails, "num_shots": num_shots, "time": time, "raw_model_output": raw_model_output, "exception": exc})

raw_results = append_run(raw_results, 1, "ner_flat_task", "nuextract", "news", "None", 0, 1.0, '{"fake json output": true}', "")
#raw_results.at[0, "ner_flat_task", "nuextract", "wikipedia", "None"] = '{"fake json output": true}'

raw_results.to_pickle('test.pkl')
pd.read_pickle('test.pkl')

raw_results.to_json('test.json')
pd.read_json('test.json')

Unnamed: 0,doc_idx,task,model,dataset,guardrails,num_shots,time,raw_model_output,exception
0,1,ner_flat_task,nuextract,news,,0,1,"{""fake json output"": true}",


# Model Loading/Runs:

In [67]:
# Set up the inference for the numind nuextract case:
#%pip install torch
#%pip install transformers

# Taken directly from the NuMind NuExtract documentation
# https://huggingface.co/numind/NuExtract-tiny

from transformers import AutoModelForCausalLM, AutoTokenizer

device = "cuda"

def predict_NuExtract(model, tokenizer, text, schema, examples=["","",""], device="cuda"):
    schema = json.dumps(json.loads(schema), indent=4)
    input_llm =  "<|input|>\n### Template:\n" +  schema + "\n"
    for i in examples:
      if i != "":
          input_llm += "### Example:\n"+ json.dumps(json.loads(i), indent=4)+"\n"
    
    input_llm +=  "### Text:\n"+text +"\n<|output|>\n"
    input_ids = tokenizer(input_llm, return_tensors="pt", truncation=True, max_length=4000).to(device)

    output = tokenizer.decode(model.generate(**input_ids)[0], skip_special_tokens=True)
    return output.split("<|output|>")[1].split("<|end-output|>")[0]


nuextract_model = AutoModelForCausalLM.from_pretrained("numind/NuExtract-tiny", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("numind/NuExtract-tiny", trust_remote_code=True)

nuextract_model.to("cuda")
nuextract_model.eval()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 1024)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2SdpaAttention(
          (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (o_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (rotary_emb): Qwen2RotaryEmbedding()
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=1024, out_features=2816, bias=False)
          (up_proj): Linear(in_features=1024, out_features=2816, bias=False)
          (down_proj): Linear(in_features=2816, out_features=1024, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm()
        (post_attention_layernorm): Qwen2RMSNorm()
      )
    )
    (norm): Qwen2RMSNorm()
  )
  (lm_head): Line

In [174]:
from enum import Enum
#%pip install openai
#%pip install anthropic
from openai import OpenAI
import anthropic
import time

anthropic_client = anthropic.Anthropic() # Default: api_key=os.environ.get("ANTHROPIC_API_KEY"),
openai_client = OpenAI() # Defaults to os.environ.get("OPENAI_API_KEY")

#results = make_df()

#prompt = "Please extract JSON from the following document. Return only a JSON object with the given schema.\nDOCUMENT:\n{doc}\nSCHEMA:\n{schema}"
checkpoint_count = 0
for task in Task:
    for model_provider, model_name in (("numind", "nuextract"), ("openai", "gpt-3.5-turbo"), ("openai", "gpt-4-turbo"), ("anthropic", "claude-3-opus-20240229")):
        for datasource in ("wikipedia", "news"):
            for num_examples in (0, 1, 3):
                for doc_idx, doc_text in enumerate(data[datasource]):
                    print(f"Run: {model_name} - {datasource} - {num_examples} - {doc_idx} ", end="")
                    # Check to see if we already ran this data.  If we did, we can skip it.
                    maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
                    row_data = maybe_row.to_dict(orient='records')
                    existing_row_idx = None  
                    if row_data:
                        existing_row_idx = maybe_row.index[0]
                        #unfilled_results.loc[existing_row_idx, "exception"] = ""
                        if row_data[0]["exception"] == "" and row_data[0]["raw_model_output"] != "":
                            print("- SKIPPED!")
                            continue
                        else:
                            print("- RERUNNING")
                    else:
                        print("...")
                    # We need to run this.
                    try:
                        schema = SCHEMAS[task]
                        if model_provider == "numind":
                            schema = NUEXTRACT_SCHEMAS[task]
                        examples = EXAMPLES[task]
                        maybe_examples = ""
                        if num_examples > 0:
                            maybe_examples = f"The following are examples of the schema:\n"
                            for x in range(num_examples):
                                maybe_examples += examples[x] + "\n\n"
                        prediction = ""
                        ex = ""
                        start_time = time.time()
                        if model_provider == "openai":
                            prediction = openai_client.chat.completions.create(
                              model=model_name,
                              messages=[
                                {
                                  "role": "system",
                                  "content": f"You will be provided with unstructured data in the form of a document. Your task is to create a JSON object that adheres to the following schema:\n{schema}\n{maybe_examples}\nReturn only the result JSON. If data for a given field is not present, provide an empty array ([]) for arrays, an empty string for strings, and null for missing values."
                                },
                                {
                                  "role": "user",
                                  "content": doc_text
                                }
                              ],
                              temperature=0.1,
                              max_tokens=1024,
                              top_p=1
                            ).choices[0].message.content
                        elif model_provider == "anthropic":
                            prediction = anthropic_client.messages.create(
                                model=model_name,
                                messages=[
                                    {
                                        "role": "user",
                                        "content": f"You will be provided with unstructured data in the form of a document. Your task is to create a JSON object that adheres to the following schema:\n{schema}\n{maybe_examples}\nReturn only the result JSON. If data for a given field is not present, provide an empty array ([]) for arrays, an empty string for strings, and null for missing values.\nInput Document:\n" + doc_text,
                                    }
                                ],
                                max_tokens=1024,
                                temperature=0.1,
                                top_p=1
                            ).content[0].text
                            time.sleep(1)  # The best rate-limiting.
                        elif model_provider == "numind":
                            prediction = predict_NuExtract(nuextract_model, tokenizer, doc_text, schema, examples[:num_examples])
                        else:
                            print(f"EXCEPTION: Typo in model_provider: {model_provider}")
                    except Exception as e:
                        ex = str(e)
                    end_time = time.time()
                    if existing_row_idx is None:
                        results = append_run(results, doc_idx, task, model_name, datasource, "None", num_examples, end_time - start_time, prediction, ex)
                    else:
                        results.loc[existing_row_idx, "doc_idx"] = doc_idx
                        results.loc[existing_row_idx, "task"] = task
                        results.loc[existing_row_idx, "num_shots"] = num_examples
                        results.loc[existing_row_idx, "model"] = model_name
                        results.loc[existing_row_idx, "dataset"] = datasource
                        results.loc[existing_row_idx, "time"] = end_time-start_time
                        results.loc[existing_row_idx, "raw_model_output"] = prediction
                        results.loc[existing_row_idx, "exception"] = ex
                        
                results.to_json(f'checkpoint_{checkpoint_count}.json')
                checkpoint_count += 1

Run: nuextract - wikipedia - 0 - 0 - SKIPPED!
Run: nuextract - wikipedia - 0 - 1 - SKIPPED!
Run: nuextract - wikipedia - 0 - 2 - SKIPPED!
Run: nuextract - wikipedia - 0 - 3 - SKIPPED!
Run: nuextract - wikipedia - 0 - 4 - SKIPPED!
Run: nuextract - wikipedia - 0 - 5 - SKIPPED!
Run: nuextract - wikipedia - 0 - 6 - SKIPPED!
Run: nuextract - wikipedia - 0 - 7 - SKIPPED!
Run: nuextract - wikipedia - 0 - 8 - SKIPPED!
Run: nuextract - wikipedia - 0 - 9 - SKIPPED!
Run: nuextract - wikipedia - 1 - 0 - SKIPPED!
Run: nuextract - wikipedia - 1 - 1 - SKIPPED!
Run: nuextract - wikipedia - 1 - 2 - SKIPPED!
Run: nuextract - wikipedia - 1 - 3 - SKIPPED!
Run: nuextract - wikipedia - 1 - 4 - SKIPPED!
Run: nuextract - wikipedia - 1 - 5 - SKIPPED!
Run: nuextract - wikipedia - 1 - 6 - SKIPPED!
Run: nuextract - wikipedia - 1 - 7 - SKIPPED!
Run: nuextract - wikipedia - 1 - 8 - SKIPPED!
Run: nuextract - wikipedia - 1 - 9 - SKIPPED!
Run: nuextract - wikipedia - 3 - 0 - SKIPPED!
Run: nuextract - wikipedia - 3 - 1

  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource

Run: gpt-3.5-turbo - wikipedia - 3 - 0 - SKIPPED!
Run: gpt-3.5-turbo - wikipedia - 3 - 1 - SKIPPED!
Run: gpt-3.5-turbo - wikipedia - 3 - 2 - SKIPPED!
Run: gpt-3.5-turbo - wikipedia - 3 - 3 - SKIPPED!
Run: gpt-3.5-turbo - wikipedia - 3 - 4 - SKIPPED!
Run: gpt-3.5-turbo - wikipedia - 3 - 5 - SKIPPED!
Run: gpt-3.5-turbo - wikipedia - 3 - 6 - SKIPPED!
Run: gpt-3.5-turbo - wikipedia - 3 - 7 - SKIPPED!
Run: gpt-3.5-turbo - wikipedia - 3 - 8 - SKIPPED!
Run: gpt-3.5-turbo - wikipedia - 3 - 9 - SKIPPED!
Run: gpt-3.5-turbo - news - 0 - 0 - SKIPPED!
Run: gpt-3.5-turbo - news - 0 - 1 - SKIPPED!
Run: gpt-3.5-turbo - news - 0 - 2 - SKIPPED!
Run: gpt-3.5-turbo - news - 0 - 3 - SKIPPED!
Run: gpt-3.5-turbo - news - 0 - 4 - SKIPPED!
Run: gpt-3.5-turbo - news - 0 - 5 - SKIPPED!
Run: gpt-3.5-turbo - news - 0 - 6 - SKIPPED!
Run: gpt-3.5-turbo - news - 0 - 7 - SKIPPED!
Run: gpt-3.5-turbo - news - 0 - 8 - SKIPPED!
Run: gpt-3.5-turbo - news - 0 - 9 - SKIPPED!
Run: gpt-3.5-turbo - news - 1 - 0 - SKIPPED!
Run: 

  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource

Run: gpt-4-turbo - news - 0 - 0 - SKIPPED!
Run: gpt-4-turbo - news - 0 - 1 - SKIPPED!
Run: gpt-4-turbo - news - 0 - 2 - SKIPPED!
Run: gpt-4-turbo - news - 0 - 3 - SKIPPED!
Run: gpt-4-turbo - news - 0 - 4 - SKIPPED!
Run: gpt-4-turbo - news - 0 - 5 - SKIPPED!
Run: gpt-4-turbo - news - 0 - 6 - SKIPPED!
Run: gpt-4-turbo - news - 0 - 7 - SKIPPED!
Run: gpt-4-turbo - news - 0 - 8 - SKIPPED!
Run: gpt-4-turbo - news - 0 - 9 - SKIPPED!
Run: gpt-4-turbo - news - 1 - 0 - SKIPPED!
Run: gpt-4-turbo - news - 1 - 1 - SKIPPED!
Run: gpt-4-turbo - news - 1 - 2 - SKIPPED!
Run: gpt-4-turbo - news - 1 - 3 - SKIPPED!
Run: gpt-4-turbo - news - 1 - 4 - SKIPPED!
Run: gpt-4-turbo - news - 1 - 5 - SKIPPED!
Run: gpt-4-turbo - news - 1 - 6 - SKIPPED!
Run: gpt-4-turbo - news - 1 - 7 - SKIPPED!
Run: gpt-4-turbo - news - 1 - 8 - SKIPPED!
Run: gpt-4-turbo - news - 1 - 9 - SKIPPED!
Run: gpt-4-turbo - news - 3 - 0 - SKIPPED!
Run: gpt-4-turbo - news - 3 - 1 - SKIPPED!
Run: gpt-4-turbo - news - 3 - 2 - SKIPPED!
Run: gpt-4-

  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource

- SKIPPED!
Run: claude-3-opus-20240229 - news - 1 - 7 - SKIPPED!
Run: claude-3-opus-20240229 - news - 1 - 8 - SKIPPED!
Run: claude-3-opus-20240229 - news - 1 - 9 - SKIPPED!
Run: claude-3-opus-20240229 - news - 3 - 0 - SKIPPED!
Run: claude-3-opus-20240229 - news - 3 - 1 - SKIPPED!
Run: claude-3-opus-20240229 - news - 3 - 2 - SKIPPED!
Run: claude-3-opus-20240229 - news - 3 - 3 - SKIPPED!
Run: claude-3-opus-20240229 - news - 3 - 4 - SKIPPED!
Run: claude-3-opus-20240229 - news - 3 - 5 - SKIPPED!
Run: claude-3-opus-20240229 - news - 3 - 6 - SKIPPED!
Run: claude-3-opus-20240229 - news - 3 - 7 - SKIPPED!
Run: claude-3-opus-20240229 - news - 3 - 8 - SKIPPED!
Run: claude-3-opus-20240229 - news - 3 - 9 - SKIPPED!
Run: nuextract - wikipedia - 0 - 0 - SKIPPED!
Run: nuextract - wikipedia - 0 - 1 - SKIPPED!
Run: nuextract - wikipedia - 0 - 2 - SKIPPED!
Run: nuextract - wikipedia - 0 - 3 - SKIPPED!
Run: nuextract - wikipedia - 0 - 4 - SKIPPED!
Run: nuextract - wikipedia - 0 - 5 - SKIPPED!
Run: nuextr

  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource

- SKIPPED!
Run: nuextract - news - 3 - 8 - SKIPPED!
Run: nuextract - news - 3 - 9 - SKIPPED!
Run: gpt-3.5-turbo - wikipedia - 0 - 0 - SKIPPED!
Run: gpt-3.5-turbo - wikipedia - 0 - 1 - SKIPPED!
Run: gpt-3.5-turbo - wikipedia - 0 - 2 - SKIPPED!
Run: gpt-3.5-turbo - wikipedia - 0 - 3 - SKIPPED!
Run: gpt-3.5-turbo - wikipedia - 0 - 4 - SKIPPED!
Run: gpt-3.5-turbo - wikipedia - 0 - 5 - SKIPPED!
Run: gpt-3.5-turbo - wikipedia - 0 - 6 - SKIPPED!
Run: gpt-3.5-turbo - wikipedia - 0 - 7 - SKIPPED!
Run: gpt-3.5-turbo - wikipedia - 0 - 8 - SKIPPED!
Run: gpt-3.5-turbo - wikipedia - 0 - 9 - SKIPPED!
Run: gpt-3.5-turbo - wikipedia - 1 - 0 - SKIPPED!
Run: gpt-3.5-turbo - wikipedia - 1 - 1 - SKIPPED!
Run: gpt-3.5-turbo - wikipedia - 1 - 2 - SKIPPED!
Run: gpt-3.5-turbo - wikipedia - 1 - 3 - SKIPPED!
Run: gpt-3.5-turbo - wikipedia - 1 - 4 - SKIPPED!
Run: gpt-3.5-turbo - wikipedia - 1 - 5 - SKIPPED!
Run: gpt-3.5-turbo - wikipedia - 1 - 6 - SKIPPED!
Run: gpt-3.5-turbo - wikipedia - 1 - 7 - SKIPPED!
Run: gp

  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource

Run: gpt-4-turbo - wikipedia - 1 - 0 - SKIPPED!
Run: gpt-4-turbo - wikipedia - 1 - 1 - SKIPPED!
Run: gpt-4-turbo - wikipedia - 1 - 2 - SKIPPED!
Run: gpt-4-turbo - wikipedia - 1 - 3 - SKIPPED!
Run: gpt-4-turbo - wikipedia - 1 - 4 - SKIPPED!
Run: gpt-4-turbo - wikipedia - 1 - 5 - SKIPPED!
Run: gpt-4-turbo - wikipedia - 1 - 6 - SKIPPED!
Run: gpt-4-turbo - wikipedia - 1 - 7 - SKIPPED!
Run: gpt-4-turbo - wikipedia - 1 - 8 - SKIPPED!
Run: gpt-4-turbo - wikipedia - 1 - 9 - SKIPPED!
Run: gpt-4-turbo - wikipedia - 3 - 0 - SKIPPED!
Run: gpt-4-turbo - wikipedia - 3 - 1 - SKIPPED!
Run: gpt-4-turbo - wikipedia - 3 - 2 - SKIPPED!
Run: gpt-4-turbo - wikipedia - 3 - 3 - SKIPPED!
Run: gpt-4-turbo - wikipedia - 3 - 4 - SKIPPED!
Run: gpt-4-turbo - wikipedia - 3 - 5 - SKIPPED!
Run: gpt-4-turbo - wikipedia - 3 - 6 - SKIPPED!
Run: gpt-4-turbo - wikipedia - 3 - 7 - SKIPPED!
Run: gpt-4-turbo - wikipedia - 3 - 8 - SKIPPED!
Run: gpt-4-turbo - wikipedia - 3 - 9 - SKIPPED!
Run: gpt-4-turbo - news - 0 - 0 - SKIPPE

  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource

Run: claude-3-opus-20240229 - news - 0 - 0 - SKIPPED!
Run: claude-3-opus-20240229 - news - 0 - 1 - SKIPPED!
Run: claude-3-opus-20240229 - news - 0 - 2 - SKIPPED!
Run: claude-3-opus-20240229 - news - 0 - 3 - SKIPPED!
Run: claude-3-opus-20240229 - news - 0 - 4 - SKIPPED!
Run: claude-3-opus-20240229 - news - 0 - 5 - SKIPPED!
Run: claude-3-opus-20240229 - news - 0 - 6 - SKIPPED!
Run: claude-3-opus-20240229 - news - 0 - 7 - SKIPPED!
Run: claude-3-opus-20240229 - news - 0 - 8 - SKIPPED!
Run: claude-3-opus-20240229 - news - 0 - 9 - SKIPPED!
Run: claude-3-opus-20240229 - news - 1 - 0 - SKIPPED!
Run: claude-3-opus-20240229 - news - 1 - 1 - SKIPPED!
Run: claude-3-opus-20240229 - news - 1 - 2 - SKIPPED!
Run: claude-3-opus-20240229 - news - 1 - 3 - SKIPPED!
Run: claude-3-opus-20240229 - news - 1 - 4 - SKIPPED!
Run: claude-3-opus-20240229 - news - 1 - 5 - SKIPPED!
Run: claude-3-opus-20240229 - news - 1 - 6 - SKIPPED!
Run: claude-3-opus-20240229 - news - 1 - 7 - SKIPPED!
Run: claude-3-opus-20240229 

  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource

Run: nuextract - news - 1 - 0 - SKIPPED!
Run: nuextract - news - 1 - 1 - SKIPPED!
Run: nuextract - news - 1 - 2 - SKIPPED!
Run: nuextract - news - 1 - 3 - SKIPPED!
Run: nuextract - news - 1 - 4 - SKIPPED!
Run: nuextract - news - 1 - 5 - SKIPPED!
Run: nuextract - news - 1 - 6 - SKIPPED!
Run: nuextract - news - 1 - 7 - SKIPPED!
Run: nuextract - news - 1 - 8 - SKIPPED!
Run: nuextract - news - 1 - 9 - SKIPPED!
Run: nuextract - news - 3 - 0 - SKIPPED!
Run: nuextract - news - 3 - 1 - SKIPPED!
Run: nuextract - news - 3 - 2 - SKIPPED!
Run: nuextract - news - 3 - 3 - SKIPPED!
Run: nuextract - news - 3 - 4 - SKIPPED!
Run: nuextract - news - 3 - 5 - SKIPPED!
Run: nuextract - news - 3 - 6 - SKIPPED!
Run: nuextract - news - 3 - 7 - SKIPPED!
Run: nuextract - news - 3 - 8 - SKIPPED!
Run: nuextract - news - 3 - 9 - SKIPPED!
Run: gpt-3.5-turbo - wikipedia - 0 - 0 - SKIPPED!
Run: gpt-3.5-turbo - wikipedia - 0 - 1 - SKIPPED!
Run: gpt-3.5-turbo - wikipedia - 0 - 2 - SKIPPED!
Run: gpt-3.5-turbo - wikipedia

  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource

- SKIPPED!
Run: gpt-4-turbo - wikipedia - 0 - 0 - SKIPPED!
Run: gpt-4-turbo - wikipedia - 0 - 1 - SKIPPED!
Run: gpt-4-turbo - wikipedia - 0 - 2 - SKIPPED!
Run: gpt-4-turbo - wikipedia - 0 - 3 - SKIPPED!
Run: gpt-4-turbo - wikipedia - 0 - 4 - SKIPPED!
Run: gpt-4-turbo - wikipedia - 0 - 5 - SKIPPED!
Run: gpt-4-turbo - wikipedia - 0 - 6 - SKIPPED!
Run: gpt-4-turbo - wikipedia - 0 - 7 - SKIPPED!
Run: gpt-4-turbo - wikipedia - 0 - 8 - SKIPPED!
Run: gpt-4-turbo - wikipedia - 0 - 9 - SKIPPED!
Run: gpt-4-turbo - wikipedia - 1 - 0 - SKIPPED!
Run: gpt-4-turbo - wikipedia - 1 - 1 - SKIPPED!
Run: gpt-4-turbo - wikipedia - 1 - 2 - SKIPPED!
Run: gpt-4-turbo - wikipedia - 1 - 3 - SKIPPED!
Run: gpt-4-turbo - wikipedia - 1 - 4 - SKIPPED!
Run: gpt-4-turbo - wikipedia - 1 - 5 - SKIPPED!
Run: gpt-4-turbo - wikipedia - 1 - 6 - SKIPPED!
Run: gpt-4-turbo - wikipedia - 1 - 7 - SKIPPED!
Run: gpt-4-turbo - wikipedia - 1 - 8 - SKIPPED!
Run: gpt-4-turbo - wikipedia - 1 - 9 - SKIPPED!
Run: gpt-4-turbo - wikipedia 

  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource

Run: claude-3-opus-20240229 - wikipedia - 1 - 0 - SKIPPED!
Run: claude-3-opus-20240229 - wikipedia - 1 - 1 - SKIPPED!
Run: claude-3-opus-20240229 - wikipedia - 1 - 2 - SKIPPED!
Run: claude-3-opus-20240229 - wikipedia - 1 - 3 - SKIPPED!
Run: claude-3-opus-20240229 - wikipedia - 1 - 4 - SKIPPED!
Run: claude-3-opus-20240229 - wikipedia - 1 - 5 - SKIPPED!
Run: claude-3-opus-20240229 - wikipedia - 1 - 6 - SKIPPED!
Run: claude-3-opus-20240229 - wikipedia - 1 - 7 - SKIPPED!
Run: claude-3-opus-20240229 - wikipedia - 1 - 8 - SKIPPED!
Run: claude-3-opus-20240229 - wikipedia - 1 - 9 - SKIPPED!
Run: claude-3-opus-20240229 - wikipedia - 3 - 0 - SKIPPED!
Run: claude-3-opus-20240229 - wikipedia - 3 - 1 - SKIPPED!
Run: claude-3-opus-20240229 - wikipedia - 3 - 2 - SKIPPED!
Run: claude-3-opus-20240229 - wikipedia - 3 - 3 - SKIPPED!
Run: claude-3-opus-20240229 - wikipedia - 3 - 4 - SKIPPED!
Run: claude-3-opus-20240229 - wikipedia - 3 - 5 - SKIPPED!
Run: claude-3-opus-20240229 - wikipedia - 3 - 6 - SKIPPE

  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource

Run: nuextract - wikipedia - 3 - 0 - SKIPPED!
Run: nuextract - wikipedia - 3 - 1 - SKIPPED!
Run: nuextract - wikipedia - 3 - 2 - SKIPPED!
Run: nuextract - wikipedia - 3 - 3 - SKIPPED!
Run: nuextract - wikipedia - 3 - 4 - SKIPPED!
Run: nuextract - wikipedia - 3 - 5 - SKIPPED!
Run: nuextract - wikipedia - 3 - 6 - SKIPPED!
Run: nuextract - wikipedia - 3 - 7 - SKIPPED!
Run: nuextract - wikipedia - 3 - 8 - SKIPPED!
Run: nuextract - wikipedia - 3 - 9 - SKIPPED!
Run: nuextract - news - 0 - 0 - SKIPPED!
Run: nuextract - news - 0 - 1 - SKIPPED!
Run: nuextract - news - 0 - 2 - SKIPPED!
Run: nuextract - news - 0 - 3 - SKIPPED!
Run: nuextract - news - 0 - 4 - SKIPPED!
Run: nuextract - news - 0 - 5 - SKIPPED!
Run: nuextract - news - 0 - 6 - SKIPPED!
Run: nuextract - news - 0 - 7 - SKIPPED!
Run: nuextract - news - 0 - 8 - SKIPPED!
Run: nuextract - news - 0 - 9 - SKIPPED!
Run: nuextract - news - 1 - 0 - SKIPPED!
Run: nuextract - news - 1 - 1 - SKIPPED!
Run: nuextract - news - 1 - 2 - SKIPPED!
Run: nu

  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource

- SKIPPED!
Run: gpt-3.5-turbo - news - 0 - 0 - SKIPPED!
Run: gpt-3.5-turbo - news - 0 - 1 - SKIPPED!
Run: gpt-3.5-turbo - news - 0 - 2 - SKIPPED!
Run: gpt-3.5-turbo - news - 0 - 3 - SKIPPED!
Run: gpt-3.5-turbo - news - 0 - 4 - SKIPPED!
Run: gpt-3.5-turbo - news - 0 - 5 - SKIPPED!
Run: gpt-3.5-turbo - news - 0 - 6 - SKIPPED!
Run: gpt-3.5-turbo - news - 0 - 7 - SKIPPED!
Run: gpt-3.5-turbo - news - 0 - 8 - SKIPPED!
Run: gpt-3.5-turbo - news - 0 - 9 - SKIPPED!
Run: gpt-3.5-turbo - news - 1 - 0 - SKIPPED!
Run: gpt-3.5-turbo - news - 1 - 1 - SKIPPED!
Run: gpt-3.5-turbo - news - 1 - 2 - SKIPPED!
Run: gpt-3.5-turbo - news - 1 - 3 - SKIPPED!
Run: gpt-3.5-turbo - news - 1 - 4 - SKIPPED!
Run: gpt-3.5-turbo - news - 1 - 5 - SKIPPED!
Run: gpt-3.5-turbo - news - 1 - 6 - SKIPPED!
Run: gpt-3.5-turbo - news - 1 - 7 - SKIPPED!
Run: gpt-3.5-turbo - news - 1 - 8 - SKIPPED!
Run: gpt-3.5-turbo - news - 1 - 9 - SKIPPED!
Run: gpt-3.5-turbo - news - 3 - 0 - SKIPPED!
Run: gpt-3.5-turbo - news - 3 - 1 - SKIPPED!

  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource

Run: gpt-4-turbo - news - 1 - 0 - SKIPPED!
Run: gpt-4-turbo - news - 1 - 1 - SKIPPED!
Run: gpt-4-turbo - news - 1 - 2 - SKIPPED!
Run: gpt-4-turbo - news - 1 - 3 - SKIPPED!
Run: gpt-4-turbo - news - 1 - 4 - SKIPPED!
Run: gpt-4-turbo - news - 1 - 5 - SKIPPED!
Run: gpt-4-turbo - news - 1 - 6 - SKIPPED!
Run: gpt-4-turbo - news - 1 - 7 - SKIPPED!
Run: gpt-4-turbo - news - 1 - 8 - SKIPPED!
Run: gpt-4-turbo - news - 1 - 9 - SKIPPED!
Run: gpt-4-turbo - news - 3 - 0 - SKIPPED!
Run: gpt-4-turbo - news - 3 - 1 - SKIPPED!
Run: gpt-4-turbo - news - 3 - 2 - SKIPPED!
Run: gpt-4-turbo - news - 3 - 3 - SKIPPED!
Run: gpt-4-turbo - news - 3 - 4 - SKIPPED!
Run: gpt-4-turbo - news - 3 - 5 - SKIPPED!
Run: gpt-4-turbo - news - 3 - 6 - SKIPPED!
Run: gpt-4-turbo - news - 3 - 7 - SKIPPED!
Run: gpt-4-turbo - news - 3 - 8 - SKIPPED!
Run: gpt-4-turbo - news - 3 - 9 - SKIPPED!
Run: claude-3-opus-20240229 - wikipedia - 0 - 0 - SKIPPED!
Run: claude-3-opus-20240229 - wikipedia - 0 - 1 - SKIPPED!
Run: claude-3-opus-202

  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource

Run: claude-3-opus-20240229 - wikipedia - 0 - 4 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - wikipedia - 0 - 5 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - wikipedia - 0 - 6 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - wikipedia - 0 - 7 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - wikipedia - 0 - 8 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - wikipedia - 0 - 9 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - wikipedia - 1 - 0 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - wikipedia - 1 - 1 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - wikipedia - 1 - 2 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - wikipedia - 1 - 3 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - wikipedia - 1 - 4 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - wikipedia - 1 - 5 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - wikipedia - 1 - 6 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - wikipedia - 1 - 7 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - wikipedia - 1 - 8 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - wikipedia - 1 - 9 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - wikipedia - 3 - 0 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - wikipedia - 3 - 1 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - wikipedia - 3 - 2 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - wikipedia - 3 - 3 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - wikipedia - 3 - 4 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - wikipedia - 3 - 5 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - wikipedia - 3 - 6 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - wikipedia - 3 - 7 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - wikipedia - 3 - 8 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - wikipedia - 3 - 9 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - news - 0 - 0 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - news - 0 - 1 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - news - 0 - 2 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - news - 0 - 3 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - news - 0 - 4 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - news - 0 - 5 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - news - 0 - 6 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - news - 0 - 7 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - news - 0 - 8 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - news - 0 - 9 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - news - 1 - 0 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - news - 1 - 1 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - news - 1 - 2 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - news - 1 - 3 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - news - 1 - 4 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - news - 1 - 5 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - news - 1 - 6 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - news - 1 - 7 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - news - 1 - 8 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - news - 1 - 9 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - news - 3 - 0 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - news - 3 - 1 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - news - 3 - 2 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - news - 3 - 3 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - news - 3 - 4 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - news - 3 - 5 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - news - 3 - 6 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - news - 3 - 7 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - news - 3 - 8 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - news - 3 - 9 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource

Run: nuextract - wikipedia - 0 - 0 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
Setting `pad_token_id` to `eos_token_id`:151646 for open-end generation.


Run: nuextract - wikipedia - 0 - 1 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
Setting `pad_token_id` to `eos_token_id`:151646 for open-end generation.


Run: nuextract - wikipedia - 0 - 2 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
Setting `pad_token_id` to `eos_token_id`:151646 for open-end generation.


Run: nuextract - wikipedia - 0 - 3 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
Setting `pad_token_id` to `eos_token_id`:151646 for open-end generation.


Run: nuextract - wikipedia - 0 - 4 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
Setting `pad_token_id` to `eos_token_id`:151646 for open-end generation.
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][resul

Run: nuextract - wikipedia - 0 - 5 ...
Run: nuextract - wikipedia - 0 - 6 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
Setting `pad_token_id` to `eos_token_id`:151646 for open-end generation.


Run: nuextract - wikipedia - 0 - 7 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
Setting `pad_token_id` to `eos_token_id`:151646 for open-end generation.
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][resul

Run: nuextract - wikipedia - 0 - 8 ...
Run: nuextract - wikipedia - 0 - 9 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
Setting `pad_token_id` to `eos_token_id`:151646 for open-end generation.


Run: nuextract - wikipedia - 1 - 0 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
Setting `pad_token_id` to `eos_token_id`:151646 for open-end generation.


Run: nuextract - wikipedia - 1 - 1 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
Setting `pad_token_id` to `eos_token_id`:151646 for open-end generation.


Run: nuextract - wikipedia - 1 - 2 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
Setting `pad_token_id` to `eos_token_id`:151646 for open-end generation.


Run: nuextract - wikipedia - 1 - 3 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
Setting `pad_token_id` to `eos_token_id`:151646 for open-end generation.


Run: nuextract - wikipedia - 1 - 4 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
Setting `pad_token_id` to `eos_token_id`:151646 for open-end generation.
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][resul

Run: nuextract - wikipedia - 1 - 5 ...
Run: nuextract - wikipedia - 1 - 6 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
Setting `pad_token_id` to `eos_token_id`:151646 for open-end generation.


Run: nuextract - wikipedia - 1 - 7 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
Setting `pad_token_id` to `eos_token_id`:151646 for open-end generation.
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][resul

Run: nuextract - wikipedia - 1 - 8 ...
Run: nuextract - wikipedia - 1 - 9 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
Setting `pad_token_id` to `eos_token_id`:151646 for open-end generation.


Run: nuextract - wikipedia - 3 - 0 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
Setting `pad_token_id` to `eos_token_id`:151646 for open-end generation.


Run: nuextract - wikipedia - 3 - 1 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
Setting `pad_token_id` to `eos_token_id`:151646 for open-end generation.


Run: nuextract - wikipedia - 3 - 2 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
Setting `pad_token_id` to `eos_token_id`:151646 for open-end generation.


Run: nuextract - wikipedia - 3 - 3 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
Setting `pad_token_id` to `eos_token_id`:151646 for open-end generation.


Run: nuextract - wikipedia - 3 - 4 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
Setting `pad_token_id` to `eos_token_id`:151646 for open-end generation.
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][resul

Run: nuextract - wikipedia - 3 - 5 ...
Run: nuextract - wikipedia - 3 - 6 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
Setting `pad_token_id` to `eos_token_id`:151646 for open-end generation.


Run: nuextract - wikipedia - 3 - 7 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
Setting `pad_token_id` to `eos_token_id`:151646 for open-end generation.
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][resul

Run: nuextract - wikipedia - 3 - 8 ...
Run: nuextract - wikipedia - 3 - 9 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
Setting `pad_token_id` to `eos_token_id`:151646 for open-end generation.


Run: nuextract - news - 0 - 0 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
Setting `pad_token_id` to `eos_token_id`:151646 for open-end generation.


Run: nuextract - news - 0 - 1 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
Setting `pad_token_id` to `eos_token_id`:151646 for open-end generation.


Run: nuextract - news - 0 - 2 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
Setting `pad_token_id` to `eos_token_id`:151646 for open-end generation.


Run: nuextract - news - 0 - 3 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
Setting `pad_token_id` to `eos_token_id`:151646 for open-end generation.


Run: nuextract - news - 0 - 4 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
Setting `pad_token_id` to `eos_token_id`:151646 for open-end generation.


Run: nuextract - news - 0 - 5 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
Setting `pad_token_id` to `eos_token_id`:151646 for open-end generation.


Run: nuextract - news - 0 - 6 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
Setting `pad_token_id` to `eos_token_id`:151646 for open-end generation.


Run: nuextract - news - 0 - 7 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
Setting `pad_token_id` to `eos_token_id`:151646 for open-end generation.


Run: nuextract - news - 0 - 8 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
Setting `pad_token_id` to `eos_token_id`:151646 for open-end generation.


Run: nuextract - news - 0 - 9 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
Setting `pad_token_id` to `eos_token_id`:151646 for open-end generation.


Run: nuextract - news - 1 - 0 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
Setting `pad_token_id` to `eos_token_id`:151646 for open-end generation.


Run: nuextract - news - 1 - 1 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
Setting `pad_token_id` to `eos_token_id`:151646 for open-end generation.


Run: nuextract - news - 1 - 2 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
Setting `pad_token_id` to `eos_token_id`:151646 for open-end generation.


Run: nuextract - news - 1 - 3 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
Setting `pad_token_id` to `eos_token_id`:151646 for open-end generation.


Run: nuextract - news - 1 - 4 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
Setting `pad_token_id` to `eos_token_id`:151646 for open-end generation.


Run: nuextract - news - 1 - 5 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
Setting `pad_token_id` to `eos_token_id`:151646 for open-end generation.


Run: nuextract - news - 1 - 6 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
Setting `pad_token_id` to `eos_token_id`:151646 for open-end generation.


Run: nuextract - news - 1 - 7 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
Setting `pad_token_id` to `eos_token_id`:151646 for open-end generation.


Run: nuextract - news - 1 - 8 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
Setting `pad_token_id` to `eos_token_id`:151646 for open-end generation.


Run: nuextract - news - 1 - 9 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
Setting `pad_token_id` to `eos_token_id`:151646 for open-end generation.


Run: nuextract - news - 3 - 0 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
Setting `pad_token_id` to `eos_token_id`:151646 for open-end generation.


Run: nuextract - news - 3 - 1 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
Setting `pad_token_id` to `eos_token_id`:151646 for open-end generation.


Run: nuextract - news - 3 - 2 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
Setting `pad_token_id` to `eos_token_id`:151646 for open-end generation.


Run: nuextract - news - 3 - 3 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
Setting `pad_token_id` to `eos_token_id`:151646 for open-end generation.


Run: nuextract - news - 3 - 4 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
Setting `pad_token_id` to `eos_token_id`:151646 for open-end generation.


Run: nuextract - news - 3 - 5 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
Setting `pad_token_id` to `eos_token_id`:151646 for open-end generation.


Run: nuextract - news - 3 - 6 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
Setting `pad_token_id` to `eos_token_id`:151646 for open-end generation.


Run: nuextract - news - 3 - 7 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
Setting `pad_token_id` to `eos_token_id`:151646 for open-end generation.


Run: nuextract - news - 3 - 8 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
Setting `pad_token_id` to `eos_token_id`:151646 for open-end generation.


Run: nuextract - news - 3 - 9 ...
Run: gpt-3.5-turbo - wikipedia - 0 - 0 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-3.5-turbo - wikipedia - 0 - 1 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-3.5-turbo - wikipedia - 0 - 2 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-3.5-turbo - wikipedia - 0 - 3 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-3.5-turbo - wikipedia - 0 - 4 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-3.5-turbo - wikipedia - 0 - 5 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-3.5-turbo - wikipedia - 0 - 6 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-3.5-turbo - wikipedia - 0 - 7 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-3.5-turbo - wikipedia - 0 - 8 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-3.5-turbo - wikipedia - 0 - 9 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-3.5-turbo - wikipedia - 1 - 0 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-3.5-turbo - wikipedia - 1 - 1 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-3.5-turbo - wikipedia - 1 - 2 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-3.5-turbo - wikipedia - 1 - 3 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-3.5-turbo - wikipedia - 1 - 4 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-3.5-turbo - wikipedia - 1 - 5 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-3.5-turbo - wikipedia - 1 - 6 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-3.5-turbo - wikipedia - 1 - 7 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-3.5-turbo - wikipedia - 1 - 8 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-3.5-turbo - wikipedia - 1 - 9 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-3.5-turbo - wikipedia - 3 - 0 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-3.5-turbo - wikipedia - 3 - 1 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-3.5-turbo - wikipedia - 3 - 2 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-3.5-turbo - wikipedia - 3 - 3 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-3.5-turbo - wikipedia - 3 - 4 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-3.5-turbo - wikipedia - 3 - 5 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-3.5-turbo - wikipedia - 3 - 6 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-3.5-turbo - wikipedia - 3 - 7 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-3.5-turbo - wikipedia - 3 - 8 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-3.5-turbo - wikipedia - 3 - 9 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-3.5-turbo - news - 0 - 0 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-3.5-turbo - news - 0 - 1 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-3.5-turbo - news - 0 - 2 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-3.5-turbo - news - 0 - 3 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-3.5-turbo - news - 0 - 4 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-3.5-turbo - news - 0 - 5 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-3.5-turbo - news - 0 - 6 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-3.5-turbo - news - 0 - 7 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-3.5-turbo - news - 0 - 8 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-3.5-turbo - news - 0 - 9 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-3.5-turbo - news - 1 - 0 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-3.5-turbo - news - 1 - 1 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-3.5-turbo - news - 1 - 2 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-3.5-turbo - news - 1 - 3 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-3.5-turbo - news - 1 - 4 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-3.5-turbo - news - 1 - 5 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-3.5-turbo - news - 1 - 6 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-3.5-turbo - news - 1 - 7 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-3.5-turbo - news - 1 - 8 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-3.5-turbo - news - 1 - 9 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-3.5-turbo - news - 3 - 0 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-3.5-turbo - news - 3 - 1 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-3.5-turbo - news - 3 - 2 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-3.5-turbo - news - 3 - 3 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-3.5-turbo - news - 3 - 4 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-3.5-turbo - news - 3 - 5 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-3.5-turbo - news - 3 - 6 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-3.5-turbo - news - 3 - 7 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-3.5-turbo - news - 3 - 8 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-3.5-turbo - news - 3 - 9 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-4-turbo - wikipedia - 0 - 0 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-4-turbo - wikipedia - 0 - 1 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-4-turbo - wikipedia - 0 - 2 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-4-turbo - wikipedia - 0 - 3 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-4-turbo - wikipedia - 0 - 4 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-4-turbo - wikipedia - 0 - 5 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-4-turbo - wikipedia - 0 - 6 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-4-turbo - wikipedia - 0 - 7 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-4-turbo - wikipedia - 0 - 8 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-4-turbo - wikipedia - 0 - 9 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-4-turbo - wikipedia - 1 - 0 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-4-turbo - wikipedia - 1 - 1 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-4-turbo - wikipedia - 1 - 2 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-4-turbo - wikipedia - 1 - 3 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-4-turbo - wikipedia - 1 - 4 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-4-turbo - wikipedia - 1 - 5 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-4-turbo - wikipedia - 1 - 6 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-4-turbo - wikipedia - 1 - 7 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-4-turbo - wikipedia - 1 - 8 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-4-turbo - wikipedia - 1 - 9 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-4-turbo - wikipedia - 3 - 0 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-4-turbo - wikipedia - 3 - 1 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-4-turbo - wikipedia - 3 - 2 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-4-turbo - wikipedia - 3 - 3 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-4-turbo - wikipedia - 3 - 4 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-4-turbo - wikipedia - 3 - 5 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-4-turbo - wikipedia - 3 - 6 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-4-turbo - wikipedia - 3 - 7 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-4-turbo - wikipedia - 3 - 8 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-4-turbo - wikipedia - 3 - 9 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-4-turbo - news - 0 - 0 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-4-turbo - news - 0 - 1 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-4-turbo - news - 0 - 2 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-4-turbo - news - 0 - 3 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-4-turbo - news - 0 - 4 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-4-turbo - news - 0 - 5 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-4-turbo - news - 0 - 6 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-4-turbo - news - 0 - 7 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-4-turbo - news - 0 - 8 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-4-turbo - news - 0 - 9 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-4-turbo - news - 1 - 0 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-4-turbo - news - 1 - 1 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-4-turbo - news - 1 - 2 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-4-turbo - news - 1 - 3 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-4-turbo - news - 1 - 4 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-4-turbo - news - 1 - 5 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-4-turbo - news - 1 - 6 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-4-turbo - news - 1 - 7 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-4-turbo - news - 1 - 8 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-4-turbo - news - 1 - 9 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-4-turbo - news - 3 - 0 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-4-turbo - news - 3 - 1 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-4-turbo - news - 3 - 2 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-4-turbo - news - 3 - 3 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-4-turbo - news - 3 - 4 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-4-turbo - news - 3 - 5 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-4-turbo - news - 3 - 6 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-4-turbo - news - 3 - 7 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-4-turbo - news - 3 - 8 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: gpt-4-turbo - news - 3 - 9 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - wikipedia - 0 - 0 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - wikipedia - 0 - 1 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - wikipedia - 0 - 2 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - wikipedia - 0 - 3 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - wikipedia - 0 - 4 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - wikipedia - 0 - 5 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - wikipedia - 0 - 6 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - wikipedia - 0 - 7 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - wikipedia - 0 - 8 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - wikipedia - 0 - 9 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - wikipedia - 1 - 0 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - wikipedia - 1 - 1 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - wikipedia - 1 - 2 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - wikipedia - 1 - 3 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - wikipedia - 1 - 4 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - wikipedia - 1 - 5 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - wikipedia - 1 - 6 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - wikipedia - 1 - 7 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - wikipedia - 1 - 8 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - wikipedia - 1 - 9 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - wikipedia - 3 - 0 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - wikipedia - 3 - 1 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - wikipedia - 3 - 2 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - wikipedia - 3 - 3 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - wikipedia - 3 - 4 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - wikipedia - 3 - 5 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - wikipedia - 3 - 6 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - wikipedia - 3 - 7 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - wikipedia - 3 - 8 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - wikipedia - 3 - 9 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - news - 0 - 0 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - news - 0 - 1 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - news - 0 - 2 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - news - 0 - 3 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - news - 0 - 4 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - news - 0 - 5 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - news - 0 - 6 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - news - 0 - 7 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - news - 0 - 8 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - news - 0 - 9 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - news - 1 - 0 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - news - 1 - 1 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - news - 1 - 2 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - news - 1 - 3 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - news - 1 - 4 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - news - 1 - 5 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - news - 1 - 6 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - news - 1 - 7 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - news - 1 - 8 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - news - 1 - 9 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - news - 3 - 0 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - news - 3 - 1 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - news - 3 - 2 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - news - 3 - 3 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - news - 3 - 4 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - news - 3 - 5 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - news - 3 - 6 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - news - 3 - 7 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - news - 3 - 8 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


Run: claude-3-opus-20240229 - news - 3 - 9 ...


  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]
  maybe_row = results.loc[results.doc_idx == doc_idx][results.num_shots==num_examples][results.task==task][results.model == model_name][results.dataset == datasource]


In [176]:
results.to_json("results.json")

In [189]:
import sqlite3
con = sqlite3.connect("results.db")
results.to_sql("results", con)

1200

In [190]:
failed_rows = results[results.exception != ""]
failed_rows

Unnamed: 0,doc_idx,task,model,dataset,guardrails,num_shots,time,raw_model_output,exception


In [185]:
#results.loc[215, 'doc_idx'] = 5

In [184]:
r = results.loc[results.doc_idx == 1][results.num_shots==1][results.task==Task.NER_NESTED][results.model == "claude-3-opus-20240229"][results.dataset == "wikipedia"]

  r = results.loc[results.doc_idx == 1][results.num_shots==1][results.task==Task.NER_NESTED][results.model == "claude-3-opus-20240229"][results.dataset == "wikipedia"]
  r = results.loc[results.doc_idx == 1][results.num_shots==1][results.task==Task.NER_NESTED][results.model == "claude-3-opus-20240229"][results.dataset == "wikipedia"]
  r = results.loc[results.doc_idx == 1][results.num_shots==1][results.task==Task.NER_NESTED][results.model == "claude-3-opus-20240229"][results.dataset == "wikipedia"]
  r = results.loc[results.doc_idx == 1][results.num_shots==1][results.task==Task.NER_NESTED][results.model == "claude-3-opus-20240229"][results.dataset == "wikipedia"]


In [179]:
r.to_dict(orient='records')

[{'doc_idx': 1,
  'task': <Task.NER_NESTED: 'ner_nested'>,
  'model': 'claude-3-opus-20240229',
  'dataset': 'wikipedia',
  'guardrails': 'None',
  'num_shots': 1,
  'time': 8.838821172714233,
  'raw_model_output': '{\n    "named_entities": [\n        {\n            "text": "Marecia Pemberton",\n            "extraction_type": "name"\n        },\n        {\n            "text": "Kittian",\n            "extraction_type": "misc"\n        },\n        {\n            "text": "Florida State Seminoles",\n            "extraction_type": "organization"\n        },\n        {\n            "text": "Commonwealth Games",\n            "extraction_type": "misc"\n        },\n        {\n            "text": "Pan American Games",\n            "extraction_type": "misc"\n        },\n        {\n            "text": "World Junior Championships",\n            "extraction_type": "misc"\n        },\n        {\n            "text": "Greensboro",\n            "extraction_type": "location"\n        },\n        {\n     

In [180]:
r.index[0]

431

In [205]:
# For the sake of my sanity, we'll flatten results into a single list of dicts.
import itertools
result_list = list()
for t, model_name, num_examples, datasource, doc_idx in itertools.product(
    [task for task in Task], 
    ("nuextract", "gpt-3.5-turbo", "gpt-4-turbo", "claude-3-opus-20240229"), 
    (0, 1, 3),
    ("wikipedia", "news"), 
    (0, 1, 2, 3, 4, 5, 6, 7, 8, 9),
):
    #r = results.loc[unfilled_results.doc_idx == 1, unfilled_results.num_shots==1, unfilled_results.task==Task.NER_FLAT, unfilled_results.model == "claude-3-opus-20240229", results.dataset == "wikipedia"]
    #r = results.loc[results.doc_idx == 1, results.num_shots==1, results.task==Task.NER_FLAT, results.model == "claude-3-opus-20240229", results.dataset == "wikipedia"]
    r = results.loc[results.num_shots==num_examples][results.task==t][results.model == model_name][results.dataset == datasource][results.doc_idx==doc_idx]
    r_idx = r.index[0]
    r_data = r.to_dict(orient='records')
    assert len(r_data) == 1
    result_list.append(r_data[0])
print(len(result_list))

  r = results.loc[results.num_shots==num_examples][results.task==t][results.model == model_name][results.dataset == datasource][results.doc_idx==doc_idx]
  r = results.loc[results.num_shots==num_examples][results.task==t][results.model == model_name][results.dataset == datasource][results.doc_idx==doc_idx]
  r = results.loc[results.num_shots==num_examples][results.task==t][results.model == model_name][results.dataset == datasource][results.doc_idx==doc_idx]
  r = results.loc[results.num_shots==num_examples][results.task==t][results.model == model_name][results.dataset == datasource][results.doc_idx==doc_idx]
  r = results.loc[results.num_shots==num_examples][results.task==t][results.model == model_name][results.dataset == datasource][results.doc_idx==doc_idx]
  r = results.loc[results.num_shots==num_examples][results.task==t][results.model == model_name][results.dataset == datasource][results.doc_idx==doc_idx]
  r = results.loc[results.num_shots==num_examples][results.task==t][results.

1200


  r = results.loc[results.num_shots==num_examples][results.task==t][results.model == model_name][results.dataset == datasource][results.doc_idx==doc_idx]
  r = results.loc[results.num_shots==num_examples][results.task==t][results.model == model_name][results.dataset == datasource][results.doc_idx==doc_idx]
  r = results.loc[results.num_shots==num_examples][results.task==t][results.model == model_name][results.dataset == datasource][results.doc_idx==doc_idx]
  r = results.loc[results.num_shots==num_examples][results.task==t][results.model == model_name][results.dataset == datasource][results.doc_idx==doc_idx]
  r = results.loc[results.num_shots==num_examples][results.task==t][results.model == model_name][results.dataset == datasource][results.doc_idx==doc_idx]
  r = results.loc[results.num_shots==num_examples][results.task==t][results.model == model_name][results.dataset == datasource][results.doc_idx==doc_idx]
  r = results.loc[results.num_shots==num_examples][results.task==t][results.

In [221]:
#%pip install partial-json-parser
from dataclasses import dataclass
from partial_json_parser import loads as loads_partial

@dataclass
class Score:
    malformed_json: bool = True
    schema_mismatch: bool = True  # Missing required fields, but JSON is valid
    constraint_violation: bool = True  # JSON is valid and all fields are present, but one or more has a violation (like out of range)
    hallucinated_content: bool = True  # Does not contain the desired data or contains hallucinated data
    additional_content: bool = True  # Contains the desired valid data but has multiple copies of it
    noisy_content: bool = True  # Contains the desired valid data but has more than 50% other unrelated data (badly hallucinated content)

    def as_int(self):
        s = 0
        if not self.malformed_json:
            s += 1
        if not self.schema_mismatch:
            s += 1
        if not self.hallucinated_content:
            s += 1
        if not self.additional_content:
            s += 1
        if not self.noisy_content:
            s += 1
        return s

    def as_float(self):
        return s/5.0

for r in result_list:
    if r['exception']:
        print("--- Fail case ---")
        print(r)
        print("---")

In [220]:
loads_partial(result_list[115]['raw_model_output'])

{'names': ['Maria Boada Sana',
  'Manuela Giacomini',
  'Olga Kikou',
  'Tilly Metz'],
 'organizations': ['Animal Welfare Foundation (AWF)',
  "Spain's Ministry of Agriculture, Fisheries and Food",
  'World Organisation for Animal Health (OIE)',
  'European commission',
  'Compassion in World Farming EU',
  'International Maritime Organization',
  "European parliament's animal transport inquiry committee"],
 'locations': ['Spain',
  'Libya',
  'Cyprus',
  'Sardinia',
  'Italy',
  'Genoa',
  'Greece',
  'Luxembourg'],
 'misc': ['bluetongue',
  'Tarragona',
  'Cartagena',
  'Cagliari',
  'Queen Hind',
  'resting time',
  'transport time',
  'disease outbreaks',
  'feed shortages',
  'refusals to unload',
  'Animals farmed monthly update',
  'animalsfarmed@theguardian.com']}