In [1]:
import os
import re
import json
import statistics
from functools import cache
from typing import List, Tuple, Union
from difflib import SequenceMatcher
from pathlib import Path
import json
ANSWER_SCHEMA = {
    "$schema": "http://json-schema.org/draft-07/schema#",
    "title": "Answer file schema",
    "type": "object",
    "properties": {
        "id":        {"type": "integer", "description": "Question ID"},
        "question":  {"type": "string",  "description": "The question"},
        "passages":  {
            "type": "array",
            "description": "Passages used and related FineWeb doc IDs, ordered by decreasing importance",
            "items": {
                "type": "object",
                "properties": {
                    "passage": {"type": "string",  "description": "Passage text"},
                    "doc_IDs": {
                        "type": "array",
                        "description": "Passage related FineWeb doc IDs, ordered by decreasing importance",
                        "items": {
                            "type": "string",
                            "description": "FineWeb doc ID, e.g., <urn:uuid:d69cbebc-133a-4ebe-9378-68235ec9f091>"
                        }
                    }
                },
                "required": ["passage", "doc_IDs"]
            }
        },
        "final_prompt": {"type": "string", "description": "Final prompt, as submitted to Falcon LLM"},
        "answer":       {"type": "string", "description": "Your answer"}
    },
    "required": ["id", "question", "passages", "final_prompt", "answer"]
}

def make_passage_objects(ranked):
    """
    ranked: list[tuple[str passage_id, str passage_text, float score]]
            sorted by score (high → low)
    """
    return [
        {
            "passage": txt,
            "doc_IDs": [pid]
        }
        for pid, txt, _ in ranked
    ]


import boto3
import torch
from tqdm import tqdm
from transformers import (
    AutoTokenizer, AutoConfig,
    AutoModelForSeq2SeqLM, AutoModelForCausalLM,
    GenerationConfig, AutoModel
)
from sentence_transformers import CrossEncoder
from pinecone import Pinecone
from opensearchpy import OpenSearch, AWSV4SignerAuth, RequestsHttpConnection
import openai
AWS_PROFILE_NAME        = "sigir-participant"
AWS_REGION_NAME         = "us-east-1"
PINECONE_INDEX_NAME     = "fineweb10bt-512-0w-e5-base-v2"
OPENSEARCH_INDEX_NAME   = "fineweb10bt-512-0w-e5-base-v2"
PINECONE_NAMESPACE      = "default"
EMBEDDING_MODEL_NAME    = "intfloat/e5-base-v2"
CROSS_ENCODER_NAME      = "cross-encoder/ms-marco-MiniLM-L-6-v2"
LOCAL_LLM_NAME          = "tiiuae/Falcon3-10B-Instruct"
SSM_PINECONE_TOKEN      = "/pinecone/ro_token"
SSM_OPENSEARCH_ENDPOINT = "/opensearch/endpoint"
BIG_K                   = 256
CONTEXT_DOCS            = 8
ALPHA                   = 1.2
CONFIDENCE_THRESHOLD    = 0.7
OPENAI_EVAL_MODEL       = "gpt-4o"
SCORE_LLM_NAME = "HuggingFaceTB/SmolLM2-135M-Instruct"

_score_cfg = AutoConfig.from_pretrained(SCORE_LLM_NAME)
ScoreModel = (
    AutoModelForSeq2SeqLM
    if getattr(_score_cfg, "is_encoder_decoder", False)
    else AutoModelForCausalLM
)

score_tokenizer = AutoTokenizer.from_pretrained(SCORE_LLM_NAME)
score_model     = ScoreModel.from_pretrained(
    SCORE_LLM_NAME,
    device_map="auto"
)
score_model.eval()

grade_func = {
    "name": "grade_answer",
    "description": "Return rubric-based scores for correctness and faithfulness, starting with an explanation for the scores",
    "parameters": {
        "type": "object",
        "properties": {
            "explanation": {
                "type": "string",
                "description": "Explanation behind the scores you are going to give according to the rubric"
            },
            "correctness_score": {
                "type": "number",
                "description": "Rubric 1.1 score (-1, 0, 1, or 2)"
            },
            "faithfulness_score": {
                "type": "number",
                "description": "Rubric 1.2 score (-1, 0, or 1)"
            }
        },
        "required": ["explanation","correctness_score", "faithfulness_score"]
    }
}

LOCAL_LLM_NAME = "tiiuae/Falcon3-10B-Instruct"
device = "cuda" if torch.cuda.is_available() else "cpu"

_config = AutoConfig.from_pretrained(LOCAL_LLM_NAME)
if getattr(_config, "is_encoder_decoder", False):
    GenModel = AutoModelForSeq2SeqLM
else:
    GenModel = AutoModelForCausalLM

gen_tokenizer = AutoTokenizer.from_pretrained(LOCAL_LLM_NAME)
gen_model     = GenModel.from_pretrained(LOCAL_LLM_NAME, device_map="auto")
gen_model.eval()

def run_llm(prompts: Union[str, List[str]], max_tokens: int = 512) -> Union[str, List[str]]:
    if isinstance(prompts, str):
        prompts = [prompts]
    outs = []
    for prompt in prompts:
        inputs = gen_tokenizer(prompt, return_tensors="pt").to(device)
        cfg = GenerationConfig(
            max_new_tokens=max_tokens,
            do_sample=False,
            output_scores=False,
            return_dict_in_generate=False
        )
        out_ids = gen_model.generate(**inputs, generation_config=cfg)
        text = gen_tokenizer.decode(out_ids[0], skip_special_tokens=True)
        outs.append(text)
    return outs[0] if len(outs)==1 else outs

from __future__ import annotations
from typing import List, Tuple, Sequence

import torch
import torch.nn.functional as F
from contextlib import nullcontext


class QueryDocScorer:
    """
    Rank passages by `log P(query | passage)` using a Hugging Face model.

    Parameters
    ----------
    tokenizer : transformers.PreTrainedTokenizerBase
    model     : transformers.PreTrainedModel
    max_passage_tokens : int
        Passage + prompt token budget before truncation (to keep GPU usage bounded).
    amp : bool
        If True and running on CUDA, scores in fp16/bf16 autocast.
    """

    def __init__(
        self,
        tokenizer,
        model,
        max_passage_tokens: int = 512,
        amp: bool = True,
    ):
        self.tok = tokenizer
        self.mdl = model.eval()                              # disable dropout
        self.max_passage_tokens = max_passage_tokens
        self.is_enc_dec = bool(getattr(model.config, "is_encoder_decoder", False))
        self.amp = amp and (self.mdl.device.type == "cuda")

        # quick alias
        self.pad = self.tok.pad_token_id
        if self.pad is None:          # GPT-like models sometimes lack pad; use eos
            self.pad = self.tok.eos_token_id

    def _prompt(self, passage: str) -> str:
        """
        Prompt the LM to *generate a query* from the document.
        We score how likely the real user query is under that distribution.
        """
        return (
            "You are a query-creation assistant. "
            "Given the following document, write ONE concise question that could be "
            "answered *solely* from the document. Ensure the question is directly related to the document topic.\n\n"
            f"Document:\n{passage.strip()}\n\nQuestion:"
        )


    @torch.no_grad()
    def _enc_dec_score(
        self, enc_inputs, query_ids: torch.Tensor
    ) -> torch.Tensor:  # shape (batch,)
        """
        Log-likelihood for encoder-decoder models (T5/BART/UL2…).

        enc_inputs : output of tokenizer() for the *prompted passage*.
        query_ids  : (batch, seq_len) – already padded.
        """
        # teacher-forcing: we predict token i given <query[:i]>
        dec_in = query_ids[:, :-1]            # everything except final token
        tgt    = query_ids[:, 1:]             # predict next

        with (torch.cuda.amp.autocast() if self.amp else nullcontext()):
            logits = self.mdl(
                **enc_inputs,
                decoder_input_ids=dec_in,
                use_cache=False,
                return_dict=False,
            )[0]                              # (batch, seq-1, vocab)

        log_probs = F.log_softmax(logits, dim=-1)
        token_lp  = log_probs.gather(2, tgt.unsqueeze(2)).squeeze(2)
        mask      = tgt.ne(self.pad)
        seq_lp    = (token_lp * mask).sum(1)          # (batch,)
        return seq_lp

    @torch.no_grad()
    def _causal_score(
        self, merged_ids: torch.Tensor, prompt_lens: torch.Tensor
    ) -> torch.Tensor:
        """
        Log-likelihood for decoder-only models (GPT/Llama…).

        merged_ids  : tokenised [prompt + query]  (batch, L)
        prompt_lens : length of prompt part for each row  (batch,)
        """
        with (torch.cuda.amp.autocast() if self.amp else nullcontext()):
            logits = self.mdl(merged_ids, use_cache=False, return_dict=False)[0]

        log_probs = F.log_softmax(logits, dim=-1)

        # Shift for causal LM: predict token t given <0..t-1>
        tgt = merged_ids[:, 1:]
        log_probs = log_probs[:, :-1]         # align shapes

        # Mask out prompt tokens – we only score the *query* portion
        mask = torch.ones_like(tgt, dtype=torch.bool)
        for row, L in enumerate(prompt_lens.tolist()):
            mask[row, : max(L - 1, 0)] = False      # −1 b/c shift
        token_lp = log_probs.gather(2, tgt.unsqueeze(2)).squeeze(2)
        seq_lp   = (token_lp * mask).sum(1)
        return seq_lp

    # --------------------------------------------------------------------- #
    # PUBLIC API
    # --------------------------------------------------------------------- #
    def score(self, passages: Sequence[str], query: str) -> List[float]:
        """
        Return list[ log P(query | passage) ] (higher = more relevant).

        ● Operates on a *batch* of passages for speed.
        """
        if not passages:
            return []

        prompts      = [self._prompt(p) for p in passages]
        enc_inputs   = self.tok(
            prompts,
            padding=True,
            truncation=True,
            max_length=self.max_passage_tokens,
            return_tensors="pt",
        ).to(self.mdl.device)

        if self.is_enc_dec:
            query_ids = self.tok(
                [query] * len(passages), padding=True, return_tensors="pt"
            ).input_ids.to(self.mdl.device)
            scores = self._enc_dec_score(enc_inputs, query_ids)
        else:
            merged_texts = [p + query for p in prompts]
            merged_ids = self.tok(
                merged_texts, padding=True, return_tensors="pt"
            ).input_ids.to(self.mdl.device)

            prompt_lens = (enc_inputs.input_ids != self.pad).sum(1)
            scores = self._causal_score(merged_ids, prompt_lens)

        return scores.tolist()

    def rank(
        self,
        query: str,
        passages: List[Tuple[str, str]],
        top_k: int | None = None,
        chunk_size: int = 256,
    ) -> List[Tuple[str, str, float]]:
        """
        Rank (doc_id, text) tuples by relevance to *query*.

        Returns
        -------
        List[(doc_id, text, score)]  sorted by descending score.
        """
        if not passages:
            return []

        doc_ids, texts = zip(*passages)

        scores: list[float] = []
        for start in range(0, len(texts), chunk_size):
            batch_texts = texts[start : start + chunk_size]
            scores.extend(self.score(batch_texts, query))

        scored = list(zip(doc_ids, texts, scores))
        scored.sort(key=lambda x: x[2], reverse=True)
        return scored if top_k is None else scored[:top_k]

# ──────────────────────────────────────────────────────────────────────────────
# Helpers & recall functions (unchanged)
# ──────────────────────────────────────────────────────────────────────────────
# helpers.py  (or wherever _ssm() lives)
@cache
def _ssm(key: str) -> str:
    sess = boto3.Session(
        profile_name=AWS_PROFILE_NAME,
        region_name=AWS_REGION_NAME
    )
    return sess.client("ssm").get_parameter(
        Name=key, WithDecryption=True
    )["Parameter"]["Value"]

@cache
def pinecone_index():
    return Pinecone(api_key=_ssm("/pinecone/ro_token")).Index("fineweb10bt-512-0w-e5-base-v2")

@cache
def opensearch_client():
    sess = boto3.Session(
        profile_name=AWS_PROFILE_NAME,
        region_name=AWS_REGION_NAME,
    )
    creds = sess.get_credentials()
    auth  = AWSV4SignerAuth(creds, region=AWS_REGION_NAME)

    host  = _ssm(SSM_OPENSEARCH_ENDPOINT)
    return OpenSearch(
        hosts=[{"host": host, "port": 443}],
        http_auth=auth,
        use_ssl=True,
        verify_certs=True,
        connection_class=RequestsHttpConnection,
    )


# ─── New: load a *separate* embedding model ────────────────────────────────────
EMBEDDING_MODEL_NAME = "intfloat/e5-base-v2"

embed_tok   = AutoTokenizer.from_pretrained(EMBEDDING_MODEL_NAME)
embed_model = AutoModel.from_pretrained(EMBEDDING_MODEL_NAME).to(device)
embed_model.eval()

def _mean_pool(last_hidden_state, attention_mask):
    # sentence-transformers’ mean-pooling
    mask = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
    summed = (last_hidden_state * mask).sum(dim=1)
    counts = mask.sum(dim=1).clamp(min=1e-9)
    return summed / counts

# ─── Rewrite query_dense to use the new encoder ───────────────────────────────
def query_dense(query: str, k: int):
    inp = embed_tok([query], padding=True, truncation=True, return_tensors="pt").to(device)

    with torch.no_grad():
        out   = embed_model(**inp)
        emb_t = _mean_pool(out.last_hidden_state, inp["attention_mask"])
        emb_t = torch.nn.functional.normalize(emb_t, p=2, dim=1)  # E5 uses cosine sims
        emb   = emb_t[0].cpu().tolist()

    matches = pinecone_index().query(
        vector=emb,
        top_k=k,
        include_metadata=True
    ).matches
    return [(m["id"], m["metadata"]["text"]) for m in matches]


def query_sparse(keywords: str, k: int):
    body = {"query":{"match":{"text":keywords}},"size":k}
    hits = opensearch_client().search(index="fineweb10bt-512-0w-e5-base-v2", body=body)["hits"]["hits"]
    return [(h["_id"], h["_source"]["text"]) for h in hits]

def hybrid_recall(question: str, big_k: int=64):
    dense  = query_dense(question, big_k)
    sparse = query_sparse(" ".join(re.findall(r"\w+", question.lower())), big_k)
    # simple merge:
    ids = {d for d,_ in dense} | {s for s,_ in sparse}
    texts = {d:t for d,t in dense} | {s:t for s,t in sparse}
    return [(i, texts[i]) for i in list(ids)[:big_k]]

def filter_duplicates(passages, threshold=0.9):
    unique, seen = [], []
    for pid, txt in passages:
        if any(SequenceMatcher(None, txt, s).ratio()>threshold for s in seen):
            continue
        seen.append(txt); unique.append((pid,txt))
    return unique

def parse_confidence(text: str) -> Tuple[str,float]:
    if "Confidence:" in text:
        body, conf = text.rsplit("Confidence:",1)
        try: return body.strip(), float(conf.strip())
        except: return body.strip(), 0.0
    return text.strip(), 0.0
import re

def extract_answer(model_output: str) -> str:
    """
    Return the substring that follows the *last* literal 'Answer:'.
    Strips leading/trailing whitespace and any trailing stop tokens
    the model might append (</s>, ###, etc.).
    """
    marker = "Answer:"
    idx = model_output.rfind(marker)
    if idx == -1:                      # No marker → just strip everything
        return model_output.strip()

    # Everything after the marker
    ans = model_output[idx + len(marker):]

    # Remove common stop sequences the model sometimes adds
    ans = re.sub(r"(</s>|###|<<END>>|</assistant>)\s*$", "", ans, flags=re.I)

    return ans.strip()

# ──────────────────────────────────────────────────────────────────────────────
# Evaluation and main loop
# ──────────────────────────────────────────────────────────────────────────────
# ──────────────────────────────────────────────────────────────────────────────
# Evaluation Rubric & Prompt Template
# ──────────────────────────────────────────────────────────────────────────────
RUBRIC = """\
### 1.1 Correctness   (integer score ∈ 2, 1, 0, -1)
  2 — Fully correct. Directly answers the question, no factual errors, includes all key details.
  1 — Partly correct. Factually sound but incomplete or contains irrelevant details.
  0 — Abstained / no answer.
 -1 — Incorrect. Contains at least one factual error or contradiction.

### 1.2 Faithfulness  (integer score ∈ 1, 0, -1)
  1 — Fully grounded. Every claim is supported by the supplied context.
  0 — Partly grounded. Some claims are supported, but at least one is not traceable.
 -1 — Ungrounded. Most or all claims are unsupported or contradicted by the context.
"""

EVAL_TMPL = f"""You are an impartial evaluator.

## Task
Using *Context* as evidence, compare the **Generated answer** with the **Reference answer**.
Assign integer scores for *Correctness* and *Faithfulness* exactly as defined below:

{RUBRIC}

## Inputs
Context:
{{context}}

Question:
{{question}}

Generated answer:
{{pred}}

Reference answer:
{{gold}}
"""
def generate_until_nonempty(prompts, *, max_tokens=384, max_tries=2):
    """
    prompts : str | list[str]
    Returns the same shape as `prompts`, regenerating up to `max_tries`
    times when the stripped answer is empty.
    """
    single = isinstance(prompts, str)
    prompts = [prompts] if single else list(prompts)
    tries   = 0
    results = [""] * len(prompts)

    todo_idx = list(range(len(prompts)))   # indices still needing output
    while todo_idx and tries < max_tries:
        tries += 1
        gens   = run_llm([prompts[i] for i in todo_idx], max_tokens=max_tokens)
        if single: gens = [gens]           # normalise shape

        # post-process & filter still-empty
        new_todo = []
        for i, gen in zip(todo_idx, gens):
            ans = extract_answer(gen)
            if ans:
                results[i] = ans
            else:
                new_todo.append(i)
        todo_idx = new_todo                # continue until all filled or max_tries
    return results[0] if single else results
# ──────────────────────────────────────────────────────────────────────────────
# Function-Call Argument Parsing
# ──────────────────────────────────────────────────────────────────────────────
def _parse_function_args(raw):
    import ast, json, re

    # 1️⃣ Coerce to dict
    if isinstance(raw, dict):
        data = raw
    else:
        try:
            data = json.loads(raw)
        except json.JSONDecodeError:
            try:
                data = ast.literal_eval(raw)
            except:
                # add quotes around bare keys, replace single→double quotes
                fixed = re.sub(r"([{\[,]\s*)([A-Za-z_][A-Za-z0-9_]*)\s*:", r'\1"\2":', raw)
                fixed = fixed.replace("'", '"')
                data = json.loads(fixed)
    if not isinstance(data, dict):
        return {}

    # 2️⃣ Canonicalize key names
    mapping = {
        "correctness":       "correctness_score",
        "correctnessScore":  "correctness_score",
        "faithfulness":      "faithfulness_score",
        "faithfulnessScore": "faithfulness_score",
    }
    cleaned = {}
    for k, v in data.items():
        key = mapping.get(k.strip('"'), k.strip('"'))
        cleaned[key] = v
    return cleaned

# ──────────────────────────────────────────────────────────────────────────────
# GPT-4o Evaluation via OpenAI Functions
# ──────────────────────────────────────────────────────────────────────────────
def evaluate_answer(question: str, pred: str, gold: str, context: str):
    prompt = EVAL_TMPL.format(context=context, question=question, pred=pred, gold=gold)

    messages = [
        {"role": "system", "content": "You are an expert QA assistant. Use ONLY the information in the context; do NOT hallucinate."},
        {"role": "user",   "content": prompt}
    ]
    openai.api_key = "API_KEY_HERE"   
    resp = openai.chat.completions.create(
        model="gpt-4o",
        messages=messages,
        functions=[grade_func],
        function_call={"name": "grade_answer"},
        temperature=0.0
    )

    call = resp.choices[0].message.function_call
    if not call:
        raise RuntimeError("Model did not invoke grade_answer")

    args = _parse_function_args(call.arguments)
    # ensure both scores are present
    for key in ("correctness_score", "faithfulness_score"):
        if key not in args:
            raise KeyError(f"Missing {key} in function call: {args}")

    return args["correctness_score"], args["faithfulness_score"]
def evaluate_and_generate(
        in_path: str,
        *,
        start_idx: int = 0,          # first entry to keep  (0-based, inclusive)
        end_idx:   int | None = None,  # last entry to keep (inclusive).  None → no upper bound
        out_path: str = "answers.jsonl",
        max_examples: int | None = None   # optional per-run cap, applied *after* slicing
    ):
    """
    Generate answers only for entries in the closed interval
    [start_idx, end_idx].  Pass end_idx=None for “to the end”.

    Example
    -------
    evaluate_and_generate(
        "questions.jsonl",
        start_idx=450,
        end_idx=500,
        max_examples=None           # or leave out entirely
    )
    """
    Path(out_path).parent.mkdir(parents=True, exist_ok=True)

    # ── Load input ───────────────────────────────────────────────────────────
    entries = (
        [json.loads(l) for l in open(in_path) if l.strip()]
        if in_path.lower().endswith(".jsonl")
        else [
            json.loads(chunk)
            for chunk in re.split(r"Request \d+:\n", open(in_path).read())
            if chunk.strip()
        ]
    )

    # Keep only the desired slice
    if end_idx is None:
        sliced = entries[start_idx:]
    else:
        sliced = entries[start_idx:end_idx + 1]      # +1 because end is inclusive

    # Optional global cap, applied *after* slicing
    if max_examples is not None:
        sliced = sliced[:max_examples]

    scorer  = QueryDocScorer(score_tokenizer, score_model)
    corr, faith = [], []

    with open(out_path, "w") as fout, tqdm(sliced, desc="Requests") as bar:
        for i, entry in enumerate(bar, start=start_idx):   # preserve original IDs
            try:
                # ── Inputs ───────────────────────────────────────────────
                q = entry.get(
                    "question",
                    entry.get("response", {}).get("result", [{}])[0].get("question"),
                )
                gold = entry.get("response", {}).get("result", [{}])[0].get(
                    "answer", None
                )

                # ── Retrieval (BIG_K) ───────────────────────────────────
                passages = hybrid_recall(q, BIG_K)

                # ── Re-ranking (CONTEXT_DOCS) ───────────────────────────
                ranked = scorer.rank(q, passages, top_k=CONTEXT_DOCS)

                passage_objs = make_passage_objects(ranked)

                # ── Build context & generate answer ─────────────────────
                context = "\n\n".join(
                    f"Document {j+1} (ID={pid}):\n{txt}"
                    for j, (pid, txt, _) in enumerate(reversed(ranked))
                )
                final_prompt = (
                    "<|system|>Context:\n<CONTEXT>"
                    f"{context}"
                    "</CONTEXT>\n\nSystem Prompt: You are an expert Question Answering assistant. "
                    "Given a user query below, comprehensively construct an answer in 300 or fewer words that gives them the information they are most likely searching for. "
                    "Use the information inside the <CONTEXT> … </CONTEXT> section to assist your answer. "
                    f"Question: <|user|>{q}<|assistant|>\nAnswer:"
                )
                ans = generate_until_nonempty(final_prompt)

                # ── Optional evaluation ────────────────────────────────
                if gold is not None:
                    c, f = evaluate_answer(q, ans, gold, context)
                    corr.append(c)
                    faith.append(f)
                    bar.set_postfix_str(
                        f"AvgCorr={statistics.mean(corr):.2f}  "
                        f"AvgFaith={statistics.mean(faith):.2f}"
                    )

                # ── Persist ─────────────────────────────────────────────
                rec = {
                    "id":         int(entry.get("id", i)),
                    "question":   q,
                    "passages":   passage_objs,
                    "final_prompt": final_prompt,
                    "answer":     ans.strip(),
                }
                fout.write(json.dumps(rec, ensure_ascii=False) + "\n")

            except Exception as e:
                print("ERROR:", e)

if __name__=="__main__":
    evaluate_and_generate("questions.jsonl", max_examples=500, start_idx=0, end_idx=500)



Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Requests:   0%|          | 0/50 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
Requests:   2%|▏         | 1/50 [00:18<15:18, 18.75s/it]Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
Requests:   4%|▍         | 2/50 [00:32<12:50, 16.06s/it]Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
Requests:   6%|▌         | 3/50 [00:50<13:16, 16.95s/it]Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
Requests:   8%|▊         | 4/50 [01:06<12:41, 16.56s/it]Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
Requests:  10%|█         | 5/50 [01:23<12:23, 16.52s/it]Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
Requests:  12%|█▏        | 6/50 [01:35<10:54, 14.88s/it]Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
Requests:  14%|█▍        | 7/50 [01:47<10:09, 14.17s/it]Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
Requests