Cell 1 — Environment bootstrap (must run first)

In [1]:
# ===== MUST RUN FIRST CELL (before importing pyterrier / jnius) =====
import os, glob

# A. Prefer system JDK (if it really exists)
sys_java_home = "/usr/lib/jvm/java-17-openjdk-amd64"
cand = [
    os.path.join(sys_java_home, "lib", "server", "libjvm.so"),
    "/gz-data/jdk/current/lib/server/libjvm.so",  # If you installed to /gz-data/jdk/current as per previous steps
]

jvm_path = next((p for p in cand if os.path.isfile(p)), None)

if jvm_path is None:
    # Fallback: Search in common directories
    for base in ("/usr/lib/jvm", "/gz-data/jdk"):
        for p in glob.glob(base + "/**/lib/server/libjvm.so", recursive=True):
            jvm_path = p
            break

if not jvm_path:
    raise RuntimeError("Cannot find libjvm.so. Please install JDK 17 (system or /gz-data) and try again.")

java_home = os.path.abspath(os.path.join(jvm_path, "..", ".."))  # Remove /lib/server
os.environ["JAVA_HOME"] = java_home
os.environ["JVM_PATH"]  = jvm_path
os.environ["LD_LIBRARY_PATH"] = os.path.dirname(jvm_path) + ":" + os.environ.get("LD_LIBRARY_PATH", "")
os.environ["PATH"] = os.path.join(java_home, "bin") + ":" + os.environ.get("PATH", "")

# Optional: Specify Java temp directory and memory
os.environ.setdefault("JAVA_TOOL_OPTIONS", "-Djava.io.tmpdir=/gz-data/tmp")
os.environ.setdefault("_JAVA_OPTIONS", "-Xms512m -Xmx8g")
os.makedirs("/gz-data/tmp", exist_ok=True)

print("JAVA_HOME =", os.environ["JAVA_HOME"])
print("JVM_PATH  =", os.environ["JVM_PATH"])

# ===== Now import PyTerrier and initialize JVM =====
import pyterrier as pt
if not pt.started():
    # Use traditional init for stability (includes Java initialization); add mem / jvm_opts if needed
    pt.init()  # Or pt.init(tail=False)
print("PyTerrier started =", pt.started())

JAVA_HOME = /gz-data/jdk/current/lib
JVM_PATH  = /gz-data/jdk/current/lib/server/libjvm.so


  if not pt.started():
Picked up JAVA_TOOL_OPTIONS: -Djava.io.tmpdir=/gz-data/tmp
Picked up _JAVA_OPTIONS: -Xms512m -Xmx8g


PyTerrier started = True


Java started and loaded: pyterrier.java, pyterrier.terrier.java [version=5.11 (build: craig.macdonald 2025-01-13 21:29), helper_version=0.0.8]
java is now started automatically with default settings. To force initialisation early, run:
pt.java.init() # optional, forces java initialisation
  pt.init()  # 或 pt.init(tail=False)
  print("PyTerrier started =", pt.started())


Cell 2 — ColBERT pipeline: environment & constants

In [2]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-17-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]
# ========== Environment & Pipeline Setup ==========
import os, re, torch, random, warnings, pandas as pd
from datasets import load_dataset
import pyterrier as pt
from colbert import Indexer
from colbert.infra import Run, RunConfig, ColBERTConfig
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from tqdm import tqdm
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu
from bert_score import score as bert_score
from colbert.modeling.checkpoint import Checkpoint
from colbert.modeling.colbert import colbert_score

# Cache goes to data disk
os.environ["HF_HOME"] = "/gz-data/hf_cache"
os.environ["TRANSFORMERS_CACHE"] = "/gz-data/hf_cache"
os.environ["HF_DATASETS_CACHE"] = "/gz-data/hf_cache"
os.environ["TORCH_HOME"] = "/gz-data/torch_cache"
os.environ["PYTERRIER_CACHE"] = "/gz-data/pyterrier_cache"
# Java & temp directory also go to data disk
os.environ["TMPDIR"] = "/gz-data/tmp"
os.environ["JAVA_TOOL_OPTIONS"] = "-Djava.io.tmpdir=/gz-data/tmp"

# Ensure directories exist
for p in ["/gz-data/hf_cache", "/gz-data/torch_cache", "/gz-data/pyterrier_cache", "/gz-data/tmp"]:
    os.makedirs(p, exist_ok=True)

os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"

# ★ ROOT on data disk to avoid writing to system disk
ROOT = "/gz-data/nlquad_colbert"

BM25_TOPK = 5
MAXLEN, GEN_MAXLEN = 250, 384
MIN_SPLIT_LEN, DESIRED_SEG_LEN = 1000, 250
COLBERT_MODEL = "/gz-data/models/colbertv2.0"
GEN_MODEL = "deepseek-ai/deepseek-llm-7b-chat"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

import pyterrier as pt
# Initialize Java with new API (safe for multiple calls)
try:
    if not pt.java.started():
        pt.java.init()
except Exception:
    # Some versions start automatically, ignore
    pass

os.makedirs(ROOT, exist_ok=True)

Cell 3 — Stage-1 (ColBERT): same-document Top-K → re-rank → per-sample metrics

In [5]:
# ========== Environment & Pipeline Setup ==========
import os, re, torch, random, warnings, pandas as pd
from datasets import load_dataset
import pyterrier as pt
from colbert import Indexer
from colbert.infra import Run, RunConfig, ColBERTConfig
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from tqdm import tqdm
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu
from bert_score import score as bert_score
from colbert.modeling.checkpoint import Checkpoint
from colbert.modeling.colbert import colbert_score

warnings.filterwarnings("ignore")

# ====== Cache & Path Config ======
os.environ["HF_HOME"] = "/gz-data/hf_cache"
os.environ["TRANSFORMERS_CACHE"] = "/gz-data/hf_cache"
os.environ["HF_DATASETS_CACHE"] = "/gz-data/hf_cache"
os.makedirs("/gz-data/hf_cache", exist_ok=True)
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"

ROOT = "/gz-data/nlquad_colbert"
os.makedirs(ROOT, exist_ok=True)
BM25_TOPK=5
MAXLEN, GEN_MAXLEN = 250, 384
MIN_SPLIT_LEN, DESIRED_SEG_LEN = 1000, 250
COLBERT_MODEL = "/gz-data/models/colbertv2.0"
GEN_MODEL = "/gz-data/models/deepseek-llm-7b-chat"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

random.seed(42); torch.manual_seed(42)
if not pt.started():
    pt.init()

# ===== Load Data & Preprocess =====
print(">>> Loading NLQuAD...")
dataset = load_dataset("LLukas22/NLQuAD", split="test")
records = []
for art in dataset:
    for para in art["paragraphs"]:
        ctx = para["context"]
        if len(ctx.split()) >= MIN_SPLIT_LEN:
            cid = para["qas"][0]["id"].split("_")[0]
            for qa in para["qas"]:
                if qa["answers"]:
                    records.append({
                        "context_id": cid,
                        "context": ctx,
                        "question": qa["question"],
                        "answer": qa["answers"][0]["text"],
                        "qa_id": qa["id"]
                    })
df = pd.DataFrame(records)
df = df.sort_values(["context_id", "qa_id"]).reset_index(drop=True)

# ===== Paragraph Splitting =====
def semantic_split(text, max_words=DESIRED_SEG_LEN):
    sents = re.split(r"(?<=[.!?])\s+", text.strip())
    buf, out = [], []
    for s in sents:
        if s.strip():
            buf.append(s)
            if len(" ".join(buf).split()) >= max_words:
                out.append(" ".join(buf))
                buf = []
    if buf: out.append(" ".join(buf))
    return out

para_records = []
for cid, grp in df.groupby("context_id"):
    context = grp["context"].iloc[0]
    for i, seg in enumerate(semantic_split(context)):
        para_records.append({"docno": f"{cid}_{i}", "text": seg, "cid": cid})
para_df = pd.DataFrame(para_records)
para_df["docid"] = para_df.index.astype(str)
docno_to_docid = dict(zip(para_df["docno"], para_df["docid"]))
para_text_map = dict(zip(para_df["docid"], para_df["text"]))

def clean_query(q): 
    return re.sub(r"[^A-Za-z0-9 ]", "", q.strip())

# ===== Read eligible qids (S5 unified set) =====
ELIGIBLE_CSV = f"{ROOT}/eligible_qids_top5.csv"
eligible = None
if os.path.exists(ELIGIBLE_CSV):
    try:
        eligible = set(pd.read_csv(ELIGIBLE_CSV)["qa_id"].astype(str))
        print(f">>> Loaded eligible S5 set: {len(eligible)} qids")
    except Exception as e:
        print(f"⚠️ Failed to load {ELIGIBLE_CSV}: {e}. Will fallback to >=5 check.")

# ===== Build ColBERT Index =====
print(">>> Building ColBERT Index...")
with Run().context(RunConfig(nranks=1, root=ROOT, experiment="nlquad", index_root=ROOT)):
    conf = ColBERTConfig(nbits=2, root=ROOT, doc_maxlen=MAXLEN)
    idxer = Indexer(checkpoint=COLBERT_MODEL, config=conf)
    idxer.index(name="nlquad", collection=para_df["text"].tolist(), overwrite="reuse")
    checkpoint = Checkpoint(COLBERT_MODEL, colbert_config=conf)

# ===== BM25 Index =====
print(">>> Building BM25 Index...")
index_ref = f"{ROOT}/pt_index"
if not os.path.exists(index_ref):
    index_ref = pt.IterDictIndexer(f"{ROOT}/pt_index", meta={"docno":44,"text":60000}, overwrite=True).index(para_df.to_dict("records"))
index = pt.IndexFactory.of(index_ref)
bm25 = pt.BatchRetrieve(index, wmodel="BM25")

# ===== Load LLM =====
print(">>> Loading LLM...")
tokenizer = AutoTokenizer.from_pretrained(GEN_MODEL, trust_remote_code=True)
quant_cfg = BitsAndBytesConfig(load_in_8bit=True)
model = AutoModelForCausalLM.from_pretrained(
    GEN_MODEL,
    quantization_config=quant_cfg,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True
)
torch.backends.cuda.matmul.allow_tf32 = True
model = torch.compile(model, mode="reduce-overhead")
print(">>> Model ready!")

# ===== Prompt Template =====
def build_prompt(question, context):
    return f"""You are an AI assistant. Based on the context, answer the question in the following format:

Context: {context}

Question: {question}

Final Answer:"""

# ===== Dynamic Batch Inference =====
def batch_generate_dynamic(prompts, initial_bs=16, max_bs=64):
    results = []
    i, bs = 0, initial_bs
    last_safe_bs = initial_bs
    while i < len(prompts):
        batch = prompts[i:i+bs]
        try:
            inputs = tokenizer(batch, return_tensors="pt", padding=True,
                               truncation=True, max_length=2048).to(DEVICE)
            outputs = model.generate(
                **inputs,
                max_new_tokens=GEN_MAXLEN,
                min_new_tokens=256,
                length_penalty=1.2,
                pad_token_id=tokenizer.eos_token_id
            )
            decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)
            results.extend([d.strip() for d in decoded])
            i += bs
            if bs < max_bs:
                last_safe_bs = bs
                bs = min(bs * 2, max_bs)
        except RuntimeError as e:
            if "CUDA out of memory" in str(e):
                torch.cuda.empty_cache()
                print(f"⚠️ OOM at bs={bs}, rollback to {last_safe_bs}")
                bs = max(last_safe_bs // 2, 1)
                if bs < 1:
                    print("❌ Even bs=1 failed, aborting.")
                    break
            else:
                raise
    return results

# ===== Metric Calculation =====
def compute_metrics(gens, refs):
    scorer = rouge_scorer.RougeScorer(["rouge1","rouge2","rougeL"], use_stemmer=True)
    _, _, bert_f1 = bert_score(gens, refs, lang="en", model_type="roberta-large", verbose=False)
    res = []
    for g, r, b in zip(gens, refs, bert_f1):
        scr = scorer.score(r, g)
        bleu = sentence_bleu([r.split()], g.split())
        res.append({
            "rouge1": round(scr["rouge1"].fmeasure, 4),
            "rouge2": round(scr["rouge2"].fmeasure, 4),
            "rougeL": round(scr["rougeL"].fmeasure, 4),
            "bleu": round(bleu, 4),
            "bertscore": round(b.item(), 4)
        })
    return res

# ====== Answer Cleaning ======
def clean_answer(text):
    text = text.strip()
    if "Final Answer:" in text:
        return text.split("Final Answer:", 1)[1].strip()
    paragraphs = re.split(r"\n\s*\n", text)
    cleaned_paras = []
    for para in paragraphs:
        first_line = para.strip().splitlines()[0] if para.strip() else ""
        if first_line.startswith(("Question:", "Context:", "RULES:", "Answer the question")):
            continue
        cleaned_paras.append(para.strip())
    return "\n\n".join(p for p in cleaned_paras if p)

# ===== Top-K Configuration =====
CTX_TOPK_LIST = [1, 2, 3, 4, 5]
topk_list = [(k, f"top{k}") for k in CTX_TOPK_LIST]
MAX_REQUIRED_K = max(CTX_TOPK_LIST)  # 5

# ===== Main Loop =====
results = []

for cid, grp in tqdm(df.groupby("context_id")):
    batch_prompts, meta = [], []
    for _, row in grp.iterrows():
        q, gt, qid = row["question"], row["answer"], str(row["qa_id"])

        if eligible is not None and qid not in eligible:
            continue

        # --- BM25 filter by cid ---
        bm25_in = pd.DataFrame({"qid": ["0"], "query": [clean_query(q)]})
        out = bm25.transform(bm25_in)
        out = out[out["docno"].str.startswith(cid)].head(BM25_TOPK)
        ids = [int(docno_to_docid[d]) for d in out["docno"] if d in docno_to_docid]

        # Uniform sample requirement (ensure top5 available)
        if eligible is None and len(ids) < MAX_REQUIRED_K:
            continue

        paras = [para_text_map[str(i)] for i in ids]

        # --- ColBERT reranking ---
        Qt = checkpoint.queryFromText([q])
        Dt = checkpoint.docFromText(paras)
        Dm = torch.ones(Dt.size()[:2], dtype=torch.long, device=Dt.device)
        scores = colbert_score(Qt, Dt, Dm).tolist()
        pairs = sorted(zip(ids, paras, scores), key=lambda x: x[2], reverse=True)

        if len(pairs) < MAX_REQUIRED_K:
            continue

        # --- Slice based on same reranked list, record topk's "rank1/2/.." text ---
        for topk, tag in topk_list:
            _, top_ps, _ = zip(*pairs[:topk])  # Top K paragraph texts sorted by ColBERT
            topk_ranked_context = "\n".join([f"rank{i+1}: {top_ps[i]}" for i in range(len(top_ps))])

            numbered = [f"Paragraph {i+1}: {p}" for i, p in enumerate(top_ps)]

            # Run all topk sequentially; add reversed for topk>=2; add shuffled for topk>=3
            strategies = [("sequential", numbered)]
            if topk >= 2 and len(numbered) > 1:
                strategies += [("reversed", list(reversed(numbered)))]
            if topk >= 3 and len(numbered) > 1:
                strategies += [("shuffled", random.sample(numbered, len(numbered)))]

            for strat_name, context in strategies:
                batch_prompts.append(build_prompt(q, "\n".join(context)))
                # Include topk_ranked_context in results
                meta.append((f"{strat_name}_{tag}", gt, row["qa_id"], q, topk, topk_ranked_context, cid))

    if not batch_prompts:
        continue

    answers = batch_generate_dynamic(batch_prompts, initial_bs=16, max_bs=64)
    for ans, (strat, gt, qid, q, topk_val, topk_ranked_ctx, cid_val) in zip(answers, meta):
        if not ans:
            continue
        ans_clean = clean_answer(ans)
        m = compute_metrics([ans_clean], [gt])[0]
        results.append({
            "cid": cid_val,
            "qid": qid,
            "question": q,
            "topk": topk_val,
            "topk_ranked_context": topk_ranked_ctx,  # << Added: ColBERT-ranked rank1/2/... text
            "strategy": strat,
            "answer_clean": ans.strip(),
            "answer_for_eval": ans_clean,
            **m
        })

# ===== Save Results =====
df_res = pd.DataFrame(results)

# Fixed column order
cols_order = [
    "cid","qid","question",
    "topk","topk_ranked_context","strategy",
    "answer_clean","answer_for_eval",
    "rouge1","rouge2","rougeL","bleu","bertscore"
]
df_res = df_res[cols_order]

# Convert \n in topk_ranked_context to actual newlines (will auto-wrap in Excel/tables)
df_res["topk_ranked_context"] = df_res["topk_ranked_context"].apply(lambda x: x.replace("\n", "\r\n"))

df_res.to_csv(f"{ROOT}/final_results_stage1.csv", index=False)
avg_m = df_res.groupby("strategy")[["rouge1","rouge2","rougeL","bleu","bertscore"]].mean().reset_index()
avg_m.to_csv(f"{ROOT}/average_metrics_stage1.csv", index=False)

print("✅ Done. Added reversed for top-2 and `topk_ranked_context` column (rank1/2/... by ColBERT).")

>>> Loading NLQuAD...
>>> Building ColBERT Index...


[Aug 13, 17:26:36] #> Note: Output directory /gz-data/nlquad_colbert/nlquad already exists


>>> Building BM25 Index...
>>> Loading LLM...


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

>>> Model ready!


  0%|          | 1/235 [00:00<00:39,  5.93it/s]


#> QueryTokenizer.tensorize(batch_text[0], batch_background[0], bsize) ==
#> Input: Where are Syrian migrants trying to go?, 		 True, 		 None
#> Output IDs: torch.Size([32]), tensor([  101,     1,  2073,  2024,  9042, 16836,  2667,  2000,  2175,  1029,
          102,   103,   103,   103,   103,   103,   103,   103,   103,   103,
          103,   103,   103,   103,   103,   103,   103,   103,   103,   103,
          103,   103], device='cuda:0')
#> Output Mask: torch.Size([32]), tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')

⚠️ OOM at bs=16, rollback to 16


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

⚠️ OOM at bs=16, rollback to 16
⚠️ OOM at bs=16, rollback to 8


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

⚠️ OOM at bs=16, rollback to 16
⚠️ OOM at bs=16, rollback to 8
⚠️ OOM at bs=16, rollback to 8
⚠️ OOM at bs=16, rollback to 8
⚠️ OOM at bs=16, rollback to 8
⚠️ OOM at bs=16, rollback to 8
⚠️ OOM at bs=16, rollback to 8


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

⚠️ OOM at bs=16, rollback to 16
⚠️ OOM at bs=16, rollback to 8
⚠️ OOM at bs=16, rollback to 8


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

⚠️ OOM at bs=16, rollback to 16
⚠️ OOM at bs=16, rollback to 8
⚠️ OOM at bs=16, rollback to 8


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

⚠️ OOM at bs=16, rollback to 16
⚠️ OOM at bs=16, rollback to 8
⚠️ OOM at bs=16, rollback to 8


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

⚠️ OOM at bs=16, rollback to 16


  8%|▊         | 19/235 [38:00<6:14:32, 104.04s/it]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoi

⚠️ OOM at bs=16, rollback to 16
⚠️ OOM at bs=16, rollback to 8


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

⚠️ OOM at bs=16, rollback to 16


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

⚠️ OOM at bs=16, rollback to 16
⚠️ OOM at bs=16, rollback to 8
⚠️ OOM at bs=16, rollback to 8
⚠️ OOM at bs=16, rollback to 8
⚠️ OOM at bs=16, rollback to 8
⚠️ OOM at bs=16, rollback to 8


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

⚠️ OOM at bs=16, rollback to 16
⚠️ OOM at bs=16, rollback to 8


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

⚠️ OOM at bs=16, rollback to 16
⚠️ OOM at bs=16, rollback to 8
⚠️ OOM at bs=16, rollback to 8


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

⚠️ OOM at bs=16, rollback to 16


 13%|█▎        | 31/235 [1:07:40<7:58:22, 140.70s/it]

⚠️ OOM at bs=16, rollback to 8
⚠️ OOM at bs=16, rollback to 8
⚠️ OOM at bs=16, rollback to 8
⚠️ OOM at bs=16, rollback to 8
⚠️ OOM at bs=16, rollback to 8
⚠️ OOM at bs=16, rollback to 8
⚠️ OOM at bs=16, rollback to 8
⚠️ OOM at bs=16, rollback to 8
⚠️ OOM at bs=16, rollback to 8
⚠️ OOM at bs=16, rollback to 8
⚠️ OOM at bs=16, rollback to 8
⚠️ OOM at bs=16, rollback to 8
⚠️ OOM at bs=16, rollback to 8


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

⚠️ OOM at bs=16, rollback to 16
⚠️ OOM at bs=16, rollback to 8


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

⚠️ OOM at bs=16, rollback to 16


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

⚠️ OOM at bs=16, rollback to 16


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

⚠️ OOM at bs=16, rollback to 16


 16%|█▌        | 37/235 [1:40:00<9:11:32, 167.13s/it]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkp

⚠️ OOM at bs=16, rollback to 16
⚠️ OOM at bs=16, rollback to 8


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

⚠️ OOM at bs=16, rollback to 16
⚠️ OOM at bs=16, rollback to 8
⚠️ OOM at bs=16, rollback to 8


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

⚠️ OOM at bs=16, rollback to 16


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

⚠️ OOM at bs=16, rollback to 16


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

⚠️ OOM at bs=16, rollback to 16
⚠️ OOM at bs=16, rollback to 8
⚠️ OOM at bs=16, rollback to 8


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

⚠️ OOM at bs=16, rollback to 16


 25%|██▍       | 58/235 [2:04:10<3:44:52, 76.23s/it]

⚠️ OOM at bs=16, rollback to 8


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

⚠️ OOM at bs=16, rollback to 16


 26%|██▋       | 62/235 [2:08:04<2:49:20, 58.73s/it]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpo

⚠️ OOM at bs=16, rollback to 16
⚠️ OOM at bs=16, rollback to 8


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

⚠️ OOM at bs=16, rollback to 16


 28%|██▊       | 66/235 [2:13:50<3:18:28, 70.47s/it]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpo

⚠️ OOM at bs=16, rollback to 16


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

⚠️ OOM at bs=16, rollback to 16
⚠️ OOM at bs=16, rollback to 8


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

⚠️ OOM at bs=16, rollback to 16


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

⚠️ OOM at bs=16, rollback to 16
⚠️ OOM at bs=16, rollback to 8
⚠️ OOM at bs=16, rollback to 8
⚠️ OOM at bs=16, rollback to 8
⚠️ OOM at bs=16, rollback to 8


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

⚠️ OOM at bs=16, rollback to 16


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

⚠️ OOM at bs=16, rollback to 16


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

⚠️ OOM at bs=16, rollback to 16
⚠️ OOM at bs=16, rollback to 8
⚠️ OOM at bs=16, rollback to 8
⚠️ OOM at bs=16, rollback to 8


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

⚠️ OOM at bs=16, rollback to 16
⚠️ OOM at bs=16, rollback to 8


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

⚠️ OOM at bs=16, rollback to 16


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

⚠️ OOM at bs=16, rollback to 16
⚠️ OOM at bs=16, rollback to 8


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

⚠️ OOM at bs=16, rollback to 16


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

⚠️ OOM at bs=16, rollback to 16


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

⚠️ OOM at bs=16, rollback to 16
⚠️ OOM at bs=16, rollback to 8
⚠️ OOM at bs=16, rollback to 8
⚠️ OOM at bs=16, rollback to 8


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

⚠️ OOM at bs=16, rollback to 16
⚠️ OOM at bs=16, rollback to 8
⚠️ OOM at bs=16, rollback to 8


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

⚠️ OOM at bs=16, rollback to 16
⚠️ OOM at bs=16, rollback to 8


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

⚠️ OOM at bs=16, rollback to 16


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

⚠️ OOM at bs=16, rollback to 16
⚠️ OOM at bs=16, rollback to 8


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

⚠️ OOM at bs=16, rollback to 16


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

⚠️ OOM at bs=16, rollback to 16


 44%|████▍     | 104/235 [3:23:20<1:46:04, 48.58s/it]

⚠️ OOM at bs=16, rollback to 8


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

⚠️ OOM at bs=16, rollback to 16
⚠️ OOM at bs=16, rollback to 8
⚠️ OOM at bs=16, rollback to 8


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

⚠️ OOM at bs=16, rollback to 16


 49%|████▊     | 114/235 [3:33:04<1:20:26, 39.89s/it]

⚠️ OOM at bs=16, rollback to 8
⚠️ OOM at bs=16, rollback to 8


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

⚠️ OOM at bs=16, rollback to 16
⚠️ OOM at bs=16, rollback to 8


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

⚠️ OOM at bs=16, rollback to 16


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

⚠️ OOM at bs=16, rollback to 16


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

⚠️ OOM at bs=16, rollback to 16


 54%|█████▍    | 128/235 [3:49:21<1:15:10, 42.15s/it]

⚠️ OOM at bs=16, rollback to 8
⚠️ OOM at bs=16, rollback to 8
⚠️ OOM at bs=16, rollback to 8
⚠️ OOM at bs=16, rollback to 8
⚠️ OOM at bs=16, rollback to 8


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

⚠️ OOM at bs=16, rollback to 16


 56%|█████▌    | 132/235 [4:01:04<2:43:55, 95.49s/it]

⚠️ OOM at bs=16, rollback to 8
⚠️ OOM at bs=16, rollback to 8
⚠️ OOM at bs=16, rollback to 8
⚠️ OOM at bs=16, rollback to 8


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

⚠️ OOM at bs=16, rollback to 16


 58%|█████▊    | 136/235 [4:10:51<2:32:36, 92.49s/it]

⚠️ OOM at bs=16, rollback to 8


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

⚠️ OOM at bs=16, rollback to 16
⚠️ OOM at bs=16, rollback to 8
⚠️ OOM at bs=16, rollback to 8
⚠️ OOM at bs=16, rollback to 8
⚠️ OOM at bs=16, rollback to 8
⚠️ OOM at bs=16, rollback to 8


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

⚠️ OOM at bs=16, rollback to 16
⚠️ OOM at bs=16, rollback to 8


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

⚠️ OOM at bs=16, rollback to 16
⚠️ OOM at bs=16, rollback to 8
⚠️ OOM at bs=16, rollback to 8
⚠️ OOM at bs=16, rollback to 8


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

⚠️ OOM at bs=16, rollback to 16


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

⚠️ OOM at bs=16, rollback to 16
⚠️ OOM at bs=16, rollback to 8
⚠️ OOM at bs=16, rollback to 8


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

⚠️ OOM at bs=16, rollback to 16


 63%|██████▎   | 148/235 [4:45:05<2:46:35, 114.89s/it]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model check

⚠️ OOM at bs=16, rollback to 16
⚠️ OOM at bs=16, rollback to 8


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

⚠️ OOM at bs=16, rollback to 16


 66%|██████▌   | 154/235 [4:50:51<1:39:25, 73.65s/it]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkp

⚠️ OOM at bs=16, rollback to 16
⚠️ OOM at bs=16, rollback to 8


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

⚠️ OOM at bs=16, rollback to 16


 69%|██████▉   | 162/235 [4:58:21<54:02, 44.42s/it]

⚠️ OOM at bs=16, rollback to 8


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

⚠️ OOM at bs=16, rollback to 16
⚠️ OOM at bs=16, rollback to 8
⚠️ OOM at bs=16, rollback to 8


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

⚠️ OOM at bs=16, rollback to 16
⚠️ OOM at bs=16, rollback to 8
⚠️ OOM at bs=16, rollback to 8
⚠️ OOM at bs=16, rollback to 8


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

⚠️ OOM at bs=16, rollback to 16
⚠️ OOM at bs=16, rollback to 8
⚠️ OOM at bs=16, rollback to 8


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

⚠️ OOM at bs=16, rollback to 16


 73%|███████▎  | 172/235 [5:22:11<1:46:15, 101.20s/it]

⚠️ OOM at bs=16, rollback to 8


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

⚠️ OOM at bs=16, rollback to 16
⚠️ OOM at bs=16, rollback to 8
⚠️ OOM at bs=16, rollback to 8
⚠️ OOM at bs=16, rollback to 8


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

⚠️ OOM at bs=16, rollback to 16


 76%|███████▌  | 178/235 [5:33:41<1:35:06, 100.12s/it]

⚠️ OOM at bs=16, rollback to 8
⚠️ OOM at bs=16, rollback to 8
⚠️ OOM at bs=16, rollback to 8


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

⚠️ OOM at bs=16, rollback to 16


 78%|███████▊  | 183/235 [5:41:05<1:18:28, 90.55s/it]

⚠️ OOM at bs=16, rollback to 8


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

⚠️ OOM at bs=16, rollback to 16
⚠️ OOM at bs=16, rollback to 8
⚠️ OOM at bs=16, rollback to 8
⚠️ OOM at bs=16, rollback to 8


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

⚠️ OOM at bs=16, rollback to 16


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

⚠️ OOM at bs=16, rollback to 16


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

⚠️ OOM at bs=16, rollback to 16


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

⚠️ OOM at bs=16, rollback to 16


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

⚠️ OOM at bs=16, rollback to 16
⚠️ OOM at bs=16, rollback to 8
⚠️ OOM at bs=16, rollback to 8
⚠️ OOM at bs=16, rollback to 8
⚠️ OOM at bs=16, rollback to 8
⚠️ OOM at bs=16, rollback to 8
⚠️ OOM at bs=16, rollback to 8
⚠️ OOM at bs=16, rollback to 8
⚠️ OOM at bs=16, rollback to 8
⚠️ OOM at bs=16, rollback to 8
⚠️ OOM at bs=16, rollback to 8
⚠️ OOM at bs=16, rollback to 8
⚠️ OOM at bs=16, rollback to 8
⚠️ OOM at bs=16, rollback to 8
⚠️ OOM at bs=16, rollback to 8
⚠️ OOM at bs=16, rollback to 8
⚠️ OOM at bs=16, rollback to 8
⚠️ OOM at bs=16, rollback to 8
⚠️ OOM at bs=16, rollback to 8
⚠️ OOM at bs=16, rollback to 8
⚠️ OOM at bs=16, rollback to 8
⚠️ OOM at bs=16, rollback to 8


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

⚠️ OOM at bs=16, rollback to 16


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

⚠️ OOM at bs=16, rollback to 16


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

⚠️ OOM at bs=16, rollback to 16


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

⚠️ OOM at bs=16, rollback to 16


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

⚠️ OOM at bs=16, rollback to 16


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

⚠️ OOM at bs=16, rollback to 16
⚠️ OOM at bs=16, rollback to 8


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

⚠️ OOM at bs=16, rollback to 16
⚠️ OOM at bs=16, rollback to 8


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

⚠️ OOM at bs=16, rollback to 16


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

⚠️ OOM at bs=16, rollback to 16


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

⚠️ OOM at bs=16, rollback to 16


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

⚠️ OOM at bs=16, rollback to 16


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

⚠️ OOM at bs=16, rollback to 16
⚠️ OOM at bs=16, rollback to 8


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

✅ Done. Added reversed for top-2 and `topk_ranked_context` column (rank1/2/... by ColBERT).


Cell 4 — Stage-2 (LLM judging): coherence & informativeness → per-sample scores

In [7]:
# ========== Stage 2: LLM-based Scoring (Coherence / Informativeness) ==========
import os, re, gc, torch, warnings
import pandas as pd
from datasets import load_dataset
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM

warnings.filterwarnings("ignore")

# ====== Path Configuration ======
ROOT = "/gz-data/nlquad_colbert"
RESULT_FILE = f"{ROOT}/final_results_stage1.csv"      # Stage 1 output file
STAGE2_FILE = f"{ROOT}/final_results_stage2.csv"      # Stage 2 scoring results save path
LLM_SCORE_MODEL = "/gz-data/models/qwen2.5"          # Qwen scoring model

# ====== GPU Memory Limit Setup ======
def auto_max_memory():
    """Set (total memory - 1GiB) limit per GPU to avoid memory overflow"""
    assert torch.cuda.is_available(), "CUDA not available."
    mem = {}
    for i in range(torch.cuda.device_count()):
        total_gb = int(torch.cuda.get_device_properties(i).total_memory / (1024**3))
        mem[i] = f"{max(total_gb - 1, 1)}GiB"
    return mem

# ====== Load NLQuAD Gold Answers ======
print(">>> Loading NLQuAD to recover gold answers ...")
dataset = load_dataset("LLukas22/NLQuAD", split="test")
gold_map = {}
for article in dataset:
    for para in article["paragraphs"]:
        for qa in para["qas"]:
            if qa["answers"]:
                gold_map[qa["id"]] = qa["answers"][0]["text"]

# ====== Load Stage 1 Results ======
df_res = pd.read_csv(RESULT_FILE)
df_res["gold_answer"] = df_res["qid"].map(gold_map)

# ====== Load Qwen Scoring Model ======
print(">>> Loading scorer (Qwen2.5-7B-Instruct) on multi-GPU ...")
tok_score = AutoTokenizer.from_pretrained(LLM_SCORE_MODEL)
score_model = AutoModelForCausalLM.from_pretrained(
    LLM_SCORE_MODEL,
    torch_dtype=torch.float16,
    device_map="balanced",          # Balanced multi-GPU
    max_memory=auto_max_memory()
)
print(">>> Scorer ready!")

# ====== Construct Prompt ======
def build_score_prompt(question, gold, generated):
    return f"""
You are an expert evaluator. Based on the question, gold answer, and generated answer, provide two scores:
1. Coherence (1-5): Logical clarity and fluency of the generated answer.
2. Informativeness (1-5): Amount of correct and relevant information compared to the gold answer.

Question: {question}
Gold Answer: {gold}
Generated Answer: {generated}

IMPORTANT:
- Do NOT provide any explanation or extra text.
- Only output two integers, separated by a comma, in the LAST line.
Format:
x,y

Now output the result:
"""

# ====== Parse LLM Output ======
def parse_llm_output(text):
    lines = text.strip().splitlines()
    for line in reversed(lines):
        if re.match(r"^\s*[1-5]\s*,\s*[1-5]\s*$", line):
            a, b = [int(x.strip()) for x in line.split(",")]
            return {"coherence": a, "informativeness": b}
    nums = re.findall(r"\b[1-5]\b", text)
    if len(nums) >= 2:
        return {"coherence": int(nums[-2]), "informativeness": int(nums[-1])}
    return {"coherence": 0, "informativeness": 0}

# ====== Batch Scoring ======
def batch_generate_scores(prompts, initial_bs=32, min_bs=1, max_new_tokens=64):
    """
    Batch generate Qwen scores with automatic batch size fallback to prevent OOM.
    initial_bs: Initial batch size
    min_bs: Minimum batch size
    max_new_tokens: Maximum tokens generated per sample
    """
    results = []
    i = 0
    bs = initial_bs

    while i < len(prompts):
        batch = prompts[i:i+bs]
        try:
            inputs = tok_score(batch, return_tensors="pt", padding=True, truncation=True).to(score_model.device)
            with torch.inference_mode():
                outputs = score_model.generate(
                    **inputs,
                    max_new_tokens=max_new_tokens,
                    do_sample=False,
                    pad_token_id=tok_score.eos_token_id
                )
            decoded = tok_score.batch_decode(outputs, skip_special_tokens=True)
            results.extend(decoded)
            i += bs  # Proceed on success
            if bs < initial_bs:  # If fallback occurred, try recovering batch size
                bs = min(bs * 2, initial_bs)
        except RuntimeError as e:
            if "CUDA out of memory" in str(e):
                torch.cuda.empty_cache()
                print(f"⚠️ OOM at batch_size={bs}, reducing to {max(bs // 2, min_bs)}")
                bs = max(bs // 2, min_bs)
                if bs < 1:
                    raise RuntimeError("❌ Even batch_size=1 failed. Aborting Stage 2.")
            else:
                raise
    return results

# ====== Stage 2 Main Loop ======
print(">>> Stage 2 (LLM-based scoring) running ...")
BATCH_SIZE = 8
prompts, meta, results_s2 = [], [], []

for _, row in tqdm(df_res.iterrows(), total=len(df_res)):
    prompt = build_score_prompt(row["question"], row["gold_answer"], row["answer_for_eval"])
    prompts.append(prompt)
    meta.append(row.to_dict())

    if len(prompts) >= BATCH_SIZE:
        outs = batch_generate_scores(prompts, initial_bs=BATCH_SIZE)
        for out, row_data in zip(outs, meta):
            sc = parse_llm_output(out)
            row_data.update(sc)
            results_s2.append(row_data)
        prompts, meta = [], []

# Flush remaining data
if prompts:
    outs = batch_generate_scores(prompts, initial_bs=BATCH_SIZE)
    for out, row_data in zip(outs, meta):
        sc = parse_llm_output(out)
        row_data.update(sc)
        results_s2.append(row_data)

# ====== Save Stage 2 Results ======
pd.DataFrame(results_s2).to_csv(STAGE2_FILE, index=False)
print(f"✅ Stage 2 completed. Results saved at {STAGE2_FILE}")

>>> Loading NLQuAD to recover gold answers ...


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


>>> Loading scorer (Qwen2.5-7B-Instruct) on multi-GPU ...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

>>> Scorer ready!
>>> Stage 2 (LLM-based scoring) running ...


 32%|███▏      | 896/2796 [05:26<11:12,  2.82it/s]

⚠️ OOM at batch_size=8, reducing to 4


 32%|███▏      | 904/2796 [05:33<15:29,  2.04it/s]

⚠️ OOM at batch_size=8, reducing to 4


100%|██████████| 2796/2796 [17:10<00:00,  2.71it/s]


✅ Stage 2 completed. Results saved at /gz-data/nlquad_colbert/final_results_stage2.csv


Cell 5 — Merge baseline vs Top-K (strictly paired on qid)

In [9]:
import pandas as pd
import re
import os

ROOT = "/gz-data/nlquad_colbert"
BASELINE_STAGE2 = f"{ROOT}/final_results_stage2-baseline.csv"
TOPK_STAGE2     = f"{ROOT}/final_results_stage2.csv"
MERGED_FILE     = f"{ROOT}/final_results_stage2_merged.csv"

# 1) Read
b = pd.read_csv(BASELINE_STAGE2, dtype={"qid": str, "cid": str})
t = pd.read_csv(TOPK_STAGE2,     dtype={"qid": str, "cid": str})

# 2) Keep only qids from baseline that also appear in topK
keep_qids = set(t["qid"].astype(str))
b = b[b["qid"].astype(str).isin(keep_qids)].copy()

# 3) Column union & alignment
all_cols = list(dict.fromkeys(list(b.columns) + list(t.columns)))  # Preserve original order
for col in all_cols:
    if col not in b.columns:
        b[col] = pd.NA
    if col not in t.columns:
        t[col] = pd.NA

# 4) Fill missing fields
if "strategy" not in b.columns:
    b["strategy"] = "baseline"
if "topk" not in b.columns:
    b["topk"] = pd.NA
if "topk" not in t.columns:
    t["topk"] = t["strategy"].str.extract(r'top(\d+)').astype(float)

# 5) Merge
df = pd.concat([b[all_cols], t[all_cols]], ignore_index=True)

# 6) Sort
def parse_topk_from_any(row):
    k = row.get("topk", pd.NA)
    if pd.isna(k):
        m = re.search(r"top(\d+)", str(row.get("strategy", "")))
        return int(m.group(1)) if m else None
    try:
        return int(k)
    except Exception:
        return None

def strat_rank(row):
    s = str(row.get("strategy", ""))
    if s == "baseline":
        return (0, 0)

    k = parse_topk_from_any(row)
    if k is None:
        return (999, 0)

    base = 1 + (k - 1) * 3
    if "sequential" in s:
        off = 0
    elif "reversed" in s:
        off = 1
    elif "shuffled" in s:
        off = 2
    else:
        off = 3
    return (base + off, 0)

df["qid"] = df["qid"].astype(str)
df["__rank"] = df.apply(strat_rank, axis=1)
df = df.sort_values(by=["qid", "__rank"]).drop(columns="__rank")

# 7) Save
df.to_csv(MERGED_FILE, index=False, encoding="utf-8")
print(f"✅ Merge completed, saved to: {MERGED_FILE}")
print(df.head(10))

✅ 合并完成，保存到: /gz-data/nlquad_colbert/final_results_stage2_merged.csv
      cid     qid                                 question         strategy  \
0    1018  1018_0  Where are Syrian migrants trying to go?         baseline   
233  1018  1018_0  Where are Syrian migrants trying to go?  sequential_top1   
234  1018  1018_0  Where are Syrian migrants trying to go?  sequential_top2   
235  1018  1018_0  Where are Syrian migrants trying to go?    reversed_top2   
236  1018  1018_0  Where are Syrian migrants trying to go?  sequential_top3   
237  1018  1018_0  Where are Syrian migrants trying to go?    reversed_top3   
238  1018  1018_0  Where are Syrian migrants trying to go?    shuffled_top3   
239  1018  1018_0  Where are Syrian migrants trying to go?  sequential_top4   
240  1018  1018_0  Where are Syrian migrants trying to go?    reversed_top4   
241  1018  1018_0  Where are Syrian migrants trying to go?    shuffled_top4   

                                          answer_clean  \
0   

Cell 11 — Weighted global mean across shards (BM25-only)

In [12]:
import pandas as pd
import numpy as np

# ===== 1) Shard Files and Sample Counts =====
files = [
    "average_metrics_stage1_bm25only_shard0of2.csv",
    "average_metrics_stage1_bm25only_shard1of2_part0of2.csv",
    "average_metrics_stage1_bm25only_shard1of2_part1of4.csv",
    "average_metrics_stage1_bm25only_shard1of2_part3of4.csv",
]
weights = [118, 59, 29, 29]  # Sample counts for each file

# Only compute these 5 standard metrics
METRICS = ["rouge1", "rouge2", "rougeL", "bleu", "bertscore"]

# ===== 2) Read and Concatenate (with Weight Column) =====
dfs = []
for f, w in zip(files, weights):
    df = pd.read_csv(f)
    # Keep only required columns; fill missing columns
    for m in METRICS:
        if m not in df.columns:
            df[m] = np.nan
    # Convert to numeric
    for m in METRICS:
        df[m] = pd.to_numeric(df[m], errors="coerce")
    # Ensure 'strategy' column exists
    if "strategy" not in df.columns:
        raise ValueError(f"{f} lacks 'strategy' column")
    df["weight"] = w
    dfs.append(df[["strategy"] + METRICS + ["weight"]])

all_df = pd.concat(dfs, ignore_index=True)

# ===== 3) Calculate Weighted Global Mean by Strategy =====
# Numerator: sum(weight * metric); Denominator: sum(weight). If a strategy is missing in a shard, it only weights existing shards.
den = all_df.groupby("strategy")["weight"].sum()

num = {}
for m in METRICS:
    num[m] = (all_df[m] * all_df["weight"]).groupby(all_df["strategy"]).sum()

global_avg = pd.DataFrame({m: num[m] / den for m in METRICS})
global_avg = global_avg.reset_index().sort_values("strategy")

print("Global weighted average for each strategy:")
print(global_avg)

# ===== 4) Save =====
out_path = "average_metrics_stage1_bm25only_GLOBAL_weighted.csv"
global_avg.to_csv(out_path, index=False)
print(f"Saved: {out_path}")

每个 strategy 的全局加权平均：
           strategy    rouge1    rouge2    rougeL      bleu  bertscore
0     reversed_top2  0.392171  0.173500  0.231797  0.099525   0.850005
1     reversed_top3  0.416581  0.196240  0.245911  0.114723   0.860794
2     reversed_top4  0.413218  0.191645  0.247533  0.111949   0.858227
3     reversed_top5  0.415737  0.195228  0.250364  0.116019   0.858161
4   sequential_top1  0.368549  0.140299  0.214201  0.073202   0.834958
5   sequential_top2  0.383072  0.160384  0.225492  0.088783   0.847785
6   sequential_top3  0.404833  0.190940  0.246829  0.114810   0.860038
7   sequential_top4  0.409974  0.191287  0.245108  0.113799   0.858077
8   sequential_top5  0.397751  0.173905  0.233842  0.099162   0.856954
9     shuffled_top3  0.408437  0.190075  0.244006  0.114828   0.860800
10    shuffled_top4  0.414323  0.197391  0.251090  0.115628   0.859264
11    shuffled_top5  0.408344  0.184700  0.243492  0.107811   0.856123
已保存：average_metrics_stage1_bm25only_GLOBAL_weighted.csv


Cell 12 — BM25-only global heatmaps (raw-tight & delta)

In [15]:
# -*- coding: utf-8 -*-
# Plot heatmaps from BM25-only global weighted mean table (raw values, no normalization)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import re

GLOBAL_FILE = "average_metrics_stage1_bm25only_GLOBAL_weighted.csv"  # Input file
SAVE_DIR = Path("bm25only_heatmaps")
SAVE_DIR.mkdir(exist_ok=True)

METRICS = ["rouge1", "rouge2", "rougeL", "bleu", "bertscore"]
DELTA_RANGE = 0.05   # Delta heatmap color axis range: ±0.05; for more sensitivity, use 0.03 or 0.02

def to_num(df, cols):
    for c in cols:
        if c not in df.columns:
            df[c] = np.nan
        df[c] = pd.to_numeric(df[c], errors="coerce")
    return df

def sort_by_strategy_topk(idx):
    order = {"baseline":0, "sequential":1, "reversed":2, "shuffled":3}
    def key(lbl):
        s = str(lbl).lower()
        m = re.match(r"^(baseline|sequential|reversed|shuffled)(?:_top(\d+))?$", s)
        if not m: return (99, 999, s)
        base = order.get(m.group(1), 98)
        tk = int(m.group(2)) if m.group(2) else 0
        return (base, tk, s)
    return sorted(idx, key=key)

def plot_heatmap(arr, row_labels, col_labels, out_path, title, vmin=None, vmax=None, fmt=".3f"):
    n_rows, n_cols = arr.shape
    fig_h = max(4.5, n_rows * 0.7)
    fig_w = max(6.5, n_cols * 1.2)
    fig, ax = plt.subplots(figsize=(fig_w, fig_h))
    im = ax.imshow(arr, aspect="auto", vmin=vmin, vmax=vmax)
    if n_rows == 1: ax.set_ylim(-0.5, 0.5)
    if n_cols == 1: ax.set_xlim(-0.5, 0.5)
    ax.set_xticks(range(n_cols)); ax.set_xticklabels(col_labels, rotation=45, ha="right")
    ax.set_yticks(range(n_rows)); ax.set_yticklabels(row_labels)
    for i in range(n_rows):
        for j in range(n_cols):
            v = arr[i, j]
            if not np.isnan(v):
                ax.text(j, i, format(v, fmt), ha="center", va="center")
    fig.colorbar(im, ax=ax).set_label("Score")
    ax.set_title(title)
    plt.tight_layout()
    fig.savefig(out_path, dpi=160)
    plt.close(fig)

# Read global table
g = pd.read_csv(GLOBAL_FILE)
assert "strategy" in g.columns, "Global table must include 'strategy' column"
g = to_num(g, METRICS)
g = g.set_index("strategy")
g = g.loc[sort_by_strategy_topk(g.index)]

# 1) Full table: raw_tight (tight color scale)
arr = g[METRICS].values.astype(float)
mn, mx = np.nanmin(arr), np.nanmax(arr)
margin = max(0.005, (mx - mn) * 0.05)
plot_heatmap(arr, g.index.tolist(), METRICS,
             str(SAVE_DIR / "bm25only_global_raw_tight.png"),
             "BM25-only Global (raw, tight scale)",
             vmin=max(0.0, mn - margin), vmax=min(1.0, mx + margin))

# 2) Full table: delta (difference from column mean, centered at 0)
col_means = np.nanmean(arr, axis=0, keepdims=True)
delta = arr - col_means
plot_heatmap(delta, g.index.tolist(), METRICS,
             str(SAVE_DIR / f"bm25only_global_delta_pm{int(DELTA_RANGE*100):02d}.png"),
             f"BM25-only Global (delta vs column mean, ±{DELTA_RANGE})",
             vmin=-DELTA_RANGE, vmax=+DELTA_RANGE, fmt="+.3f")

# 3) Per-strategy subplots (skip if no rows for a strategy)
for prefix in ["sequential", "reversed", "shuffled", "baseline"]:
    rows = [i for i in g.index if str(i).lower().startswith(prefix)]
    if not rows:
        continue
    sub = g.loc[rows, METRICS]
    arr = sub.values.astype(float)
    # raw_tight
    mn, mx = np.nanmin(arr), np.nanmax(arr)
    margin = max(0.005, (mx - mn) * 0.05)
    plot_heatmap(arr, sub.index.tolist(), METRICS,
                 str(SAVE_DIR / f"bm25only_{prefix}_raw_tight.png"),
                 f"BM25-only {prefix.capitalize()} (raw, tight scale)",
                 vmin=max(0.0, mn - margin), vmax=min(1.0, mx + margin))
    # delta
    col_means = np.nanmean(arr, axis=0, keepdims=True)
    delta = arr - col_means
    plot_heatmap(delta, sub.index.tolist(), METRICS,
                 str(SAVE_DIR / f"bm25only_{prefix}_delta_pm{int(DELTA_RANGE*100):02d}.png"),
                 f"BM25-only {prefix.capitalize()} (delta vs column mean, ±{DELTA_RANGE})",
                 vmin=-DELTA_RANGE, vmax=+DELTA_RANGE, fmt="+.3f")

print(f"Saved to {SAVE_DIR}: bm25only_global_raw_tight.png, bm25only_global_delta_*.png, and per-strategy subplots.")

已输出到 bm25only_heatmaps ：bm25only_global_raw_tight.png、bm25only_global_delta_*.png 以及分策略子图。


Cell 13 — Zoomed heatmaps for Stage-1 (all strategies)

In [14]:
# -*- coding: utf-8 -*-
# Enhanced resolution heatmaps: raw_tight (tight color scale) and delta (difference from column mean)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import re

FILE_MAIN = "average_metrics_stage1.csv"           # Contains sequential_* / reversed_* / shuffled_*
FILE_BASE = "average_metrics_stage1-baseline.csv"  # Baseline
SAVE_DIR  = Path("heatmaps_stage1_zoom")

METRICS = ["rouge1", "rouge2", "rougeL", "bleu", "bertscore"]
DELTA_RANGE = 0.05   # Delta heatmap color axis range: ±0.05; for finer detail, set to 0.02 or 0.03

Path(SAVE_DIR).mkdir(exist_ok=True)

def to_numeric(df, metrics):
    for c in metrics:
        if c not in df.columns:
            df[c] = np.nan
        df[c] = pd.to_numeric(df[c], errors="coerce")
    return df

def plot_heatmap(arr, row_labels, col_labels, out_path, title, vmin=None, vmax=None, fmt=".3f"):
    n_rows, n_cols = arr.shape
    fig_h = max(4.5, n_rows * 0.7)
    fig_w = max(6.5, n_cols * 1.2)
    fig, ax = plt.subplots(figsize=(fig_w, fig_h))
    im = ax.imshow(arr, aspect="auto", vmin=vmin, vmax=vmax)  # Default cmap
    if n_rows == 1: ax.set_ylim(-0.5, 0.5)
    if n_cols == 1: ax.set_xlim(-0.5, 0.5)
    ax.set_xticks(range(n_cols)); ax.set_xticklabels(col_labels, rotation=45, ha="right")
    ax.set_yticks(range(n_rows)); ax.set_yticklabels(row_labels)
    for i in range(n_rows):
        for j in range(n_cols):
            v = arr[i, j]
            if not np.isnan(v):
                ax.text(j, i, format(v, fmt), ha="center", va="center")
    cb = fig.colorbar(im, ax=ax)
    cb.set_label("Score")
    ax.set_title(title)
    plt.tight_layout()
    fig.savefig(out_path, dpi=160)
    plt.close(fig)

def extract_block(df, prefix):
    # prefix in ["sequential","reversed","shuffled"]
    mask = df["strategy"].str.lower().str.startswith(prefix + "_")
    block = df.loc[mask, ["strategy"] + METRICS].copy()
    block["__topk__"] = block["strategy"].str.extract(r"_top(\d+)", expand=False).astype(float)
    block = block.sort_values(by="__topk__", kind="mergesort")
    return block

# Read data
main = pd.read_csv(FILE_MAIN)
base = pd.read_csv(FILE_BASE)
main = to_numeric(main, METRICS)
base = to_numeric(base, METRICS)

# Baseline (typically one row)
dfb = base[base["strategy"].str.lower().eq("baseline")].copy()
if not dfb.empty:
    arr = dfb[METRICS].values.astype(float)
    # raw_tight: Use min/max of this plot with small margin
    mn, mx = np.nanmin(arr), np.nanmax(arr)
    margin = max(0.005, (mx - mn) * 0.05)
    plot_heatmap(arr, ["baseline"] * len(dfb), METRICS,
                 str(Path(SAVE_DIR, "baseline_raw_tight.png")),
                 f"Baseline (raw, tight scale)",
                 vmin=max(0.0, mn - margin), vmax=min(1.0, mx + margin))
    # Delta for baseline is meaningless (single row), so skip

# Three strategies: Plot raw_tight and delta
for prefix in ["sequential", "reversed", "shuffled"]:
    blk = extract_block(main, prefix)
    if blk.empty:
        print(f"[Warn] No {prefix}_* found")
        continue

    row_labels = blk["strategy"].tolist()
    arr = blk[METRICS].values.astype(float)

    # 1) raw_tight: Tight color scale based on subtable min/max (no value modification)
    mn, mx = np.nanmin(arr), np.nanmax(arr)
    margin = max(0.005, (mx - mn) * 0.05)  # 5% margin to avoid edge clipping
    plot_heatmap(arr, row_labels, METRICS,
                 str(Path(SAVE_DIR, f"{prefix}_raw_tight.png")),
                 f"{prefix.capitalize()} (raw, tight scale)",
                 vmin=max(0.0, mn - margin), vmax=min(1.0, mx + margin))

    # 2) delta: Difference from column mean (actual difference, color centered at 0)
    col_means = np.nanmean(arr, axis=0, keepdims=True)
    delta = arr - col_means
    plot_heatmap(delta, row_labels, METRICS,
                 str(Path(SAVE_DIR, f"{prefix}_delta_pm{int(DELTA_RANGE*100):02d}.png")),
                 f"{prefix.capitalize()} (delta vs column mean, ±{DELTA_RANGE})",
                 vmin=-DELTA_RANGE, vmax=+DELTA_RANGE, fmt="+.3f")

print(f"Done. See: {SAVE_DIR}/*_raw_tight.png and *_delta_*.png")

Done. See: heatmaps_stage1_zoom/*_raw_tight.png 以及 *_delta_*.png
