In [1]:
# ================== Notebook B - Config (GPU:1, shard=1) ==================
import os, warnings, torch
warnings.filterwarnings("ignore")

# Fixed to use GPU:1
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

# Cache & Java
os.environ["HF_HOME"] = "/gz-data/hf_cache"
os.environ["TRANSFORMERS_CACHE"] = "/gz-data/hf_cache"
os.environ["HF_DATASETS_CACHE"] = "/gz-data/hf_cache"
os.environ.setdefault("JAVA_HOME", "/gz-data/jdk/current")
os.environ.setdefault("JVM_PATH", "/gz-data/jdk/current/lib/server/libjvm.so")
os.environ.setdefault("LD_LIBRARY_PATH", "/gz-data/jdk/current/lib/server:" + os.environ.get("LD_LIBRARY_PATH", ""))
os.environ.setdefault("JAVA_TOOL_OPTIONS", "-Djava.io.tmpdir=/gz-data/tmp")

ROOT = "/gz-data/nlquad_colbert"
GEN_MODEL = "/gz-data/models/deepseek-llm-7b-chat"
BM25_TOPK = 5
MAXLEN, GEN_MAXLEN = 250, 384
MIN_SPLIT_LEN, DESIRED_SEG_LEN = 1000, 250
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

SHARD = 1
NUM_SHARDS = 2

import os
for p in ["/gz-data/hf_cache", "/gz-data/tmp", ROOT]:
    os.makedirs(p, exist_ok=True)
print("Notebook B ready. Using GPU:", os.environ["CUDA_VISIBLE_DEVICES"], "Shard:", SHARD, "/", NUM_SHARDS)

Notebook B ready. Using GPU: 1 Shard: 1 / 2


In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"              # Restrict this kernel to see only physical GPU 1
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
os.environ["HF_HOME"] = "/gz-data/hf_cache"
os.environ["TRANSFORMERS_CACHE"] = "/gz-data/hf_cache"
os.environ["HF_DATASETS_CACHE"] = "/gz-data/hf_cache"

import torch
print("B sees", torch.cuda.device_count(), "GPU(s). logical cuda:0 =>", torch.cuda.get_device_name(0))
assert torch.cuda.device_count() == 1, "B sees multiple GPUs, indicating environment variables did not take effect; please restart the kernel and run this cell again."

# ===== MUST RUN FIRST CELL (before importing pyterrier / jnius) =====
import os, glob

# A. Prefer system JDK (if it exists)
sys_java_home = "/usr/lib/jvm/java-17-openjdk-amd64"
cand = [
    os.path.join(sys_java_home, "lib", "server", "libjvm.so"),
    "/gz-data/jdk/current/lib/server/libjvm.so",  # If installed to /gz-data/jdk/current as per previous steps
]

jvm_path = next((p for p in cand if os.path.isfile(p)), None)

if jvm_path is None:
    # Fallback: Search in common directories
    for base in ("/usr/lib/jvm", "/gz-data/jdk"):
        for p in glob.glob(base + "/**/lib/server/libjvm.so", recursive=True):
            jvm_path = p
            break

if not jvm_path:
    raise RuntimeError("Could not find libjvm.so. Please install JDK 17 (system or /gz-data) and try again.")

java_home = os.path.abspath(os.path.join(jvm_path, "..", ".."))  # Remove /lib/server
os.environ["JAVA_HOME"] = java_home
os.environ["JVM_PATH"] = jvm_path
os.environ["LD_LIBRARY_PATH"] = os.path.dirname(jvm_path) + ":" + os.environ.get("LD_LIBRARY_PATH", "")
os.environ["PATH"] = os.path.join(java_home, "bin") + ":" + os.environ.get("PATH", "")

# Optional: Specify Java temporary directory and memory
os.environ.setdefault("JAVA_TOOL_OPTIONS", "-Djava.io.tmpdir=/gz-data/tmp")
os.environ.setdefault("_JAVA_OPTIONS", "-Xms512m -Xmx8g")
os.makedirs("/gz-data/tmp", exist_ok=True)

print("JAVA_HOME =", os.environ["JAVA_HOME"])
print("JVM_PATH  =", os.environ["JVM_PATH"])

# ===== Now import PyTerrier and initialize JVM =====
import pyterrier as pt
if not pt.started():
    # Use standard init for stability (includes Java initialization); add mem/jvm_opts if needed
    pt.init()  # Or pt.init(tail=False)
print("PyTerrier started =", pt.started())

B sees 1 GPU(s). logical cuda:0 => NVIDIA GeForce RTX 4090
JAVA_HOME = /gz-data/jdk/current/lib
JVM_PATH  = /gz-data/jdk/current/lib/server/libjvm.so
PyTerrier started = True


Picked up JAVA_TOOL_OPTIONS: -Djava.io.tmpdir=/gz-data/tmp
Picked up _JAVA_OPTIONS: -Xms512m -Xmx8g
Java started and loaded: pyterrier.java, pyterrier.terrier.java [version=5.11 (build: craig.macdonald 2025-01-13 21:29), helper_version=0.0.8]
java is now started automatically with default settings. To force initialisation early, run:
pt.java.init() # optional, forces java initialisation
  pt.init()  # 或 pt.init(tail=False)


In [3]:
# ========== Environment & Pipeline Setup (BM25-only + Dual-GPU Sharding) ==========
import os, re, torch, random, warnings, pandas as pd
from datasets import load_dataset
import pyterrier as pt
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from tqdm import tqdm
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu
from bert_score import score as bert_score

warnings.filterwarnings("ignore")

# ===== Manual Settings: Single GPU + Shard Parameters =====
GPU_VISIBLE = "1"   # Notebook A uses "0", Notebook B uses "1"
SHARD       = 1     # A=0, B=1
NUM_SHARDS  = 2     # Set to 2 for two GPUs in parallel

# ====== Cache & Path Config ======
os.environ["CUDA_VISIBLE_DEVICES"] = GPU_VISIBLE
os.environ["HF_HOME"] = "/gz-data/hf_cache"
os.environ["TRANSFORMERS_CACHE"] = "/gz-data/hf_cache"
os.environ["HF_DATASETS_CACHE"] = "/gz-data/hf_cache"
os.makedirs("/gz-data/hf_cache", exist_ok=True)

ROOT = "/gz-data/nlquad_colbert"
os.makedirs(ROOT, exist_ok=True)
BM25_TOPK=5
MAXLEN, GEN_MAXLEN = 250, 384
MIN_SPLIT_LEN, DESIRED_SEG_LEN = 1000, 250
GEN_MODEL = "/gz-data/models/deepseek-llm-7b-chat"  # Consistent with original (local path)
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

random.seed(42); torch.manual_seed(42)
if not pt.started():
    pt.init()

print(f"Using GPU_VISIBLE={GPU_VISIBLE} | SHARD {SHARD}/{NUM_SHARDS}")

# ===== Load Data & Preprocess =====
print(">>> Loading NLQuAD...")
dataset = load_dataset("LLukas22/NLQuAD", split="test")
records = []
for art in dataset:
    for para in art["paragraphs"]:
        ctx = para["context"]
        if len(ctx.split()) >= MIN_SPLIT_LEN:
            cid = para["qas"][0]["id"].split("_")[0]
            for qa in para["qas"]:
                if qa["answers"]:
                    records.append({
                        "context_id": cid,
                        "context": ctx,
                        "question": qa["question"],
                        "answer": qa["answers"][0]["text"],
                        "qa_id": qa["id"]
                    })
df = pd.DataFrame(records)
df = df.sort_values(["context_id", "qa_id"]).reset_index(drop=True)

# ===== Paragraph Splitting =====
def semantic_split(text, max_words=DESIRED_SEG_LEN):
    sents = re.split(r"(?<=[.!?])\s+", text.strip())
    buf, out = [], []
    for s in sents:
        if s.strip():
            buf.append(s)
            if len(" ".join(buf).split()) >= max_words:
                out.append(" ".join(buf))
                buf = []
    if buf: out.append(" ".join(buf))
    return out

para_records = []
for cid, grp in df.groupby("context_id"):
    context = grp["context"].iloc[0]
    for i, seg in enumerate(semantic_split(context)):
        para_records.append({"docno": f"{cid}_{i}", "text": seg, "cid": cid})
para_df = pd.DataFrame(para_records)
para_df["docid"] = para_df.index.astype(str)
docno_to_docid = dict(zip(para_df["docno"], para_df["docid"]))
para_text_map = dict(zip(para_df["docid"], para_df["text"]))

def clean_query(q):
    return re.sub(r"[^A-Za-z0-9 ]", "", q.strip())

# ===== Read eligible qids (S5 unified set) =====
ELIGIBLE_CSV = f"{ROOT}/eligible_qids_top5.csv"
eligible = None
if os.path.exists(ELIGIBLE_CSV):
    try:
        eligible = set(pd.read_csv(ELIGIBLE_CSV)["qa_id"].astype(str))
        print(f">>> Loaded eligible S5 set: {len(eligible)} qids")
    except Exception as e:
        print(f"⚠️ Failed to load {ELIGIBLE_CSV}: {e}. Will fallback to >=5 check.")

# ===== BM25 Index =====
print(">>> Building BM25 Index...")
index_ref = f"{ROOT}/pt_index"
if not os.path.exists(index_ref):
    index_ref = pt.IterDictIndexer(f"{ROOT}/pt_index", meta={"docno":44,"text":60000}, overwrite=True).index(para_df.to_dict("records"))
index = pt.IndexFactory.of(index_ref)
bm25 = pt.BatchRetrieve(index, wmodel="BM25")

# ===== Load LLM (Consistent with original: 8-bit + device_map="auto" + no compile) =====
print(">>> Loading LLM (B notebook, allow CPU offload)...")
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import os, torch

tokenizer = AutoTokenizer.from_pretrained(GEN_MODEL, trust_remote_code=True)

bnb_cfg = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_enable_fp32_cpu_offload=True   # Allow FP32 layers to offload to CPU
)

offload_dir = "/gz-data/offload_deepseek_B"
os.makedirs(offload_dir, exist_ok=True)

# Note: In Notebook B, CUDA_VISIBLE_DEVICES="1" is set
# For this process, the only visible GPU is still called "cuda:0"
model = AutoModelForCausalLM.from_pretrained(
    GEN_MODEL,
    quantization_config=bnb_cfg,
    device_map="auto",
    max_memory={ 0: "19GiB", "cpu": "200GiB" },  # Adjust based on available GPU memory (18~20GiB)
    offload_folder=offload_dir,
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
    trust_remote_code=True
).eval()

torch.backends.cuda.matmul.allow_tf32 = True
# 8-bit + offload does not recommend torch.compile, so keep it commented out or removed
# model = torch.compile(model, mode="reduce-overhead")

print(">>> Model ready (B with CPU offload)!")

# ===== Prompt Template =====
def build_prompt(question, context):
    return f"""You are an AI assistant. Based on the context, answer the question in the following format:

Context: {context}

Question: {question}

Final Answer:"""

# ===== Dynamic Batch Inference =====
def batch_generate_dynamic(prompts, initial_bs=16, max_bs=64):
    results = []
    i, bs = 0, initial_bs
    last_safe_bs = initial_bs
    while i < len(prompts):
        batch = prompts[i:i+bs]
        try:
            inputs = tokenizer(batch, return_tensors="pt", padding=True,
                               truncation=True, max_length=2048).to(DEVICE)
            outputs = model.generate(
                **inputs,
                max_new_tokens=GEN_MAXLEN,
                min_new_tokens=256,
                length_penalty=1.2,             # Keep as original
                pad_token_id=tokenizer.eos_token_id
            )
            decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)
            results.extend([d.strip() for d in decoded])
            i += bs
            if bs < max_bs:
                last_safe_bs = bs
                bs = min(bs * 2, max_bs)
        except RuntimeError as e:
            if "CUDA out of memory" in str(e):
                torch.cuda.empty_cache()
                print(f"⚠️ OOM at bs={bs}, rollback to {last_safe_bs}")
                bs = max(last_safe_bs // 2, 1)
                if bs < 1:
                    print("❌ Even bs=1 failed, aborting.")
                    break
            else:
                raise
    return results

# ===== Metric Calculation =====
def compute_metrics(gens, refs):
    scorer = rouge_scorer.RougeScorer(["rouge1","rouge2","rougeL"], use_stemmer=True)
    _, _, bert_f1 = bert_score(gens, refs, lang="en", model_type="roberta-large", verbose=False)
    res = []
    for g, r, b in zip(gens, refs, bert_f1):
        scr = scorer.score(r, g)
        bleu = sentence_bleu([r.split()], g.split())
        res.append({
            "rouge1": round(scr["rouge1"].fmeasure, 4),
            "rouge2": round(scr["rouge2"].fmeasure, 4),
            "rougeL": round(scr["rougeL"].fmeasure, 4),
            "bleu": round(bleu, 4),
            "bertscore": round(b.item(), 4)
        })
    return res

# ====== Answer Cleaning ======
def clean_answer(text):
    text = text.strip()
    if "Final Answer:" in text:
        return text.split("Final Answer:", 1)[1].strip()
    paragraphs = re.split(r"\n\s*\n", text)
    cleaned_paras = []
    for para in paragraphs:
        first_line = para.strip().splitlines()[0] if para.strip() else ""
        if first_line.startswith(("Question:", "Context:", "RULES:", "Answer the question")):
            continue
        cleaned_paras.append(para.strip())
    return "\n\n".join(p for p in cleaned_paras if p)

# ===== Top-K Configuration =====
CTX_TOPK_LIST = [1, 2, 3, 4, 5]
topk_list = [(k, f"top{k}") for k in CTX_TOPK_LIST]
MAX_REQUIRED_K = max(CTX_TOPK_LIST)  # 5

# ====== Sharding (Key Addition): Split context_id to this shard ======
all_groups = list(df.groupby("context_id"))
shard_groups = [g for i, g in enumerate(all_groups) if i % NUM_SHARDS == SHARD]
print(f">>> Shard {SHARD}/{NUM_SHARDS} groups = {len(shard_groups)}")

# ===== Main Loop (BM25-only, remove ColBERT reranking, keep rest unchanged) =====
results = []

for cid, grp in tqdm(shard_groups):
    batch_prompts, meta = [], []
    for _, row in grp.iterrows():
        q, gt, qid = row["question"], row["answer"], str(row["qa_id"])

        if eligible is not None and qid not in eligible:
            continue

        # --- BM25 filter by cid ---
        bm25_in = pd.DataFrame({"qid": ["0"], "query": [clean_query(q)]})
        out = bm25.transform(bm25_in)
        out = out[out["docno"].str.startswith(cid)].head(BM25_TOPK)
        ids = [int(docno_to_docid[d]) for d in out["docno"] if d in docno_to_docid]

        # Uniform sample requirement (ensure top5 available)
        if eligible is None and len(ids) < MAX_REQUIRED_K:
            continue

        paras = [para_text_map[str(i)] for i in ids]

        # ===== Remove ColBERT reranking, use BM25 order directly =====
        pairs = list(zip(ids, paras, [None]*len(paras)))

        if len(pairs) < MAX_REQUIRED_K:
            continue

        # --- Slice based on BM25 list, record topk's "rank1/2/.." text ---
        for topk, tag in topk_list:
            _, top_ps, _ = zip(*pairs[:topk])
            topk_ranked_context = "\n".join([f"rank{i+1}: {top_ps[i]}" for i in range(len(top_ps))])

            numbered = [f"Paragraph {i+1}: {p}" for i, p in enumerate(top_ps)]

            # Run all topk sequentially; add reversed for topk>=2; add shuffled for topk>=3
            strategies = [("sequential", numbered)]
            if topk >= 2 and len(numbered) > 1:
                strategies += [("reversed", list(reversed(numbered)))]
            if topk >= 3 and len(numbered) > 1:
                strategies += [("shuffled", random.sample(numbered, len(numbered)))]

            for strat_name, context in strategies:
                batch_prompts.append(build_prompt(q, "\n".join(context)))
                meta.append((f"{strat_name}_{tag}", gt, row["qa_id"], q, topk, topk_ranked_context, cid))

    if not batch_prompts:
        continue

    answers = batch_generate_dynamic(batch_prompts, initial_bs=16, max_bs=64)
    for ans, (strat, gt, qid, q, topk_val, topk_ranked_ctx, cid_val) in zip(answers, meta):
        if not ans:
            continue
        ans_clean = clean_answer(ans)
        m = compute_metrics([ans_clean], [gt])[0]
        results.append({
            "cid": cid_val,
            "qid": qid,
            "question": q,
            "topk": topk_val,
            "topk_ranked_context": topk_ranked_ctx,  # Keep column name unchanged
            "strategy": strat,
            "answer_clean": ans.strip(),
            "answer_for_eval": ans_clean,
            **m
        })

# ===== Save Results (with shard suffix; keep newline handling) =====
df_res = pd.DataFrame(results)

# Fixed column order
cols_order = [
    "cid","qid","question",
    "topk","topk_ranked_context","strategy",
    "answer_clean","answer_for_eval",
    "rouge1","rouge2","rougeL","bleu","bertscore"
]
df_res = df_res[cols_order]

# Convert \n in topk_ranked_context to actual newlines (will auto-wrap in Excel/tables)
df_res["topk_ranked_context"] = df_res["topk_ranked_context"].apply(lambda x: x.replace("\n", "\r\n"))

suf = f"_shard{SHARD}of{NUM_SHARDS}"
df_res.to_csv(f"{ROOT}/final_results_stage1_bm25only{suf}.csv", index=False)
avg_m = df_res.groupby("strategy")[["rouge1","rouge2","rougeL","bleu","bertscore"]].mean().reset_index()
avg_m.to_csv(f"{ROOT}/average_metrics_stage1_bm25only{suf}.csv", index=False)

print("✅ Done. Saved:")
print("   -", f"{ROOT}/final_results_stage1_bm25only{suf}.csv")
print("   -", f"{ROOT}/average_metrics_stage1_bm25only{suf}.csv")

Using GPU_VISIBLE=1 | SHARD 1/2
>>> Loading NLQuAD...
>>> Building BM25 Index...
>>> Loading LLM...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

>>> Model ready!
>>> Shard 1/2 groups = 117 | Subpart 3/4 = 29


  0%|          | 0/29 [00:00<?, ?it/s]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense

✅ Done. Saved:
   - /gz-data/nlquad_colbert/final_results_stage1_bm25only_shard1of2_part3of4.csv
   - /gz-data/nlquad_colbert/average_metrics_stage1_bm25only_shard1of2_part3of4.csv
