In [6]:
!pip -q install sentence-transformers
import os, re, unicodedata, numpy as np, pandas as pd, torch
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import normalize
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score


SEED = 42
np.random.seed(SEED); torch.manual_seed(SEED)
if torch.cuda.is_available(): torch.cuda.manual_seed_all(SEED)

TRAIN_PATH = "/kaggle/input/jigsaw-agile-community-rules/train.csv"   
TEST_PATH  = "/kaggle/input/jigsaw-agile-community-rules/test.csv"    
MODEL_NAME = "BAAI/bge-m3"         # swap to test others
USE_E5_PREFIX = False                                 
BATCH = 256


train = pd.read_csv(TRAIN_PATH)
needed = ["body","rule","rule_violation",
          "positive_example_1","positive_example_2",
          "negative_example_1","negative_example_2"]
train = train[[c for c in needed if c in train.columns]].copy()

test = pd.read_csv(TEST_PATH)
test_needed = ["body","rule","positive_example_1","positive_example_2",
               "negative_example_1","negative_example_2"]
test = test[[c for c in test_needed if c in test.columns]].copy()
if "id" not in test.columns:
    test["id"] = np.arange(len(test))  


USE_CLEANING = True          
CLEAN_VARIANT = "no_stop"    # one of: "minimal","norm_only","strip_punct","keep_punct","no_stop","num_mask"

import re, unicodedata, string
URL=r'https?://\S+'; EMAIL=r'\S+@\S+'; USER=r'@\w+'


def clean_minimal(x: str) -> str:
    x = unicodedata.normalize("NFKC", x)
    x = re.sub(URL,"<URL>",x)
    x = re.sub(EMAIL,"<EMAIL>",x)
    x = re.sub(USER,"<USER>",x)
    x = re.sub(r"\s+"," ",x).strip()
    return x

def clean_norm_only(x: str) -> str:
    x = unicodedata.normalize("NFKC", x)
    x = re.sub(r"\s+"," ",x).strip()
    return x

def clean_strip_punct(x: str) -> str:
    x = clean_minimal(x)
    return x.translate(str.maketrans('', '', string.punctuation))

def clean_keep_punct(x: str) -> str:
    x = unicodedata.normalize("NFKC", x)
    x = re.sub(URL,"<URL>",x)
    x = re.sub(EMAIL,"<EMAIL>",x)
    x = re.sub(USER,"<USER>",x)
    x = re.sub(r"\s+"," ",x).strip()
    return x

try:
    from nltk.corpus import stopwords
    _STOP = set(stopwords.words("english"))
except:
    _STOP = set()
def clean_no_stop(x: str) -> str:
    x = clean_minimal(x).lower()
    return " ".join(w for w in x.split() if w not in _STOP) if _STOP else x

def clean_num_mask(x: str) -> str:
    x = clean_minimal(x)
    x = re.sub(r"\d+", "<NUM>", x)
    return x

_VARIANTS = {
    "minimal": clean_minimal,
    "norm_only": clean_norm_only,
    "strip_punct": clean_strip_punct,
    "keep_punct": clean_keep_punct,
    "no_stop": clean_no_stop,     
    "num_mask": clean_num_mask,
}

def CLEAN_FN(x):
    if not isinstance(x, str): 
        x = "" if x is None else str(x)
    if not USE_CLEANING:
        return x
    return _VARIANTS.get(CLEAN_VARIANT, clean_minimal)(x)

cols = [c for c in ["body","rule","positive_example_1","positive_example_2",
                    "negative_example_1","negative_example_2"] if c in train.columns]
for df in (train, test):
    for c in cols:
        df[c] = df[c].astype(str).map(CLEAN_FN)

y = train["rule_violation"].astype(int).values
rules_text = train["rule"].values

def join2(a,b): return ((a or "") + " " + (b or "")).strip()
train_pos = [join2(a,b) for a,b in zip(train.get("positive_example_1",""), train.get("positive_example_2",""))]
train_neg = [join2(a,b) for a,b in zip(train.get("negative_example_1",""), train.get("negative_example_2",""))]
test_pos  = [join2(a,b) for a,b in zip(test.get("positive_example_1",""),  test.get("positive_example_2",""))]
test_neg  = [join2(a,b) for a,b in zip(test.get("negative_example_1",""),  test.get("negative_example_2",""))]



device = "cuda" if torch.cuda.is_available() else "cpu"
embedder = SentenceTransformer(MODEL_NAME, device=device)

import gc, torch
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()

def encode_in_chunks(texts, bs):
    out = []
    for i in range(0, len(texts), bs):
        out.append(embedder.encode(
            texts[i:i+bs], 
            batch_size=bs, 
            convert_to_numpy=True, 
            show_progress_bar=False
        ))
        if torch.cuda.is_available():
           torch.cuda.empty_cache()
    return np.vstack(out)

# ---- PROMPT FORMATTING ----
USE_PROMPT_FMT = False     # toggle True/False to A/B test
USE_E5_PREFIX  = False     # set True for E5 models, keep False for BGE/Qwen

def fmt_query(body, rule):
    """Format query (comment text + optional rule)"""
    if USE_PROMPT_FMT:
        text = f"RULE: {rule} || COMMENT: {body}"
    else:
        text = str(body)
    if USE_E5_PREFIX:
        text = "query: " + text
    return text

def fmt_passage(text):
    """Format passage (rules/examples)"""
    text = str(text)
    if USE_E5_PREFIX:
        text = "passage: " + text
    return text



q  = [fmt_query(b, r) for b, r in zip(train["body"], train["rule"])]
r  = [fmt_passage(x)  for x in train["rule"]]
pp = [fmt_passage(a + " " + b) for a, b in zip(train.get("positive_example_1","").fillna(""),
                                               train.get("positive_example_2","").fillna(""))]
nn = [fmt_passage(a + " " + b) for a, b in zip(train.get("negative_example_1","").fillna(""),
                                               train.get("negative_example_2","").fillna(""))]


tq = [fmt_query(b, r) for b, r in zip(test["body"], test["rule"])]
tr = [fmt_passage(x)  for x in test["rule"]]
tp = [fmt_passage(a + " " + b) for a, b in zip(test.get("positive_example_1","").fillna(""),
                                               test.get("positive_example_2","").fillna(""))]
tn = [fmt_passage(a + " " + b) for a, b in zip(test.get("negative_example_1","").fillna(""),
                                               test.get("negative_example_2","").fillna(""))]



with torch.no_grad():
   E_body = encode_in_chunks(q,  BATCH)
E_rule = encode_in_chunks(r,  BATCH)
E_ppos = encode_in_chunks(pp, BATCH)
E_pneg = encode_in_chunks(nn, BATCH)
TE_body = encode_in_chunks(tq, BATCH)
TE_rule = encode_in_chunks(tr, BATCH)
TE_ppos = encode_in_chunks(tp, BATCH)
TE_pneg = encode_in_chunks(tn, BATCH)
# L2 normalize
E_body = normalize(E_body); E_rule = normalize(E_rule)
E_ppos = normalize(E_ppos); E_pneg = normalize(E_pneg)
TE_body = normalize(TE_body); TE_rule = normalize(TE_rule)
TE_ppos = normalize(TE_ppos); TE_pneg = normalize(TE_pneg)

def tok(s):  
    return set(s.lower().split())


train_body_len = train["body"].str.len().to_numpy().reshape(-1,1)
train_rule_len = train["rule"].str.len().to_numpy().reshape(-1,1)
test_body_len  = test["body"].str.len().to_numpy().reshape(-1,1)
test_rule_len  = test["rule"].str.len().to_numpy().reshape(-1,1)


def jaccard_series(a, b):
    out = np.zeros((len(a),1), dtype=np.float32)
    for i,(x,y) in enumerate(zip(a,b)):
        sx, sy = tok(x), tok(y)
        u = len(sx|sy); inter = len(sx&sy)
        out[i,0] = (inter / u) if u else 0.0
    return out

train_jacc = jaccard_series(train["body"], train["rule"])
test_jacc  = jaccard_series(test["body"],  test["rule"])

def zscore_fit_transform(a):
    m = a.mean(axis=0, keepdims=True); s = a.std(axis=0, keepdims=True) + 1e-9
    return (a-m)/s, m, s
def zscore_transform(a, m, s): return (a-m)/s

num_train = np.hstack([train_body_len, train_rule_len, train_jacc]).astype(np.float32)
num_train_z, m, s = zscore_fit_transform(num_train)

num_test  = np.hstack([test_body_len, test_rule_len, test_jacc]).astype(np.float32)
num_test_z = zscore_transform(num_test, m, s)



rules_train = train["rule"].astype(str).values
uniq_rules = np.unique(rules_train)

pos_cent = {}
neg_cent = {}

def passage_fmt(s):  
    return ("passage: " + s) if USE_E5_PREFIX else s

for r in uniq_rules:
    idx = (rules_train == r)
    pos_txt = (train.loc[idx, "positive_example_1"].fillna("") + " " +
               train.loc[idx, "positive_example_2"].fillna("")).tolist()
    neg_txt = (train.loc[idx, "negative_example_1"].fillna("") + " " +
               train.loc[idx, "negative_example_2"].fillna("")).tolist()

    with torch.no_grad():
        E_pos = embedder.encode([passage_fmt(x) for x in pos_txt], batch_size=BATCH,
                                convert_to_numpy=True, show_progress_bar=False)
        E_neg = embedder.encode([passage_fmt(x) for x in neg_txt], batch_size=BATCH,
                                convert_to_numpy=True, show_progress_bar=False)

    E_pos = normalize(E_pos); E_neg = normalize(E_neg)
    pos_cent[r] = E_pos.mean(axis=0, keepdims=True) if len(E_pos) else np.zeros((1, E_body.shape[1]), dtype=np.float32)
    neg_cent[r] = E_neg.mean(axis=0, keepdims=True) if len(E_neg) else np.zeros((1, E_body.shape[1]), dtype=np.float32)


E_posg_train = np.vstack([pos_cent.get(r, np.zeros((1, E_body.shape[1]), np.float32)) for r in rules_train])
E_negg_train = np.vstack([neg_cent.get(r, np.zeros((1, E_body.shape[1]), np.float32)) for r in rules_train])

rules_test = test["rule"].astype(str).values
E_posg_test = np.vstack([pos_cent.get(r, np.zeros((1, E_body.shape[1]), np.float32)) for r in rules_test])
E_negg_test = np.vstack([neg_cent.get(r, np.zeros((1, E_body.shape[1]), np.float32)) for r in rules_test])

if "subreddit" in train.columns:
    sub_train = train["subreddit"].astype(str)
    sub_test  = test["subreddit"].astype(str)
    sub_freq  = sub_train.value_counts()

    sub_train_freq = sub_train.map(sub_freq).fillna(1).to_numpy(np.float32).reshape(-1,1)
    sub_test_freq  = sub_test.map(sub_freq).fillna(1).to_numpy(np.float32).reshape(-1,1)

    TOPK = 30
    top_subs = list(sub_freq.index[:TOPK])
    idx = {c:i for i,c in enumerate(top_subs)}
    sub_train_oh = np.zeros((len(sub_train), len(top_subs)), np.float32)
    sub_test_oh  = np.zeros((len(sub_test),  len(top_subs)), np.float32)
    for i, v in enumerate(sub_train):
        if v in idx: sub_train_oh[i, idx[v]] = 1.0
    for i, v in enumerate(sub_test):
        if v in idx: sub_test_oh[i, idx[v]] = 1.0

    m,s = sub_train_freq.mean(0, keepdims=True), sub_train_freq.std(0, keepdims=True)+1e-9
    sub_train_freq_z = (sub_train_freq - m)/s
    sub_test_freq_z  = (sub_test_freq  - m)/s
else:
    sub_train_freq_z = np.zeros((len(train),1), np.float32)
    sub_test_freq_z  = np.zeros((len(test),1),  np.float32)
    sub_train_oh = np.zeros((len(train),0), np.float32)
    sub_test_oh  = np.zeros((len(test),0),  np.float32)

rule_freq = pd.Series(rules_train).value_counts()
r_train_freq = pd.Series(rules_train).map(rule_freq).fillna(1).to_numpy(np.float32).reshape(-1,1)
r_test_freq  = pd.Series(rules_test ).map(rule_freq).fillna(1).to_numpy(np.float32).reshape(-1,1)
m,s = r_train_freq.mean(0, keepdims=True), r_train_freq.std(0, keepdims=True)+1e-9
r_train_freq_z = (r_train_freq - m)/s
r_test_freq_z  = (r_test_freq  - m)/s

meta_train = np.hstack([r_train_freq_z, sub_train_freq_z, sub_train_oh]).astype(np.float32)
meta_test  = np.hstack([r_test_freq_z,  sub_test_freq_z,  sub_test_oh ]).astype(np.float32)


def cos(A,B): return np.sum(A*B, axis=1, keepdims=True)

X_sim_train = np.hstack([
    cos(E_body, E_rule),
    cos(E_body, E_ppos),
    -cos(E_body, E_pneg),
    cos(E_body, (E_ppos - E_pneg)),
])

X_sim_test = np.hstack([
    cos(TE_body, TE_rule),
    cos(TE_body, TE_ppos),
    -cos(TE_body, TE_pneg),
    cos(TE_body, (TE_ppos - TE_pneg)),
])

X_sim_train = np.hstack([
    X_sim_train,
    cos(E_body, E_posg_train),
    -cos(E_body, E_negg_train),
    cos(E_body, (E_posg_train - E_negg_train)),
])

X_sim_test = np.hstack([
    X_sim_test,
    cos(TE_body, E_posg_test),
    -cos(TE_body, E_negg_test),
    cos(TE_body, (E_posg_test - E_negg_test)),
])

X  = np.hstack([X_sim_train, num_train_z]).astype(np.float32)
Xtest = np.hstack([X_sim_test,  num_test_z]).astype(np.float32)

sim_cols = X_sim_train.shape[1]

X_sim_train_z = np.zeros_like(X_sim_train, dtype=np.float32)
rule_stats = {}
for r in uniq_rules:
    idx = (rules_train == r)
    mu = X_sim_train[idx].mean(axis=0, keepdims=True)
    sd = X_sim_train[idx].std(axis=0, keepdims=True) + 1e-9
    X_sim_train_z[idx] = (X_sim_train[idx] - mu) / sd
    rule_stats[r] = (mu, sd)

X_sim_test_z = np.zeros_like(X_sim_test, dtype=np.float32)
for i, r in enumerate(rules_test):
    if r in rule_stats:
        mu, sd = rule_stats[r]
        X_sim_test_z[i:i+1] = (X_sim_test[i:i+1] - mu) / sd
    else:
        X_sim_test_z[i:i+1] = X_sim_test[i:i+1]  

if 'num_train_z' in locals():
    X     = np.hstack([X_sim_train_z, num_train_z, meta_train]).astype(np.float32)
    Xtest = np.hstack([X_sim_test_z,  num_test_z,  meta_test ]).astype(np.float32)
else:
    X     = np.hstack([X_sim_train_z,               meta_train]).astype(np.float32)
    Xtest = np.hstack([X_sim_test_z,                meta_test ]).astype(np.float32)


from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
oof = np.zeros(len(X), dtype=np.float32)

rules_all = train["rule"].astype(str).values
has_sub   = "subreddit" in train.columns
subs_all  = train["subreddit"].astype(str).values if has_sub else np.array(["_NA_"]*len(train))

for tr, va in skf.split(X, y):
    global_rate = y[tr].mean()

    rule_rate = pd.DataFrame({"r": rules_all[tr], "y": y[tr]}).groupby("r")["y"].mean()
    te_rule_tr = pd.Series(rules_all[tr]).map(rule_rate).fillna(global_rate).to_numpy(np.float32).reshape(-1,1)
    te_rule_va = pd.Series(rules_all[va]).map(rule_rate).fillna(global_rate).to_numpy(np.float32).reshape(-1,1)

    if has_sub:
        sub_rate = pd.DataFrame({"s": subs_all[tr], "y": y[tr]}).groupby("s")["y"].mean()
        te_sub_tr = pd.Series(subs_all[tr]).map(sub_rate).fillna(global_rate).to_numpy(np.float32).reshape(-1,1)
        te_sub_va = pd.Series(subs_all[va]).map(sub_rate).fillna(global_rate).to_numpy(np.float32).reshape(-1,1)
    else:
        te_sub_tr = np.zeros((len(tr),1), dtype=np.float32)
        te_sub_va = np.zeros((len(va),1), dtype=np.float32)

    X_tr = np.hstack([X[tr], te_rule_tr, te_sub_tr]).astype(np.float32)
    X_va = np.hstack([X[va], te_rule_va, te_sub_va]).astype(np.float32)

    clf = LogisticRegression(max_iter=300, class_weight="balanced")
    clf.fit(X_tr, y[tr])
    oof[va] = clf.predict_proba(X_va)[:, 1]

per_rule_auc = {}
for rname in np.unique(rules_all):
    idx = (rules_all == rname)
    if idx.sum() >= 2 and len(np.unique(y[idx])) > 1:
        per_rule_auc[rname] = roc_auc_score(y[idx], oof[idx])

macro_auc = float(np.mean(list(per_rule_auc.values()))) if per_rule_auc else float("nan")
print(f"Column-averaged AUC (CV): {macro_auc:.4f}")





huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Column-averaged AUC (CV): 0.8413


In [None]:
# after CV loop produced oof and computed test_probs
np.savez("/kaggle/working/run_bge_m3_minimal.npz", # swap with different models after rerunning the finetuning cell
         oof=oof.astype(np.float32),
         test=test_probs.astype(np.float32),
         rules=train["rule"].astype(str).values,
         y=train["rule_violation"].astype(int).values)


In [None]:
# Emsemble
import numpy as np, pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

#  point these to saved runs (oof,y,rules)
RUN_PATHS = [
    "/kaggle/working/run_e5_base.npz",
    "/kaggle/working/run_Alibaba_NLP.npz",
    # "/kaggle/working/run_e5_large.npz",
    # "/kaggle/working/run_bge_m3.npz",
]

def load_run(path):
    z = np.load(path, allow_pickle=True)
    return dict(oof=z["oof"].astype(np.float32),
                y=z["y"].astype(int),
                rules=z["rules"].astype(str))

runs = [load_run(p) for p in RUN_PATHS]
assert all((runs[i]["y"]==runs[0]["y"]).all() for i in range(len(runs))), "y mismatch"
assert all((runs[i]["rules"]==runs[0]["rules"]).all() for i in range(len(runs))), "rules mismatch"

y = runs[0]["y"]
rules = runs[0]["rules"]
O = np.column_stack([r["oof"] for r in runs]) 

def macro_auc_by_rule(y, rules, preds):
    vals=[]
    for ru in np.unique(rules):
        m = (rules == ru)
        if m.sum() >= 2 and len(np.unique(y[m])) > 1:
            vals.append(roc_auc_score(y[m], preds[m]))
    return float(np.mean(vals)) if vals else float("nan")
 
best_auc, best_w = -1.0, None
W_MAX = 4  
M = O.shape[1]
def enum_weights(m, wmax):
    if m == 1:
        for w1 in range(1, wmax+1): yield (w1,)
        return
    from itertools import product
    for w in product(range(0, wmax+1), repeat=m):
        if sum(w) > 0:
            yield w

for w in enum_weights(M, W_MAX):
    blend = (O * np.array(w, dtype=np.float32)).sum(axis=1) / max(1, sum(w))
    auc = macro_auc_by_rule(y, rules, blend)
    if auc > best_auc:
        best_auc, best_w = auc, w

print(f"[Grid] Best CV macro AUC: {best_auc:.6f}  weights: {best_w}")

stack_auc = None
try:
    from sklearn.model_selection import StratifiedKFold
    skf = StratifiedKFold(5, shuffle=True, random_state=42)
    oof_stack = np.zeros(len(y), np.float32)
    for tr, va in skf.split(O, y):
        clf = LogisticRegression(max_iter=1000, class_weight="balanced")
        clf.fit(O[tr], y[tr])
        oof_stack[va] = clf.predict_proba(O[va])[:,1]
    stack_auc = macro_auc_by_rule(y, rules, oof_stack)
    print(f"[Stack] CV macro AUC: {stack_auc:.6f}")
except Exception as e:
    print("Stacking skipped:", e)

use_stacking = (stack_auc is not None) and (stack_auc >= best_auc)
if use_stacking:
    print(">> Using STACKED ensemble.")
    oof_final = oof_stack
else:
    print(">> Using GRID-WEIGHTED average.")
    w = np.array(best_w, np.float32)
    oof_final = (O * w).sum(axis=1) / max(1, w.sum())

final_auc = macro_auc_by_rule(y, rules, oof_final)
print(f"Final chosen CV macro AUC: {final_auc:.6f}")
