In [None]:
# ---------------- Config ----------------
DATA_CSV_PATH = "./Datasets/CS_Summ.xlsx"     # <-- set this to your CS-Summ CSV
SAVE_DIR      = "./Models"                    # where to save checkpoints
TOPICS_DF_DIR = "./Keywords"                  # where keyword cache is stored
DS_TAG        = "_cs"                         # dataset tag in saved folder names

In [None]:
# --- Silence tokenizers + tame BLAS/OpenMP BEFORE any imports ---
import os

# This both disables tokenizers' thread pool and silences the fork warning.
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Keep BLAS/OpenMP single-threaded (prevents the OpenBLAS warning & potential hangs)
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["OPENBLAS_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["VECLIB_MAXIMUM_THREADS"] = "1"
os.environ["NUMEXPR_NUM_THREADS"] = "1"

# Reduce OpenBLAS chatter (won’t fix issues, just hides logs)
os.environ["OPENBLAS_VERBOSE"] = "0"

# Optional: quiet HF logs (cosmetic)
import logging, warnings
logging.getLogger("transformers").setLevel(logging.ERROR)
logging.getLogger("datasets").setLevel(logging.ERROR)
warnings.filterwarnings("ignore")
print("Silencing set: TOKENIZERS_PARALLELISM=false, *NUM_THREADS=1, OPENBLAS_VERBOSE=0")


In [None]:
# Fair/equal hyperparameters across models
SEED = 42
NUM_EPOCHS = 10                 
TRAIN_BATCH_SIZE = 8
EVAL_BATCH_SIZE  = 8
LEARNING_RATE = 5e-5
MAX_SOURCE_LEN = 512
MAX_TARGET_LEN = 128
GRAD_ACCUM_STEPS = 1
WARMUP_RATIO = 0.03
ES_PATIENCE = 2

# Keyword special tokens
SPECIAL_TOKENS = {"additional_special_tokens": ["<TEXT>", "<TOPIC>"]}
USE_KEYWORD_CONFIGS = [False, True]  # train noKW and KW variants

# ---------------- Repro ----------------
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
set_seed(SEED)

os.makedirs(SAVE_DIR, exist_ok=True)
print("Torch:", torch.__version__, "| CUDA:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("CUDA device:", torch.cuda.get_device_name(0))

# ---------------- spaCy NER ----------------
import spacy
try:
    # Keep the pipeline lean; we only need NER for keyword cues
    nlp = spacy.load("en_core_web_sm", exclude=["parser","attribute_ruler","lemmatizer","tagger","senter"])
    nlp.max_length = 2_000_000
    print("spaCy loaded:", nlp.pipe_names)
except Exception as e:
    raise RuntimeError("spaCy model not installed. Run: python -m spacy download en_core_web_sm") from e

In [None]:
# ===============================
# B) Load & split data
# ===============================
df = pd.read_excel(DATA_CSV_PATH)
assert {"Text","Abstractive"}.issubset(df.columns), \
    "CSV must have 'Text' and 'Abstractive' columns."

# basic cleanup
df = df.dropna(subset=["Text","Abstractive"]).reset_index(drop=True)

# 90/10 split
train_df, val_df = train_test_split(df, test_size=0.10, random_state=SEED, shuffle=True)
print(f"Train size: {len(train_df)} | Val size: {len(val_df)}")

In [None]:
# ===============================
# C) Topic cache settings (no file re-read)
# ===============================
import os, numpy as np, pandas as pd

os.makedirs(TOPICS_DF_DIR, exist_ok=True)
TOPICS_CACHE_PARQUET = os.path.join(TOPICS_DF_DIR, f"topics_cache{DS_TAG}.parquet")
TOPICS_CACHE_CSV     = os.path.join(TOPICS_DF_DIR, f"topics_cache{DS_TAG}.csv")

# If you want to force a re-extraction once, flip to True
FORCE_RECOMPUTE_TOPICS = True
MAX_KW = 10  # top entities to keep


In [None]:
# ===============================
# D) Topic extraction (once) and caching — NO re-reading of XLS/CSV
# ===============================
from tqdm.auto import tqdm

def _extract_keywords_spacy(text: str, nlp, max_kw: int = MAX_KW) -> str:
    """
    NER-based cues (deduped, no digits). Returns a single string 'k1 ; k2 ; ...'.
    """
    doc = nlp(text)
    out, seen = [], set()
    for ent in doc.ents:
        tok = ent.text.strip()
        if not tok:
            continue
        if any(c.isdigit() for c in tok):
            continue
        key = tok.lower()
        if key not in seen:
            seen.add(key)
            out.append(tok)
        if len(out) >= max_kw:
            break
    return " ; ".join(out)

# Work strictly from the df you already loaded earlier
df_orig = df.copy()
df_orig = df_orig.dropna(subset=["Text","Abstractive"])

# we’ll key the cache by the **original row index** (orig_idx)
df_orig = df_orig.reset_index().rename(columns={"index": "orig_idx"})

use_cache = False
if (not FORCE_RECOMPUTE_TOPICS) and os.path.exists(TOPICS_CACHE_PARQUET):
    try:
        cached = pd.read_parquet(TOPICS_CACHE_PARQUET)
        # cache valid if it covers all rows once
        if {"orig_idx","topics"}.issubset(cached.columns) and cached["orig_idx"].nunique() == len(df_orig):
            topics_df = cached[["orig_idx","topics"]].copy()
            use_cache = True
            print("► Using cached topics.")
    except Exception:
        pass

if not use_cache:
    print("► Extracting topics with spaCy NER (one-time)...")
    topics = []
    for txt in tqdm(df_orig["Text"].tolist(), total=len(df_orig)):
        topics.append(_extract_keywords_spacy(str(txt), nlp, max_kw=MAX_KW))
    topics_df = pd.DataFrame({"orig_idx": df_orig["orig_idx"], "topics": topics})
    topics_df.to_parquet(TOPICS_CACHE_PARQUET, index=False)
    topics_df.to_csv(TOPICS_CACHE_CSV, index=False)
    print(f"Saved topics cache: {TOPICS_CACHE_PARQUET}")

# Attach topics to your current split frames by **matching original indexes**
train_df = train_df.copy()
val_df   = val_df.copy()

train_df["orig_idx"] = train_df.index
val_df["orig_idx"]   = val_df.index

train_df = train_df.merge(topics_df, on="orig_idx", how="left")
val_df   = val_df.merge(topics_df,   on="orig_idx", how="left")

# tidy
train_df = train_df.drop(columns=["orig_idx"]).reset_index(drop=True)
val_df   = val_df.drop(columns=["orig_idx"]).reset_index(drop=True)

In [None]:
# ===============================
# E) Build inputs (KW vs noKW)  — <TOPIC> first, then <TEXT>
# ===============================
from datasets import Dataset

def build_input(text: str, topics: str, use_keywords: bool) -> str:
    """
    KW:  "<TOPIC> k1 ; k2 ; ... <TEXT> original"
    noKW: original text only
    """
    if not use_keywords:
        return text
    topics = (topics or "").strip()
    return f"<TOPIC> {topics} <TEXT> {text}".strip()

def df_to_hf_dataset(df_: pd.DataFrame, use_keywords: bool) -> Dataset:
    src, tgt = [], []
    for s, t, k in zip(df_["Text"].tolist(), df_["Abstractive"].tolist(), df_["topics"].tolist()):
        src.append(build_input(str(s), str(k), use_keywords=use_keywords))
        tgt.append(str(t))
    return Dataset.from_dict({"source": src, "target": tgt})

train_ds_no_kw = df_to_hf_dataset(train_df, use_keywords=False)
val_ds_no_kw   = df_to_hf_dataset(val_df,   use_keywords=False)
train_ds_kw    = df_to_hf_dataset(train_df, use_keywords=True)
val_ds_kw      = df_to_hf_dataset(val_df,   use_keywords=True)

In [None]:
# ===============================
# F) Tokenization helpers
# ===============================
from dataclasses import dataclass
from typing import Dict, List
from transformers import AutoTokenizer

@dataclass
class TokenizeConfig:
    tokenizer: AutoTokenizer
    max_source_len: int
    max_target_len: int

def tokenize_function(samples: Dict[str, List[str]], cfg: TokenizeConfig):
    tok = cfg.tokenizer(
        samples["source"],
        padding=False,
        truncation=True,
        max_length=MAX_SOURCE_LEN,
    )
    labels = cfg.tokenizer(
        text_target=samples["target"],
        padding=False,
        truncation=True,
        max_length=MAX_TARGET_LEN,
    )
    tok["labels"] = labels["input_ids"]
    return tok

In [None]:
# ===============================
# G) Trainer with early stopping on eval_loss
# ===============================
import torch, os
from transformers import (
    AutoTokenizer, AutoModelForSeq2SeqLM,
    EncoderDecoderModel, BertTokenizer,
    DataCollatorForSeq2Seq, TrainingArguments, Trainer, EarlyStoppingCallback
)

PATIENCE = 2  # epochs with no improvement

def train_and_save_seq2seq(
    model_name: str,
    pretrained_id: str,
    tokenizer_cls,
    model_cls,
    train_ds: Dataset,
    val_ds: Dataset,
    add_special_tokens: bool,
    save_subdir: str,
):
    import os
    from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, EncoderDecoderModel, BertTokenizer
    os.makedirs(save_subdir, exist_ok=True)

    # 1) Tokenizer
    if tokenizer_cls is None:
        tokenizer = AutoTokenizer.from_pretrained(pretrained_id, use_fast=True)
    else:
        tokenizer = tokenizer_cls.from_pretrained(pretrained_id, use_fast=True)

    # 2) Model
    if model_name in ("t5", "bart"):
        model = AutoModelForSeq2SeqLM.from_pretrained(pretrained_id)
    elif model_name == "bert2bert":
        enc_id = "bert-base-uncased"
        dec_id = "bert-base-uncased"
        model = EncoderDecoderModel.from_encoder_decoder_pretrained(enc_id, dec_id)
        tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", use_fast=True)
        model.config.decoder_start_token_id = tokenizer.cls_token_id
        model.config.eos_token_id = tokenizer.sep_token_id
        model.config.pad_token_id = tokenizer.pad_token_id
        model.config.max_length = MAX_TARGET_LEN
        model.config.min_length = 5
        model.config.no_repeat_ngram_size = 3
        model.config.early_stopping = True
        model.config.length_penalty = 2.0
        model.config.num_beams = 4
    else:
        raise ValueError("Unknown model_name")

    # 3) Special tokens for keyword format
    if add_special_tokens:
        tokenizer.add_special_tokens(SPECIAL_TOKENS)
        if model_name == "bert2bert":
            new_size = len(tokenizer)
            model.encoder.resize_token_embeddings(new_size)
            model.decoder.resize_token_embeddings(new_size)
        else:
            model.resize_token_embeddings(len(tokenizer))

    # 4) Tokenize
    @dataclass
    class TokenizeConfig:
        tokenizer: AutoTokenizer
        max_source_len: int
        max_target_len: int

    def tokenize_function(samples: Dict[str, List[str]], cfg: "TokenizeConfig"):
        tok = cfg.tokenizer(
            samples["source"], padding=False, truncation=True, max_length=cfg.max_source_len,
        )
        with cfg.tokenizer.as_target_tokenizer():
            labels = cfg.tokenizer(
                samples["target"], padding=False, truncation=True, max_length=cfg.max_target_len,
            )
        tok["labels"] = labels["input_ids"]
        return tok

    cfg = TokenizeConfig(tokenizer=tokenizer, max_source_len=MAX_SOURCE_LEN, max_target_len=MAX_TARGET_LEN)
    tokenized_train = train_ds.map(lambda x: tokenize_function(x, cfg), batched=True, remove_columns=train_ds.column_names)
    tokenized_val   = val_ds.map(lambda x: tokenize_function(x, cfg), batched=True, remove_columns=val_ds.column_names)

    # 5) Trainer with EVAL EACH EPOCH + EARLY STOPPING + EPOCH LOGGING
    data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
    args = TrainingArguments(
        output_dir=os.path.join(save_subdir, "hf_runs"),
        num_train_epochs=NUM_EPOCHS,
        per_device_train_batch_size=TRAIN_BATCH_SIZE,
        per_device_eval_batch_size=EVAL_BATCH_SIZE,
        learning_rate=LEARNING_RATE,
        weight_decay=0.01,
        gradient_accumulation_steps=GRAD_ACCUM_STEPS,
        warmup_ratio=WARMUP_RATIO,

        # 👇 these lines drive the “Epoch | Training | Validation” pattern
        evaluation_strategy="epoch",
        logging_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,
        save_total_limit=2,

        fp16=torch.cuda.is_available(),
        report_to="none",
        seed=SEED,
        dataloader_num_workers=0,
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_val,
        data_collator=data_collator,
        tokenizer=tokenizer,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=ES_PATIENCE, early_stopping_threshold=0.0)],
    )

    trainer.train()
    model.save_pretrained(save_subdir)
    tokenizer.save_pretrained(save_subdir)
    print(f"✓ Saved: {save_subdir}")

In [None]:
# BART base is safer VRAM-wise; switch to "facebook/bart-large-cnn" if your GPU can handle it
BART_ID = "facebook/bart-base"

In [None]:
# ===============================
# H) Run training per model (isolated try/except per run)
# ===============================

# ---- T5-base: noKW & KW ----
for use_kw, tag in [(False,"noKW"), (True,"KW")]:
    try:
        subdir = os.path.join(SAVE_DIR, f"t5-base{DS_TAG}_{tag}_NEW")
        train_and_save_seq2seq(
            model_name="t5",
            pretrained_id="t5-base",
            tokenizer_cls=None,
            model_cls=AutoModelForSeq2SeqLM,   # keep arg to satisfy signature
            train_ds=(train_ds_kw if use_kw else train_ds_no_kw),
            val_ds=(val_ds_kw   if use_kw else val_ds_no_kw),
            add_special_tokens=use_kw,
            save_subdir=subdir,
        )
    except Exception as e:
        print(f"❌ T5 {tag} failed:", repr(e))

In [None]:
# ===============================
# Two more T5 variants — KW+ and Prefix
# ===============================
import re
from typing import List
import spacy

# ---- (1) A "full" spaCy pipeline for KW+ (needs parser for noun_chunks)
try:
    nlp_kwplus = spacy.load("en_core_web_sm")  # full pipeline, includes parser
    nlp_kwplus.max_length = 2_000_000
    # cheap sentence splitter just in case (safe if already present)
    if "sentencizer" not in nlp_kwplus.pipe_names and "senter" not in nlp_kwplus.pipe_names:
        nlp_kwplus.add_pipe("sentencizer")
except Exception as e:
    raise RuntimeError(
        "spaCy full model not available. Install with:\n"
        "python -m spacy download en_core_web_sm"
    ) from e

def normalize_kw(s: str) -> str:
    s = re.sub(r"\s+", " ", s).strip()
    return s

def extract_keywords_spacy_plus(text: str, max_kw: int = MAX_KW) -> str:
    """
    KW+ : NER + noun chunks (short noun phrases), lowercase/dedup, no digits.
    Order: entities first, then NP chunks; cap to max_kw.
    """
    doc = nlp_kwplus(str(text))
    cands: List[str] = []

    # 1) Named entities (prioritize salient types)
    keep_ent = {"PERSON","ORG","GPE","LOC","NORP","FAC","PRODUCT","EVENT","WORK_OF_ART","LAW","LANGUAGE"}
    for ent in doc.ents:
        if ent.label_ in keep_ent:
            cands.append(ent.text)

    # 2) Noun chunks
    for nc in doc.noun_chunks:
        cands.append(nc.text)

    # 3) Clean/dedup
    out, seen = [], set()
    for c in cands:
        c = normalize_kw(c)
        if not c or any(ch.isdigit() for ch in c):
            continue
        lc = c.lower()
        if lc not in seen:
            seen.add(lc)
            out.append(c)
        if len(out) >= max_kw:
            break
    return " ; ".join(out)

# ---- (2) T5-style prefix formatter with keyword dropout (training only)
def keyword_dropout(topics: str, p_drop: float = 0.3) -> str:
    """
    Randomly drop some keywords during TRAINING to improve robustness.
    During EVAL we won't drop (p_drop=0).
    """
    toks = [t.strip() for t in (topics or "").split(";") if t.strip()]
    if not toks:
        return ""
    import random
    kept = [t for t in toks if random.random() > p_drop]
    if not kept:
        kept = toks[: max(1, len(toks)//2)]
    return " ; ".join(kept)

def build_input_kwplus(text: str, topics_plus: str) -> str:
    # same control tokens as your KW baseline, but with better topics
    return f"<TOPIC> {topics_plus} <TEXT> {text}".strip()

def build_input_prefix(text: str, topics: str, train_mode: bool) -> str:
    # classic T5 style with natural language fields
    # IMPORTANT: no special tokens, so we set add_special_tokens=False for this run.
    t = keyword_dropout(topics, 0.3 if train_mode else 0.0)
    return f"summarize: topics: {t}  context: {text}".strip()

# -----------------------------------------
# Compute KW+ topics for CURRENT splits only
# (does not touch your original NER cache)
# -----------------------------------------
print("► Building KW+ topics for train/val …")
train_df = train_df.copy()
val_df   = val_df.copy()

train_df["topics_plus"] = [extract_keywords_spacy_plus(x, MAX_KW) for x in tqdm(train_df["Text"].tolist())]
val_df["topics_plus"]   = [extract_keywords_spacy_plus(x, MAX_KW) for x in tqdm(val_df["Text"].tolist())]

# Fallback if base 'topics' column is missing (recompute simple NER topics)
if "topics" not in train_df.columns or train_df["topics"].isna().any():
    from tqdm.auto import tqdm
    print("► Base 'topics' not found — recomputing simple NER topics for Prefix variant …")
    train_df["topics"] = [_extract_keywords_spacy(x, nlp, MAX_KW) for x in tqdm(train_df["Text"].tolist())]
    val_df["topics"]   = [_extract_keywords_spacy(x, nlp, MAX_KW) for x in tqdm(val_df["Text"].tolist())]

# -----------------------------------------
# Build HF datasets for the two new variants
# -----------------------------------------
from datasets import Dataset

# KW+ datasets (use control tokens; will require SPECIAL_TOKENS)
def df_to_hf_kwplus(df_):
    src, tgt = [], []
    for s, t, k in zip(df_["Text"].tolist(), df_["Abstractive"].tolist(), df_["topics_plus"].tolist()):
        src.append(build_input_kwplus(str(s), str(k)))
        tgt.append(str(t))
    return Dataset.from_dict({"source": src, "target": tgt})

# Prefix datasets (T5 prompt, no special tokens)
def df_to_hf_prefix(df_, train_mode: bool):
    src, tgt = [], []
    for s, t, k in zip(df_["Text"].tolist(), df_["Abstractive"].tolist(), df_["topics"].tolist()):
        src.append(build_input_prefix(str(s), str(k), train_mode=train_mode))
        tgt.append(str(t))
    return Dataset.from_dict({"source": src, "target": tgt})

train_ds_kwplus = df_to_hf_kwplus(train_df)
val_ds_kwplus   = df_to_hf_kwplus(val_df)

train_ds_prefix = df_to_hf_prefix(train_df, train_mode=True)
val_ds_prefix   = df_to_hf_prefix(val_df,   train_mode=False)

# -----------------------------------------
# Train: T5-base_KWplus  (needs special tokens)
# -----------------------------------------
try:
    subdir = os.path.join(SAVE_DIR, f"t5-base{DS_TAG}_KWplus")
    train_and_save_seq2seq(
        model_name="t5",
        pretrained_id="t5-base",
        tokenizer_cls=None,
        model_cls=AutoModelForSeq2SeqLM,
        train_ds=train_ds_kwplus,
        val_ds=val_ds_kwplus,
        add_special_tokens=True,      # <TOPIC>/<TEXT> tokens required
        save_subdir=subdir,
    )
except Exception as e:
    print(f"❌ T5 KWplus failed:", repr(e))

# -----------------------------------------
# Train: T5-base_Prefix (T5 task-style, no special tokens)
# -----------------------------------------
try:
    subdir = os.path.join(SAVE_DIR, f"t5-base{DS_TAG}_KWprefix")
    train_and_save_seq2seq(
        model_name="t5",
        pretrained_id="t5-base",
        tokenizer_cls=None,
        model_cls=AutoModelForSeq2SeqLM,
        train_ds=train_ds_prefix,
        val_ds=val_ds_prefix,
        add_special_tokens=False,     # plain text prompt, no extra tokens
        save_subdir=subdir,
    )
except Exception as e:
    print(f"❌ T5 KWprefix failed:", repr(e))

print("Done training new T5 variants: KWplus & KWprefix.")

In [None]:
# ---- BART: noKW & KW ----
for use_kw, tag in [(False,"noKW"), (True,"KW")]:
    try:
        subdir = os.path.join(SAVE_DIR, f"{os.path.basename(BART_ID)}{DS_TAG}_{tag}_NEW")
        train_and_save_seq2seq(
            model_name="bart",
            pretrained_id=BART_ID,
            tokenizer_cls=None,
            model_cls=AutoModelForSeq2SeqLM,
            train_ds=(train_ds_kw if use_kw else train_ds_no_kw),
            val_ds=(val_ds_kw   if use_kw else val_ds_no_kw),
            add_special_tokens=use_kw,
            save_subdir=subdir,
        )
    except Exception as e:
        print(f"❌ BART {tag} failed:", repr(e))

In [None]:
# ---- BERT2BERT: noKW & KW (re-run with the fixed helper) ----
for use_kw, tag in [(False, "noKW"), (True, "KW")]:
    try:
        subdir = os.path.join(SAVE_DIR, f"bert2bert{DS_TAG}_{tag}_NEW")
        train_and_save_seq2seq(
            model_name="bert2bert",
            pretrained_id="bert-base-uncased",
            tokenizer_cls=BertTokenizer,            # ensures BERT tokenizer
            model_cls=EncoderDecoderModel,          # not used inside now, but kept for signature compatibility
            train_ds=(train_ds_kw if use_kw else train_ds_no_kw),
            val_ds=(val_ds_kw   if use_kw else val_ds_no_kw),
            add_special_tokens=use_kw,              # triggers encoder/decoder resize on KW
            save_subdir=subdir,
        )
    except Exception as e:
        print(f"❌ BERT2BERT {tag} failed:", repr(e))