In [None]:
import torch
torch.cuda.is_available()


True

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from bs4 import BeautifulSoup
from transformers import AutoTokenizer

In [None]:
file_to_check = "/content/drive/MyDrive/NewsSumm_perfect_clean.csv"
df=pd.read_csv(file_to_check)

In [None]:
df.head()

Unnamed: 0,newspaper_name,published_date,headline,article_text,human_summary,news_category,article_clean,summary_clean,article_tokens,summary_tokens,compression_ratio,cluster_id
0,Indian Express,2023-10-08,When Congress fell short of a majority despite...,As India-Canada relations are strained in the ...,"In summary, ""I am not thinking about the India...",International News,As India-Canada relations are strained in the ...,"In summary, ""I am not thinking about the India...",308,95,0.308442,75523
1,Indian Express,2023-10-16,"""I am not thinking about the India-Canada rift...",Isro chairman S Somanath on Sunday said expert...,"In summary, ""I am not thinking about the India...",Science and Technology,Isro chairman S Somanath on Sunday said expert...,"In summary, ""I am not thinking about the India...",434,95,0.218894,75523
2,Financial Express,2018-09-06,Andhra Pradesh to punish tax officials for unr...,In an attempt to minimize the harassment faced...,Andhra Pradesh has formed four nodal committee...,Business and Finance,In an attempt to minimize the harassment faced...,Andhra Pradesh has formed four nodal committee...,297,86,0.289562,20930
3,Indian Express,2011-04-03,The World At Our Feet,The headline of the Times of India on April ...,India defeated Sri Lanka in the final of the C...,Sports,"The headline of the Times of India on April 2,...",India defeated Sri Lanka in the final of the C...,175,63,0.36,79366
4,Hindustan Times,2001-01-01,2001 Parliament attack: ‘A shot missed me and ...,"On December 13, 2001, a group of five armed m...",The 2001 Parliament attack was a major terrori...,terrorist attack,"On December 13, 2001, a group of five armed me...",The 2001 Parliament attack was a major terrori...,319,118,0.369906,143292


In [None]:
df.columns = df.columns.str.strip()
print(df.columns)


Index(['newspaper_name', 'published_date', 'headline', 'article_text',
       'human_summary', 'news_category', 'article_clean', 'summary_clean',
       'article_tokens', 'summary_tokens', 'compression_ratio', 'cluster_id'],
      dtype='object')


In [None]:
cluster_sizes = df.groupby('cluster_id').size()

valid_clusters = cluster_sizes[cluster_sizes >= 2].index

df_multi = df[df['cluster_id'].isin(valid_clusters)].reset_index(drop=True)

print("Filtered rows:", len(df_multi))
print("Filtered clusters:", df_multi['cluster_id'].nunique())
print("Avg docs per cluster:",
      df_multi.groupby('cluster_id').size().mean())


Filtered rows: 4335
Filtered clusters: 2060
Avg docs per cluster: 2.104368932038835


In [None]:
from sklearn.model_selection import train_test_split

clusters = df_multi['cluster_id'].unique()

train_clusters, temp_clusters = train_test_split(
    clusters, test_size=0.2, random_state=42
)

val_clusters, test_clusters = train_test_split(
    temp_clusters, test_size=0.5, random_state=42
)

train_df = df_multi[df_multi['cluster_id'].isin(train_clusters)]
val_df   = df_multi[df_multi['cluster_id'].isin(val_clusters)]
test_df  = df_multi[df_multi['cluster_id'].isin(test_clusters)]

print("Train clusters:", train_df['cluster_id'].nunique())
print("Val clusters:", val_df['cluster_id'].nunique())
print("Test clusters:", test_df['cluster_id'].nunique())


Train clusters: 1648
Val clusters: 206
Test clusters: 206


In [None]:
train_df.to_csv("/content/drive/MyDrive/NewsSumm_multi_train.csv", index=False)
val_df.to_csv("/content/drive/MyDrive/NewsSumm_multi_val.csv", index=False)
test_df.to_csv("/content/drive/MyDrive/NewsSumm_multi_test.csv", index=False)


In [None]:
def build_cluster_samples(df):
    samples = []
    for cid, group in df.groupby("cluster_id"):
        docs = []

        # sort only if column exists
        if "published_date" in group.columns:
            try:
                group = group.sort_values("published_date")
            except Exception:
                pass

        for _, row in group.iterrows():
            docs.append(f"[DOC]\n{row['article_clean']}")

        samples.append({
            "cluster_id": cid,
            "source": "\n".join(docs),
            "summary": group.iloc[0]['summary_clean']
        })
    return samples


In [None]:
train_samples = build_cluster_samples(train_df)
val_samples   = build_cluster_samples(val_df)
test_samples  = build_cluster_samples(test_df)

print("Train samples:", len(train_samples))
print("Val samples:", len(val_samples))
print("Test samples:", len(test_samples))


Train samples: 1648
Val samples: 206
Test samples: 206


In [None]:
!pip install transformers datasets accelerate evaluate rouge-score sentencepiece


Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=34f573a770181cc6ae4465c3918aedf5e7799af4ba1818e72242c3abdb3bfed1
  Stored in directory: /root/.cache/pip/wheels/85/9d/af/01feefbe7d55ef5468796f0c68225b6788e85d9d0a281e7a70
Successfully built rouge-score
Installing collected packages: rouge-score, evaluate
Successfully installed evaluate-0.4.6 rouge-score-0.1.2


In [None]:
pip install bert_score

Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bert_score
Successfully installed bert_score-0.3.13


In [None]:
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorForSeq2Seq,
)
from datasets import Dataset
import numpy as np
import evaluate
import torch

MODEL_NAME = "allenai/PRIMERA"
MAX_INPUT_LEN = 1000
MAX_TARGET_LEN = 256
LR = 2e-5

rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

# memory saver
model.gradient_checkpointing_enable()

# train_samples = [{"source": "...", "summary": "..."}]
# val_samples   = [{"source": "...", "summary": "..."}]

train_ds = Dataset.from_list(train_samples)
val_ds   = Dataset.from_list(val_samples)

def preprocess(batch):
    inputs = tokenizer(
        batch["source"],
        max_length=MAX_INPUT_LEN,
        truncation=True
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            batch["summary"],
            max_length=MAX_TARGET_LEN,
            truncation=True
        )

    labels["input_ids"] = [
        [(t if t != tokenizer.pad_token_id else -100) for t in seq]
        for seq in labels["input_ids"]
    ]

    inputs["labels"] = labels["input_ids"]
    return inputs

train_data = train_ds.map(
    preprocess, batched=True, remove_columns=train_ds.column_names
)
val_data = val_ds.map(
    preprocess, batched=True, remove_columns=val_ds.column_names
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

tokenizer_config.json:   0%|          | 0.00/27.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/20.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/283 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.79G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.79G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/197 [00:00<?, ?B/s]

Map:   0%|          | 0/1648 [00:00<?, ? examples/s]



Map:   0%|          | 0/206 [00:00<?, ? examples/s]

In [None]:
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorForSeq2Seq,
)
from datasets import Dataset
import torch

# =========================
# CONFIG
# =========================
MODEL_NAME = "allenai/PRIMERA"
MAX_INPUT_LEN = 512
MAX_TARGET_LEN = 256
LR = 2e-5
EPOCHS = 2
BATCH_SIZE = 1
GRAD_ACC = 4  # effective batch = 4
LOGGING_STEPS = 50

# =========================
# DATASET
# =========================
# train_samples = [{"source": "...", "summary": "..."}]
# val_samples = [{"source": "...", "summary": "..."}]

train_ds = Dataset.from_list(train_samples)
val_ds   = Dataset.from_list(val_samples)

# =========================
# TOKENIZER + MODEL
# =========================
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
model.gradient_checkpointing_enable()  # save VRAM

# =========================
# PREPROCESS FUNCTION
# =========================
def preprocess(batch):
    inputs = tokenizer(
        batch["source"],
        max_length=MAX_INPUT_LEN,
        truncation=True
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            batch["summary"],
            max_length=MAX_TARGET_LEN,
            truncation=True
        )

    labels["input_ids"] = [
        [(t if t != tokenizer.pad_token_id else -100) for t in seq]
        for seq in labels["input_ids"]
    ]

    inputs["labels"] = labels["input_ids"]
    return inputs

train_data = train_ds.map(preprocess, batched=True, remove_columns=train_ds.column_names)
val_data = val_ds.map(preprocess, batched=True, remove_columns=val_ds.column_names)

# =========================
# DATA COLLATOR
# =========================
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# =========================
# TRAINING ARGUMENTS
# =========================
training_args = TrainingArguments(
    output_dir="./primera_train",
    overwrite_output_dir=True,

    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRAD_ACC,
    num_train_epochs=EPOCHS,

    learning_rate=LR,
    lr_scheduler_type="linear",
    warmup_ratio=0.05,

    fp16=True,
    logging_steps=LOGGING_STEPS,       # print every 50 steps
    log_level="info",
    save_strategy="no",
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

print("🚀 Starting training...")
trainer.train()

trainer.save_model("./primera_finetuned")
tokenizer.save_pretrained("./primera_finetuned")
print("✅ Training finished and model saved!")


Map:   0%|          | 0/1648 [00:00<?, ? examples/s]

Map:   0%|          | 0/206 [00:00<?, ? examples/s]

  trainer = Trainer(
Using auto half precision backend


🚀 Starting training...


***** Running training *****
  Num examples = 1,648
  Num Epochs = 2
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 4
  Total optimization steps = 824
  Number of trainable parameters = 447,219,712


Step,Training Loss
50,1.601
100,1.4744
150,1.5113
200,1.3476
250,1.3374
300,1.216
350,1.3421
400,1.2705
450,1.0492
500,1.0108


Input ids are automatically padded from 329 to 512 to be a multiple of `config.attention_window`: 512
Input ids are automatically padded from 456 to 512 to be a multiple of `config.attention_window`: 512
Input ids are automatically padded from 428 to 512 to be a multiple of `config.attention_window`: 512
Input ids are automatically padded from 375 to 512 to be a multiple of `config.attention_window`: 512
Input ids are automatically padded from 484 to 512 to be a multiple of `config.attention_window`: 512
Input ids are automatically padded from 307 to 512 to be a multiple of `config.attention_window`: 512
Input ids are automatically padded from 498 to 512 to be a multiple of `config.attention_window`: 512
Input ids are automatically padded from 492 to 512 to be a multiple of `config.attention_window`: 512
Input ids are automatically padded from 461 to 512 to be a multiple of `config.attention_window`: 512
Input ids are automatically padded from 353 to 512 to be a multiple of `config.att

✅ Training finished and model saved!


In [None]:
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorForSeq2Seq,
)
from datasets import Dataset

# =========================
# CONFIG
# =========================
MODEL_PATH = "./primera_finetuned"   # load trained model
MAX_INPUT_LEN = 512
MAX_TARGET_LEN = 256
LR = 2e-5
EPOCHS = 1
BATCH_SIZE = 1
GRAD_ACC = 4
LOGGING_STEPS = 50

# =========================
# DATASET
# =========================
# train_samples = [{"source": "...", "summary": "..."}]

train_ds = Dataset.from_list(train_samples)

# =========================
# LOAD TOKENIZER + MODEL
# =========================
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_PATH)
model.gradient_checkpointing_enable()

# =========================
# PREPROCESS
# =========================
def preprocess(batch):
    inputs = tokenizer(
        batch["source"],
        max_length=MAX_INPUT_LEN,
        truncation=True
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            batch["summary"],
            max_length=MAX_TARGET_LEN,
            truncation=True
        )

    labels["input_ids"] = [
        [(t if t != tokenizer.pad_token_id else -100) for t in seq]
        for seq in labels["input_ids"]
    ]

    inputs["labels"] = labels["input_ids"]
    return inputs

train_data = train_ds.map(
    preprocess, batched=True, remove_columns=train_ds.column_names
)

# =========================
# COLLATOR
# =========================
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# =========================
# TRAINING ARGUMENTS
# =========================
training_args = TrainingArguments(
    output_dir="./primera_continue",
    overwrite_output_dir=True,

    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRAD_ACC,
    num_train_epochs=EPOCHS,

    learning_rate=LR,
    lr_scheduler_type="linear",
    warmup_ratio=0.05,

    fp16=True,
    logging_steps=LOGGING_STEPS,
    save_strategy="no",
    report_to="none"
)

# =========================
# TRAINER
# =========================
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

# =========================
# CONTINUE TRAINING
# =========================
print("🚀 Continuing training for 3 more epochs...")
trainer.train()

# Save updated model
trainer.save_model("./primera_finetuned")
tokenizer.save_pretrained("./primera_finetuned")

print("✅ Training continued and model updated!")


loading file vocab.json
loading file merges.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading file chat_template.jinja
loading configuration file ./primera_finetuned/config.json
Model config LEDConfig {
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "LEDForConditionalGeneration"
  ],
  "attention_dilation": [
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1
  ],
  "attention_dropout": 0.0,
  "attention_mode": "sliding_chunks",
  "attention_window": [
    512,
    512,
    512,
    512,
    512,
    512,
    512,
    512,
    512,
    512,
    512,
    512
  ],
  "autoregressive": false,
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "

Map:   0%|          | 0/1648 [00:00<?, ? examples/s]

PyTorch: setting up devices
  trainer = Trainer(
Using auto half precision backend


🚀 Continuing training for 3 more epochs...


***** Running training *****
  Num examples = 1,648
  Num Epochs = 1
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 4
  Total optimization steps = 412
  Number of trainable parameters = 447,219,712


Step,Training Loss
50,0.0424
100,0.0385
150,0.0503
200,0.0313
250,0.034
300,0.0309
350,0.0323
400,0.0326




Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to ./primera_finetuned
Configuration saved in ./primera_finetuned/config.json
Configuration saved in ./primera_finetuned/generation_config.json
Model weights saved in ./primera_finetuned/model.safetensors
tokenizer config file saved in ./primera_finetuned/tokenizer_config.json
Special tokens file saved in ./primera_finetuned/special_tokens_map.json
tokenizer config file saved in ./primera_finetuned/tokenizer_config.json
Special tokens file saved in ./primera_finetuned/special_tokens_map.json


✅ Training continued and model updated!


In [None]:
drive_model_path = "/content/drive/MyDrive/PRIMERA FINAL"

In [None]:
trainer.save_model(drive_model_path)
tokenizer.save_pretrained(drive_model_path)


Saving model checkpoint to /content/drive/MyDrive/PRIMERA FINAL
Configuration saved in /content/drive/MyDrive/PRIMERA FINAL/config.json
Configuration saved in /content/drive/MyDrive/PRIMERA FINAL/generation_config.json
Model weights saved in /content/drive/MyDrive/PRIMERA FINAL/model.safetensors
tokenizer config file saved in /content/drive/MyDrive/PRIMERA FINAL/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/PRIMERA FINAL/special_tokens_map.json
tokenizer config file saved in /content/drive/MyDrive/PRIMERA FINAL/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/PRIMERA FINAL/special_tokens_map.json


('/content/drive/MyDrive/PRIMERA FINAL/tokenizer_config.json',
 '/content/drive/MyDrive/PRIMERA FINAL/special_tokens_map.json',
 '/content/drive/MyDrive/PRIMERA FINAL/vocab.json',
 '/content/drive/MyDrive/PRIMERA FINAL/merges.txt',
 '/content/drive/MyDrive/PRIMERA FINAL/added_tokens.json',
 '/content/drive/MyDrive/PRIMERA FINAL/tokenizer.json')

In [None]:
trainer.save_model("/content/drive/MyDrive/primer_models/saved_primera")
tokenizer.save_pretrained("/content/drive/MyDrive/primer_models/saved_primera")


('/content/drive/MyDrive/primer_models/saved_primera/tokenizer_config.json',
 '/content/drive/MyDrive/primer_models/saved_primera/special_tokens_map.json',
 '/content/drive/MyDrive/primer_models/saved_primera/vocab.json',
 '/content/drive/MyDrive/primer_models/saved_primera/merges.txt',
 '/content/drive/MyDrive/primer_models/saved_primera/added_tokens.json',
 '/content/drive/MyDrive/primer_models/saved_primera/tokenizer.json')

In [None]:
from datasets import Dataset
val_samples = Dataset.from_list(val_samples)


In [None]:
val_samples = val_samples.select(range(200))


In [None]:
PREFIX = "summarize: "

def preprocess(batch):
    texts = [PREFIX + x for x in batch["source"]]
    model_inputs = tokenizer(
        texts,
        max_length=2048,
        truncation=True,
        padding="max_length"
    )

    labels = tokenizer(
        batch["summary"],
        max_length=256,
        truncation=True,
        padding="max_length"
    )

    labels_ids = []
    for seq in labels["input_ids"]:
        labels_ids.append([tok if tok != tokenizer.pad_token_id else -100 for tok in seq])

    model_inputs["labels"] = labels_ids
    return model_inputs

val_data = val_samples.map(preprocess, batched=True, remove_columns=val_samples.column_names)


Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [None]:
import numpy as np
import evaluate

rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")


In [None]:
from transformers import Trainer, DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

def compute_metrics(eval_pred):
    preds, labels = eval_pred
    preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    preds_text = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels_text = tokenizer.batch_decode(labels, skip_special_tokens=True)

    r = rouge.compute(predictions=preds_text, references=labels_text)
    b = bertscore.compute(predictions=preds_text, references=labels_text, lang="en")

    return {
        "rouge1": r["rouge1"],
        "rouge2": r["rouge2"],
        "rougeL": r["rougeL"],
        "bertscore": float(np.mean(b["f1"]))
    }

trainer = Trainer(
    model=model,
    eval_dataset=val_data,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)


  trainer = Trainer(


In [None]:
metrics = trainer.evaluate()
print(metrics)


{'eval_loss': 1.23781418800354, 'eval_runtime': 89.52791595458984, 'eval_samples_per_second': 8.714, 'eval_steps_per_second': 0.742, 'rouge1': 0.4438, 'rouge2': 0.2094, 'rougeL': 0.4061, 'bertscore': 0.8736}


In [None]:
import os
os.listdir("/content/drive/MyDrive/primer_models/saved_primera")


['config.json',
 'generation_config.json',
 'model.safetensors',
 'tokenizer_config.json',
 'special_tokens_map.json',
 'added_tokens.json',
 'vocab.json',
 'merges.txt',
 'tokenizer.json',
 'training_args.bin']