In [1]:
import torch
torch.cuda.is_available()


True

In [2]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from bs4 import BeautifulSoup
from transformers import AutoTokenizer

In [4]:
file_to_check = "/content/drive/MyDrive/NewsSumm_perfect_clean.csv"
df=pd.read_csv(file_to_check)

In [5]:
cluster_sizes = df.groupby('cluster_id').size()

valid_clusters = cluster_sizes[cluster_sizes >= 2].index

df_multi = df[df['cluster_id'].isin(valid_clusters)].reset_index(drop=True)

print("Filtered rows:", len(df_multi))
print("Filtered clusters:", df_multi['cluster_id'].nunique())
print("Avg docs per cluster:",
      df_multi.groupby('cluster_id').size().mean())


Filtered rows: 4335
Filtered clusters: 2060
Avg docs per cluster: 2.104368932038835


In [6]:
from sklearn.model_selection import train_test_split

clusters = df_multi['cluster_id'].unique()

train_clusters, temp_clusters = train_test_split(
    clusters, test_size=0.2, random_state=42
)

val_clusters, test_clusters = train_test_split(
    temp_clusters, test_size=0.5, random_state=42
)

train_df = df_multi[df_multi['cluster_id'].isin(train_clusters)]
val_df   = df_multi[df_multi['cluster_id'].isin(val_clusters)]
test_df  = df_multi[df_multi['cluster_id'].isin(test_clusters)]

print("Train clusters:", train_df['cluster_id'].nunique())
print("Val clusters:", val_df['cluster_id'].nunique())
print("Test clusters:", test_df['cluster_id'].nunique())


Train clusters: 1648
Val clusters: 206
Test clusters: 206


In [7]:
def build_cluster_samples(df):
    samples = []
    for cid, group in df.groupby("cluster_id"):
        docs = []

        # sort only if column exists
        if "published_date" in group.columns:
            try:
                group = group.sort_values("published_date")
            except Exception:
                pass

        for _, row in group.iterrows():
            docs.append(f"[DOC]\n{row['article_clean']}")

        samples.append({
            "cluster_id": cid,
            "source": "\n".join(docs),
            "summary": group.iloc[0]['summary_clean']
        })
    return samples


In [8]:
train_samples = build_cluster_samples(train_df)
val_samples   = build_cluster_samples(val_df)
test_samples  = build_cluster_samples(test_df)

print("Train samples:", len(train_samples))
print("Val samples:", len(val_samples))
print("Test samples:", len(test_samples))


Train samples: 1648
Val samples: 206
Test samples: 206


In [22]:
pip install -U bert-score




In [10]:
!pip install transformers datasets accelerate evaluate rouge-score sentencepiece


Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m84.1/84.1 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=f50f4d72f1e74f717634f1c297431c5fe0fe67dc4602d872626ec1c5405a27eb
  Stored in directory: /root/.cache/pip/wheels/85/9d/af/01feefbe7d55ef5468796f0c68225b6788e85d9d0a281e7a70
Successfully built rouge-score
Installing collected packages: rouge-score, evaluate
Successfully installed evaluate-0.4.6 rouge-score-0.1.2


In [11]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

MODEL_PATH = "/content/drive/MyDrive/longt5_finetuned"   # üî• CHANGE THIS

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_PATH)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

print("‚úÖ Loaded:", model.config._name_or_path)

Loading weights:   0%|          | 0/295 [00:01<?, ?it/s]

LongT5ForConditionalGeneration LOAD REPORT from: /content/drive/MyDrive/longt5_finetuned
Key                         | Status  | 
----------------------------+---------+-
encoder.embed_tokens.weight | MISSING | 
decoder.embed_tokens.weight | MISSING | 

Notes:
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


‚úÖ Loaded: /content/drive/MyDrive/longt5_finetuned


In [14]:
test_sources = [s["source"] for s in test_samples]
test_references = [s["summary"] for s in test_samples]


In [18]:
import random

N = 100
random.seed(42)

idx = random.sample(range(len(test_sources)), min(N, len(test_sources)))

eval_sources = [test_sources[i] for i in idx]
eval_references = [test_references[i] for i in idx]

print("Evaluation samples:", len(eval_sources))


Evaluation samples: 100


In [35]:
def generate_summaries_longt5(
    texts,
    batch_size=2,
    max_input_len=4096,
    max_output_len=256
):
    predictions = []

    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]

        inputs = tokenizer(
            batch_texts,
            max_length=max_input_len,
            truncation=True,
            padding=True,
            return_tensors="pt"
        ).to(device)

        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_length=max_output_len,
                min_length=50,          # üî• IMPORTANT
                num_beams=4,            # üî• force generation
                length_penalty=1.0,
                no_repeat_ngram_size=3
            )

        decoded = tokenizer.batch_decode(
            outputs,
            skip_special_tokens=True
        )
        print(f"Processed {min(i + batch_size, len(texts))}/{len(texts)}")
        predictions.extend(decoded)

    return predictions


In [36]:
predictions = generate_summaries_longt5(
    eval_sources,
    batch_size=2,
    max_input_len=4096,
    max_output_len=256
)


Processed 2/100
Processed 4/100
Processed 6/100
Processed 8/100
Processed 10/100
Processed 12/100
Processed 14/100
Processed 16/100
Processed 18/100
Processed 20/100
Processed 22/100
Processed 24/100
Processed 26/100
Processed 28/100
Processed 30/100
Processed 32/100
Processed 34/100
Processed 36/100
Processed 38/100
Processed 40/100
Processed 42/100
Processed 44/100
Processed 46/100
Processed 48/100
Processed 50/100
Processed 52/100
Processed 54/100
Processed 56/100
Processed 58/100
Processed 60/100
Processed 62/100
Processed 64/100
Processed 66/100
Processed 68/100
Processed 70/100
Processed 72/100
Processed 74/100
Processed 76/100
Processed 78/100
Processed 80/100
Processed 82/100
Processed 84/100
Processed 86/100
Processed 88/100
Processed 90/100
Processed 92/100
Processed 94/100
Processed 96/100
Processed 98/100
Processed 100/100


In [37]:
for i in range(3):
    print("PRED:", predictions[i][:200])
    print("REF :", eval_references[i][:200])
    print("-" * 80)


PRED: sss thessass:ss andss toss ofss fillssess insstss-ss isss dess forss‚Äôssi,,,s,, the,,a,,:,, and,, to,, of,, fill,,e,, in,,t,,-,, is,, de,, for,,‚Äô,,i,ss,s the,sa,s:,s and,s to,s of,s fill,se,s in,st,s-,
REF : The October 2002 votes authorizing the Iraq War were pivotal in American history. Michigan Sen. Stabenow, unconvinced by the evidence, opposed the resolution, questioning its connection to 9 11. Sen. 
--------------------------------------------------------------------------------
PRED: sss thessass:ss andss toss ofss fillssess insstss-ss isss dess forss‚Äôssi,,,s,, the,,a,,:,, and,, to,, of,, fill,,e,, in,,t,,-,, is,, de,, for,,‚Äô,,i,ss,s the,sa,s:,s and,s to,s of,s fill,se,s in,st,s-,
REF : After nearly five days of intense firefighting by the Indian Air Force, Army, and Central Reserve Police Force (CRPF), the forest fire around Mount Abu was extinguished. The fire caused significant de
--------------------------------------------------------------------------------
PR

In [38]:
import evaluate

rouge = evaluate.load("rouge")

r = rouge.compute(
    predictions=predictions,
    references=eval_references
)


In [39]:
from sentence_transformers import SentenceTransformer, util

sbert = SentenceTransformer(
    "all-MiniLM-L6-v2",
    device="cuda" if torch.cuda.is_available() else "cpu"
)

pred_emb = sbert.encode(predictions, convert_to_tensor=True, show_progress_bar=True)
ref_emb  = sbert.encode(eval_references, convert_to_tensor=True, show_progress_bar=True)

semantic_similarity = util.cos_sim(pred_emb, ref_emb).diag().mean().item()


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

In [40]:
print("\nüìä LONG-T5 EVALUATION RESULTS\n")
print(f"ROUGE-1 : {r['rouge1']:.4f}")
print(f"ROUGE-2 : {r['rouge2']:.4f}")
print(f"ROUGE-L : {r['rougeL']:.4f}")
print(f"Semantic Similarity (SBERT) : {semantic_similarity:.4f}")



üìä LONG-T5 EVALUATION RESULTS (100 samples)

ROUGE-1 : 0.1401
ROUGE-2 : 0.0001
ROUGE-L : 0.0910
Semantic Similarity (SBERT) : 0.0008


In [41]:
import numpy as np

pred_lengths = [len(p.split()) for p in predictions]
ref_lengths = [len(r.split()) for r in eval_references]

print("Avg prediction length:", np.mean(pred_lengths))
print("Avg reference length:", np.mean(ref_lengths))


Avg prediction length: 51.0
Avg reference length: 78.23
