In [1]:
import torch
torch.cuda.is_available()


True

In [3]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from bs4 import BeautifulSoup
from transformers import AutoTokenizer

In [5]:
file_to_check = "/content/drive/MyDrive/NewsSumm_perfect_clean.csv"
df=pd.read_csv(file_to_check)

In [6]:
cluster_sizes = df.groupby('cluster_id').size()

valid_clusters = cluster_sizes[cluster_sizes >= 2].index

df_multi = df[df['cluster_id'].isin(valid_clusters)].reset_index(drop=True)

print("Filtered rows:", len(df_multi))
print("Filtered clusters:", df_multi['cluster_id'].nunique())
print("Avg docs per cluster:",
      df_multi.groupby('cluster_id').size().mean())


Filtered rows: 4335
Filtered clusters: 2060
Avg docs per cluster: 2.104368932038835


In [7]:
from sklearn.model_selection import train_test_split

clusters = df_multi['cluster_id'].unique()

train_clusters, temp_clusters = train_test_split(
    clusters, test_size=0.2, random_state=42
)

val_clusters, test_clusters = train_test_split(
    temp_clusters, test_size=0.5, random_state=42
)

train_df = df_multi[df_multi['cluster_id'].isin(train_clusters)]
val_df   = df_multi[df_multi['cluster_id'].isin(val_clusters)]
test_df  = df_multi[df_multi['cluster_id'].isin(test_clusters)]

print("Train clusters:", train_df['cluster_id'].nunique())
print("Val clusters:", val_df['cluster_id'].nunique())
print("Test clusters:", test_df['cluster_id'].nunique())


Train clusters: 1648
Val clusters: 206
Test clusters: 206


In [8]:
def build_cluster_samples(df):
    samples = []
    for cid, group in df.groupby("cluster_id"):
        docs = []

        # sort only if column exists
        if "published_date" in group.columns:
            try:
                group = group.sort_values("published_date")
            except Exception:
                pass

        for _, row in group.iterrows():
            docs.append(f"[DOC]\n{row['article_clean']}")

        samples.append({
            "cluster_id": cid,
            "source": "\n".join(docs),
            "summary": group.iloc[0]['summary_clean']
        })
    return samples


In [9]:
train_samples = build_cluster_samples(train_df)
val_samples   = build_cluster_samples(val_df)
test_samples  = build_cluster_samples(test_df)

print("Train samples:", len(train_samples))
print("Val samples:", len(val_samples))
print("Test samples:", len(test_samples))


Train samples: 1648
Val samples: 206
Test samples: 206


In [10]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

MODEL_PATH = "/content/drive/MyDrive/LED BASE"

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_PATH)

model.eval()


Loading weights:   0%|          | 0/296 [00:00<?, ?it/s]

LEDForConditionalGeneration(
  (led): LEDModel(
    (shared): Embedding(50265, 768, padding_idx=1)
    (encoder): LEDEncoder(
      (embed_tokens): Embedding(50265, 768, padding_idx=1)
      (embed_positions): LEDLearnedPositionalEmbedding(16384, 768)
      (layers): ModuleList(
        (0-5): 6 x LEDEncoderLayer(
          (self_attn): LEDEncoderAttention(
            (longformer_self_attn): LEDEncoderSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (query_global): Linear(in_features=768, out_features=768, bias=True)
              (key_global): Linear(in_features=768, out_features=768, bias=True)
              (value_global): Linear(in_features=768, out_features=768, bias=True)
            )
            (output): Linear(in_features=768, out_features=768, bias=True)
          )
     

In [11]:
import torch

pred_texts = []
ref_texts = []

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for sample in test_samples:
    inputs = tokenizer(
        sample["source"],
        return_tensors="pt",
        truncation=True,
        max_length=4096
    ).to(device)

    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_length=256,
            num_beams=4,
            length_penalty=1.0,
            early_stopping=True
        )

    pred = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    pred_texts.append(pred)
    ref_texts.append(sample["summary"])


Input ids are automatically padded from 1911 to 2048 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 547 to 1024 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 1289 to 2048 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 967 to 1024 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 2851 to 3072 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 1842 to 2048 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 1053 to 2048 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 371 to 1024 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 561 to 1024 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 909 to 1024 to be a

In [19]:
pip install rouge-score bert-score evaluate


Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=6bd8edde134f73ae9cac3ca3aa35520de59502de6e07a19231f27416af8f0ee6
  Stored in directory: /root/.cache/pip/wheels/85/9d/af/01feefbe7d55ef5468796f0c68225b6788e85d9d0a281e7a70
Successfully built rouge-score
Installing collected packages: rouge-score, bert-score
Successfully installed bert-score-0.3.13 rouge-score-0.1.2


In [20]:
import evaluate
import numpy as np

# Load metrics
rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")

# ROUGE
r = rouge.compute(
    predictions=pred_texts,
    references=ref_texts
)

# BERTScore
b = bertscore.compute(
    predictions=pred_texts,
    references=ref_texts,
    lang="en",
    model_type="roberta-large"
)

# Final metrics
metrics = {
    "rouge1": r["rouge1"],
    "rouge2": r["rouge2"],
    "rougeL": r["rougeL"],
    "bertscore_f1": float(np.mean(b["f1"]))
}

print("📊 FINAL EVALUATION (Loaded Model)")
for k, v in metrics.items():
    print(f"{k}: {v:.4f}")


Downloading builder script: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]



tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Loading weights:   0%|          | 0/389 [00:00<?, ?it/s]

RobertaModel LOAD REPORT from: roberta-large
Key                             | Status     | 
--------------------------------+------------+-
lm_head.bias                    | UNEXPECTED | 
lm_head.dense.weight            | UNEXPECTED | 
lm_head.dense.bias              | UNEXPECTED | 
lm_head.layer_norm.weight       | UNEXPECTED | 
lm_head.layer_norm.bias         | UNEXPECTED | 
roberta.embeddings.position_ids | UNEXPECTED | 
pooler.dense.bias               | MISSING    | 
pooler.dense.weight             | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


📊 FINAL EVALUATION (Loaded Model)
rouge1: 0.4459
rouge2: 0.2548
rougeL: 0.3413
bertscore_f1: 0.8908


In [15]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6
