In [4]:
import torch
torch.cuda.is_available()


True

In [2]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from bs4 import BeautifulSoup
from transformers import AutoTokenizer
file_to_check = "/content/drive/MyDrive/NewsSumm_perfect_clean.csv"
df=pd.read_csv(file_to_check)
cluster_sizes = df.groupby('cluster_id').size()

valid_clusters = cluster_sizes[cluster_sizes >= 2].index

df_multi = df[df['cluster_id'].isin(valid_clusters)].reset_index(drop=True)

print("Filtered rows:", len(df_multi))
print("Filtered clusters:", df_multi['cluster_id'].nunique())
print("Avg docs per cluster:",
      df_multi.groupby('cluster_id').size().mean())
from sklearn.model_selection import train_test_split

clusters = df_multi['cluster_id'].unique()

train_clusters, temp_clusters = train_test_split(
    clusters, test_size=0.2, random_state=42
)

val_clusters, test_clusters = train_test_split(
    temp_clusters, test_size=0.5, random_state=42
)

train_df = df_multi[df_multi['cluster_id'].isin(train_clusters)]
val_df   = df_multi[df_multi['cluster_id'].isin(val_clusters)]
test_df  = df_multi[df_multi['cluster_id'].isin(test_clusters)]

print("Train clusters:", train_df['cluster_id'].nunique())
print("Val clusters:", val_df['cluster_id'].nunique())
print("Test clusters:", test_df['cluster_id'].nunique())
def build_cluster_samples(df):
    samples = []
    for cid, group in df.groupby("cluster_id"):
        docs = []

        # sort only if column exists
        if "published_date" in group.columns:
            try:
                group = group.sort_values("published_date")
            except Exception:
                pass

        for _, row in group.iterrows():
            docs.append(f"[DOC]\n{row['article_clean']}")

        samples.append({
            "cluster_id": cid,
            "source": "\n".join(docs),
            "summary": group.iloc[0]['summary_clean']
        })
    return samples
train_samples = build_cluster_samples(train_df)
val_samples   = build_cluster_samples(val_df)
test_samples  = build_cluster_samples(test_df)

print("Train samples:", len(train_samples))
print("Val samples:", len(val_samples))
print("Test samples:", len(test_samples))



Filtered rows: 4335
Filtered clusters: 2060
Avg docs per cluster: 2.104368932038835
Train clusters: 1648
Val clusters: 206
Test clusters: 206
Train samples: 1648
Val samples: 206
Test samples: 206


In [5]:
!pip install bert_score

Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bert_score
Successfully installed bert_score-0.3.13


In [6]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

MODEL_PATH = "/content/drive/MyDrive/LLMA"  # your model

tokenizer = AutoTokenizer.from_pretrained(
    MODEL_PATH,
    use_fast=False
)

# IMPORTANT for LLaMA
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

print("✅ Loaded model:", model.config._name_or_path)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
`torch_dtype` is deprecated! Use `dtype` instead!


Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

Loading weights:   0%|          | 0/88 [00:00<?, ?it/s]

✅ Loaded model: TinyLlama/TinyLlama-1.1B-Chat-v1.0


In [7]:
test_sources = [s["source"] for s in test_samples]
test_references = [s["summary"] for s in test_samples]


In [10]:
import random

N = 100
random.seed(42)

idx = random.sample(range(len(test_sources)), min(N, len(test_sources)))

eval_sources = [test_sources[i] for i in idx]
eval_references = [test_references[i] for i in idx]

print("Evaluation samples:", len(eval_sources))


Evaluation samples: 100


In [12]:
def generate_summaries_llama(
    texts,
    batch_size=2,
    max_input_len=512,
    max_new_tokens=128
):
    predictions = []

    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]

        prompts = [
            f"Summarize the following text:\n\n{text}\n\nSummary:"
            for text in batch_texts
        ]

        inputs = tokenizer(
            prompts,
            max_length=max_input_len,
            truncation=True,
            padding=True,
            return_tensors="pt"
        ).to(device)

        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,   # ✅ correct for LLaMA
                do_sample=False,
                num_beams=1,                     # 🔥 fast
                pad_token_id=tokenizer.eos_token_id
            )

        decoded = tokenizer.batch_decode(
            outputs,
            skip_special_tokens=True
        )

        # Remove prompt part
        for d in decoded:
            predictions.append(
                d.split("Summary:")[-1].strip()
            )

        print(f"Processed {min(i + batch_size, len(texts))}/{len(texts)}")

    return predictions


In [13]:
predictions = generate_summaries_llama(
    eval_sources,
    batch_size=2,
    max_input_len=512,
    max_new_tokens=128
)


Processed 2/100


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed 4/100
Processed 6/100
Processed 8/100
Processed 10/100
Processed 12/100
Processed 14/100


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed 16/100
Processed 18/100
Processed 20/100
Processed 22/100
Processed 24/100
Processed 26/100
Processed 28/100


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed 30/100
Processed 32/100
Processed 34/100
Processed 36/100
Processed 38/100
Processed 40/100
Processed 42/100
Processed 44/100


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed 46/100
Processed 48/100
Processed 50/100
Processed 52/100


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed 54/100
Processed 56/100
Processed 58/100
Processed 60/100
Processed 62/100
Processed 64/100
Processed 66/100
Processed 68/100


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed 70/100


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed 72/100
Processed 74/100
Processed 76/100
Processed 78/100
Processed 80/100
Processed 82/100
Processed 84/100
Processed 86/100
Processed 88/100
Processed 90/100
Processed 92/100
Processed 94/100
Processed 96/100
Processed 98/100
Processed 100/100


In [18]:
import evaluate
import numpy as np

rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")

# ROUGE
r = rouge.compute(
    predictions=predictions,
    references=eval_references
)

# BERTScore
b = bertscore.compute(
    predictions=predictions,
    references=eval_references,
    lang="en",
    model_type="roberta-large"
)

print("\n📊 LLaMA EVALUATION RESULTS \n")
print(f"ROUGE-1 : {r['rouge1']:.4f}")
print(f"ROUGE-2 : {r['rouge2']:.4f}")
print(f"ROUGE-L : {r['rougeL']:.4f}")
print(f"BERTScore-F1 : {np.mean(b['f1']):.4f}")


Downloading builder script: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Loading weights:   0%|          | 0/389 [00:00<?, ?it/s]

RobertaModel LOAD REPORT from: roberta-large
Key                             | Status     | 
--------------------------------+------------+-
lm_head.dense.weight            | UNEXPECTED | 
lm_head.dense.bias              | UNEXPECTED | 
lm_head.layer_norm.bias         | UNEXPECTED | 
roberta.embeddings.position_ids | UNEXPECTED | 
lm_head.bias                    | UNEXPECTED | 
lm_head.layer_norm.weight       | UNEXPECTED | 
pooler.dense.bias               | MISSING    | 
pooler.dense.weight             | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.



📊 LLaMA EVALUATION RESULTS 

ROUGE-1 : 0.2629
ROUGE-2 : 0.1670
ROUGE-L : 0.2110
BERTScore-F1 : 0.8602
