In [None]:
import torch
torch.cuda.is_available()


True

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from bs4 import BeautifulSoup
from transformers import AutoTokenizer

In [None]:
file_to_check = "/content/drive/MyDrive/NewsSumm_perfect_clean.csv"
df=pd.read_csv(file_to_check)

In [None]:
cluster_sizes = df.groupby('cluster_id').size()

valid_clusters = cluster_sizes[cluster_sizes >= 2].index

df_multi = df[df['cluster_id'].isin(valid_clusters)].reset_index(drop=True)

print("Filtered rows:", len(df_multi))
print("Filtered clusters:", df_multi['cluster_id'].nunique())
print("Avg docs per cluster:",
      df_multi.groupby('cluster_id').size().mean())


Filtered rows: 4335
Filtered clusters: 2060
Avg docs per cluster: 2.104368932038835


In [None]:
from sklearn.model_selection import train_test_split

clusters = df_multi['cluster_id'].unique()

train_clusters, temp_clusters = train_test_split(
    clusters, test_size=0.2, random_state=42
)

val_clusters, test_clusters = train_test_split(
    temp_clusters, test_size=0.5, random_state=42
)

train_df = df_multi[df_multi['cluster_id'].isin(train_clusters)]
val_df   = df_multi[df_multi['cluster_id'].isin(val_clusters)]
test_df  = df_multi[df_multi['cluster_id'].isin(test_clusters)]

print("Train clusters:", train_df['cluster_id'].nunique())
print("Val clusters:", val_df['cluster_id'].nunique())
print("Test clusters:", test_df['cluster_id'].nunique())


Train clusters: 1648
Val clusters: 206
Test clusters: 206


In [None]:
def build_cluster_samples(df):
    samples = []
    for cid, group in df.groupby("cluster_id"):
        docs = []

        # sort only if column exists
        if "published_date" in group.columns:
            try:
                group = group.sort_values("published_date")
            except Exception:
                pass

        for _, row in group.iterrows():
            docs.append(f"[DOC]\n{row['article_clean']}")

        samples.append({
            "cluster_id": cid,
            "source": "\n".join(docs),
            "summary": group.iloc[0]['summary_clean']
        })
    return samples


In [None]:
train_samples = build_cluster_samples(train_df)
val_samples   = build_cluster_samples(val_df)
test_samples  = build_cluster_samples(test_df)

print("Train samples:", len(train_samples))
print("Val samples:", len(val_samples))
print("Test samples:", len(test_samples))


Train samples: 1648
Val samples: 206
Test samples: 206


In [None]:
!pip install transformers datasets accelerate evaluate rouge-score sentencepiece


Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=573e42da3ea90173441dc972efb7ec10d2212f6b56ddb59ff350f84d3b0fac95
  Stored in directory: /root/.cache/pip/wheels/85/9d/af/01feefbe7d55ef5468796f0c68225b6788e85d9d0a281e7a70
Successfully built rouge-score
Installing collected packages: rouge-score, evaluate
Successfully installed evaluate-0.4.6 rouge-score-0.1.2


In [None]:
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Trainer,
    TrainingArguments,
    DataCollatorForSeq2Seq,
)
from datasets import Dataset
import torch

# =========================
# CONFIG
# =========================
MODEL_NAME = "allenai/led-base-16384"  # LED-base for long documents
MAX_INPUT_LEN = 4096                   # LED supports very long inputs
MAX_TARGET_LEN = 256
LR = 2e-5
EPOCHS = 7
BATCH_SIZE = 1
GRAD_ACC = 4
LOGGING_STEPS = 50

# =========================
# DATASET
# =========================
# train_samples = [{"source": "...", "summary": "..."}]
# val_samples = [{"source": "...", "summary": "..."}]

train_ds = Dataset.from_list(train_samples)
val_ds   = Dataset.from_list(val_samples)

# =========================
# TOKENIZER + MODEL
# =========================
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
model.gradient_checkpointing_enable()  # save VRAM
model.config.use_cache = False          # needed for gradient checkpointing

# =========================
# PREPROCESS FUNCTION
# =========================
def preprocess(batch):
    inputs = tokenizer(
        batch["source"],
        max_length=MAX_INPUT_LEN,
        truncation=True,
        padding="max_length"
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            batch["summary"],
            max_length=MAX_TARGET_LEN,
            truncation=True,
            padding="max_length"
        )
    labels["input_ids"] = [
        [t if t != tokenizer.pad_token_id else -100 for t in seq]
        for seq in labels["input_ids"]
    ]
    inputs["labels"] = labels["input_ids"]
    return inputs

train_data = train_ds.map(preprocess, batched=True, remove_columns=train_ds.column_names)
val_data   = val_ds.map(preprocess, batched=True, remove_columns=val_ds.column_names)

# =========================
# DATA COLLATOR
# =========================
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# =========================
# TRAINING ARGUMENTS
# =========================
training_args = TrainingArguments(
    output_dir="./led_finetuned",
    overwrite_output_dir=True,

    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRAD_ACC,
    num_train_epochs=EPOCHS,

    learning_rate=LR,
    lr_scheduler_type="linear",
    warmup_ratio=0.05,

    fp16=True,
    logging_steps=LOGGING_STEPS,
    log_level="info",
    save_strategy="epoch",
    report_to="none"
)

# =========================
# TRAINER
# =========================
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

# =========================
# START TRAINING
# =========================
print("🚀 Starting LED-base training...")
trainer.train()

# =========================



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/27.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/648M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/648M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

Map:   0%|          | 0/1648 [00:00<?, ? examples/s]



Map:   0%|          | 0/206 [00:00<?, ? examples/s]

  trainer = Trainer(
Using auto half precision backend


🚀 Starting LED-base training...


***** Running training *****
  Num examples = 1,648
  Num Epochs = 7
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 4
  Total optimization steps = 2,884
  Number of trainable parameters = 161,844,480


Step,Training Loss
50,1.9252
100,1.7
150,1.7149
200,1.469
250,1.4405
300,1.3779
350,1.4896
400,1.4037
450,1.2457
500,1.2029


Saving model checkpoint to ./led_finetuned/checkpoint-412
Configuration saved in ./led_finetuned/checkpoint-412/config.json
Configuration saved in ./led_finetuned/checkpoint-412/generation_config.json
Model weights saved in ./led_finetuned/checkpoint-412/model.safetensors
tokenizer config file saved in ./led_finetuned/checkpoint-412/tokenizer_config.json
Special tokens file saved in ./led_finetuned/checkpoint-412/special_tokens_map.json
Saving model checkpoint to ./led_finetuned/checkpoint-824
Configuration saved in ./led_finetuned/checkpoint-824/config.json
Configuration saved in ./led_finetuned/checkpoint-824/generation_config.json
Model weights saved in ./led_finetuned/checkpoint-824/model.safetensors
tokenizer config file saved in ./led_finetuned/checkpoint-824/tokenizer_config.json
Special tokens file saved in ./led_finetuned/checkpoint-824/special_tokens_map.json
Saving model checkpoint to ./led_finetuned/checkpoint-1236
Configuration saved in ./led_finetuned/checkpoint-1236/conf

TrainOutput(global_step=2884, training_loss=0.929253615221931, metrics={'train_runtime': 9411.3186, 'train_samples_per_second': 1.226, 'train_steps_per_second': 0.306, 'total_flos': 3.1149562974437376e+16, 'train_loss': 0.929253615221931, 'epoch': 7.0})

In [None]:
model.save_pretrained("./LED_FINETUNED")
tokenizer.save_pretrained("./LED_FINETUNED")

Configuration saved in ./LED_FINETUNED/config.json
Configuration saved in ./LED_FINETUNED/generation_config.json
Model weights saved in ./LED_FINETUNED/model.safetensors
tokenizer config file saved in ./LED_FINETUNED/tokenizer_config.json
Special tokens file saved in ./LED_FINETUNED/special_tokens_map.json


('./LED_FINETUNED/tokenizer_config.json',
 './LED_FINETUNED/special_tokens_map.json',
 './LED_FINETUNED/vocab.json',
 './LED_FINETUNED/merges.txt',
 './LED_FINETUNED/added_tokens.json',
 './LED_FINETUNED/tokenizer.json')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
SAVE_PATH = "/content/drive/MyDrive/models/LED_FINETUNED"


In [None]:
model.save_pretrained(SAVE_PATH)
tokenizer.save_pretrained(SAVE_PATH)


Configuration saved in /content/drive/MyDrive/models/LED_FINETUNED/config.json
Configuration saved in /content/drive/MyDrive/models/LED_FINETUNED/generation_config.json
Model weights saved in /content/drive/MyDrive/models/LED_FINETUNED/model.safetensors
tokenizer config file saved in /content/drive/MyDrive/models/LED_FINETUNED/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/models/LED_FINETUNED/special_tokens_map.json


('/content/drive/MyDrive/models/LED_FINETUNED/tokenizer_config.json',
 '/content/drive/MyDrive/models/LED_FINETUNED/special_tokens_map.json',
 '/content/drive/MyDrive/models/LED_FINETUNED/vocab.json',
 '/content/drive/MyDrive/models/LED_FINETUNED/merges.txt',
 '/content/drive/MyDrive/models/LED_FINETUNED/added_tokens.json',
 '/content/drive/MyDrive/models/LED_FINETUNED/tokenizer.json')