In [None]:
import torch
torch.cuda.is_available()


True

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from bs4 import BeautifulSoup
from transformers import AutoTokenizer

file_to_check = "./NewsSumm_perfect_clean.csv"
df=pd.read_csv(file_to_check)

cluster_sizes = df.groupby('cluster_id').size()

valid_clusters = cluster_sizes[cluster_sizes >= 2].index

df_multi = df[df['cluster_id'].isin(valid_clusters)].reset_index(drop=True)

print("Filtered rows:", len(df_multi))
print("Filtered clusters:", df_multi['cluster_id'].nunique())
print("Avg docs per cluster:",
      df_multi.groupby('cluster_id').size().mean())


Filtered rows: 4335
Filtered clusters: 2060
Avg docs per cluster: 2.104368932038835


In [None]:
from sklearn.model_selection import train_test_split

clusters = df_multi['cluster_id'].unique()

train_clusters, temp_clusters = train_test_split(
    clusters, test_size=0.2, random_state=42
)

val_clusters, test_clusters = train_test_split(
    temp_clusters, test_size=0.5, random_state=42
)

train_df = df_multi[df_multi['cluster_id'].isin(train_clusters)]
val_df   = df_multi[df_multi['cluster_id'].isin(val_clusters)]
test_df  = df_multi[df_multi['cluster_id'].isin(test_clusters)]

print("Train clusters:", train_df['cluster_id'].nunique())
print("Val clusters:", val_df['cluster_id'].nunique())
print("Test clusters:", test_df['cluster_id'].nunique())


Train clusters: 1648
Val clusters: 206
Test clusters: 206


In [None]:
def build_cluster_samples(df):
    samples = []
    for cid, group in df.groupby("cluster_id"):
        docs = []

        # sort only if column exists
        if "published_date" in group.columns:
            try:
                group = group.sort_values("published_date")
            except Exception:
                pass

        for _, row in group.iterrows():
            docs.append(f"[DOC]\n{row['article_clean']}")

        samples.append({
            "cluster_id": cid,
            "source": "\n".join(docs),
            "summary": group.iloc[0]['summary_clean']
        })
    return samples


In [None]:
train_samples = build_cluster_samples(train_df)
val_samples   = build_cluster_samples(val_df)
test_samples  = build_cluster_samples(test_df)

print("Train samples:", len(train_samples))
print("Val samples:", len(val_samples))
print("Test samples:", len(test_samples))


Train samples: 1648
Val samples: 206
Test samples: 206


In [None]:
pip install -U transformers datasets accelerate peft bitsandbytes




In [None]:
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig
)
from datasets import Dataset
from peft import LoraConfig, get_peft_model
import torch

# =========================
# CONFIG
# =========================
MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
OUTPUT_DIR = "./llama_finetuned"

MAX_LEN = 1024
LR = 1e-4
EPOCHS = 2
BATCH_SIZE = 2
GRAD_ACC = 4
LOGGING_STEPS = 50

# =========================
# DATA (REPLACE WITH YOUR OWN)
# =========================


train_ds = Dataset.from_list(train_samples)
val_ds   = Dataset.from_list(val_samples)

# =========================
# TOKENIZER
# =========================
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

# =========================
# 4-BIT CONFIG
# =========================
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

# =========================
# MODEL
# =========================
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto"
)

# =========================
# LoRA CONFIG
# =========================
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# =========================
# PROMPT FORMAT
# =========================
def format_prompt(source, summary):
    return (
        "### Instruction:\n"
        "Summarize the following document.\n\n"
        "### Document:\n"
        f"{source}\n\n"
        "### Summary:\n"
        f"{summary}"
    )

# =========================
# PREPROCESS
# =========================
def preprocess(batch):
    texts = [
        format_prompt(s, t)
        for s, t in zip(batch["source"], batch["summary"])
    ]

    tokenized = tokenizer(
        texts,
        max_length=MAX_LEN,
        truncation=True,
        padding="max_length"
    )

    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

train_data = train_ds.map(preprocess, batched=True, remove_columns=train_ds.column_names)
val_data   = val_ds.map(preprocess, batched=True, remove_columns=val_ds.column_names)

# =========================
# DATA COLLATOR
# =========================
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# =========================
# TRAINING ARGUMENTS (LATEST TRANSFORMERS)
# =========================
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRAD_ACC,
    num_train_epochs=EPOCHS,
    learning_rate=LR,
    fp16=True,
    logging_steps=LOGGING_STEPS,
    save_strategy="no",
    report_to="none"
)

# =========================
# TRAINER (NO tokenizer ARG)
# =========================
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    data_collator=data_collator
)

# =========================
# TRAIN
# =========================
print("ðŸš€ Training started...")
trainer.train()

# =========================
# SAVE
# =========================
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

print("âœ… DONE. Model saved.")


Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

trainable params: 2,252,800 || all params: 1,102,301,184 || trainable%: 0.2044


Map:   0%|          | 0/1648 [00:00<?, ? examples/s]

Map:   0%|          | 0/206 [00:00<?, ? examples/s]

ðŸš€ Training started...


Step,Training Loss
50,1.566038
100,1.536581
150,1.482328
200,1.510573
250,1.486085
300,1.461407
350,1.494395
400,1.480805


âœ… DONE. Model saved.


In [None]:
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig
)
from datasets import Dataset
from peft import PeftModel
import torch

# =========================
# CONFIG
# =========================
BASE_MODEL = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
LORA_DIR = "./llama_finetuned"   # where LoRA was saved

MAX_LEN = 1024
LR = 1e-4
EPOCHS = 4              # 2 old + 2 new
BATCH_SIZE = 2
GRAD_ACC = 4
LOGGING_STEPS = 50

# =========================
# DATA
# =========================
train_ds = Dataset.from_list(train_samples)
val_ds   = Dataset.from_list(val_samples)

# =========================
# TOKENIZER
# =========================
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
tokenizer.pad_token = tokenizer.eos_token

# =========================
# 4-BIT CONFIG
# =========================
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

# =========================
# LOAD BASE MODEL
# =========================
base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    quantization_config=bnb_config,
    device_map="auto"
)

# =========================
# ðŸ”¥ LOAD LoRA ADAPTER (THIS IS THE KEY FIX)
# =========================
model = PeftModel.from_pretrained(
    base_model,
    LORA_DIR,
    is_trainable=True   # ðŸ”¥ REQUIRED to continue training
)

model.print_trainable_parameters()

# =========================
# PROMPT FORMAT
# =========================
def format_prompt(source, summary):
    return (
        "### Instruction:\n"
        "Summarize the following document.\n\n"
        "### Document:\n"
        f"{source}\n\n"
        "### Summary:\n"
        f"{summary}"
    )

# =========================
# PREPROCESS
# =========================
def preprocess(batch):
    texts = [
        format_prompt(s, t)
        for s, t in zip(batch["source"], batch["summary"])
    ]

    tokenized = tokenizer(
        texts,
        max_length=MAX_LEN,
        truncation=True,
        padding="max_length"
    )

    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

train_data = train_ds.map(preprocess, batched=True, remove_columns=train_ds.column_names)
val_data   = val_ds.map(preprocess, batched=True, remove_columns=val_ds.column_names)

# =========================
# DATA COLLATOR
# =========================
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# =========================
# TRAINING ARGS
# =========================
training_args = TrainingArguments(
    output_dir=LORA_DIR,
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRAD_ACC,
    num_train_epochs=EPOCHS,
    learning_rate=LR,
    fp16=True,
    logging_steps=LOGGING_STEPS,
    save_strategy="no",
    report_to="none"
)

# =========================
# TRAINER
# =========================
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    data_collator=data_collator
)

print("ðŸš€ Resuming LoRA training correctly...")
trainer.train()

# =========================
# SAVE UPDATED LoRA
# =========================
model.save_pretrained(LORA_DIR)
tokenizer.save_pretrained(LORA_DIR)

print("âœ… DONE. Training resumed successfully.")


Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

trainable params: 2,252,800 || all params: 1,102,301,184 || trainable%: 0.2044


Map:   0%|          | 0/1648 [00:00<?, ? examples/s]

Map:   0%|          | 0/206 [00:00<?, ? examples/s]

ðŸš€ Resuming LoRA training correctly...


Step,Training Loss
50,1.481051
100,1.504873
150,1.459525
200,1.493566
250,1.466819
300,1.443842
350,1.475943
400,1.464469
450,1.479508
500,1.460639


âœ… DONE. Training resumed successfully.
