In [1]:
!python -m pip install --upgrade pip
!pip install datasets accelerate transformers evaluate rouge_score safetensors --quiet



In [2]:
from datasets import Dataset,DatasetDict,load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM


In [21]:
train_df = Dataset.load_from_disk("chunked_train")
test_df  = Dataset.load_from_disk("chunked_test")
validation_df = Dataset.load_from_disk("chunked_validation")

print(train_df)
print(test_df)
print(validation_df)

Dataset({
    features: ['article_chunk', 'summary_chunk'],
    num_rows: 1057334
})
Dataset({
    features: ['article_chunk', 'summary_chunk'],
    num_rows: 43560
})
Dataset({
    features: ['article_chunk', 'summary_chunk'],
    num_rows: 53480
})


In [22]:
splits = train_df.train_test_split(test_size=0.25, seed=42)

train_df = splits["test"]
 
print(f"Number of examples in the 25% split: {len(train_df)}")


Number of examples in the 25% split: 264334


In [23]:
splits = test_df.train_test_split(test_size=0.25, seed=42)

test_df = splits["test"]
 
print(f"Number of examples in the 25% split: {len(test_df)}")


Number of examples in the 25% split: 10890


In [5]:
model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [6]:
model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)


In [20]:

def preprocess_function(examples,  prefix="summarize: ", max_context_length=512):
    """
    Preprocesses the dataset to ensure the total tokens (inputs + labels) fit 
    within max_context_length, allocating 25% to the summary and 75% to the article.

    Args:
        examples (dict): A batch of examples containing "document" and "summary".
        tokenizer: The tokenizer to tokenize inputs and labels.
        prefix (str): The prefix to add before the document text (e.g., "summarize: ").
        max_context_length (int): The total allowed context length.

    Returns:
        dict: A dictionary containing tokenized inputs and labels.
    """
    # Calculate the max lengths for inputs and labels
    max_target_length = int(max_context_length * 0.3)  # 30% for summary  128
    max_source_length = max_context_length - max_target_length  # 75% for article

    # Add prefix and tokenize the document (article)
    inputs = [prefix + doc for doc in examples["article_chunk"]]
    model_inputs = tokenizer(
        inputs,
        max_length=max_source_length,
        padding="max_length",
        truncation=True
    )

    # Tokenize the summary (labels)
    labels = tokenizer(
        examples["summary_chunk"],
        max_length=max_target_length,
        padding="max_length",
        truncation=True
    )

    # Add the tokenized labels to the model inputs
    model_inputs["labels"] = labels["input_ids"]

    return model_inputs



In [None]:
tokenized_train = train_df.map(preprocess_function, batched=True, remove_columns=["article_chunk", "summary_chunk"])
tokenized_test = test_df.map(preprocess_function, batched=True, remove_columns=["article_chunk", "summary_chunk"])
print(tokenized_train)
print(tokenized_test)
tokenized_train.save_to_disk("tokenized_train")
tokenized_test.save_to_disk("tokenized_test")


In [24]:
tokenized_validation= validation_df.map(preprocess_function, batched=True, remove_columns=["article_chunk", "summary_chunk"])
tokenized_validation .save_to_disk("tokenized_validation")


Map:   0%|          | 0/53480 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/53480 [00:00<?, ? examples/s]

In [6]:

tokenized_train = Dataset .load_from_disk("tokenized_train")
tokenized_test = Dataset .load_from_disk("tokenized_test")

In [9]:
tokenized_train

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 264334
})

In [11]:

print(f"Train count: {train_count}")
print(f"Test count: {test_count}")

Train count: 0
Test count: 0


# Model Training

In [3]:
from evaluate import load
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
import numpy as np
from nltk.tokenize import sent_tokenize
import nltk
nltk.download("punkt")
nltk.download('punkt_tab')
metric  = load("rouge")

  warn(f"Failed to load image Python extension: {e}")
[nltk_data] Downloading package punkt to
[nltk_data]     /teamspace/studios/this_studio/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /teamspace/studios/this_studio/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [None]:

# helper function to postprocess text
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]
    
    # rougeLSum expects newline after each sentence
    preds = ["\n".join(sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(sent_tokenize(label)) for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    
    # decode preds and labels
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    
    
   # rougeLSum expects newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    # print("Predictions: ",decoded_preds)
    # print("Reference: ",decoded_labels)
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
  
    return result


from transformers import DataCollatorForSeq2Seq

# we want to ignore tokenizer pad token in the loss
label_pad_token_id = -100
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=tokenizer.pad_token_id,
    pad_to_multiple_of=8
)


In [17]:
import os
root = "beta"
logs_dir = root+"./logs"
os.makedirs(logs_dir, exist_ok=True)

checkpoint_dir = root+"./checkpoints"

os.makedirs(checkpoint_dir, exist_ok=True)

# Global Parameters
L_RATE = 3e-4

PER_DEVICE_EVAL_BATCH = 24
PER_DEVICE_TRAIN_BATCH = 24
WEIGHT_DECAY = 0.01
SAVE_TOTAL_LIM = 3
NUM_EPOCHS = 3
GEN_MAX_LEN = 200
LOG_FREQ = 1

# Set up training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir=checkpoint_dir,
    eval_strategy="epoch",
    logging_strategy="epoch",
    logging_steps=LOG_FREQ,   
    learning_rate=L_RATE,
    per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH,
    per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH,
    weight_decay=WEIGHT_DECAY,
    generation_max_length  = GEN_MAX_LEN,
    save_total_limit=SAVE_TOTAL_LIM,
    num_train_epochs=NUM_EPOCHS,

    predict_with_generate=True,
    push_to_hub=False
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  trainer = Seq2SeqTrainer(


In [23]:
resume_checkpoint_dir="beta./checkpoints/checkpoint-11000"
trainer.train(resume_from_checkpoint=resume_checkpoint_dir)

# trainer.train()
final_model_dir = root+"./model"
model.save_pretrained(final_model_dir)
tokenizer.save_pretrained(final_model_dir)

There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,0.2316,0.220237,0.435483,0.25877,0.397995,0.398204
2,0.2199,0.216597,0.445193,0.267272,0.405745,0.40582
3,0.2024,0.215625,0.444601,0.265641,0.40513,0.405015




('beta./model/tokenizer_config.json',
 'beta./model/special_tokens_map.json',
 'beta./model/spiece.model',
 'beta./model/added_tokens.json',
 'beta./model/tokenizer.json')

In [None]:
# final_model_dir = root+"./model"
# model.save_pretrained(final_model_dir)
# tokenizer.save_pretrained(final_model_dir)

# Evaluation

In [7]:
final_model_dir = "beta./model"

In [8]:
model= AutoModelForSeq2SeqLM.from_pretrained(final_model_dir)
tokenizer = AutoTokenizer.from_pretrained(final_model_dir)
model

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear(in_features=768, out_features=2048, bias=False)
              (wo):

In [26]:

tokenized_validation = Dataset .load_from_disk("tokenized_validation")

def sample_fraction_of_dataset(split, fraction=0.1):
    return split.shuffle(seed=42).select(range(int(len(split) * fraction)))

validation_dataset_subset = sample_fraction_of_dataset(tokenized_validation, fraction=0.1) 
validation_dataset_subset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 5348
})

In [27]:
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
print(device)

cuda


In [29]:
import time
import torch
preds =[]

for example in test_subset:
    start = time.time()
    input_ids = example["input_ids"]
    attention_mask = example["attention_mask"]
    inputs = {"input_ids": input_ids, "attention_mask": attention_mask}

    with torch.no_grad():
        output = model.generate(
            input_ids  = torch.tensor([input_ids]).to(device),
            attention_mask = torch.tensor([attention_mask]).to(device),
            max_length = 200,
            num_beams = 4,
            temperature=0.7,
            top_k=50,
            top_p=0.95,
            do_sample=True,
            early_stopping=True)
    pred_summary = tokenizer.decode(output[0], skip_special_tokens = True, clean_up_tokenization_spaces=True)
    end = time .time()
    # print("Done, time " ,end-start)
    preds .append(pred_summary)

preds[:3]

['Also used was a photo of a green smoothie - a much loved favourite of those involved in the ‘clean living’ movement.',
 'A new study of ancient stone tools suggests human weapons were no better than those created by Neanderthals.',
 'She says the Supreme Court seems to have become a place for partisan theatrics.']

In [30]:
reference = [tokenizer.decode(label, skip_special_tokens = True, clean_up_tokenization_spaces = True) for label in test_subset ["labels"]]
reference[:3]

["Paleo preacher Pete Evans's photo was also used in the article.",
 'Japanese researchers studied ancient stone weapons created by humans.',
 'Sally Kohn: Supreme Court seems to have increasingly become a place for partisan theatrics.']

In [31]:
metric  = load("rouge")
def compute_rouge(decoded_preds, decoded_labels):
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    return result


rouge_scores = compute_rouge(preds, reference)
print("ROUGE Scores:", rouge_scores)

ROUGE Scores: {'rouge1': 0.4345137685923526, 'rouge2': 0.2703196501029389, 'rougeL': 0.398388581455548, 'rougeLsum': 0.398493401584058}
