In [1]:
# Install required packages
!pip install -q transformers datasets accelerate evaluate rouge_score sentencepiece torch


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import os
import re
import numpy as np
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq
)
import evaluate

# Check GPU
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

CUDA available: True
GPU: NVIDIA H100 80GB HBM3
GPU Memory: 85.02 GB


## Step 1: Load and Explore Dataset

In [3]:
# Load dataset
dataset = load_dataset("kdave/Indian_Financial_News")

# Check structure
print(dataset)
print(f"\nTotal rows: {len(dataset['train'])}")
print("\nColumns:", dataset['train'].column_names)
print("\nSample row:")
print(f"Content: {dataset['train'][0]['Content'][:200]}...")
print(f"Summary: {dataset['train'][0]['Summary']}")

README.md: 0.00B [00:00, ?B/s]

training_data_26000.csv:   0%|          | 0.00/115M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/26961 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['URL', 'Content', 'Summary', 'Sentiment'],
        num_rows: 26961
    })
})

Total rows: 26961

Columns: ['URL', 'Content', 'Summary', 'Sentiment']

Sample row:
Content: US consumer spending dropped by a record in April as the COVID-19 pandemic undercut demand, buttressing expectations that the economy could contract in the second quarter at its steepest pace since th...
Summary: consumer spending plunges 13.6 percent in April. that was the biggest drop since the government started tracking series in 1959. consumer spending accounts for more than two-thirds of economic activity. economists polled by Reuters had forecast consumer spending plummeting 12.6 percent. a spokesman for the u.s. government said the data was not available.


## Step 2: Clean and Prepare Dataset

In [4]:
def clean_text(example):
    """Clean Content and Summary: remove extra spaces, newlines, special chars"""
    # Clean content
    text = example['Content']
    text = re.sub(r'\s+', ' ', text)  # Multiple spaces to single
    text = re.sub(r'[^a-zA-Z0-9\s.,!?%-]', '', text)  # Keep basic punctuation
    example['Content'] = text.strip()

    # Clean summary
    summary = example['Summary']
    summary = re.sub(r'\s+', ' ', summary)
    example['Summary'] = summary.strip()
    
    return example

# Clean dataset
dataset = dataset.map(clean_text)

# Split into train (95%) and test (5%) for final evaluation
# We use all train data for training (no validation during training)
dataset = dataset['train'].train_test_split(test_size=0.05, seed=42)
print(f"\nTrain: {len(dataset['train'])} samples")
print(f"Test (for final evaluation): {len(dataset['test'])} samples")

Map:   0%|          | 0/26961 [00:00<?, ? examples/s]


Train: 25612 samples
Test (for final evaluation): 1349 samples


## Step 3: Load FLAN-T5 Model and Tokenizer

In [5]:
# Load FLAN-T5 base model (you can also use flan-t5-large for better quality)
model_name = "google/flan-t5-base"
# model_name = "google/flan-t5-large"  # Uncomment for larger model

print(f"Loading {model_name}...")
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

print(f"\nModel loaded: {model_name}")
print(f"Model parameters: {model.num_parameters() / 1e6:.2f}M")

Loading google/flan-t5-base...


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]


Model loaded: google/flan-t5-base
Model parameters: 247.58M


## Step 4: Tokenize Dataset

In [6]:
# FLAN-T5 specific preprocessing
# Add task prefix for better performance
def preprocess_flan_t5(examples):
    """Preprocess for FLAN-T5 with task prefix"""
    # Add summarization task prefix
    inputs = [f"summarize: {doc}" for doc in examples['Content']]
    
    # Tokenize inputs
    model_inputs = tokenizer(
        inputs,
        max_length=512,
        truncation=True,
        padding=False  # Dynamic padding in data collator
    )
    
    # Tokenize targets
    labels = tokenizer(
        examples['Summary'],
        max_length=128,
        truncation=True,
        padding=False  # Dynamic padding in data collator
    )
    
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

# Tokenize dataset
print("Tokenizing dataset...")
tokenized_data = dataset.map(
    preprocess_flan_t5,
    batched=True,
    remove_columns=['Content', 'Summary', 'URL', 'Sentiment'],
    desc="Tokenizing"
)

print("\nTokenization complete!")
print(f"Train samples: {len(tokenized_data['train'])}")
print(f"Test samples: {len(tokenized_data['test'])}")

Tokenizing dataset...


Tokenizing:   0%|          | 0/25612 [00:00<?, ? examples/s]

Tokenizing:   0%|          | 0/1349 [00:00<?, ? examples/s]


Tokenization complete!
Train samples: 25612
Test samples: 1349


## Step 5: Setup Training Arguments (Optimized for H100)

In [7]:
# Data collator for dynamic padding
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding=True
)

# Training arguments optimized for H100
training_args = Seq2SeqTrainingArguments(
    output_dir="./flan-t5-financial",
    
    # Training strategy
    eval_strategy="no",  # No evaluation during training
    save_strategy="epoch",  # Save after each epoch
    
    # Hyperparameters
    learning_rate=5e-5,  # Higher LR for FLAN-T5
    num_train_epochs=3,
    weight_decay=0.01,
    
    # Batch sizes (H100 can handle larger batches)
    per_device_train_batch_size=16,  # Adjust based on H100 memory
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=2,  # Effective batch size = 32
    
    # Performance optimization
    bf16=True,  # H100 supports BF16
    dataloader_num_workers=4,
    dataloader_pin_memory=True,
    gradient_checkpointing=False,  # H100 has enough memory
    
    # Logging
    logging_steps=100,
    logging_first_step=True,
    report_to="none",  # Disable wandb/tensorboard
    
    # Saving
    save_total_limit=2,  # Keep only last 2 checkpoints
    
    # Generation
    predict_with_generate=True,
    generation_max_length=128,
    generation_num_beams=4,
    
    # Misc
    push_to_hub=False,
    load_best_model_at_end=False,
)

print("Training arguments configured!")
print(f"Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}")
print(f"Total training steps: {len(tokenized_data['train']) * training_args.num_train_epochs // (training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps)}")

Training arguments configured!
Effective batch size: 32
Total training steps: 2401


## Step 6: Setup Evaluation Metrics

In [12]:
# Load ROUGE metric
rouge = evaluate.load("rouge")

def compute_metrics(eval_preds):
    """Compute ROUGE metrics for evaluation"""
    preds, labels = eval_preds
    
    # Replace -100 in labels (used for padding)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    
    # Decode predictions and labels
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Compute ROUGE scores
    result = rouge.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        use_stemmer=True
    )
    
    return {
        "rouge1": result["rouge1"],
        "rouge2": result["rouge2"],
        "rougeL": result["rougeL"],
    }

print("Evaluation metrics configured!")

Downloading builder script: 0.00B [00:00, ?B/s]

Evaluation metrics configured!


In [13]:
# Initialize Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data['train'],
    eval_dataset=None,  # No evaluation during training
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

print("Trainer initialized!")
print(f"Model device: {next(model.parameters()).device}")

  trainer = Seq2SeqTrainer(
Detected kernel version 4.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Trainer initialized!
Model device: cuda:0


## Step 8: Train the Model

In [14]:
print("Starting training...\n")
print("=" * 60)

# Train
train_result = trainer.train()

print("\n" + "=" * 60)
print("Training completed!")
print(f"Training time: {train_result.metrics['train_runtime']:.2f} seconds")
print(f"Training loss: {train_result.metrics['train_loss']:.4f}")
print(f"Samples per second: {train_result.metrics['train_samples_per_second']:.2f}")

Starting training...



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Step,Training Loss
1,1.3249
100,0.7912
200,0.6706
300,0.6246
400,0.6075
500,0.5944
600,0.5766
700,0.5724
800,0.555
900,0.5283


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av


Training completed!
Training time: 552.93 seconds
Training loss: 0.5407
Samples per second: 138.96


## Step 9: Final Evaluation on Test Set

In [16]:
import numpy as np
import evaluate

print("\n" + "=" * 60)
print("Starting final evaluation on test set...")
print("=" * 60)

# Disable metrics during evaluation
trainer.compute_metrics = None

# Run evaluation to get loss
eval_results = trainer.evaluate(
    eval_dataset=tokenized_data["test"],
    max_length=128,
    num_beams=2,
)

print(f"\n✅ Evaluation Loss: {eval_results['eval_loss']:.4f}")

# Generate predictions
print("\n⏳ Generating predictions for ROUGE scores...")
predictions = trainer.predict(
    test_dataset=tokenized_data["test"],
    max_length=128,
    num_beams=2,
)

preds = predictions.predictions
if isinstance(preds, tuple):
    preds = preds[0]

labels = predictions.label_ids

# FIX: Clip to valid vocab range (0 to 32127 for FLAN-T5)
vocab_size = tokenizer.vocab_size
preds = np.clip(preds, 0, vocab_size - 1).astype(np.int64)
labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
labels = np.clip(labels, 0, vocab_size - 1).astype(np.int64)

# Decode
decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

# Compute ROUGE
rouge = evaluate.load("rouge")
rouge_results = rouge.compute(
    predictions=decoded_preds,
    references=decoded_labels,
    use_stemmer=True
)

# Print results
print("\n" + "=" * 60)
print("🎯 FINAL EVALUATION RESULTS")
print("=" * 60)
print(f"Evaluation Loss: {eval_results['eval_loss']:.4f}")
print(f"ROUGE-1: {rouge_results['rouge1']:.4f}")
print(f"ROUGE-2: {rouge_results['rouge2']:.4f}")
print(f"ROUGE-L: {rouge_results['rougeL']:.4f}")
print("=" * 60)


Starting final evaluation on test set...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av


✅ Evaluation Loss: 0.3855

⏳ Generating predictions for ROUGE scores...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av


🎯 FINAL EVALUATION RESULTS
Evaluation Loss: 0.3855
ROUGE-1: 0.5688
ROUGE-2: 0.4397
ROUGE-L: 0.5002


## Step 10: Generate Sample Predictions

In [17]:
# Generate predictions on a few test samples
print("\nGenerating sample predictions...\n")

test_samples = dataset['test'].select(range(3))

for i, sample in enumerate(test_samples):
    print(f"\n{'='*60}")
    print(f"SAMPLE {i+1}")
    print(f"{'='*60}")
    
    # Prepare input
    input_text = f"summarize: {sample['Content']}"
    inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True).to(model.device)
    
    # Generate summary
    outputs = model.generate(
        **inputs,
        max_length=128,
        num_beams=4,
        early_stopping=True
    )
    
    predicted_summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    print(f"\nOriginal Content (first 200 chars):\n{sample['Content'][:200]}...")
    print(f"\nReference Summary:\n{sample['Summary']}")
    print(f"\nGenerated Summary:\n{predicted_summary}")
    print(f"\n{'='*60}")


Generating sample predictions...


SAMPLE 1

Original Content (first 200 chars):
The consumer durables market in India was valued at 90,000 crore in 2019, according to industry estimates, but is expected to decline to 80,000 crore due to the loss in sales during the lockdown perio...

Reference Summary:
over 56 million dishwashers were sold in india in 2019, and this is expected to rise to 70 million this year. the microwave category — which has been witnessing a flat retail volume since 2017 — is expected to touch 1,548 million this year. in-store tactics Lloyd (part of Havells) forayed into the refrigerator segment in September with 25 models priced between 10,000 and 84,990.

Generated Summary:
consumer durables market in india was valued at 90,000 crore in 2019, but is expected to decline to 80,000 crore due to the loss in sales during the lockdown period. over 56 million units of dishwashers were sold in india in 2019, and this is expected to rise to 70 million this year. microwa

In [18]:
import os

SAVE_PATH = "/mnt/models/flan-t5-financial-final"
os.makedirs(SAVE_PATH, exist_ok=True)

trainer.save_model(SAVE_PATH)
tokenizer.save_pretrained(SAVE_PATH)

print(f"✅ Model saved to {SAVE_PATH}")

✅ Model saved to /mnt/models/flan-t5-financial-final


In [1]:
import shutil
from pathlib import Path

MODEL_DIR = Path("/mnt/models/flan-t5-financial-final")
ZIP_PATH = Path("/mnt/models/flan-t5-financial-final.zip")

# Create zip
shutil.make_archive(
    base_name=str(ZIP_PATH).replace(".zip", ""),
    format="zip",
    root_dir=MODEL_DIR
)

ZIP_PATH


FileNotFoundError: [Errno 2] No such file or directory: '/mnt/models/flan-t5-financial-final'

In [2]:
import os

os.listdir("/mnt")


['my-flan-t5-volume']

In [3]:
import os

os.listdir("/mnt/my-flan-t5-volume")


[]

In [4]:
import os

model_paths = []

for root, dirs, files in os.walk("/"):
    if "config.json" in files and ("pytorch_model.bin" in files or "model.safetensors" in files):
        model_paths.append(root)

model_paths


[]

In [5]:
SAVE_PATH = "/mnt/my-flan-t5-volume/flan-t5-financial-final"

model.save_pretrained(SAVE_PATH)
tokenizer.save_pretrained(SAVE_PATH)

SAVE_PATH


NameError: name 'model' is not defined

In [6]:
import os

for root, dirs, files in os.walk("/"):
    for d in dirs:
        if d.startswith("checkpoint"):
            print(os.path.join(root, d))


/usr/local/lib/python3.12/site-packages/openai/resources/fine_tuning/checkpoints
/usr/local/lib/python3.12/site-packages/openai/types/fine_tuning/checkpoints
/usr/local/lib/python3.12/site-packages/orbax/checkpoint
/usr/local/lib/python3.12/site-packages/orbax/checkpoint/_src/checkpoint_managers
/usr/local/lib/python3.12/site-packages/orbax/checkpoint/_src/checkpointers
/usr/local/lib/python3.12/site-packages/sonnet/src/conformance/checkpoints
/usr/local/lib/python3.12/site-packages/torch/distributed/checkpoint
/usr/local/lib/python3.12/site-packages/torch/distributed/_shard/checkpoint
