In [1]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch.optim import AdamW
import pickle
import os

# Load and preprocess data
df = pd.read_csv("/kaggle/input/news-dataset/News_Category_Dataset.csv")
df = df[['headline', 'category']].dropna()
df = df.sample(frac=1, random_state=42)  # shuffle

# Encode category labels
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['category'])
labels_list = label_encoder.classes_

# Split dataset
split_idx = int(0.8 * len(df))
train_df = df.iloc[:split_idx]
test_df = df.iloc[split_idx:]

# Tokenizer and model
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')

# Custom Dataset
class T5HeadlineDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len=64):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        headline = self.data.iloc[idx]['headline']
        label = self.data.iloc[idx]['category']

        input_text = f"Classify headline: {headline}"
        target_text = label

        input_encoding = self.tokenizer(
            input_text,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        target_encoding = self.tokenizer(
            target_text,
            max_length=10,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        return {
            'input_ids': input_encoding['input_ids'].squeeze(),
            'attention_mask': input_encoding['attention_mask'].squeeze(),
            'labels': target_encoding['input_ids'].squeeze()
        }

# Loaders
train_dataset = T5HeadlineDataset(train_df, tokenizer)
test_dataset = T5HeadlineDataset(test_df, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Optimizer
optimizer = AdamW(model.parameters(), lr=3e-5)

# Training
print("Training...")
model.train()
for epoch in range(10):
    total_loss = 0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch + 1}, Loss: {total_loss:.4f}")

# Evaluation
print("Evaluating...")
model.eval()
y_preds = []
y_true = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=10)
        preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)

        true_labels = tokenizer.batch_decode(batch['labels'], skip_special_tokens=True)

        y_preds.extend(preds)
        y_true.extend(true_labels)

# Label encoding for classification report
y_preds_encoded = label_encoder.transform(y_preds)
y_true_encoded = label_encoder.transform(y_true)

report = classification_report(y_true_encoded, y_preds_encoded, target_names=labels_list)
acc = accuracy_score(y_true_encoded, y_preds_encoded)

# Save report and model
os.makedirs("t5_outputs", exist_ok=True)
with open("t5_outputs/classification_report.txt", "w") as f:
    f.write(f"Accuracy: {acc:.4f}\n\n")
    f.write(report)

with open("t5_outputs/T5_classifier.pkl", "wb") as f:
    pickle.dump(model, f)

print("Model saved to: t5_outputs/T5_classifier.pkl")
print("Classification report saved to: t5_outputs/classification_report.txt")


2025-05-12 08:38:32.396717: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747039112.553276      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747039112.596857      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Training...


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch 1, Loss: 4065.0119
Epoch 2, Loss: 2074.3857
Epoch 3, Loss: 1849.5567
Epoch 4, Loss: 1724.3882
Epoch 5, Loss: 1634.0587
Epoch 6, Loss: 1566.8645
Epoch 7, Loss: 1510.0959
Epoch 8, Loss: 1461.7314
Epoch 9, Loss: 1418.1502
Epoch 10, Loss: 1379.9075
Evaluating...
Model saved to: t5_outputs/T5_classifier.pkl
Classification report saved to: t5_outputs/classification_report.txt
