In [1]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BartTokenizer, BartForConditionalGeneration
from torch.optim import AdamW
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
import pickle
import os

# Load and prepare data
df = pd.read_csv("/kaggle/input/news-dataset/News_Category_Dataset.csv")
df = df[['headline', 'category']].dropna()
df = df.sample(frac=1, random_state=42)

# Encode labels
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['category'])
labels_list = label_encoder.classes_

# Split data
split = int(0.8 * len(df))
train_df = df.iloc[:split]
test_df = df.iloc[split:]

# Tokenizer and model
tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')
model = BartForConditionalGeneration.from_pretrained('facebook/bart-base')

# Dataset class
class BARTHeadlineDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len=64):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        headline = self.data.iloc[idx]['headline']
        category = self.data.iloc[idx]['category']

        input_text = f"Classify: {headline}"
        target_text = category

        inputs = self.tokenizer(input_text, max_length=self.max_len, padding='max_length', truncation=True, return_tensors="pt")
        targets = self.tokenizer(target_text, max_length=10, padding='max_length', truncation=True, return_tensors="pt")

        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'labels': targets['input_ids'].squeeze()
        }

# DataLoaders
train_dataset = BARTHeadlineDataset(train_df, tokenizer)
test_dataset = BARTHeadlineDataset(test_df, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# Optimizer
optimizer = AdamW(model.parameters(), lr=3e-5)

# Training
model.train()
print("Training...")
for epoch in range(10):
    total_loss = 0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

# Evaluation
model.eval()
print("Evaluating...")
y_true = []
y_pred = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels']

        generated_ids = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=10)
        preds = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
        true_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

        y_pred.extend(preds)
        y_true.extend(true_labels)

# Encode string labels to integers
y_pred_encoded = label_encoder.transform(y_pred)
y_true_encoded = label_encoder.transform(y_true)

# Report
report = classification_report(y_true_encoded, y_pred_encoded, target_names=labels_list)
acc = accuracy_score(y_true_encoded, y_pred_encoded)

# Save results
os.makedirs("bart_outputs", exist_ok=True)

with open("bart_outputs/classification_report.txt", "w") as f:
    f.write(f"Accuracy: {acc:.4f}\n\n")
    f.write(report)

with open("bart_outputs/BART_classifier.pkl", "wb") as f:
    pickle.dump(model, f)

print("Model saved to: bart_outputs/BART_classifier.pkl")
print("Report saved to: bart_outputs/classification_report.txt")

2025-05-12 08:42:26.346992: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747039346.595214      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747039346.670461      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

Training...
Epoch 1, Loss: 1997.8259
Epoch 2, Loss: 1300.4318
Epoch 3, Loss: 1113.4660
Epoch 4, Loss: 962.5325
Epoch 5, Loss: 824.5777
Epoch 6, Loss: 701.2471
Epoch 7, Loss: 588.4131
Epoch 8, Loss: 490.4932
Epoch 9, Loss: 412.3317
Epoch 10, Loss: 345.1997
Evaluating...
Model saved to: bart_outputs/BART_classifier.pkl
Report saved to: bart_outputs/classification_report.txt
