In [1]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch.optim import AdamW
import os
import numpy as np
from tqdm import tqdm

# --- Active Learning Configuration ---
INITIAL_LABELED_SIZE = 1000  # Start with a seed set of labeled data
QUERY_SIZE = 400             # Number of samples to query in each round
NUM_QUERY_ROUNDS = 10        # Number of active learning rounds
EPOCHS_PER_ROUND = 3         # Number of training epochs in each round
BATCH_SIZE = 32              # Batch size for training and inference

# --- 1. Load and Preprocess Data ---
print("Loading and preparing data...")
df = pd.read_csv("/kaggle/input/news-dataset/News_Category_Dataset.csv")
df = df[['headline', 'category']].dropna().drop_duplicates()
# Shuffle the entire dataset once at the beginning
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Encode category labels
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['category'])
labels_list = label_encoder.classes_

# --- 2. Active Learning Data Split (Stratified Initial Set) ---
print("Creating data splits with a stratified initial labeled set...")
# Split into a fixed test set first (20% of the data)
test_split_idx = int(0.8 * len(df))
train_val_df = df.iloc[:test_split_idx]
test_df = df.iloc[test_split_idx:]

# Create a stratified initial labeled set from the training data pool
num_classes = len(labels_list)
# Determine how many samples to take from each class for the initial set
samples_per_class = max(1, INITIAL_LABELED_SIZE // num_classes)
print(f"Aiming for {samples_per_class} initial samples per class.")

# Group by category and take a balanced sample. This prevents initial bias.
labeled_df = train_val_df.groupby('category', group_keys=False).apply(
    lambda x: x.sample(min(len(x), samples_per_class))
)
print(f"Actual Initial Labeled Set Size (stratified): {len(labeled_df)}")

# The unlabeled pool is everything in the training pool that is NOT in the new labeled set
unlabeled_df = train_val_df.drop(labeled_df.index).reset_index(drop=True)
print(f"Unlabeled Pool Size: {len(unlabeled_df)}")
print(f"Test Set Size: {len(test_df)}")
print("-" * 30)


# --- 3. Initialize Model, Tokenizer, and Device ---
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"Using device: {device}")

# Custom Dataset
class T5HeadlineDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len=64):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        headline = row['headline']
        category = row['category']

        input_text = f"Classify headline: {headline}"
        target_text = category

        input_encoding = self.tokenizer(
            input_text,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        target_encoding = self.tokenizer(
            target_text,
            max_length=10,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        return {
            'input_ids': input_encoding['input_ids'].squeeze(0),
            'attention_mask': input_encoding['attention_mask'].squeeze(0),
            'labels': target_encoding['input_ids'].squeeze(0)
        }


# --- STABLE Helper function for calculating uncertainty ---
def get_uncertainty_scores(model, tokenizer, unlabeled_loader):
    """
    Calculates uncertainty scores by inspecting the model's logits for the first
    decoded token. This is a stable method that prevents length mismatch errors.
    Uncertainty = 1 - confidence (max probability of the first potential token).
    """
    model.eval()
    uncertainties = []
    with torch.no_grad():
        for batch in tqdm(unlabeled_loader, desc="Acquiring from Unlabeled Pool"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            # Perform a direct forward pass to get the logits
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids) # T5 needs some labels for this pass

            # Logits for the very first potential output token are at sequence position 0.
            first_token_logits = outputs.logits[:, 0, :]
            probabilities = torch.nn.functional.softmax(first_token_logits, dim=-1)
            max_probs, _ = torch.max(probabilities, dim=-1)

            # Calculate uncertainty and append
            batch_uncertainties = (1 - max_probs).cpu().numpy()
            uncertainties.extend(batch_uncertainties)
    return uncertainties

# --- 4. The Active Learning Loop ---
for round_num in range(NUM_QUERY_ROUNDS):
    print(f"\n--- Starting Active Learning Round {round_num + 1}/{NUM_QUERY_ROUNDS} ---")
    print(f"Current Labeled Set Size: {len(labeled_df)}")

    # --- TRAIN on current labeled data ---
    model.train()
    optimizer = AdamW(model.parameters(), lr=3e-5)
    train_dataset = T5HeadlineDataset(labeled_df, tokenizer)
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

    for epoch in range(EPOCHS_PER_ROUND):
        total_loss = 0
        for batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}"):
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1} Avg Loss: {total_loss / len(train_loader):.4f}")

    # --- ACQUIRE & SELECT with DIVERSITY for Imbalanced Data ---
    if len(unlabeled_df) < QUERY_SIZE:
        print("Unlabeled pool is smaller than query size. Finishing training.")
        break

    unlabeled_dataset = T5HeadlineDataset(unlabeled_df, tokenizer)
    unlabeled_loader = DataLoader(unlabeled_dataset, batch_size=BATCH_SIZE, shuffle=False)

    # 1. Get uncertainty scores for the entire unlabeled pool
    uncertainty_scores = get_uncertainty_scores(model, tokenizer, unlabeled_loader)
    unlabeled_df['uncertainty'] = uncertainty_scores

    # 2. Get model predictions for each sample to enable grouping by class
    print("Getting predictions for diversity sampling...")
    preds = []
    model.eval()
    with torch.no_grad():
        for batch in tqdm(unlabeled_loader, desc="Getting Predictions"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            generated_ids = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=10)
            batch_preds = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
            preds.extend(batch_preds)

    unlabeled_df['predicted_category'] = preds

    # 3. Select the most uncertain samples from each predicted category
    print("Selecting diverse samples across predicted classes...")
    queried_indices = []
    unlabeled_df_sorted = unlabeled_df.sort_values('uncertainty', ascending=False)
    grouped = unlabeled_df_sorted.groupby('predicted_category')
    num_classes_predicted = unlabeled_df['predicted_category'].nunique()

    samples_per_class = max(1, QUERY_SIZE // num_classes_predicted) if num_classes_predicted > 0 else QUERY_SIZE

    for group_name, group_df in grouped:
        queried_indices.extend(group_df.head(samples_per_class).index)

    # 4. If we don't have enough samples, fill with the most uncertain ones overall
    if len(queried_indices) < QUERY_SIZE:
        remaining_needed = QUERY_SIZE - len(queried_indices)
        remaining_indices = unlabeled_df_sorted.index.difference(queried_indices)
        queried_indices.extend(remaining_indices[:remaining_needed])

    # Ensure unique indices
    queried_indices = list(dict.fromkeys(queried_indices))
    queried_samples = unlabeled_df.loc[queried_indices]

    # --- UPDATE the datasets ---
    print(f"Querying {len(queried_samples)} new samples...")
    labeled_df = pd.concat([labeled_df, queried_samples.drop(columns=['uncertainty', 'predicted_category'])])
    unlabeled_df = unlabeled_df.drop(index=queried_indices).drop(columns=['uncertainty', 'predicted_category'])

    # Reset indices
    labeled_df = labeled_df.reset_index(drop=True)
    unlabeled_df = unlabeled_df.reset_index(drop=True)

    print(f"New Labeled Set Size: {len(labeled_df)}")
    print(f"Remaining Unlabeled Pool Size: {len(unlabeled_df)}")
    print("-" * 30)

# --- 5. Final Evaluation on the Held-out Test Set ---
model.eval()
print("\nPerforming final evaluation on the held-out test set...")
y_true = []
y_pred = []

test_dataset = T5HeadlineDataset(test_df, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Final Evaluation"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels']

        generated_ids = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=10)
        preds = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
        true_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

        y_pred.extend(preds)
        y_true.extend(true_labels)

# Filter out invalid labels before encoding
valid_labels = set(label_encoder.classes_)
y_true_filtered, y_pred_filtered = [], []
for true, pred in zip(y_true, y_pred):
    if true in valid_labels and pred in valid_labels:
        y_true_filtered.append(true)
        y_pred_filtered.append(pred)

# Encode string labels to integers for metrics
y_true_encoded = label_encoder.transform(y_true_filtered)
y_pred_encoded = label_encoder.transform(y_pred_filtered)

# --- 6. Save Final Results ---
print("Saving final model and report...")
report_labels = np.unique(np.concatenate((y_true_encoded, y_pred_encoded)))
target_names_for_report = label_encoder.classes_[report_labels]

# Use zero_division=0 to prevent errors if a class has no predicted samples
report = classification_report(
    y_true_encoded, y_pred_encoded,
    labels=report_labels,
    target_names=target_names_for_report,
    zero_division=0
)
acc = accuracy_score(y_true_encoded, y_pred_encoded)

output_dir = "t5_active_learning_final"
os.makedirs(output_dir, exist_ok=True)
report_path = os.path.join(output_dir, "classification_report.txt")
model_path = os.path.join(output_dir, "T5_classifier_state_dict.pth")

with open(report_path, "w") as f:
    f.write(f"Final Labeled Dataset Size after {NUM_QUERY_ROUNDS} rounds: {len(labeled_df)}\n")
    f.write(f"Accuracy: {acc:.4f}\n\n")
    f.write(report)

# Saving the model's state_dict is more robust than pickling
torch.save(model.state_dict(), model_path)

print(f"\nModel state dict saved to: {model_path}")
print(f"Report saved to: {report_path}")

2025-06-11 04:34:47.917288: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749616488.124777      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749616488.189909      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Loading and preparing data...
Creating data splits with a stratified initial labeled set...
Aiming for 23 initial samples per class.
Actual Initial Labeled Set Size (stratified): 966
Unlabeled Pool Size: 165515
Test Set Size: 41621
------------------------------


  labeled_df = train_val_df.groupby('category', group_keys=False).apply(


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Using device: cuda

--- Starting Active Learning Round 1/10 ---
Current Labeled Set Size: 966


Training Epoch 1:   0%|          | 0/31 [00:00<?, ?it/s]Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
Training Epoch 1: 100%|██████████| 31/31 [00:05<00:00,  5.64it/s]


Epoch 1 Avg Loss: 6.6219


Training Epoch 2: 100%|██████████| 31/31 [00:04<00:00,  6.98it/s]


Epoch 2 Avg Loss: 3.9312


Training Epoch 3: 100%|██████████| 31/31 [00:04<00:00,  6.95it/s]


Epoch 3 Avg Loss: 3.1582


Acquiring from Unlabeled Pool: 100%|██████████| 5173/5173 [09:32<00:00,  9.03it/s]


Getting predictions for diversity sampling...


Getting Predictions: 100%|██████████| 5173/5173 [13:08<00:00,  6.56it/s]


Selecting diverse samples across predicted classes...
Querying 2212 new samples...
New Labeled Set Size: 3178
Remaining Unlabeled Pool Size: 163303
------------------------------

--- Starting Active Learning Round 2/10 ---
Current Labeled Set Size: 3178


Training Epoch 1: 100%|██████████| 100/100 [00:17<00:00,  5.86it/s]


Epoch 1 Avg Loss: 2.0077


Training Epoch 2: 100%|██████████| 100/100 [00:16<00:00,  5.93it/s]


Epoch 2 Avg Loss: 1.1692


Training Epoch 3: 100%|██████████| 100/100 [00:16<00:00,  6.04it/s]


Epoch 3 Avg Loss: 0.8776


Acquiring from Unlabeled Pool: 100%|██████████| 5104/5104 [09:34<00:00,  8.88it/s]


Getting predictions for diversity sampling...


Getting Predictions: 100%|██████████| 5104/5104 [10:47<00:00,  7.89it/s]


Selecting diverse samples across predicted classes...
Querying 400 new samples...
New Labeled Set Size: 3578
Remaining Unlabeled Pool Size: 162903
------------------------------

--- Starting Active Learning Round 3/10 ---
Current Labeled Set Size: 3578


Training Epoch 1: 100%|██████████| 112/112 [00:19<00:00,  5.86it/s]


Epoch 1 Avg Loss: 0.6738


Training Epoch 2: 100%|██████████| 112/112 [00:18<00:00,  5.91it/s]


Epoch 2 Avg Loss: 0.5480


Training Epoch 3: 100%|██████████| 112/112 [00:18<00:00,  6.04it/s]


Epoch 3 Avg Loss: 0.4782


Acquiring from Unlabeled Pool: 100%|██████████| 5091/5091 [09:33<00:00,  8.88it/s]


Getting predictions for diversity sampling...


Getting Predictions: 100%|██████████| 5091/5091 [12:14<00:00,  6.93it/s]


Selecting diverse samples across predicted classes...
Querying 400 new samples...
New Labeled Set Size: 3978
Remaining Unlabeled Pool Size: 162503
------------------------------

--- Starting Active Learning Round 4/10 ---
Current Labeled Set Size: 3978


Training Epoch 1: 100%|██████████| 125/125 [00:21<00:00,  5.82it/s]


Epoch 1 Avg Loss: 0.4385


Training Epoch 2: 100%|██████████| 125/125 [00:20<00:00,  5.96it/s]


Epoch 2 Avg Loss: 0.4097


Training Epoch 3: 100%|██████████| 125/125 [00:20<00:00,  6.07it/s]


Epoch 3 Avg Loss: 0.3920


Acquiring from Unlabeled Pool: 100%|██████████| 5079/5079 [09:33<00:00,  8.86it/s]


Getting predictions for diversity sampling...


Getting Predictions: 100%|██████████| 5079/5079 [12:34<00:00,  6.73it/s]


Selecting diverse samples across predicted classes...
Querying 400 new samples...
New Labeled Set Size: 4378
Remaining Unlabeled Pool Size: 162103
------------------------------

--- Starting Active Learning Round 5/10 ---
Current Labeled Set Size: 4378


Training Epoch 1: 100%|██████████| 137/137 [00:23<00:00,  5.85it/s]


Epoch 1 Avg Loss: 0.3700


Training Epoch 2: 100%|██████████| 137/137 [00:23<00:00,  5.94it/s]


Epoch 2 Avg Loss: 0.3539


Training Epoch 3: 100%|██████████| 137/137 [00:22<00:00,  6.05it/s]


Epoch 3 Avg Loss: 0.3424


Acquiring from Unlabeled Pool: 100%|██████████| 5066/5066 [09:31<00:00,  8.87it/s]


Getting predictions for diversity sampling...


Getting Predictions: 100%|██████████| 5066/5066 [12:38<00:00,  6.68it/s]


Selecting diverse samples across predicted classes...
Querying 400 new samples...
New Labeled Set Size: 4778
Remaining Unlabeled Pool Size: 161703
------------------------------

--- Starting Active Learning Round 6/10 ---
Current Labeled Set Size: 4778


Training Epoch 1: 100%|██████████| 150/150 [00:25<00:00,  5.86it/s]


Epoch 1 Avg Loss: 0.3269


Training Epoch 2: 100%|██████████| 150/150 [00:25<00:00,  6.00it/s]


Epoch 2 Avg Loss: 0.3145


Training Epoch 3: 100%|██████████| 150/150 [00:24<00:00,  6.08it/s]


Epoch 3 Avg Loss: 0.3067


Acquiring from Unlabeled Pool: 100%|██████████| 5054/5054 [09:29<00:00,  8.87it/s]


Getting predictions for diversity sampling...


Getting Predictions: 100%|██████████| 5054/5054 [12:31<00:00,  6.72it/s]


Selecting diverse samples across predicted classes...
Querying 400 new samples...
New Labeled Set Size: 5178
Remaining Unlabeled Pool Size: 161303
------------------------------

--- Starting Active Learning Round 7/10 ---
Current Labeled Set Size: 5178


Training Epoch 1: 100%|██████████| 162/162 [00:27<00:00,  5.86it/s]


Epoch 1 Avg Loss: 0.2991


Training Epoch 2: 100%|██████████| 162/162 [00:26<00:00,  6.00it/s]


Epoch 2 Avg Loss: 0.2884


Training Epoch 3: 100%|██████████| 162/162 [00:26<00:00,  6.00it/s]


Epoch 3 Avg Loss: 0.2823


Acquiring from Unlabeled Pool: 100%|██████████| 5041/5041 [09:27<00:00,  8.88it/s]


Getting predictions for diversity sampling...


Getting Predictions: 100%|██████████| 5041/5041 [12:52<00:00,  6.53it/s]


Selecting diverse samples across predicted classes...
Querying 400 new samples...
New Labeled Set Size: 5578
Remaining Unlabeled Pool Size: 160903
------------------------------

--- Starting Active Learning Round 8/10 ---
Current Labeled Set Size: 5578


Training Epoch 1: 100%|██████████| 175/175 [00:30<00:00,  5.82it/s]


Epoch 1 Avg Loss: 0.2752


Training Epoch 2: 100%|██████████| 175/175 [00:29<00:00,  6.02it/s]


Epoch 2 Avg Loss: 0.2676


Training Epoch 3: 100%|██████████| 175/175 [00:28<00:00,  6.05it/s]


Epoch 3 Avg Loss: 0.2583


Acquiring from Unlabeled Pool: 100%|██████████| 5029/5029 [09:27<00:00,  8.86it/s]


Getting predictions for diversity sampling...


Getting Predictions: 100%|██████████| 5029/5029 [13:07<00:00,  6.38it/s]


Selecting diverse samples across predicted classes...
Querying 400 new samples...
New Labeled Set Size: 5978
Remaining Unlabeled Pool Size: 160503
------------------------------

--- Starting Active Learning Round 9/10 ---
Current Labeled Set Size: 5978


Training Epoch 1: 100%|██████████| 187/187 [00:31<00:00,  5.86it/s]


Epoch 1 Avg Loss: 0.2560


Training Epoch 2: 100%|██████████| 187/187 [00:31<00:00,  5.98it/s]


Epoch 2 Avg Loss: 0.2499


Training Epoch 3: 100%|██████████| 187/187 [00:31<00:00,  5.98it/s]


Epoch 3 Avg Loss: 0.2459


Acquiring from Unlabeled Pool: 100%|██████████| 5016/5016 [09:27<00:00,  8.84it/s]


Getting predictions for diversity sampling...


Getting Predictions: 100%|██████████| 5016/5016 [13:14<00:00,  6.31it/s]


Selecting diverse samples across predicted classes...
Querying 400 new samples...
New Labeled Set Size: 6378
Remaining Unlabeled Pool Size: 160103
------------------------------

--- Starting Active Learning Round 10/10 ---
Current Labeled Set Size: 6378


Training Epoch 1: 100%|██████████| 200/200 [00:34<00:00,  5.85it/s]


Epoch 1 Avg Loss: 0.2412


Training Epoch 2: 100%|██████████| 200/200 [00:33<00:00,  6.03it/s]


Epoch 2 Avg Loss: 0.2368


Training Epoch 3: 100%|██████████| 200/200 [00:33<00:00,  5.96it/s]


Epoch 3 Avg Loss: 0.2316


Acquiring from Unlabeled Pool: 100%|██████████| 5004/5004 [09:25<00:00,  8.85it/s]


Getting predictions for diversity sampling...


Getting Predictions: 100%|██████████| 5004/5004 [13:11<00:00,  6.32it/s]


Selecting diverse samples across predicted classes...
Querying 400 new samples...
New Labeled Set Size: 6778
Remaining Unlabeled Pool Size: 159703
------------------------------

Performing final evaluation on the held-out test set...


Final Evaluation: 100%|██████████| 1301/1301 [04:12<00:00,  5.15it/s]


Saving final model and report...

Model state dict saved to: t5_active_learning_final/T5_classifier_state_dict.pth
Report saved to: t5_active_learning_final/classification_report.txt
