In [1]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
import os
import numpy as np
from tqdm import tqdm

# --- Active Learning Configuration ---
INITIAL_LABELED_SIZE = 5000  # Start with a seed set of labeled data
QUERY_SIZE = 840             # Number of samples to query in each round (e.g., ~10 per class for 40 classes)
NUM_QUERY_ROUNDS = 10        # Number of active learning rounds
EPOCHS_PER_ROUND = 3         # Number of training epochs in each round
BATCH_SIZE = 64              # Batch size for training and inference

# --- 1. Load and Prepare Data ---
print("Loading and preparing data...")
# Make sure the path to your dataset is correct
df = pd.read_csv("/kaggle/input/news-dataset/News_Category_Dataset.csv")
df = df[['headline', 'category']].dropna().drop_duplicates()
# Shuffle the entire dataset once at the beginning
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Encode labels
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['category'])
labels_list = label_encoder.classes_
num_classes = len(labels_list)

# --- 2. Active Learning Data Split (Stratified Initial Set) ---
print("Creating data splits with a stratified initial labeled set...")
# Split into a fixed test set first (20% of the data)
test_split_idx = int(0.8 * len(df))
train_val_df = df.iloc[:test_split_idx]
test_df = df.iloc[test_split_idx:]

# Determine how many samples to take from each class for the initial set
samples_per_class = max(1, INITIAL_LABELED_SIZE // num_classes)
print(f"Aiming for {samples_per_class} initial samples per class.")

# Group by category and take a balanced sample. This prevents initial bias.
labeled_df = train_val_df.groupby('category', group_keys=False).apply(
    lambda x: x.sample(min(len(x), samples_per_class))
)
print(f"Actual Initial Labeled Set Size (stratified): {len(labeled_df)}")

# The unlabeled pool is everything in the training pool that is NOT in the new labeled set
unlabeled_df = train_val_df.drop(labeled_df.index).reset_index(drop=True)
print(f"Unlabeled Pool Size: {len(unlabeled_df)}")
print(f"Test Set Size: {len(test_df)}")
print("-" * 30)


# --- 3. Initialize Model, Tokenizer, and Device ---
# Use BertForSequenceClassification and BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_classes)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
print(f"Using device: {device}")


# Dataset class adapted for BERT
class BERTHeadlineDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len=64):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        headline = row['headline']
        # The label is now a simple integer
        label = row['label']

        inputs = self.tokenizer.encode_plus(
            headline,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': inputs['input_ids'].squeeze(0),
            'attention_mask': inputs['attention_mask'].squeeze(0),
            # Return the label as a tensor
            'labels': torch.tensor(label, dtype=torch.long)
        }

# --- STABLE Helper function for calculating uncertainty with BERT ---
def get_uncertainty_scores(model, unlabeled_loader):
    """
    Calculates uncertainty scores using the classification logits from BERT.
    Uncertainty = 1 - confidence (max probability).
    """
    model.eval()
    uncertainties = []
    with torch.no_grad():
        for batch in tqdm(unlabeled_loader, desc="Acquiring from Unlabeled Pool"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            # Get logits from the classification model
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            # Convert logits to probabilities
            probabilities = torch.nn.functional.softmax(logits, dim=-1)
            max_probs, _ = torch.max(probabilities, dim=-1)

            # Calculate uncertainty and append
            batch_uncertainties = (1 - max_probs).cpu().numpy()
            uncertainties.extend(batch_uncertainties)
    return uncertainties


# --- 4. The Active Learning Loop ---
for round_num in range(NUM_QUERY_ROUNDS):
    print(f"\n--- Starting Active Learning Round {round_num + 1}/{NUM_QUERY_ROUNDS} ---")
    print(f"Current Labeled Set Size: {len(labeled_df)}")

    # --- TRAIN on current labeled data ---
    model.train()
    optimizer = AdamW(model.parameters(), lr=3e-5)
    # Use the new BERT Dataset
    train_dataset = BERTHeadlineDataset(labeled_df, tokenizer)
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

    for epoch in range(EPOCHS_PER_ROUND):
        total_loss = 0
        for batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}"):
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # Forward pass is simpler for BERT classifier
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1} Avg Loss: {total_loss / len(train_loader):.4f}")

    # --- ACQUIRE & SELECT with DIVERSITY for Imbalanced Data ---
    if len(unlabeled_df) < QUERY_SIZE:
        print("Unlabeled pool is smaller than query size. Finishing training.")
        break

    unlabeled_dataset = BERTHeadlineDataset(unlabeled_df, tokenizer)
    unlabeled_loader = DataLoader(unlabeled_dataset, batch_size=BATCH_SIZE, shuffle=False)

    # 1. Get uncertainty scores for the entire unlabeled pool
    uncertainty_scores = get_uncertainty_scores(model, unlabeled_loader)
    unlabeled_df['uncertainty'] = uncertainty_scores

    # 2. Get model predictions for each sample to enable grouping by class
    print("Getting predictions for diversity sampling...")
    preds_int = []
    model.eval()
    with torch.no_grad():
        for batch in tqdm(unlabeled_loader, desc="Getting Predictions"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            # Get integer predictions by taking the argmax of the logits
            predictions = torch.argmax(outputs.logits, dim=1)
            preds_int.extend(predictions.cpu().numpy())

    # Convert integer predictions back to string category names
    unlabeled_df['predicted_category'] = label_encoder.inverse_transform(preds_int)

    # 3. Select the most uncertain samples from each predicted category
    print("Selecting diverse samples across predicted classes...")
    queried_indices = []
    unlabeled_df_sorted = unlabeled_df.sort_values('uncertainty', ascending=False)
    grouped = unlabeled_df_sorted.groupby('predicted_category')
    num_classes_predicted = unlabeled_df['predicted_category'].nunique()

    samples_per_class = max(1, QUERY_SIZE // num_classes_predicted) if num_classes_predicted > 0 else QUERY_SIZE

    for group_name, group_df in grouped:
        queried_indices.extend(group_df.head(samples_per_class).index)

    # 4. If we don't have enough samples, fill with the most uncertain ones overall
    if len(queried_indices) < QUERY_SIZE:
        remaining_needed = QUERY_SIZE - len(queried_indices)
        remaining_indices = unlabeled_df_sorted.index.difference(queried_indices)
        queried_indices.extend(remaining_indices[:remaining_needed])

    queried_indices = list(dict.fromkeys(queried_indices))
    queried_samples = unlabeled_df.loc[queried_indices]

    # --- UPDATE the datasets ---
    print(f"Querying {len(queried_samples)} new samples...")
    labeled_df = pd.concat([labeled_df, queried_samples.drop(columns=['uncertainty', 'predicted_category'])])
    unlabeled_df = unlabeled_df.drop(index=queried_indices).drop(columns=['uncertainty', 'predicted_category'])

    labeled_df = labeled_df.reset_index(drop=True)
    unlabeled_df = unlabeled_df.reset_index(drop=True)

    print(f"New Labeled Set Size: {len(labeled_df)}")
    print(f"Remaining Unlabeled Pool Size: {len(unlabeled_df)}")
    print("-" * 30)

# --- 5. Final Evaluation on the Held-out Test Set ---
model.eval()
print("\nPerforming final evaluation on the held-out test set...")
y_true = []
y_pred = []
test_dataset = BERTHeadlineDataset(test_df, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Final Evaluation"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs.logits, dim=1)

        y_pred.extend(predictions.cpu().numpy())
        y_true.extend(labels.cpu().numpy())

# The y_true and y_pred are already integer encoded, no further transformation needed
y_true_encoded = np.array(y_true)
y_pred_encoded = np.array(y_pred)

# --- 6. Save Final Results ---
print("Saving final model and report...")
report_labels = np.unique(np.concatenate((y_true_encoded, y_pred_encoded)))
target_names_for_report = label_encoder.classes_[report_labels]

report = classification_report(
    y_true_encoded, y_pred_encoded,
    labels=report_labels,
    target_names=target_names_for_report,
    zero_division=0
)
acc = accuracy_score(y_true_encoded, y_pred_encoded)

output_dir = "bert_active_learning_final"
os.makedirs(output_dir, exist_ok=True)
report_path = os.path.join(output_dir, "classification_report.txt")
model_path = os.path.join(output_dir, "BERT_classifier_state_dict.pth")

with open(report_path, "w") as f:
    f.write(f"Final Labeled Dataset Size after {NUM_QUERY_ROUNDS} rounds: {len(labeled_df)}\n")
    f.write(f"Accuracy: {acc:.4f}\n\n")
    f.write(report)

torch.save(model.state_dict(), model_path)

print(f"\nModel state dict saved to: {model_path}")
print(f"Report saved to: {report_path}")

2025-06-11 09:07:13.020239: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749632833.215402      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749632833.270545      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Loading and preparing data...
Creating data splits with a stratified initial labeled set...
Aiming for 119 initial samples per class.
Actual Initial Labeled Set Size (stratified): 4998
Unlabeled Pool Size: 161483
Test Set Size: 41621
------------------------------


  labeled_df = train_val_df.groupby('category', group_keys=False).apply(


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using device: cuda

--- Starting Active Learning Round 1/10 ---
Current Labeled Set Size: 4998


Training Epoch 1: 100%|██████████| 79/79 [00:47<00:00,  1.66it/s]


Epoch 1 Avg Loss: 3.6895


Training Epoch 2: 100%|██████████| 79/79 [00:51<00:00,  1.55it/s]


Epoch 2 Avg Loss: 2.9962


Training Epoch 3: 100%|██████████| 79/79 [00:56<00:00,  1.40it/s]


Epoch 3 Avg Loss: 2.2435


Acquiring from Unlabeled Pool: 100%|██████████| 2524/2524 [10:46<00:00,  3.90it/s]


Getting predictions for diversity sampling...


Getting Predictions: 100%|██████████| 2524/2524 [10:47<00:00,  3.90it/s]


Selecting diverse samples across predicted classes...
Querying 840 new samples...
New Labeled Set Size: 5838
Remaining Unlabeled Pool Size: 160643
------------------------------

--- Starting Active Learning Round 2/10 ---
Current Labeled Set Size: 5838


Training Epoch 1: 100%|██████████| 92/92 [01:04<00:00,  1.43it/s]


Epoch 1 Avg Loss: 1.9317


Training Epoch 2: 100%|██████████| 92/92 [01:04<00:00,  1.43it/s]


Epoch 2 Avg Loss: 1.4488


Training Epoch 3: 100%|██████████| 92/92 [01:04<00:00,  1.43it/s]


Epoch 3 Avg Loss: 1.0745


Acquiring from Unlabeled Pool: 100%|██████████| 2511/2511 [10:44<00:00,  3.90it/s]


Getting predictions for diversity sampling...


Getting Predictions: 100%|██████████| 2511/2511 [10:44<00:00,  3.90it/s]


Selecting diverse samples across predicted classes...
Querying 840 new samples...
New Labeled Set Size: 6678
Remaining Unlabeled Pool Size: 159803
------------------------------

--- Starting Active Learning Round 3/10 ---
Current Labeled Set Size: 6678


Training Epoch 1: 100%|██████████| 105/105 [01:13<00:00,  1.43it/s]


Epoch 1 Avg Loss: 1.1251


Training Epoch 2: 100%|██████████| 105/105 [01:13<00:00,  1.42it/s]


Epoch 2 Avg Loss: 0.7589


Training Epoch 3: 100%|██████████| 105/105 [01:13<00:00,  1.43it/s]


Epoch 3 Avg Loss: 0.5348


Acquiring from Unlabeled Pool: 100%|██████████| 2497/2497 [10:40<00:00,  3.90it/s]


Getting predictions for diversity sampling...


Getting Predictions: 100%|██████████| 2497/2497 [10:40<00:00,  3.90it/s]


Selecting diverse samples across predicted classes...
Querying 840 new samples...
New Labeled Set Size: 7518
Remaining Unlabeled Pool Size: 158963
------------------------------

--- Starting Active Learning Round 4/10 ---
Current Labeled Set Size: 7518


Training Epoch 1: 100%|██████████| 118/118 [01:22<00:00,  1.42it/s]


Epoch 1 Avg Loss: 0.7456


Training Epoch 2: 100%|██████████| 118/118 [01:23<00:00,  1.42it/s]


Epoch 2 Avg Loss: 0.4416


Training Epoch 3: 100%|██████████| 118/118 [01:22<00:00,  1.42it/s]


Epoch 3 Avg Loss: 0.2966


Acquiring from Unlabeled Pool: 100%|██████████| 2484/2484 [10:36<00:00,  3.90it/s]


Getting predictions for diversity sampling...


Getting Predictions: 100%|██████████| 2484/2484 [10:36<00:00,  3.90it/s]


Selecting diverse samples across predicted classes...
Querying 840 new samples...
New Labeled Set Size: 8358
Remaining Unlabeled Pool Size: 158123
------------------------------

--- Starting Active Learning Round 5/10 ---
Current Labeled Set Size: 8358


Training Epoch 1: 100%|██████████| 131/131 [01:32<00:00,  1.42it/s]


Epoch 1 Avg Loss: 0.5536


Training Epoch 2: 100%|██████████| 131/131 [01:32<00:00,  1.42it/s]


Epoch 2 Avg Loss: 0.3186


Training Epoch 3: 100%|██████████| 131/131 [01:32<00:00,  1.42it/s]


Epoch 3 Avg Loss: 0.1820


Acquiring from Unlabeled Pool: 100%|██████████| 2471/2471 [10:33<00:00,  3.90it/s]


Getting predictions for diversity sampling...


Getting Predictions: 100%|██████████| 2471/2471 [10:34<00:00,  3.90it/s]


Selecting diverse samples across predicted classes...
Querying 840 new samples...
New Labeled Set Size: 9198
Remaining Unlabeled Pool Size: 157283
------------------------------

--- Starting Active Learning Round 6/10 ---
Current Labeled Set Size: 9198


Training Epoch 1: 100%|██████████| 144/144 [01:41<00:00,  1.42it/s]


Epoch 1 Avg Loss: 0.4629


Training Epoch 2: 100%|██████████| 144/144 [01:41<00:00,  1.42it/s]


Epoch 2 Avg Loss: 0.2443


Training Epoch 3: 100%|██████████| 144/144 [01:41<00:00,  1.42it/s]


Epoch 3 Avg Loss: 0.1409


Acquiring from Unlabeled Pool: 100%|██████████| 2458/2458 [10:31<00:00,  3.89it/s]


Getting predictions for diversity sampling...


Getting Predictions: 100%|██████████| 2458/2458 [10:31<00:00,  3.89it/s]


Selecting diverse samples across predicted classes...
Querying 840 new samples...
New Labeled Set Size: 10038
Remaining Unlabeled Pool Size: 156443
------------------------------

--- Starting Active Learning Round 7/10 ---
Current Labeled Set Size: 10038


Training Epoch 1: 100%|██████████| 157/157 [01:50<00:00,  1.42it/s]


Epoch 1 Avg Loss: 0.4028


Training Epoch 2: 100%|██████████| 157/157 [01:50<00:00,  1.42it/s]


Epoch 2 Avg Loss: 0.2164


Training Epoch 3: 100%|██████████| 157/157 [01:50<00:00,  1.42it/s]


Epoch 3 Avg Loss: 0.1225


Acquiring from Unlabeled Pool: 100%|██████████| 2445/2445 [10:27<00:00,  3.90it/s]


Getting predictions for diversity sampling...


Getting Predictions: 100%|██████████| 2445/2445 [10:26<00:00,  3.90it/s]


Selecting diverse samples across predicted classes...
Querying 840 new samples...
New Labeled Set Size: 10878
Remaining Unlabeled Pool Size: 155603
------------------------------

--- Starting Active Learning Round 8/10 ---
Current Labeled Set Size: 10878


Training Epoch 1: 100%|██████████| 170/170 [02:00<00:00,  1.41it/s]


Epoch 1 Avg Loss: 0.3678


Training Epoch 2: 100%|██████████| 170/170 [01:59<00:00,  1.42it/s]


Epoch 2 Avg Loss: 0.1849


Training Epoch 3: 100%|██████████| 170/170 [01:59<00:00,  1.42it/s]


Epoch 3 Avg Loss: 0.1010


Acquiring from Unlabeled Pool: 100%|██████████| 2432/2432 [10:23<00:00,  3.90it/s]


Getting predictions for diversity sampling...


Getting Predictions: 100%|██████████| 2432/2432 [10:24<00:00,  3.90it/s]


Selecting diverse samples across predicted classes...
Querying 840 new samples...
New Labeled Set Size: 11718
Remaining Unlabeled Pool Size: 154763
------------------------------

--- Starting Active Learning Round 9/10 ---
Current Labeled Set Size: 11718


Training Epoch 1: 100%|██████████| 184/184 [02:09<00:00,  1.42it/s]


Epoch 1 Avg Loss: 0.3360


Training Epoch 2: 100%|██████████| 184/184 [02:08<00:00,  1.43it/s]


Epoch 2 Avg Loss: 0.1657


Training Epoch 3: 100%|██████████| 184/184 [02:08<00:00,  1.43it/s]


Epoch 3 Avg Loss: 0.0929


Acquiring from Unlabeled Pool: 100%|██████████| 2419/2419 [10:20<00:00,  3.90it/s]


Getting predictions for diversity sampling...


Getting Predictions: 100%|██████████| 2419/2419 [10:20<00:00,  3.90it/s]


Selecting diverse samples across predicted classes...
Querying 840 new samples...
New Labeled Set Size: 12558
Remaining Unlabeled Pool Size: 153923
------------------------------

--- Starting Active Learning Round 10/10 ---
Current Labeled Set Size: 12558


Training Epoch 1: 100%|██████████| 197/197 [02:18<00:00,  1.43it/s]


Epoch 1 Avg Loss: 0.3143


Training Epoch 2: 100%|██████████| 197/197 [02:18<00:00,  1.42it/s]


Epoch 2 Avg Loss: 0.1448


Training Epoch 3: 100%|██████████| 197/197 [02:18<00:00,  1.42it/s]


Epoch 3 Avg Loss: 0.0749


Acquiring from Unlabeled Pool: 100%|██████████| 2406/2406 [10:17<00:00,  3.90it/s]


Getting predictions for diversity sampling...


Getting Predictions: 100%|██████████| 2406/2406 [10:17<00:00,  3.90it/s]


Selecting diverse samples across predicted classes...
Querying 840 new samples...
New Labeled Set Size: 13398
Remaining Unlabeled Pool Size: 153083
------------------------------

Performing final evaluation on the held-out test set...


Final Evaluation: 100%|██████████| 651/651 [02:46<00:00,  3.90it/s]


Saving final model and report...

Model state dict saved to: bert_active_learning_final/BERT_classifier_state_dict.pth
Report saved to: bert_active_learning_final/classification_report.txt
