In [22]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false" 

from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_name = "distilbert-base-uncased"  # or "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [23]:
import pandas as pd

df = pd.read_csv("gov_docs.csv")
df["label"] = df["category"] + "|" + df["subcategory"]  # e.g., "healthcare|medicaid"

# Map labels to IDs
labels = df["label"].unique().tolist()
label2id = {l: i for i, l in enumerate(labels)}
id2label = {i: l for i, l in enumerate(labels)}

In [24]:
print(df.head())

                                                text    category subcategory  \
0  FDADrug Safety Communication FDA cautions agai...  healthcare         FDA   
1  Field Alert Report Submission Questions and An...  healthcare         FDA   
2  Acceptability of Draft Labeling to Support AND...  healthcare         FDA   
3  Contains Nonbinding Recommendations The Least ...  healthcare         FDA   
4  Center for Devices and Radiological Health Int...  healthcare         FDA   

                                         source_file           label  
0     Updated.DSC-Hydroxychloroquine.chloroquine.txt  healthcare|FDA  
1                                   24740676_FAR.txt  healthcare|FDA  
2  Acceptability-of-Draft-Labeling-to-Support-Abb...  healthcare|FDA  
3                                           1332.txt  healthcare|FDA  
4  CDRH_International_Harmonization_Draft_Strateg...  healthcare|FDA  


In [25]:
# Clean text column (run this before tokenization)
df['text'] = df['text'].fillna('')  # Replace NaN
df['text'] = df['text'].astype(str)  # Force string type
df['text'] = df['text'].str.strip()  # Remove whitespace

# Remove empty texts if needed
df = df[df['text'] != '']

In [26]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)
print(f"Train samples: {len(train_df)}, Validation samples: {len(val_df)}")

Train samples: 207, Validation samples: 52


In [27]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Test tokenization on first row
try:
    test_text = df.iloc[0]['text']
    print("\nTokenizing test text:", test_text[:50], "...")
    tokens = tokenizer(test_text, truncation=True)
    print("Success! Tokenized output keys:", tokens.keys())
except Exception as e:
    print(f"Failed: {type(e).__name__}: {e}")
    print("Problem text:", test_text)


Tokenizing test text: FDADrug Safety Communication FDA cautions against  ...
Success! Tokenized output keys: dict_keys(['input_ids', 'attention_mask'])


In [28]:
import numpy as np
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")  # Automatically uses fast tokenizer

# Tokenize in batches (CPU-friendly)
def tokenize(batch):
    # Tokenize the text
    tokenized = tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=512,
        return_tensors="np"  # Keep NumPy arrays for CPU efficiency
    )
    
    # Convert text labels to numerical IDs using your label2id mapping
    tokenized["labels"] = np.array([label2id[label] for label in batch["label"]])
    
    return tokenized
# def tokenize(batch):
#     return tokenizer(
#         batch["text"],
#         truncation=True,
#         padding="max_length",  # Pad to max_length for static shapes (better CPU performance)
#         max_length=512,        # DistilBERT's limit
#         return_tensors="np"    # NumPy arrays for CPU (smaller memory footprint)
#     )

# Apply to datasets
from datasets import Dataset
dataset = Dataset.from_pandas(df)
dataset = dataset.map(tokenize, batched=True, batch_size=100)  # Process 100 texts at once



train_dataset = Dataset.from_pandas(train_df).map(tokenize, batched=True, batch_size=100)
val_dataset = Dataset.from_pandas(val_df).map(tokenize, batched=True, batch_size=100)

columns_to_remove = ['text', 'category', 'subcategory', 'source_file', 'label', '__index_level_0__']
train_dataset = train_dataset.remove_columns(columns_to_remove)
val_dataset = val_dataset.remove_columns(columns_to_remove)

Map:   0%|          | 0/259 [00:00<?, ? examples/s]

Map:   0%|          | 0/207 [00:00<?, ? examples/s]

Map:   0%|          | 0/52 [00:00<?, ? examples/s]

In [29]:
label_counts = df['label'].value_counts()  # Count occurrences of each label
print("Label distribution:\n", label_counts)

Label distribution:
 label
healthcare|FDA                     30
healthcare|medicaid                30
healthcare|medicare                28
education|k12funding               28
defense|cybersecurity              28
defense|procurement                26
education|studentloans             24
finance|budgets                    24
education|highereducationpolicy    23
finance|tax_policies               18
Name: count, dtype: int64


In [30]:
print("Sample training labels:", train_dataset[0]["labels"])  # Should be an integer
print("Label mapping:", label2id)  # Verify your mapping is correct

Sample training labels: 9
Label mapping: {'healthcare|FDA': 0, 'healthcare|medicaid': 1, 'healthcare|medicare': 2, 'education|highereducationpolicy': 3, 'education|k12funding': 4, 'education|studentloans': 5, 'finance|tax_policies': 6, 'finance|budgets': 7, 'defense|cybersecurity': 8, 'defense|procurement': 9}


In [31]:
# Verify all labels exist in your mapping
missing_labels = set(df['label']) - set(label2id.keys())
if missing_labels:
    raise ValueError(f"Labels missing from mapping: {missing_labels}")

# Check tokenized datasets
print("First training sample:", train_dataset[0].keys())
print("First validation sample:", val_dataset[0].keys())

First training sample: dict_keys(['input_ids', 'attention_mask', 'labels'])
First validation sample: dict_keys(['input_ids', 'attention_mask', 'labels'])


In [32]:
# After loading CSV
df = df.dropna(subset=['text', 'label'])  # Remove rows with missing text or label
df = df[df['text'].str.strip() != '']     # Remove empty texts

In [33]:
from transformers import AutoModelForSequenceClassification, Trainer
import torch

# Get class weights (critical for imbalanced data)
from sklearn.utils.class_weight import compute_class_weight
class_weights = compute_class_weight(
    class_weight="balanced",
    classes=df['label'].unique(),
    y=df['label']
)
weights = torch.tensor(class_weights, dtype=torch.float32)

# Model with weighted loss
# model = AutoModelForSequenceClassification.from_pretrained(
#     "distilbert-base-uncased",
#     num_labels=len(label_counts),
#     id2label={i: l for i, l in enumerate(label_counts.index)},
#     label2id={l: i for i, l in enumerate(label_counts.index)}
# )

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=len(label2id),  # Use label2id count
    id2label=id2label,         # Use your original mapping
    label2id=label2id          # Use your original mapping
)

# class CustomTrainer(Trainer):
#     def compute_loss(self, model, inputs, return_outputs=False):
#         labels = inputs.pop("labels")
#         outputs = model(**inputs)
#         logits = outputs.logits
#         loss_fct = torch.nn.CrossEntropyLoss(weight=weights.to(model.device))
#         loss = loss_fct(logits.view(-1, model.config.num_labels), labels.view(-1))
#         return (loss, outputs) if return_outputs else loss
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        
        # Ensure weights are on correct device
        current_weights = weights.to(model.device)
        
        loss_fct = torch.nn.CrossEntropyLoss(weight=current_weights)
        loss = loss_fct(logits.view(-1, model.config.num_labels), 
                      labels.view(-1))
        return (loss, outputs) if return_outputs else loss

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./cpu_training_results",  # Save directory
    # Batch sizes (CPU-optimized)
    per_device_train_batch_size=4,       # Small batches for CPU (4-8)
    per_device_eval_batch_size=8,        # Slightly larger for eval
    gradient_accumulation_steps=2,       # Accumulate gradients to simulate larger batches
    
    # Logging/Eval (Balanced for CPU)
    logging_strategy="steps",
    logging_steps=50,                    # Log every 50 steps (frequent but not excessive)
    eval_strategy="steps",               # Evaluate periodically
    eval_steps=200,                      # Evaluate every 200 steps
    save_strategy="epoch",               # Save checkpoints per epoch
    
    # Training Hyperparams
    learning_rate=2e-5,                  # Standard for fine-tuning
    num_train_epochs=5,                  # Train longer (adjust based on loss plateau)
    max_steps=-1,                        # Let epochs control training (not steps)
    
    # CPU-Specific
    fp16=False,                          # Disable mixed-precision (CPU-only)
    no_cuda=True,                        # Force CPU mode
    dataloader_pin_memory=False,         # Disable pin_memory (no GPU)
    dataloader_num_workers=2,            # Light multiprocessing (avoid CPU overload)
    
    # Misc
    report_to="none",                    # Disable external logging
    load_best_model_at_end=True,         # Keep best model
    metric_for_best_model="eval_loss",   # Track validation loss
)

In [36]:
# Check the first sample in train_dataset
print("Train sample keys:", train_dataset[0].keys())
print("Train 'labels' type:", type(train_dataset[0]["labels"]))  # Should be `int` or `numpy.int64`

# Check the first sample in val_dataset
print("Validation sample keys:", val_dataset[0].keys())
print("Validation 'labels' type:", type(val_dataset[0]["labels"]))  # Should be `int` or `numpy.int64`

Train sample keys: dict_keys(['input_ids', 'attention_mask', 'labels'])
Train 'labels' type: <class 'int'>
Validation sample keys: dict_keys(['input_ids', 'attention_mask', 'labels'])
Validation 'labels' type: <class 'int'>


In [41]:
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,  # Use tokenized training data
    eval_dataset=val_dataset,     # Use tokenized validation data
)

trainer.train()  # Start training!

Step,Training Loss,Validation Loss
1,2.0466,2.291556
2,2.2513,2.290128
3,2.2668,2.2886


TrainOutput(global_step=3, training_loss=2.188241958618164, metrics={'train_runtime': 237.1028, 'train_samples_per_second': 0.101, 'train_steps_per_second': 0.013, 'total_flos': 3179671142400.0, 'train_loss': 2.188241958618164, 'epoch': 0.11538461538461539})

In [43]:
# # After training (BEFORE kernel restart)
# model.save_pretrained("./my_finetuned_model")
# tokenizer.save_pretrained("./my_finetuned_model")

# # Save label mappings
# import json
# with open("./my_finetuned_model/label_mappings.json", "w") as f:
#     json.dump({
#         "label2id": label2id,  # Your existing mapping dict
#         "id2label": id2label   # Your existing mapping dict
#     }, f)

# import os
from datetime import datetime

# Generate a unique folder name (e.g., "model_20240515_153022")
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
model_dir = f"models/model_{timestamp}"

# Save model, tokenizer, and mappings
model.save_pretrained(model_dir)
tokenizer.save_pretrained(model_dir)

with open(f"{model_dir}/label_mappings.json", "w") as f:
    json.dump({"label2id": label2id, "id2label": id2label}, f)

print(f"Model saved to: {model_dir}")

Model saved to: models/model_20250411_110144


In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_path = "./my_finetuned_model"
model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

import numpy as np

# Example: Classify new text
text = "Medicaid budget 2024"
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
outputs = model(**inputs)
pred_label_id = np.argmax(outputs.logits.detach().numpy())
pred_label = id2label[pred_label_id]  # Use your `id2label` mapping
print(f"Predicted: {pred_label}")

Predicted: education|k12funding


In [None]:
# Get predictions
predictions = trainer.predict(val_dataset)
preds = np.argmax(predictions.predictions, axis=-1)

# Classification report
from sklearn.metrics import classification_report
print(classification_report(predictions.label_ids, preds, target_names=list(label2id.keys())))

                                 precision    recall  f1-score   support

                 healthcare|FDA       0.14      0.17      0.15         6
            healthcare|medicaid       0.20      0.17      0.18         6
            healthcare|medicare       0.00      0.00      0.00         6
education|highereducationpolicy       0.00      0.00      0.00         4
           education|k12funding       0.12      0.83      0.22         6
         education|studentloans       0.00      0.00      0.00         5
           finance|tax_policies       0.00      0.00      0.00         3
                finance|budgets       0.00      0.00      0.00         5
          defense|cybersecurity       0.00      0.00      0.00         6
            defense|procurement       0.00      0.00      0.00         5

                       accuracy                           0.13        52
                      macro avg       0.05      0.12      0.06        52
                   weighted avg       0.05      0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
print("Sample labels before conversion:")
print(train_df['label'].head())

print("\nLabel to ID mapping:")
print(label2id)

Sample labels before conversion:
267      defense|procurement
247      defense|procurement
245      defense|procurement
227    defense|cybersecurity
12            healthcare|FDA
Name: label, dtype: object

Label to ID mapping:
{'healthcare|FDA': 0, 'healthcare|medicaid': 1, 'healthcare|medicare': 2, 'education|highereducationpolicy': 3, 'education|k12funding': 4, 'education|studentloans': 5, 'finance|tax_policies': 6, 'finance|budgets': 7, 'defense|cybersecurity': 8, 'defense|procurement': 9}
