In [16]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false" 

from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_name = "distilbert-base-uncased"  # or "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [17]:
import pandas as pd

df = pd.read_csv("gov_docs.csv")
df["label"] = df["category"] + "|" + df["subcategory"]  # e.g., "healthcare|medicaid"

# Map labels to IDs
labels = df["label"].unique().tolist()
label2id = {l: i for i, l in enumerate(labels)}
id2label = {i: l for i, l in enumerate(labels)}

In [18]:
print(df.head())

                                                text    category subcategory  \
0  FDADrug Safety Communication FDA cautions agai...  healthcare         FDA   
1  Field Alert Report Submission Questions and An...  healthcare         FDA   
2  Acceptability of Draft Labeling to Support AND...  healthcare         FDA   
3  Contains Nonbinding Recommendations The Least ...  healthcare         FDA   
4  Center for Devices and Radiological Health Int...  healthcare         FDA   

                                         source_file           label  
0     Updated.DSC-Hydroxychloroquine.chloroquine.txt  healthcare|FDA  
1                                   24740676_FAR.txt  healthcare|FDA  
2  Acceptability-of-Draft-Labeling-to-Support-Abb...  healthcare|FDA  
3                                           1332.txt  healthcare|FDA  
4  CDRH_International_Harmonization_Draft_Strateg...  healthcare|FDA  


In [30]:
# Clean text column (run this before tokenization)
df['text'] = df['text'].fillna('')  # Replace NaN
df['text'] = df['text'].astype(str)  # Force string type
df['text'] = df['text'].str.strip()  # Remove whitespace

# Remove empty texts if needed
df = df[df['text'] != '']

In [31]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)
print(f"Train samples: {len(train_df)}, Validation samples: {len(val_df)}")

Train samples: 207, Validation samples: 52


In [21]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Test tokenization on first row
try:
    test_text = df.iloc[0]['text']
    print("\nTokenizing test text:", test_text[:50], "...")
    tokens = tokenizer(test_text, truncation=True)
    print("Success! Tokenized output keys:", tokens.keys())
except Exception as e:
    print(f"Failed: {type(e).__name__}: {e}")
    print("Problem text:", test_text)


Tokenizing test text: FDADrug Safety Communication FDA cautions against  ...
Success! Tokenized output keys: dict_keys(['input_ids', 'attention_mask'])


In [39]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")  # Automatically uses fast tokenizer

# Tokenize in batches (CPU-friendly)
def tokenize(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",  # Pad to max_length for static shapes (better CPU performance)
        max_length=512,        # DistilBERT's limit
        return_tensors="np"    # NumPy arrays for CPU (smaller memory footprint)
    )

# Apply to datasets
from datasets import Dataset
dataset = Dataset.from_pandas(df)
dataset = dataset.map(tokenize, batched=True, batch_size=100)  # Process 100 texts at once
train_dataset = Dataset.from_pandas(train_df).map(tokenize, batched=True, batch_size=100)
val_dataset = Dataset.from_pandas(val_df).map(tokenize, batched=True, batch_size=100)

Map:   0%|          | 0/259 [00:00<?, ? examples/s]

Map:   0%|          | 0/207 [00:00<?, ? examples/s]

Map:   0%|          | 0/52 [00:00<?, ? examples/s]

In [None]:
# label_counts = df['label'].value_counts()  # Count occurrences of each label
# print("Label distribution:\n", label_counts)

Label distribution:
 label
healthcare|FDA                     30
healthcare|medicaid                30
healthcare|medicare                28
education|k12funding               28
defense|cybersecurity              28
defense|procurement                26
education|studentloans             24
finance|budgets                    24
education|highereducationpolicy    23
finance|tax_policies               18
Name: count, dtype: int64


In [34]:
from transformers import AutoModelForSequenceClassification, Trainer
import torch

# Get class weights (critical for imbalanced data)
from sklearn.utils.class_weight import compute_class_weight
class_weights = compute_class_weight(
    class_weight="balanced",
    classes=df['label'].unique(),
    y=df['label']
)
weights = torch.tensor(class_weights, dtype=torch.float32)

# Model with weighted loss
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=len(label_counts),
    id2label={i: l for i, l in enumerate(label_counts.index)},
    label2id={l: i for i, l in enumerate(label_counts.index)}
)

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = torch.nn.CrossEntropyLoss(weight=weights.to(model.device))
        loss = loss_fct(logits.view(-1, model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [38]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./cpu_results",
    per_device_train_batch_size=4,  # Reduce batch size for CPU (typical: 4-8)
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,   # Simulate larger batches by accumulating gradients
    eval_strategy="steps",
    eval_steps=500,                 # Check validation less frequently
    save_strategy="epoch",
    logging_steps=100,
    learning_rate=2e-5,
    num_train_epochs=3,             # Fewer epochs if CPU is slow
    fp16=False,                  # Disable mixed-precision (CPU doesn't support it)
    no_cuda=True,                   # Ensure no GPU is accidentally used
)

In [40]:
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,  # Use tokenized training data
    eval_dataset=val_dataset,     # Use tokenized validation data
)

trainer.train()  # Start training!

ValueError: too many dimensions 'str'

In [41]:
print("Sample labels before conversion:")
print(train_df['label'].head())

print("\nLabel to ID mapping:")
print(label2id)

Sample labels before conversion:
267      defense|procurement
247      defense|procurement
245      defense|procurement
227    defense|cybersecurity
12            healthcare|FDA
Name: label, dtype: object

Label to ID mapping:
{'healthcare|FDA': 0, 'healthcare|medicaid': 1, 'healthcare|medicare': 2, 'education|highereducationpolicy': 3, 'education|k12funding': 4, 'education|studentloans': 5, 'finance|tax_policies': 6, 'finance|budgets': 7, 'defense|cybersecurity': 8, 'defense|procurement': 9}
