In [2]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false" 

from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_name = "distilbert-base-uncased"  # or "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [3]:
import pandas as pd

df = pd.read_csv("gov_docs.csv")
df["label"] = df["category"] + "|" + df["subcategory"]  # e.g., "healthcare|medicaid"

# Map labels to IDs
labels = df["label"].unique().tolist()
label2id = {l: i for i, l in enumerate(labels)}
id2label = {i: l for i, l in enumerate(labels)}

In [4]:
print(df.head())

                                                text    category subcategory  \
0  FDADrug Safety Communication FDA cautions agai...  healthcare         FDA   
1  Field Alert Report Submission Questions and An...  healthcare         FDA   
2  Acceptability of Draft Labeling to Support AND...  healthcare         FDA   
3  Contains Nonbinding Recommendations The Least ...  healthcare         FDA   
4  Center for Devices and Radiological Health Int...  healthcare         FDA   

                                         source_file           label  
0     Updated.DSC-Hydroxychloroquine.chloroquine.txt  healthcare|FDA  
1                                   24740676_FAR.txt  healthcare|FDA  
2  Acceptability-of-Draft-Labeling-to-Support-Abb...  healthcare|FDA  
3                                           1332.txt  healthcare|FDA  
4  CDRH_International_Harmonization_Draft_Strateg...  healthcare|FDA  


In [5]:
# Clean text column (run this before tokenization)
df['text'] = df['text'].fillna('')  # Replace NaN
df['text'] = df['text'].astype(str)  # Force string type
df['text'] = df['text'].str.strip()  # Remove whitespace

# Remove empty texts if needed
df = df[df['text'] != '']

In [6]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)
print(f"Train samples: {len(train_df)}, Validation samples: {len(val_df)}")

Train samples: 207, Validation samples: 52


In [7]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Test tokenization on first row
try:
    test_text = df.iloc[0]['text']
    print("\nTokenizing test text:", test_text[:50], "...")
    tokens = tokenizer(test_text, truncation=True)
    print("Success! Tokenized output keys:", tokens.keys())
except Exception as e:
    print(f"Failed: {type(e).__name__}: {e}")
    print("Problem text:", test_text)


Tokenizing test text: FDADrug Safety Communication FDA cautions against  ...
Success! Tokenized output keys: dict_keys(['input_ids', 'attention_mask'])


In [8]:
import numpy as np
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")  # Automatically uses fast tokenizer

# Tokenize in batches (CPU-friendly)
def tokenize(batch):
    # Tokenize the text
    tokenized = tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=512,
        return_tensors="np"  # Keep NumPy arrays for CPU efficiency
    )
    
    # Convert text labels to numerical IDs using your label2id mapping
    tokenized["labels"] = np.array([label2id[label] for label in batch["label"]])
    
    return tokenized
# def tokenize(batch):
#     return tokenizer(
#         batch["text"],
#         truncation=True,
#         padding="max_length",  # Pad to max_length for static shapes (better CPU performance)
#         max_length=512,        # DistilBERT's limit
#         return_tensors="np"    # NumPy arrays for CPU (smaller memory footprint)
#     )

# Apply to datasets
from datasets import Dataset
dataset = Dataset.from_pandas(df)
dataset = dataset.map(tokenize, batched=True, batch_size=100)  # Process 100 texts at once



train_dataset = Dataset.from_pandas(train_df).map(tokenize, batched=True, batch_size=100)
val_dataset = Dataset.from_pandas(val_df).map(tokenize, batched=True, batch_size=100)

columns_to_remove = ['text', 'category', 'subcategory', 'source_file', 'label', '__index_level_0__']
train_dataset = train_dataset.remove_columns(columns_to_remove)
val_dataset = val_dataset.remove_columns(columns_to_remove)

Map:   0%|          | 0/259 [00:00<?, ? examples/s]

Map:   0%|          | 0/207 [00:00<?, ? examples/s]

Map:   0%|          | 0/52 [00:00<?, ? examples/s]

In [9]:
label_counts = df['label'].value_counts()  # Count occurrences of each label
print("Label distribution:\n", label_counts)

Label distribution:
 label
healthcare|FDA                     30
healthcare|medicaid                30
healthcare|medicare                28
education|k12funding               28
defense|cybersecurity              28
defense|procurement                26
education|studentloans             24
finance|budgets                    24
education|highereducationpolicy    23
finance|tax_policies               18
Name: count, dtype: int64


In [10]:
print("Sample training labels:", train_dataset[0]["labels"])  # Should be an integer
print("Label mapping:", label2id)  # Verify your mapping is correct

Sample training labels: 9
Label mapping: {'healthcare|FDA': 0, 'healthcare|medicaid': 1, 'healthcare|medicare': 2, 'education|highereducationpolicy': 3, 'education|k12funding': 4, 'education|studentloans': 5, 'finance|tax_policies': 6, 'finance|budgets': 7, 'defense|cybersecurity': 8, 'defense|procurement': 9}


In [11]:
# Verify all labels exist in your mapping
missing_labels = set(df['label']) - set(label2id.keys())
if missing_labels:
    raise ValueError(f"Labels missing from mapping: {missing_labels}")

# Check tokenized datasets
print("First training sample:", train_dataset[0].keys())
print("First validation sample:", val_dataset[0].keys())

First training sample: dict_keys(['input_ids', 'attention_mask', 'labels'])
First validation sample: dict_keys(['input_ids', 'attention_mask', 'labels'])


In [12]:
# After loading CSV
df = df.dropna(subset=['text', 'label'])  # Remove rows with missing text or label
df = df[df['text'].str.strip() != '']     # Remove empty texts

In [13]:
from transformers import AutoModelForSequenceClassification, Trainer
import torch

# Get class weights (critical for imbalanced data)
from sklearn.utils.class_weight import compute_class_weight
class_weights = compute_class_weight(
    class_weight="balanced",
    classes=df['label'].unique(),
    y=df['label']
)
weights = torch.tensor(class_weights, dtype=torch.float32)

# Model with weighted loss
# model = AutoModelForSequenceClassification.from_pretrained(
#     "distilbert-base-uncased",
#     num_labels=len(label_counts),
#     id2label={i: l for i, l in enumerate(label_counts.index)},
#     label2id={l: i for i, l in enumerate(label_counts.index)}
# )

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=len(label2id),  # Use label2id count
    id2label=id2label,         # Use your original mapping
    label2id=label2id          # Use your original mapping
)

# class CustomTrainer(Trainer):
#     def compute_loss(self, model, inputs, return_outputs=False):
#         labels = inputs.pop("labels")
#         outputs = model(**inputs)
#         logits = outputs.logits
#         loss_fct = torch.nn.CrossEntropyLoss(weight=weights.to(model.device))
#         loss = loss_fct(logits.view(-1, model.config.num_labels), labels.view(-1))
#         return (loss, outputs) if return_outputs else loss
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        
        # Ensure weights are on correct device
        current_weights = weights.to(model.device)
        
        loss_fct = torch.nn.CrossEntropyLoss(weight=current_weights)
        loss = loss_fct(logits.view(-1, model.config.num_labels), 
                      labels.view(-1))
        return (loss, outputs) if return_outputs else loss

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./cpu_results",
    per_device_train_batch_size=4,  # Reduce batch size for CPU (typical: 4-8)
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,   # Simulate larger batches by accumulating gradients
    eval_strategy="steps",
    eval_steps=500,                 # Check validation less frequently
    save_strategy="epoch",
    logging_steps=100,
    learning_rate=2e-5,
    num_train_epochs=3,             # Fewer epochs if CPU is slow
    fp16=False,                  # Disable mixed-precision (CPU doesn't support it)
    no_cuda=True,                   # Ensure no GPU is accidentally used
)



In [15]:
# Check the first sample in train_dataset
print("Train sample keys:", train_dataset[0].keys())
print("Train 'labels' type:", type(train_dataset[0]["labels"]))  # Should be `int` or `numpy.int64`

# Check the first sample in val_dataset
print("Validation sample keys:", val_dataset[0].keys())
print("Validation 'labels' type:", type(val_dataset[0]["labels"]))  # Should be `int` or `numpy.int64`

Train sample keys: dict_keys(['input_ids', 'attention_mask', 'labels'])
Train 'labels' type: <class 'int'>
Validation sample keys: dict_keys(['input_ids', 'attention_mask', 'labels'])
Validation 'labels' type: <class 'int'>


In [16]:
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,  # Use tokenized training data
    eval_dataset=val_dataset,     # Use tokenized validation data
)

trainer.train()  # Start training!

Step,Training Loss,Validation Loss


TrainOutput(global_step=10, training_loss=2.2968854904174805, metrics={'train_runtime': 228.6527, 'train_samples_per_second': 0.35, 'train_steps_per_second': 0.044, 'total_flos': 10598903808000.0, 'train_loss': 2.2968854904174805, 'epoch': 0.38461538461538464})

In [17]:
# After training (BEFORE kernel restart)
model.save_pretrained("./my_finetuned_model")
tokenizer.save_pretrained("./my_finetuned_model")

# Save label mappings
import json
with open("./my_finetuned_model/label_mappings.json", "w") as f:
    json.dump({
        "label2id": label2id,  # Your existing mapping dict
        "id2label": id2label   # Your existing mapping dict
    }, f)

In [18]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_path = "./my_finetuned_model"
model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

import numpy as np

# Example: Classify new text
text = "Medicaid budget 2024"
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
outputs = model(**inputs)
pred_label_id = np.argmax(outputs.logits.detach().numpy())
pred_label = id2label[pred_label_id]  # Use your `id2label` mapping
print(f"Predicted: {pred_label}")

Predicted: education|k12funding


In [19]:
# Get predictions
predictions = trainer.predict(val_dataset)
preds = np.argmax(predictions.predictions, axis=-1)

# Classification report
from sklearn.metrics import classification_report
print(classification_report(predictions.label_ids, preds, target_names=list(label2id.keys())))

                                 precision    recall  f1-score   support

                 healthcare|FDA       0.14      0.17      0.15         6
            healthcare|medicaid       0.20      0.17      0.18         6
            healthcare|medicare       0.00      0.00      0.00         6
education|highereducationpolicy       0.00      0.00      0.00         4
           education|k12funding       0.12      0.83      0.22         6
         education|studentloans       0.00      0.00      0.00         5
           finance|tax_policies       0.00      0.00      0.00         3
                finance|budgets       0.00      0.00      0.00         5
          defense|cybersecurity       0.00      0.00      0.00         6
            defense|procurement       0.00      0.00      0.00         5

                       accuracy                           0.13        52
                      macro avg       0.05      0.12      0.06        52
                   weighted avg       0.05      0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [20]:
print("Sample labels before conversion:")
print(train_df['label'].head())

print("\nLabel to ID mapping:")
print(label2id)

Sample labels before conversion:
267      defense|procurement
247      defense|procurement
245      defense|procurement
227    defense|cybersecurity
12            healthcare|FDA
Name: label, dtype: object

Label to ID mapping:
{'healthcare|FDA': 0, 'healthcare|medicaid': 1, 'healthcare|medicare': 2, 'education|highereducationpolicy': 3, 'education|k12funding': 4, 'education|studentloans': 5, 'finance|tax_policies': 6, 'finance|budgets': 7, 'defense|cybersecurity': 8, 'defense|procurement': 9}


In [21]:
print(train_dataset[0])  # Should ONLY show: input_ids, attention_mask, labels

{'input_ids': [101, 3189, 2053, 1012, 26489, 8004, 1514, 2355, 1514, 5511, 2629, 1057, 1012, 1055, 1012, 2533, 1997, 3639, 1045, 1050, 1055, 1052, 1041, 1039, 2000, 1054, 1043, 1041, 1050, 1041, 1054, 1037, 1048, 2258, 2756, 1010, 2355, 1996, 2250, 2486, 6194, 2005, 10439, 22046, 2250, 2486, 2166, 5402, 2968, 2415, 2309, 1514, 2400, 25617, 1514, 6959, 25617, 1514, 11712, 8311, 2342, 7620, 11109, 8122, 17842, 8012, 18447, 13910, 15780, 8122, 17842, 8012, 25481, 2256, 3260, 2003, 2000, 3073, 2981, 1010, 7882, 1010, 1998, 23259, 15709, 1997, 1996, 2533, 1997, 3639, 2008, 6753, 1996, 2162, 20027, 1025, 14067, 17842, 1010, 11109, 1010, 1998, 8122, 1025, 25453, 1996, 3187, 1997, 3639, 1998, 3519, 1025, 1998, 15670, 1996, 2270, 1012, 4432, 2256, 4432, 2003, 2000, 2022, 1037, 2944, 15709, 3029, 1999, 1996, 2976, 2231, 2011, 2877, 2689, 1010, 4092, 3606, 1010, 1998, 7694, 8012, 1517, 1037, 7578, 3029, 1010, 2551, 2362, 2004, 2028, 2658, 2136, 1010, 3858, 2004, 4177, 1999, 2256, 2492, 1012, 9861