In [1]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false" 

from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_name = "distilbert-base-uncased"  # or "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)



In [2]:
import pandas as pd

df = pd.read_csv("gov_docs.csv")
df["label"] = df["category"] + "|" + df["subcategory"]  # e.g., "healthcare|medicaid"

# Map labels to IDs
labels = df["label"].unique().tolist()
label2id = {l: i for i, l in enumerate(labels)}
id2label = {i: l for i, l in enumerate(labels)}

In [3]:
print(df.head())

                                                text    category subcategory  \
0  FDADrug Safety Communication FDA cautions agai...  healthcare         FDA   
1  Field Alert Report Submission Questions and An...  healthcare         FDA   
2  Acceptability of Draft Labeling to Support AND...  healthcare         FDA   
3  Contains Nonbinding Recommendations The Least ...  healthcare         FDA   
4  Center for Devices and Radiological Health Int...  healthcare         FDA   

                                         source_file           label  
0     Updated.DSC-Hydroxychloroquine.chloroquine.txt  healthcare|FDA  
1                                   24740676_FAR.txt  healthcare|FDA  
2  Acceptability-of-Draft-Labeling-to-Support-Abb...  healthcare|FDA  
3                                           1332.txt  healthcare|FDA  
4  CDRH_International_Harmonization_Draft_Strateg...  healthcare|FDA  


In [4]:
# Clean text column (run this before tokenization)
df['text'] = df['text'].fillna('')  # Replace NaN
df['text'] = df['text'].astype(str)  # Force string type
df['text'] = df['text'].str.strip()  # Remove whitespace

# Remove empty texts if needed
df = df[df['text'] != '']

In [5]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)
print(f"Train samples: {len(train_df)}, Validation samples: {len(val_df)}")

Train samples: 207, Validation samples: 52


In [6]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Test tokenization on first row
try:
    test_text = df.iloc[0]['text']
    print("\nTokenizing test text:", test_text[:50], "...")
    tokens = tokenizer(test_text, truncation=True)
    print("Success! Tokenized output keys:", tokens.keys())
except Exception as e:
    print(f"Failed: {type(e).__name__}: {e}")
    print("Problem text:", test_text)


Tokenizing test text: FDADrug Safety Communication FDA cautions against  ...
Success! Tokenized output keys: dict_keys(['input_ids', 'attention_mask'])


In [7]:
import numpy as np
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")  # Automatically uses fast tokenizer

# Tokenize in batches (CPU-friendly)
def tokenize(batch):
    # Tokenize the text
    tokenized = tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=512,
        return_tensors="np"  # Keep NumPy arrays for CPU efficiency
    )
    
    # Convert text labels to numerical IDs using your label2id mapping
    tokenized["labels"] = np.array([label2id[label] for label in batch["label"]])
    
    return tokenized
# def tokenize(batch):
#     return tokenizer(
#         batch["text"],
#         truncation=True,
#         padding="max_length",  # Pad to max_length for static shapes (better CPU performance)
#         max_length=512,        # DistilBERT's limit
#         return_tensors="np"    # NumPy arrays for CPU (smaller memory footprint)
#     )

# Apply to datasets
from datasets import Dataset
dataset = Dataset.from_pandas(df)
dataset = dataset.map(tokenize, batched=True, batch_size=100)  # Process 100 texts at once



train_dataset = Dataset.from_pandas(train_df).map(tokenize, batched=True, batch_size=100)
val_dataset = Dataset.from_pandas(val_df).map(tokenize, batched=True, batch_size=100)

columns_to_remove = ['text', 'category', 'subcategory', 'source_file', 'label', '__index_level_0__']
train_dataset = train_dataset.remove_columns(columns_to_remove)
val_dataset = val_dataset.remove_columns(columns_to_remove)

Map:   0%|          | 0/259 [00:00<?, ? examples/s]

Map:   0%|          | 0/207 [00:00<?, ? examples/s]

Map:   0%|          | 0/52 [00:00<?, ? examples/s]

In [8]:
label_counts = df['label'].value_counts()  # Count occurrences of each label
print("Label distribution:\n", label_counts)

Label distribution:
 label
healthcare|FDA                     30
healthcare|medicaid                30
healthcare|medicare                28
education|k12funding               28
defense|cybersecurity              28
defense|procurement                26
education|studentloans             24
finance|budgets                    24
education|highereducationpolicy    23
finance|tax_policies               18
Name: count, dtype: int64


In [9]:
print("Sample training labels:", train_dataset[0]["labels"])  # Should be an integer
print("Label mapping:", label2id)  # Verify your mapping is correct

Sample training labels: 9
Label mapping: {'healthcare|FDA': 0, 'healthcare|medicaid': 1, 'healthcare|medicare': 2, 'education|highereducationpolicy': 3, 'education|k12funding': 4, 'education|studentloans': 5, 'finance|tax_policies': 6, 'finance|budgets': 7, 'defense|cybersecurity': 8, 'defense|procurement': 9}


In [10]:
# Verify all labels exist in your mapping
missing_labels = set(df['label']) - set(label2id.keys())
if missing_labels:
    raise ValueError(f"Labels missing from mapping: {missing_labels}")

# Check tokenized datasets
print("First training sample:", train_dataset[0].keys())
print("First validation sample:", val_dataset[0].keys())

First training sample: dict_keys(['input_ids', 'attention_mask', 'labels'])
First validation sample: dict_keys(['input_ids', 'attention_mask', 'labels'])


In [11]:
# After loading CSV
df = df.dropna(subset=['text', 'label'])  # Remove rows with missing text or label
df = df[df['text'].str.strip() != '']     # Remove empty texts

In [12]:
from transformers import AutoModelForSequenceClassification, Trainer
import torch

# Get class weights (critical for imbalanced data)
from sklearn.utils.class_weight import compute_class_weight
class_weights = compute_class_weight(
    class_weight="balanced",
    classes=df['label'].unique(),
    y=df['label']
)
weights = torch.tensor(class_weights, dtype=torch.float32)

# Model with weighted loss
# model = AutoModelForSequenceClassification.from_pretrained(
#     "distilbert-base-uncased",
#     num_labels=len(label_counts),
#     id2label={i: l for i, l in enumerate(label_counts.index)},
#     label2id={l: i for i, l in enumerate(label_counts.index)}
# )

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=len(label2id),  # Use label2id count
    id2label=id2label,         # Use your original mapping
    label2id=label2id          # Use your original mapping
)

# class CustomTrainer(Trainer):
#     def compute_loss(self, model, inputs, return_outputs=False):
#         labels = inputs.pop("labels")
#         outputs = model(**inputs)
#         logits = outputs.logits
#         loss_fct = torch.nn.CrossEntropyLoss(weight=weights.to(model.device))
#         loss = loss_fct(logits.view(-1, model.config.num_labels), labels.view(-1))
#         return (loss, outputs) if return_outputs else loss
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        
        # Ensure weights are on correct device
        current_weights = weights.to(model.device)
        
        loss_fct = torch.nn.CrossEntropyLoss(weight=current_weights)
        loss = loss_fct(logits.view(-1, model.config.num_labels), 
                      labels.view(-1))
        return (loss, outputs) if return_outputs else loss

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
training_args = TrainingArguments(
    output_dir="./cpu_training_results",
    # Batch sizes
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,
    
    # Logging/Eval/Save (Aligned)
    logging_strategy="steps",
    logging_steps=50,            # Log metrics every 50 steps
    eval_strategy="steps",
    eval_steps=200,              # Evaluate every 200 steps
    save_strategy="steps",
    save_steps=200,              # Must equal eval_steps or be a multiple
    
    # Training
    learning_rate=2e-5,
    num_train_epochs=5,
    max_steps=-1,
    
    # CPU-specific
    fp16=False,
    no_cuda=True,
    dataloader_pin_memory=False,
    dataloader_num_workers=2,
    
    # Model selection
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    
    # Misc
    report_to="none"
)

In [16]:
# Check the first sample in train_dataset
print("Train sample keys:", train_dataset[0].keys())
print("Train 'labels' type:", type(train_dataset[0]["labels"]))  # Should be `int` or `numpy.int64`

# Check the first sample in val_dataset
print("Validation sample keys:", val_dataset[0].keys())
print("Validation 'labels' type:", type(val_dataset[0]["labels"]))  # Should be `int` or `numpy.int64`

Train sample keys: dict_keys(['input_ids', 'attention_mask', 'labels'])
Train 'labels' type: <class 'int'>
Validation sample keys: dict_keys(['input_ids', 'attention_mask', 'labels'])
Validation 'labels' type: <class 'int'>


In [17]:
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,  # Use tokenized training data
    eval_dataset=val_dataset,     # Use tokenized validation data
)

trainer.train()  # Start training!



Step,Training Loss,Validation Loss




TrainOutput(global_step=130, training_loss=1.7449928577129656, metrics={'train_runtime': 3179.7836, 'train_samples_per_second': 0.325, 'train_steps_per_second': 0.041, 'total_flos': 137123318016000.0, 'train_loss': 1.7449928577129656, 'epoch': 5.0})

In [19]:
# # After training (BEFORE kernel restart)
# model.save_pretrained("./my_finetuned_model")
# tokenizer.save_pretrained("./my_finetuned_model")

# # Save label mappings
# import json
# with open("./my_finetuned_model/label_mappings.json", "w") as f:
#     json.dump({
#         "label2id": label2id,  # Your existing mapping dict
#         "id2label": id2label   # Your existing mapping dict
#     }, f)

# import os
import json
from datetime import datetime

# Generate a unique folder name (e.g., "model_20240515_153022")
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
model_dir = f"models/model_{timestamp}"

# Save model, tokenizer, and mappings
model.save_pretrained(model_dir)
tokenizer.save_pretrained(model_dir)

with open(f"{model_dir}/label_mappings.json", "w") as f:
    json.dump({"label2id": label2id, "id2label": id2label}, f)

print(f"Model saved to: {model_dir}")

Model saved to: models/model_20250411_122546


In [None]:
# from transformers import AutoModelForSequenceClassification, AutoTokenizer

# model_path = "/Users/gwin/Documents/Post Undergrad Work/Tax Search/models/model_20250411_122546"
# model = AutoModelForSequenceClassification.from_pretrained(model_path)
# tokenizer = AutoTokenizer.from_pretrained(model_path)

# import numpy as np

# # Example: Classify new text
# text = "Medicaid budget 2024"
# inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
# outputs = model(**inputs)
# pred_label_id = np.argmax(outputs.logits.detach().numpy())
# pred_label = id2label[pred_label_id]  # Use your `id2label` mapping
# print(f"Predicted: {pred_label}")

Predicted: healthcare|medicaid


In [21]:
# Get predictions
predictions = trainer.predict(val_dataset)
preds = np.argmax(predictions.predictions, axis=-1)

# Classification report
from sklearn.metrics import classification_report
print(classification_report(predictions.label_ids, preds, target_names=list(label2id.keys())))



                                 precision    recall  f1-score   support

                 healthcare|FDA       0.86      1.00      0.92         6
            healthcare|medicaid       1.00      0.83      0.91         6
            healthcare|medicare       0.83      0.83      0.83         6
education|highereducationpolicy       1.00      0.50      0.67         4
           education|k12funding       1.00      1.00      1.00         6
         education|studentloans       1.00      1.00      1.00         5
           finance|tax_policies       1.00      1.00      1.00         3
                finance|budgets       1.00      0.20      0.33         5
          defense|cybersecurity       0.71      0.83      0.77         6
            defense|procurement       0.50      1.00      0.67         5

                       accuracy                           0.83        52
                      macro avg       0.89      0.82      0.81        52
                   weighted avg       0.88      0

In [28]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import numpy as np
import json

# Load model and tokenizer
model_path = "/Users/gwin/Documents/Post Undergrad Work/Tax Search/models/model_20250411_122546"
model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Load label mappings - FIXED VERSION
with open(f"{model_path}/label_mappings.json", "r") as f:
    mappings = json.load(f)  # Load ONCE
    label2id = mappings["label2id"]
    id2label = {int(k): v for k, v in mappings["id2label"].items()}  # Convert keys to int

print("Successfully loaded:")
print(f"label2id: {list(label2id.items())[:3]}...")  # Print first 3 as sample
print(f"id2label: {list(id2label.items())[:3]}...")

Successfully loaded:
label2id: [('healthcare|FDA', 0), ('healthcare|medicaid', 1), ('healthcare|medicare', 2)]...
id2label: [(0, 'healthcare|FDA'), (1, 'healthcare|medicaid'), (2, 'healthcare|medicare')]...


In [None]:
from sklearn.metrics import classification_report

# Tokenize validation data
val_encodings = tokenizer(
    val_df["text"].tolist(),
    truncation=True,
    padding=True,
    return_tensors="pt"
)

# Get predictions
with torch.no_grad():
    outputs = model(**val_encodings)
    preds = np.argmax(outputs.logits.numpy(), axis=-1)

# Classification report
print(classification_report(
    val_df["label"].map(label2id).values,  # True labels (numeric)
    preds,
    target_names=list(label2id.keys())
))

In [None]:
# import os

# model_path = "/Users/gwin/Documents/Post Undergrad Work/Tax Search/models/model_20250411_122546"  # Replace with your actual path
# mapping_file = f"{model_path}/label_mappings.json"

# # Check if file exists and has content
# if not os.path.exists(mapping_file):
#     raise FileNotFoundError(f"Label mapping file not found at {mapping_file}")

# if os.path.getsize(mapping_file) == 0:
#     raise ValueError("Label mapping file is empty")

# import json

# try:
#     with open(mapping_file, "r") as f:
#         mappings = json.load(f)
#         label2id = mappings["label2id"]
#         id2label = {int(k): v for k, v in mappings["id2label"].items()}  # Ensure keys are int
# except json.JSONDecodeError as e:
#     print(f"Invalid JSON in {mapping_file}: {e}")
#     print("File content:", open(mapping_file).read())
#     raise