In [11]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import json
from pathlib import Path
import pandas as pd

# 1. Load Model and Tokenizer
model_path = "models/model_20250411_122546"
model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

# 2. Load Label Mappings
with open(f"{model_path}/label_mappings.json", "r") as f:
    mappings = json.load(f)
    id2label = {int(k): v for k, v in mappings["id2label"].items()}

# 3. Enhanced Prediction Function (Returns Full Distribution)
def predict_with_distribution(text, top_k=3):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.softmax(outputs.logits, dim=1).numpy()[0]
    
    # Get all labels and their probabilities
    predictions = []
    for label_id, prob in enumerate(probs):
        predictions.append({
            "label": id2label[label_id],
            "score": float(prob)  # Convert numpy float to Python float
        })
    
    # Sort by probability (descending)
    predictions.sort(key=lambda x: x["score"], reverse=True)
    
    return predictions[:top_k]  # Return top K predictions

# 4. Process Files with Distribution
test_dir = Path("/Users/gwin/Documents/Post Undergrad Work/Tax Search/test_data/test_clean")
results = []

for txt_file in test_dir.glob("*.txt"):
    text = txt_file.read_text(encoding="utf-8").strip()
    if not text:
        continue
    
    predictions = predict_with_distribution(text)
    
    # Format for readable output
    dist_str = " | ".join([f"{p['label']}: {p['score']:.1%}" for p in predictions])
    top_pred = predictions[0]
    
    results.append({
        "file": txt_file.name,
        "top_label": top_pred["label"],
        "top_score": top_pred["score"],
        "distribution": dist_str,
        "full_distribution": predictions  # Keep raw data
    })



In [12]:
# 5. Save Results
df = pd.DataFrame(results)
print(df[["file", "top_label", "top_score", "distribution"]])  # Preview
df.to_csv("predictions_with_distribution.csv", index=False)
df.to_json("predictions_full.json", orient="records", indent=2)  # Full data

                                 file            top_label  top_score  \
0  2022_DPC_Year_in_Review_Report.txt  defense|procurement    0.28893   

                                        distribution  
0  defense|procurement: 28.9% | defense|cybersecu...  
