In [None]:
# import torch
# from transformers import AutoModelForSequenceClassification, AutoTokenizer
# import json
# from pathlib import Path

# class TaxonomyPredictor:
#     def __init__(self, model_path="models/model_20250411_122546"):
#         """Initialize model, tokenizer, and label mappings."""
#         self.model = AutoModelForSequenceClassification.from_pretrained(model_path)
#         self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        
#         # Load label mappings
#         with open(f"{model_path}/label_mappings.json", "r") as f:
#             mappings = json.load(f)
#             self.id2label = {int(k): v for k, v in mappings["id2label"].items()}
#             self.label_hierarchy = mappings.get("hierarchy", {})  # Optional: If you have parent-child relationships

#     def predict_taxonomy(self, text, top_k=3):
#         """Predict L1/L2 categories with probabilities.
#         Returns:
#             dict: {
#                 'l1_category': str,
#                 'l2_category': str,
#                 'l1_prob': float,
#                 'l2_prob': float,
#                 'full_distribution': list  # All predictions (sorted)
#             }
#         """
#         inputs = self.tokenizer(text, return_tensors="pt", truncation=True, padding=True)
        
#         with torch.no_grad():
#             outputs = self.model(**inputs)
#             probs = torch.softmax(outputs.logits, dim=1).numpy()[0]
        
#         # Get all predictions
#         predictions = [
#             {"label": self.id2label[label_id], "score": float(prob)}
#             for label_id, prob in enumerate(probs)
#         ]
#         predictions.sort(key=lambda x: x["score"], reverse=True)
        
#         # Extract L1/L2 (assuming labels follow "L1 > L2" format)
#         top_pred = predictions[0]['label']
#         if ' > ' in top_pred:
#             l1, l2 = top_pred.split(' > ', 1)
#         else:
#             l1, l2 = top_pred, top_pred  # Fallback if no hierarchy
        
#         return {
#             'l1_category': l1,
#             'l2_category': l2,
#             'l1_prob': predictions[0]['score'],
#             'l2_prob': predictions[0]['score'],  # Or customize logic for L2
#             'full_distribution': predictions[:top_k]
#         }

# # --- Usage Example ---
# if __name__ == "__main__":
#     # Initialize predictor
#     predictor = TaxonomyPredictor()
    
#     # Test prediction
#     text = "someone needs to look after my grandma"
#     result = predictor.predict_taxonomy(text)
#     print(f"L1: {result['l1_category']} (P={result['l1_prob']:.2f})")
#     print(f"L2: {result['l2_category']} (P={result['l2_prob']:.2f})")

L1: education|k12funding (P=0.12)
L2: education|k12funding (P=0.12)


In [54]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import json
from pathlib import Path

class TaxonomyPredictor:
    def __init__(self, model_path="models/model_20250411_122546"):
        """Initialize model, tokenizer, and label mappings."""
        self.model = AutoModelForSequenceClassification.from_pretrained(model_path)
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        
        # Load label mappings
        with open(f"{model_path}/label_mappings.json", "r") as f:
            mappings = json.load(f)
            self.id2label = {int(k): v for k, v in mappings["id2label"].items()}
            self.label_hierarchy = mappings.get("hierarchy", {})  # Optional: If you have parent-child relationships

    def predict_taxonomy(self, text, top_k=3):
        """Predict L1/L2 categories with probabilities.
        Returns:
            dict: {
                'l1_category': str,       # Main category (e.g., "healthcare")
                'l2_category': str,       # Subcategory (e.g., "medicare")
                'combined_category': str, # Combined format "healthcare|medicare"
                'l1_prob': float,
                'l2_prob': float,
                'full_distribution': list  # All predictions (sorted)
            }
        """
        inputs = self.tokenizer(text, return_tensors="pt", truncation=True, padding=True)
        
        with torch.no_grad():
            outputs = self.model(**inputs)
            probs = torch.softmax(outputs.logits, dim=1).numpy()[0]
        
        # Get all predictions
        predictions = [
            {"label": self.id2label[label_id], "score": float(prob)}
            for label_id, prob in enumerate(probs)
        ]
        predictions.sort(key=lambda x: x["score"], reverse=True)
        
        # Extract L1/L2 from the predicted label
        top_pred = predictions[0]['label']
        if ' > ' in top_pred:
            l1, l2 = top_pred.split(' > ', 1)
            combined = f"{l1.lower()}|{l2.lower()}"  # Format as "healthcare|medicare"
        else:
            l1 = l2 = top_pred
            combined = f"{l1.lower()}|{l2.lower()}"  # Fallback format
        
        return {
            'l1_category': l1.lower(),
            'l2_category': l2.lower(),
            'combined_category': combined,  # New field with pipeline format
            'l1_prob': predictions[0]['score'],
            'l2_prob': predictions[0]['score'],
            'full_distribution': predictions[:top_k]
        }

# --- Usage Example ---
if __name__ == "__main__":
    # Initialize predictor
    predictor = TaxonomyPredictor()
    
    # Test prediction
    texts = [
        "someone needs to look after my grandma",
        "medicare coverage for seniors",
        "FDA drug approval process"
    ]
    
    for text in texts:
        result = predictor.predict_taxonomy(text)
        print(f"\nText: '{text}'")
        print(f"→ Combined: {result['combined_category']}")
        print(f"→ L1: {result['l1_category']} (P={result['l1_prob']:.2f})")
        print(f"→ L2: {result['l2_category']} (P={result['l2_prob']:.2f})")


Text: 'someone needs to look after my grandma'
→ Combined: education|k12funding|education|k12funding
→ L1: education|k12funding (P=0.12)
→ L2: education|k12funding (P=0.12)

Text: 'medicare coverage for seniors'
→ Combined: healthcare|medicare|healthcare|medicare
→ L1: healthcare|medicare (P=0.28)
→ L2: healthcare|medicare (P=0.28)

Text: 'FDA drug approval process'
→ Combined: healthcare|fda|healthcare|fda
→ L1: healthcare|fda (P=0.35)
→ L2: healthcare|fda (P=0.35)


In [55]:
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np

# Load your cleaned data (CSV with 'text' column)
df = pd.read_csv("/Users/gwin/Documents/Post Undergrad Work/Tax Search/test_data/junk/gov_docs.csv")  

# Use tiny-but-mighty model (CPU-friendly)
model = SentenceTransformer("all-MiniLM-L6-v2")  # 384-dim vectors


In [56]:
# Drop rows with missing text
df = df.dropna(subset=['text']) 

# Or fill empty values with empty string
df['text'] = df['text'].fillna('')
df["vector"] = df["text"].apply(lambda x: model.encode(x).tolist())

In [None]:
# from elasticsearch import Elasticsearch

# # Start local Elasticsearch (Docker)
# # docker run -p 9200:9200 -e "discovery.type=single-node" elasticsearch:8.12.0

# es = Elasticsearch("http://localhost:9200")



# es.indices.create(index="docs",mappings={
#         "properties": {
#             "text": {"type": "text"},
#             "vector": {"type": "dense_vector", "dims": 384, "similarity": "cosine"},
#             "l1_category": {"type": "keyword"},  
#             "l2_category": {"type": "keyword"},
#             "l1_prob": {"type": "float"},
#             "l2_prob": {"type": "float"}
#         }
#     },
#     ignore=400  # Ignore "index already exists" errors
# )
# predictor = TaxonomyPredictor() 
# # Index documents with vectors + taxonomy
# for _, row in df.iterrows():
#     taxonomy = predictor.predict_taxonomy(row["text"])
    
#     es.index(
#         index="docs",
#         document={
#             "text": row["text"],
#             "vector": row["vector"],
#             "l1_category": taxonomy["l1_category"],
#             "l2_category": taxonomy["l2_category"],
#             "l1_prob": taxonomy["l1_prob"],
#             "l2_prob": taxonomy["l2_prob"],
#             # Optional: Store full distribution for debugging
#             "metadata": {
#                 "taxonomy_distribution": taxonomy["full_distribution"]
#             }
#         }
#     )

  es.indices.create(index="docs",mappings={


In [82]:
import pandas as pd

# Create a tiny test dataframe with 4 sample documents
test_df = pd.DataFrame([
    {"text": "Medicare benefits for elderly patients", "id": "doc1"},
    {"text": "FDA approves new diabetes medication", "id": "doc2"},
    {"text": "Budget proposal for defense spending", "id": "doc3"},
    {"text": "Student loan forgiveness program updates", "id": "doc4"}
])

# Generate vectors (if needed)
test_df["vector"] = test_df["text"].apply(lambda x: model.encode(x).tolist())
df = test_df

In [88]:
from elasticsearch import Elasticsearch

# Initialize Elasticsearch client
es = Elasticsearch("http://localhost:9200")

# Delete old index if exists
es.indices.delete(index="docs", ignore_unavailable=True)

# Create index with triple-category storage
es.indices.create(
    index="docs",
    mappings={
        "properties": {
            "text": {"type": "text"},
            "vector": {
                "type": "dense_vector",
                "dims": 384,
                "similarity": "cosine"
            },
            # Triple storage for maximum flexibility
            "category": {"type": "keyword"},      # "healthcare|medicare" (original format)
            "l1": {"type": "keyword"},           # "healthcare" (extracted)
            "l2": {"type": "keyword"},           # "medicare" (extracted)
            # Confidence metrics
            "confidence": {"type": "float"},
            "l1_confidence": {"type": "float"},
            "l2_confidence": {"type": "float"},
            # Full distribution for debugging
            "metadata": {"type": "object", "enabled": False}
        }
    }
)

# Initialize predictor
predictor = TaxonomyPredictor()

def index_documents(df):
    for _, row in df.iterrows():
        taxonomy = predictor.predict_taxonomy(row["text"])
        
        # Auto-split combined category
        l1, l2 = taxonomy["combined_category"].split("|", 1)
        
        es.index(
            index="docs",
            document={
                "text": row["text"],
                "vector": row["vector"],
                # Triple category storage
                "category": taxonomy["combined_category"],
                "l1": l1,
                "l2": l2,
                # Confidence metrics
                "confidence": taxonomy["l1_prob"],
                "l1_confidence": taxonomy["l1_prob"],
                "l2_confidence": taxonomy["l2_prob"],
                # Debug info
                "metadata": {
                    "full_distribution": taxonomy["full_distribution"],
                    "original_prediction": taxonomy
                }
            }
        )
    print(f"Indexed {len(df)} documents")

# Usage
index_documents(df)

print("Indexing complete!")

Indexed 4 documents
Indexing complete!


In [87]:
# Create a dedicated test index
test_index = "test_docs"
es.indices.delete(index=test_index, ignore_unavailable=True)

es.indices.create(
    index=test_index,
    mappings={
        "properties": {
            "text": {"type": "text"},
            "vector": {"type": "dense_vector", "dims": 384},
            "category": {"type": "keyword"},
            "l1": {"type": "keyword"},
            "l2": {"type": "keyword"},
            "confidence": {"type": "float"}
        }
    }
)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'test_docs'})

In [89]:
# Check all test documents
test_docs = es.search(index=test_index, query={"match_all": {}}, size=10)
print("Indexed Test Documents:")
for hit in test_docs["hits"]["hits"]:
    doc = hit["_source"]
    print(f"\nID: {hit['_id']}")
    print(f"Text: {doc['text']}")
    print(f"Category: {doc['category']}")
    print(f"L1: {doc['l1']} | L2: {doc['l2']}")
    print(f"Confidence: {doc['confidence']:.2f}")

Indexed Test Documents:


In [None]:
for _, row in test_df.iterrows():
    taxonomy = predictor.predict_taxonomy(row["text"])
    l1, l2 = taxonomy["combined_category"].split("|", 1)
    
    es.index(
        index=test_index,
        id=row["id"],  # Use our test IDs for easy reference
        document={
            "text": row["text"],
            "vector": row["vector"],
            "category": taxonomy["combined_category"],
            "l1": l1,
            "l2": l2,
            "confidence": taxonomy["l1_prob"]
        }
    )
print("Test documents indexed!")

In [63]:
# Get exact mapping for your fields
mapping = es.indices.get_mapping(index="docs")
print("Current mapping for l1_category:")
print(mapping['docs']['mappings']['properties']['l1_category'])
print("\nCurrent mapping for l2_category:")
print(mapping['docs']['mappings']['properties']['l2_category'])

Current mapping for l1_category:
{'type': 'text', 'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}}

Current mapping for l2_category:
{'type': 'text', 'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}}


In [67]:
# Get all unique L1 categories (using the correct .keyword field)
resp = es.search(
    index="docs",
    aggs={
        "l1_categories": {
            "terms": {
                "field": "l1_category.keyword",
                "size": 100  # Increase if you have many categories
            }
        }
    },
    size=0
)

print("Existing L1 Categories and Counts:")
for bucket in resp['aggregations']['l1_categories']['buckets']:
    print(f"{bucket['key']}: {bucket['doc_count']} documents")

Existing L1 Categories and Counts:
defense|procurement: 108 documents
education|k12funding: 90 documents
healthcare|medicare: 90 documents
healthcare|medicaid: 81 documents
defense|cybersecurity: 75 documents
education|studentloans: 72 documents
healthcare|fda: 62 documents
education|highereducationpolicy: 60 documents
finance|budgets: 54 documents
finance|tax_policies: 54 documents
healthcare|FDA: 31 documents


In [68]:
# Check for similar categories
similar_categories = ["Health", "Medical", "Care", "Elderly", "Senior"]
for cat in similar_categories:
    count = es.count(
        index="docs",
        query={"wildcard": {"l1_category.keyword": f"*{cat}*"}}
    )['count']
    print(f"Documents with '{cat}' in L1 category: {count}")

Documents with 'Health' in L1 category: 0
Documents with 'Medical' in L1 category: 0
Documents with 'Care' in L1 category: 0
Documents with 'Elderly' in L1 category: 0
Documents with 'Senior' in L1 category: 0


In [69]:
# Find documents that mention healthcare terms but aren't categorized
healthcare_terms = ["healthcare", "hospital", "elderly", "grandma", "senior"]
query = {
    "bool": {
        "must_not": {"exists": {"field": "l1_category.keyword"}},  # Or use specific bad category
        "should": [{"match": {"text": term}} for term in healthcare_terms],
        "minimum_should_match": 1
    }
}

potential_health_docs = es.search(
    index="docs",
    query=query,
    size=5
)

print("\nPotentially misclassified documents:")
for hit in potential_health_docs['hits']['hits']:
    print(f"\nID: {hit['_id']}")
    print(f"Current L1: {hit['_source'].get('l1_category', 'UNCATEGORIZED')}")
    print(f"Text: {hit['_source']['text'][:200]}...")


Potentially misclassified documents:

ID: FNRpO5YBmBfTKM6rC3Nj
Current L1: UNCATEGORIZED
Text: Senate Budget and Fiscal Review Committee The Legislature’s Version of the 2020-21 State Budget Summary Unprecedented for this stage of the budget process, the Assembly and Senate are in agreement on ...

ID: ktRpO5YBmBfTKM6rB3I6
Current L1: UNCATEGORIZED
Text: 1 401 Hathaway Building • Cheyenne, WY 82002 Phone (307) 777-7656 • 1-866-571-0944 Fax (307) 777-7439 • www.health.wyo.gov Stefan Johansson Mark Gordon Director Governor February 5, 2025 Dear Medicaid...

ID: ndRpO5YBmBfTKM6rB3KZ
Current L1: UNCATEGORIZED
Text: Module: 11 Medicare Advantage Plans and Other Medicare Plans Inside front cover Module 11: Medicare Advantage Plans and Other Medicare Plans Contents Introduction ........................................

ID: V9RpO5YBmBfTKM6rBHKs
Current L1: UNCATEGORIZED
Text: FDADrug Safety Communication FDA cautions against use of hydroxychloroquine or chloroquine for COVID-19 outside of the

In [71]:
# Find all Medicare-related documents under Healthcare
results = es.search(
    index="docs",
    query={
        "term": {"combined_category": "healthcare|medicare"}
    }
)

In [72]:
from sentence_transformers import SentenceTransformer
from elasticsearch import Elasticsearch

# Initialize models and ES client
model = SentenceTransformer("all-MiniLM-L6-v2")
es = Elasticsearch("http://localhost:9200")
predictor = TaxonomyPredictor()  # Your taxonomy classifier

def hybrid_search(query, l1_filter=None, l2_filter=None, top_k=10):
    query_vector = model.encode(query).tolist()
    
    # Build category filter
    category_filter = []
    if l1_filter and l2_filter:
        category_filter.append({"term": {"combined_category": f"{l1_filter}|{l2_filter}"}})
    elif l1_filter:
        category_filter.append({"term": {"l1_category": l1_filter}})
    
    query_body = {
        "query": {
            "bool": {
                "must": category_filter,
                "filter": [{
                    "script_score": {
                        "query": {"match_all": {}},
                        "script": {
                            "source": "cosineSimilarity(params.query_vector, 'vector') + 1.0",
                            "params": {"query_vector": query_vector}
                        }
                    }
                }]
            }
        },
        "size": top_k
    }
    
    return es.search(index="docs", body=query_body)

# Example usage:
results = hybrid_search(
    "elderly care benefits",
    l1_filter="healthcare",
    l2_filter="medicare"
)

In [79]:

# 1. Count healthcare documents - THREE OPTIONS:

# Option A: Count using combined field (recommended)
healthcare_count = es.count(
    index="docs",
    query={"prefix": {"combined_category.keyword": "healthcare|"}}
)['count']
print(f"Found {healthcare_count} Healthcare documents (any subcategory)")

# Option B: Count specific subcategory
medicare_count = es.count(
    index="docs",
    query={"term": {"combined_category.keyword": "healthcare|medicare"}}
)['count']
print(f"Found {medicare_count} Medicare documents")

# Option C: Count using separate L1 field
healthcare_l1_count = es.count(
    index="docs",
    query={"term": {"l1_category.keyword": "healthcare"}}
)['count']
print(f"Found {healthcare_l1_count} documents with L1=healthcare")

# 2. Updated hybrid search usage
results = hybrid_search(
    "elderly care benefits",
    l1_filter="healthcare|medicare",  # Now works with either format
    l2_filter="medicare",    # Optional subcategory filter
    top_k=5
)

# 3. Process results with enhanced output
print(f"\nFound {len(results['hits']['hits'])} results:")
for i, hit in enumerate(results["hits"]["hits"], 1):
    source = hit['_source']
    print(f"\nResult #{i}:")
    print(f"Score: {hit['_score']:.3f}")
    print(f"Combined: {source.get('combined_category', 'N/A')}")
    print(f"L1: {source['l1_category']} (P={source['l1_prob']:.2f})")
    print(f"L2: {source['l2_category']} (P={source['l2_prob']:.2f})")
    print(f"Text: {source['text'][:200]}...")
    
    # Show full distribution if available
    if 'metadata' in source and 'taxonomy_distribution' in source['metadata']:
        print("\nTop predictions:")
        for pred in source['metadata']['taxonomy_distribution']:
            print(f"- {pred['label']}: {pred['score']:.2f}")
# results = hybrid_search(
#     "elderly care services",
#     l1_filter="Healthcare",
#     top_k=5
# )

# # 3. Print results
# print(f"\nFound {len(results['hits']['hits'])} results:")
# for hit in results['hits']['hits']:
#     print(f"\nScore: {hit['_score']:.3f}")
#     print(f"L1: {hit['_source']['l1_category']}")
#     print(f"Text: {hit['_source']['text'][:200]}...")

Found 88 Healthcare documents (any subcategory)
Found 0 Medicare documents
Found 0 documents with L1=healthcare

Found 0 results:


In [76]:
# Check total documents in index
count = es.count(index="docs")['count']
print(f"Total documents in index: {count}")

# Check if any documents have your expected categories
sample = es.search(
    index="docs",
    query={"term": {"l1_category": "Healthcare"}},
    size=1
)
print("Sample Healthcare doc:", sample['hits']['hits'][0]['_source']['text'][:100] if sample['hits']['hits'] else "None found")

Total documents in index: 1036
Sample Healthcare doc: None found
