In [21]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import json
from pathlib import Path

class TaxonomyPredictor:
    def __init__(self, model_path="models/model_20250411_122546"):
        """Initialize model, tokenizer, and label mappings."""
        self.model = AutoModelForSequenceClassification.from_pretrained(model_path)
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        
        # Load label mappings
        with open(f"{model_path}/label_mappings.json", "r") as f:
            mappings = json.load(f)
            self.id2label = {int(k): v for k, v in mappings["id2label"].items()}
            self.label_hierarchy = mappings.get("hierarchy", {})  # Optional: If you have parent-child relationships

    def predict_taxonomy(self, text, top_k=3):
        """Predict L1/L2 categories with probabilities.
        Returns:
            dict: {
                'l1_category': str,       # Main category (e.g., "healthcare")
                'l2_category': str,       # Subcategory (e.g., "medicare")
                'combined_category': str, # Combined format "healthcare|medicare"
                'l1_prob': float,
                'l2_prob': float,
                'full_distribution': list  # All predictions (sorted)
            }
        """
        inputs = self.tokenizer(text, return_tensors="pt", truncation=True, padding=True)
        
        with torch.no_grad():
            outputs = self.model(**inputs)
            probs = torch.softmax(outputs.logits, dim=1).numpy()[0]
        
        # Get all predictions
        predictions = [
            {"label": self.id2label[label_id], "score": float(prob)}
            for label_id, prob in enumerate(probs)
        ]
        predictions.sort(key=lambda x: x["score"], reverse=True)
        
        # Extract L1/L2 from the predicted label
        top_pred = predictions[0]['label']
        # if ' > ' in top_pred:
        #     l1, l2 = top_pred.split(' > ', 1)
        #     combined = f"{l1.lower()}|{l2.lower()}"  # Format as "healthcare|medicare"
        # else:
        #     l1 = l2 = top_pred
        #     combined = f"{l1.lower()}|{l2.lower()}"  # Fallback format

        if '|' in top_pred:
            l1, l2 = top_pred.split('|', 1)  # Split on existing pipe
        elif ' > ' in top_pred:
            l1, l2 = top_pred.split(' > ', 1)  # Original case
        else:
            l1 = l2 = top_pred
        
        # Clean whitespace and standardize case
        l1, l2 = l1.strip().lower(), l2.strip().lower()
        
        return {
            'l1_category': l1.lower(),
            'l2_category': l2.lower(),
            'combined_category': f"{l1}|{l2}",  # New field with pipeline format
            'l1_prob': predictions[0]['score'],
            'l2_prob': predictions[0]['score'],
            'full_distribution': predictions[:top_k]
        }

# # --- Usage Example ---
# if __name__ == "__main__":
#     # Initialize predictor
#     predictor = TaxonomyPredictor()
    
#     # Test prediction
#     texts = [
#         "someone needs to look after my grandma",
#         "medicare coverage for seniors",
#         "FDA drug approval process"
#     ]
    
#     for text in texts:
#         result = predictor.predict_taxonomy(text)
#         print(f"\nText: '{text}'")
#         print(f"→ Combined: {result['combined_category']}")
#         print(f"→ L1: {result['l1_category']} (P={result['l1_prob']:.2f})")
#         print(f"→ L2: {result['l2_category']} (P={result['l2_prob']:.2f})")

In [22]:
print("Label mappings inspection:")
with open("models/model_20250411_122546/label_mappings.json", "r") as f:
    mappings = json.load(f)
    print("Sample labels:", list(mappings["id2label"].values())[:5])  # Check for duplicates

Label mappings inspection:
Sample labels: ['healthcare|FDA', 'healthcare|medicaid', 'healthcare|medicare', 'education|highereducationpolicy', 'education|k12funding']


In [23]:
predictor = TaxonomyPredictor()
test_text = "FDA drug approval process"  # Example that showed duplication
inputs = predictor.tokenizer(test_text, return_tensors="pt")
with torch.no_grad():
    outputs = predictor.model(**inputs)
    probs = torch.softmax(outputs.logits, dim=1).numpy()[0]

print("\nRaw predictions:")
for label_id, prob in enumerate(probs):
    print(f"{predictor.id2label[label_id]}: {prob:.4f}")


Raw predictions:
healthcare|FDA: 0.3493
healthcare|medicaid: 0.0889
healthcare|medicare: 0.1224
education|highereducationpolicy: 0.0548
education|k12funding: 0.0553
education|studentloans: 0.0573
finance|tax_policies: 0.0599
finance|budgets: 0.0729
defense|cybersecurity: 0.0736
defense|procurement: 0.0655


In [24]:
# After implementing the fix:
test_text = "FDA drug approval process"
result = predictor.predict_taxonomy(test_text)
print(result['combined_category'])  # Now correctly shows "healthcare|fda"

healthcare|fda


In [25]:
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np

# Load your cleaned data (CSV with 'text' column)
df = pd.read_csv("/Users/gwin/Documents/Post Undergrad Work/Tax Search/test_data/junk/gov_docs.csv")  

# Use tiny-but-mighty model (CPU-friendly)
model = SentenceTransformer("all-MiniLM-L6-v2")  # 384-dim vectors


In [26]:
# Drop rows with missing text
df = df.dropna(subset=['text']) 

# Or fill empty values with empty string
df['text'] = df['text'].fillna('')
df["vector"] = df["text"].apply(lambda x: model.encode(x).tolist())

In [33]:
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
import time

class ElasticsearchTaxonomyIndexer:
    
    def __init__(self, es_host="http://localhost:9200", index_name="taxonomy_documents"):
        self.es = Elasticsearch(es_host)
        self.index_name = index_name
        self.predictor = TaxonomyPredictor()
        self.embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
        
    def create_index(self):
        """Create Elasticsearch index with proper mappings for taxonomy and vectors"""
        mapping = {
            "mappings": {
                "properties": {
                    "text": {"type": "text"},
                    "vector": {
                        "type": "dense_vector",
                        "dims": 384,  # Match your SentenceTransformer model
                        "index": True,
                        "similarity": "cosine"
                    },
                    "taxonomy": {
                        "type": "keyword",
                        "fields": {
                            "analyzed": {"type": "text"}
                        }
                    },
                    "l1_category": {"type": "keyword"},
                    "l2_category": {"type": "keyword"},
                    "l1_prob": {"type": "float"},
                    "l2_prob": {"type": "float"},
                    "metadata": {  # For any additional fields from your CSV
                        "type": "object",
                        "enabled": True
                    }
                }
            }
        }
        
        # Delete index if it exists
        if self.es.indices.exists(index=self.index_name):
            self.es.indices.delete(index=self.index_name)
        
        self.es.indices.create(index=self.index_name, body=mapping)
    
    def process_document(self, row):
        """Process a single document row from DataFrame"""
        doc = {
            "text": row["text"],
            "vector": self.embedding_model.encode(row["text"]).tolist(),
            "metadata": {k: v for k, v in row.items() if k != "text"}
        }
        
        # Add taxonomy classification
        taxonomy_result = self.predictor.predict_taxonomy(row["text"])
        doc.update({
            "taxonomy": taxonomy_result["combined_category"],
            "l1_category": taxonomy_result["l1_category"],
            "l2_category": taxonomy_result["l2_category"],
            "l1_prob": taxonomy_result["l1_prob"],
            "l2_prob": taxonomy_result["l2_prob"]
        })
        
        return doc
    
    def index_dataframe(self, df, batch_size=100):
        """Index a pandas DataFrame with taxonomy classification and vectors"""
        actions = []
        
        for _, row in df.iterrows():
            try:
                doc = self.process_document(row)
                actions.append({
                    "_index": self.index_name,
                    "_source": doc
                })
                
                # Bulk insert in batches
                if len(actions) >= batch_size:
                    bulk(self.es, actions)
                    actions = []
                    print(f"Indexed {batch_size} documents...")
                    
            except Exception as e:
                print(f"Error processing document: {e}")
                continue
        
        # Insert remaining documents
        if actions:
            bulk(self.es, actions)
            print(f"Indexed final {len(actions)} documents")
        
        print(f"Finished indexing {len(df)} documents")
        self.es.indices.refresh(index=self.index_name)
    
    def search(self, query, taxonomy_filter=None, top_k=10):
        """Search with optional taxonomy filtering"""
        # Generate query vector
        query_vector = self.embedding_model.encode(query).tolist()
        
        # Build the search query
        search_body = {
            "query": {
                "script_score": {
                    "query": {"match_all": {}},
                    "script": {
                        "source": "cosineSimilarity(params.query_vector, 'vector') + 1.0",
                        "params": {"query_vector": query_vector}
                    }
                }
            },
            "size": top_k,
            "_source": ["text", "taxonomy", "l1_category", "l2_category"]
        }
        
        # Add taxonomy filter if provided
        if taxonomy_filter:
            if "|" in taxonomy_filter:
                l1, l2 = taxonomy_filter.split("|", 1)
                search_body["query"]["script_score"]["query"] = {
                    "bool": {
                        "must": [
                            {"term": {"l1_category": l1.lower()}},
                            {"term": {"l2_category": l2.lower()}}
                        ]
                    }
                }
            else:
                search_body["query"]["script_score"]["query"] = {
                    "term": {"l1_category": taxonomy_filter.lower()}
                }
        
        results = self.es.search(index=self.index_name, body=search_body)
        return [hit["_source"] for hit in results["hits"]["hits"]]
    def smart_search(self, query, use_query_taxonomy=True, top_k=10):
        """Search using query's predicted taxonomy as filter"""
        # Classify the query itself
        query_taxonomy = self.predictor.predict_taxonomy(query)['combined_category']

        print(f"Query classified as: {query_taxonomy}")  # Debug output

        if use_query_taxonomy:
            return self.search(query, taxonomy_filter=query_taxonomy, top_k=top_k)
        else:
            return self.search(query, top_k=top_k)
        
# if __name__ == "__main__":
#     # Load your data
#     df = pd.read_csv("/Users/gwin/Documents/Post Undergrad Work/Tax Search/test_data/junk/gov_docs.csv")
#     df = df.dropna(subset=['text'])
#     df['text'] = df['text'].fillna('')
    
#     # LIMIT TO 4 ENTRIES FOR TESTING (choose one method)
#     df = df.head(4)  # First 4 rows (deterministic)
#     # OR 
#     # df = df.sample(4, random_state=42)  # Random 4 rows (reproducible)
    
#     print(f"TESTING WITH {len(df)} DOCUMENTS:")
#     print(df['text'].head())  # Verify the subset
    
#     # Initialize and create index
#     indexer = ElasticsearchTaxonomyIndexer()
#     indexer.create_index()
    
#     # Index only the 4 test documents
#     indexer.index_dataframe(df)
    
#     # Example searches (unchanged)
#     print("\nSearch results for 'elderly care':")
#     for result in indexer.search("i need schooling"):
#         print(f"{result['taxonomy']}: {result['text'][:100]}...")
    
#     print("\nSearch results for 'healthcare' filtered by taxonomy:")
#     for result in indexer.search("healthcare", taxonomy_filter="healthcare|medicare"):
#         print(f"{result['taxonomy']}: {result['text'][:100]}...")
# --- Usage Example ---


In [34]:
if __name__ == "__main__":
    # Load your data
    df = pd.read_csv("/Users/gwin/Documents/Post Undergrad Work/Tax Search/test_data/junk/gov_docs.csv")
    df = df.dropna(subset=['text'])
    df['text'] = df['text'].fillna('')
    
    

    # Initialize and create index
    indexer = ElasticsearchTaxonomyIndexer()
    indexer.create_index()
    
    # Index your data (this may take some time)
    indexer.index_dataframe(df)
    
    # Example searches
    print("\nSearch results for 'elderly care':")
    for result in indexer.search("elderly care"):
        print(f"{result['taxonomy']}: {result['text'][:100]}...")
    
    print("\nSearch results for 'healthcare' filtered by taxonomy:")
    for result in indexer.search("money", taxonomy_filter="healthcare|medicare"):
        print(f"{result['taxonomy']}: {result['text'][:100]}...")

Indexed 100 documents...
Indexed 100 documents...
Indexed final 59 documents
Finished indexing 259 documents

Search results for 'elderly care':
healthcare|medicare: MAKINGMEDICAREFORBENEFICIARIESBETTERNEWMEDICAREADVANTAGESUPPLEMENTALBENEFITSBeneficiaries in Medicar...
healthcare|medicaid: DEPARTMENTOFHEALTH & HUMANSERVICESCenters for Medicare & Medicaid Services 7500 Security Boulevard B...
healthcare|medicaid: Department of Health and Hu December 1, 2024 The Honorable Jim Pillen Governor of Nebraska P.O. Box ...
healthcare|medicaid: Department of Health and Hu December 1, 2024 The Honorable Jim Pillen Governor of Nebraska P.O. Box ...
healthcare|medicaid: Medicaid_10_11_Cvr.indd 1 6/27/2012 11:03:34 AM...
healthcare|medicaid: DEPARTMENTOFHEALTH & HUMANSERVICESCenters for Medicare & Medicaid Services 7500 Security Boulevard, ...
healthcare|medicare: 11 CHAPTEROptions for slowing the growth of Medicare fee-for-service spending for emergency departme...
healthcare|medicaid: Chapter 10: 

In [30]:
 print("\nSearch results for 'elderly care':")
for result in indexer.search("money"):
    print(f"{result['taxonomy']}: {result['text'][:100]}...")

print("\nSearch results for 'healthcare' filtered by taxonomy:")
for result in indexer.search("money", taxonomy_filter="healthcare|medicare"):
    print(f"{result['taxonomy']}: {result['text'][:100]}...")


Search results for 'elderly care':
finance|budgets: 1 March 20, 2024 Fellow Americans, Our nation’s fiscal house is in ruin, the President of the United...
finance|budgets: Enacted State Budget Fiscal Year 2023-24 Areas of Interest in the Intersection of Criminal Justice &...
education|k12funding: R00A03 Funding for Educational Organizations Maryland State Department of Education Executive Summar...
finance|budgets: Senate Budget and Fiscal Review Committee The Legislature’s Version of the 2020-21 State Budget Summ...
education|highereducationpolicy: Closing the College Affordability Gap December 2023 A report for The Institute for College Access an...
education|k12funding: STATEFACTSHEET: House Republican Proposals Hurt Children, Borrowers, and Undermine Education in New ...
education|k12funding: OCTOBER 2023 INVESTMENTSINSTUDENTRECOVERYA Review of School Districts’ Use of American Rescue Plan F...
education|highereducationpolicy: Glossary CFRDCLCost of Attendance 2 CHAPTER (Budget) 

In [29]:
# Add this right after indexing to inspect what was stored
print("\nDEBUG: Taxonomy values in index:")

# Use the CORRECT parameter name (top_k instead of size)
for doc in indexer.search("", top_k=4):  # Now using 'top_k' which matches your method's parameter
    print(f"Taxonomy: {doc['taxonomy']} | Text: {doc['text'][:50]}...")


DEBUG: Taxonomy values in index:
Taxonomy: healthcare|fda | Text: Product Name Placement, Size, and Prominence in Pr...
Taxonomy: education|k12funding | Text: 2024-2025 FISCALYEARANALYSISOFTHENEWJERSEYBUDGETDD...
Taxonomy: education|k12funding | Text: 2024-2025 FISCALYEARANALYSISOFTHENEWJERSEYBUDGETDD...
Taxonomy: defense|cybersecurity | Text: UNCLASSIFIED//FOROFFICIALUSEONLYPre-Decisional Dra...


In [None]:
results = indexer.smart_search("medicare benefits")
for result in results:
    print(f"{result['taxonomy']}: {result['text'][:100]}...")

Query classified as: healthcare|medicare
