1. Normalize the Records:- Create a consistent schema from the raw fields (~60-70 per record)- Group fields logically (e.g., address, valuation, contractor)- Handle nulls and edge cases gracefully- Final output: a normalized JSON structure per record

In [1]:
import json

with open("sample_permits_raw.json", "r") as f:
    raw_permits = json.load(f)

normalized_permits = []

for permit in raw_permits:
    normalized = {
        "permit_id": permit.get("permit_number"),
        "type": {
            "code": permit.get("permittype"),
            "description": permit.get("permit_type_desc"),
            "class": permit.get("permit_class_mapped"),
            "work_class": permit.get("work_class"),
        },
        "status": {
            "current": permit.get("status_current"),
            "applied_date": permit.get("applieddate", "")[:10],
            "issue_date": permit.get("issue_date", "")[:10],
            "expires_date": permit.get("expiresdate", "")[:10],
        },
        "location": {
            "address": permit.get("original_address1") or permit.get("permit_location"),
            "city": permit.get("original_city", ""),
            "state": permit.get("original_state", ""),
            "zip": permit.get("original_zip", ""),
            "latitude": permit.get("latitude"),
            "longitude": permit.get("longitude"),
            "jurisdiction": permit.get("jurisdiction"),
            "council_district": permit.get("council_district"),
        },
        "contractor": {
            "name": permit.get("contractor_full_name"),
            "company": permit.get("contractor_company_name"),
            "phone": permit.get("contractor_phone"),
            "address": f"{permit.get('contractor_address2', '')}, {permit.get('contractor_city', '')}, {permit.get('contractor_zip', '')}".strip()
        } if permit.get("contractor_full_name") else None,
        "details": {
            "description": permit.get("description"),
            "valuation": permit.get("total_valuation_remodel") or
                         permit.get("building_valuation_remodel") or
                         permit.get("electrical_valuation_remodel") or
                         permit.get("mechanical_valuation_remodel") or
                         permit.get("plumbing_valuation_remodel"),
            "sqft": permit.get("remodel_repair_sqft") or permit.get("total_new_add_sqft"),
            "housing_units": permit.get("housing_units"),
            "floors": permit.get("number_of_floors"),
        },
        "project": {
            "id": permit.get("project_id"),
            "link": permit.get("link", {}).get("url")
        }
    }

    normalized_permits.append(normalized)

with open("sample_permits_normalized.json", "w") as f:
    json.dump(normalized_permits, f, indent=2)


In [2]:
import pprint
pprint.pprint(normalized_permits[0])


{'contractor': {'address': '101  DRY CREEK ROAD, MANOR, 78653',
                'company': 'Tierra Electric',
                'name': 'Juan P. Jimenez',
                'phone': '5127448813'},
 'details': {'description': 'This application is to activate a permit '
                            'requested by the City of Austin',
             'floors': '1',
             'housing_units': '1',
             'sqft': '0',
             'valuation': '2620'},
 'location': {'address': '8808 SLAYTON DR',
              'city': 'AUSTIN',
              'council_district': '4',
              'jurisdiction': 'AUSTIN FULL PURPOSE',
              'latitude': '30.35577184',
              'longitude': '-97.69806189',
              'state': 'TX',
              'zip': '78753'},
 'permit_id': '2025-094161 EP',
 'project': {'id': '13556858',
             'link': 'https://abc.austintexas.gov/web/permit/public-search-other?t_detail=1&t_selected_folderrsn=13556858'},
 'status': {'applied_date': '2025-07-28',
      

 2. Embedding and Indexing:
 - Use OpenAI's text-embedding-3-small- Choose relevant fields for embedding (e.g., description, permit_type, work_class)- Preprocess into a single text block per record- Index into a vector DB of your choice: pgvector, Chroma, Pinecone, or Weavia

In [3]:
embedding_texts = []

for record in normalized_permits:
    parts = [
        record["type"]["description"] or "",
        record["type"]["class"] or "",
        record["type"]["work_class"] or "",
        record["details"]["description"] or "",
        record["location"]["address"] or "",
        record["location"]["city"] or "",
        record["status"]["current"] or ""
    ]
    text = " | ".join(part.strip() for part in parts if part)
    embedding_texts.append(text)


In [11]:
! pip install openai python-dotenv





[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: C:\Users\DELL\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [26]:
import os
from dotenv import load_dotenv
import openai

load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")


In [19]:
embedding_vectors = []

for text in embedding_texts:
    response = openai.embeddings.create(
        input=text,
        model="text-embedding-3-small"
    )
    embedding = response.data[0].embedding
    embedding_vectors.append(embedding)


RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}

as free version is not working so i used gpt to make me embeddings just like text-embedding-3-small wou;d do 

format 




embedding_texts = [
  "Plumbing Permit | Residential | Irrigation | Irrigation for lawn | 13108 GEARY DR | AUSTIN | Active",
  "Plumbing Permit | Residential | Irrigation | ETJ Installation of New Irrigation | 2712 LEAFY LN | AUSTIN | Active",
  "Plumbing Permit | Residential | Irrigation | Irrigation for lawn | 13102 GEARY DR | AUSTIN | Active",
  "Plumbing Permit | Residential | Irrigation | 7 zones | 1701 MISTYWOOD DR | AUSTIN | Active",
  "Plumbing Permit | Residential | Irrigation | Install Sprinkler System | 9216 FLATBUSH DR | AUSTIN | Final"
]


In [3]:
! pip install sentence-transformers


Collecting sentence-transformers
  Using cached sentence_transformers-5.1.0-py3-none-any.whl.metadata (16 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Using cached transformers-4.55.0-py3-none-any.whl.metadata (39 kB)
Collecting torch>=1.11.0 (from sentence-transformers)
  Using cached torch-2.7.1-cp311-cp311-win_amd64.whl.metadata (28 kB)
Collecting scikit-learn (from sentence-transformers)
  Using cached scikit_learn-1.7.1-cp311-cp311-win_amd64.whl.metadata (11 kB)
Collecting scipy (from sentence-transformers)
  Using cached scipy-1.16.1-cp311-cp311-win_amd64.whl.metadata (60 kB)
Collecting huggingface-hub>=0.20.0 (from sentence-transformers)
  Using cached huggingface_hub-0.34.3-py3-none-any.whl.metadata (14 kB)
Collecting filelock (from huggingface-hub>=0.20.0->sentence-transformers)
  Using cached filelock-3.18.0-py3-none-any.whl.metadata (2.9 kB)
Collecting fsspec>=2023.5.0 (from huggingface-hub>=0.20.0->sentence-transformers)
  Using cached fsspec-20


[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: C:\Users\DELL\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [6]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")
embedding_vectors = model.encode(embedding_texts, convert_to_numpy=True)


In [7]:
import numpy as np

np.save("permit_embeddings.npy", embedding_vectors)


In [4]:
import numpy as np

embedding_vectors = np.load("permit_embeddings.npy")
