In [None]:
import json
import openai
import os
from copy import deepcopy

# ---------------------------
# Set your OpenAI API key here
openai.api_key = os.getenv("OPENAI_API_KEY")  # or just set it directly
# openai.api_key = "sk-..."

# ---------------------------
# Step 1: Recursively extract all 'address' or 'location' keys with their paths

def extract_locations(obj, key_names=("address", "location"), path=None):
    if path is None:
        path = []
    found = []
    if isinstance(obj, dict):
        for k, v in obj.items():
            new_path = path + [k]
            if k.lower() in key_names and isinstance(v, str):
                found.append((new_path, v))
            else:
                found.extend(extract_locations(v, key_names, new_path))
    elif isinstance(obj, list):
        for i, item in enumerate(obj):
            found.extend(extract_locations(item, key_names, path + [i]))
    return found

# ---------------------------
# Step 2: Replace a value in a nested dict given a path

def set_value_at_path(obj, path, new_value):
    for key in path[:-1]:
        obj = obj[key]
    obj[path[-1]] = new_value

# ---------------------------
# Step 3: Enrich location strings using OpenAI GPT

def group_and_enrich_location_with_get(locations):
    system_prompt = (
        "You are a location normalization and enrichment expert. "
        "Given unstructured location strings, return structured and standardized versions "
        "with as much detail as possible."
    )

    user_prompt = f"""
Normalize the following locations. For each, return a JSON object with:

- "name": a human-friendly short name (e.g., "New York", "NY")
- "address": full address if available
- "city": city name
- "match": the original input string

Locations:
{json.dumps(locations, indent=2)}
"""

    try:
        response = openai.ChatCompletion.create(
            model="gpt-4o",  # or "gpt-4" / "gpt-3.5-turbo"
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            temperature=0.3
        )

        reply = response['choices'][0]['message']['content'].strip()

        # Optionally strip markdown code blocks like ```json
        if reply.startswith("```json"):
            reply = reply[7:]
        if reply.endswith("```"):
            reply = reply[:-3]

        return json.loads(reply)

    except Exception as e:
        print("❌ Error calling OpenAI or parsing response:", e)
        return [{"name": "", "address": "", "city": "", "match": loc} for loc in locations]

# ---------------------------
# Step 4: Main function to tie everything together

def normalize_and_enrich_json_locations(input_json):
    input_data = deepcopy(input_json)
    found_locations = extract_locations(input_data)
    original_values = [val for _, val in found_locations]

    enriched_locations = group_and_enrich_location_with_get(original_values)

    for (path, _), enriched_value in zip(found_locations, enriched_locations):
        set_value_at_path(input_data, path, enriched_value)

    return input_data

# ---------------------------
# Example usage

if __name__ == "__main__":
    sample_json = {
        "employee": {
            "name": "John Doe",
            "contact": {
                "location": "nyc, 5th ave"
            },
            "history": [
                {"address": "tokyo, jp - minato-ku"},
                {"meta": {"site": {"location": "SFO intl airprt"}}},
                {"location": "!!!unknownplace???"}
            ]
        },
        "meta": {
            "address": "   "
        }
    }

    enriched = normalize_and_enrich_json_locations(sample_json)
    print(json.dumps(enriched, indent=2))
