In [5]:
!pip install openai dotenv

Collecting dotenv
  Downloading dotenv-0.9.9-py2.py3-none-any.whl.metadata (279 bytes)
Collecting python-dotenv (from dotenv)
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB)
Downloading dotenv-0.9.9-py2.py3-none-any.whl (1.9 kB)
Downloading python_dotenv-1.1.0-py3-none-any.whl (20 kB)
Installing collected packages: python-dotenv, dotenv
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [dotenv]
[1A[2KSuccessfully installed dotenv-0.9.9 python-dotenv-1.1.0


In [None]:
import os
import json
import base64
from openai import OpenAI

# Set your OpenAI API key
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

SCRAPED_DIR = "dataset"
OUTPUT_FILENAME = "labels.json"

def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")

def call_gpt_labeler(parcel_info, image_path):
    parcel_text = "\n".join(f"{k}: {v}" for k, v in parcel_info.items())
    encoded_image = encode_image(image_path)

    print(f"📸 Using image: {image_path}")

    try:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {
                    "role": "system",
                    "content": (
                        "You are an energy efficiency expert. Given a photo of a house and its parce information, rate the need for the following retrofit interventions from 0.0 (not needed) to 1.0 (very high priority): \n"

                        "1. Insulation upgrade\n"
                        "2. HVAC replacement\n"
                        "3. Air sealing\n"
                        "4. Moisture mitigation\n"
                        
                        "Base your response on both the **visual appearance** of the home and the **parcel data**. Return only a raw JSON list of 4 float values in this order. Do not explain or wrap in Markdown."
                    )
                },
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": f"Parcel data:\n{parcel_text}"
                        },
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/jpeg;base64,{encoded_image}"
                            }
                        }
                    ]
                }
            ],
            max_tokens=50,
            temperature=0
        )

        message = response.choices[0].message
        content = message.content.strip() if message else ""
        print(f"\n🔍 GPT raw output: {content}\n")

        # Strip Markdown ```json blocks if present
        if content.startswith("```"):
            content = content.strip("` \n").split("\n", 1)[-1].strip()
            if content.endswith("```"):
                content = content[:-3].strip()

        parsed = json.loads(content)
        if isinstance(parsed, list) and len(parsed) == 4:
            return parsed
        else:
            print("⚠️ GPT returned unexpected structure.")
            return [0.0, 0.0, 0.0, 0.0]

    except Exception as e:
        print(f"❌ Error with GPT call: {e}")
        return [0.0, 0.0, 0.0, 0.0]

def label_all():
    labeled_count = 0
    for folder_name in os.listdir(SCRAPED_DIR):
        folder_path = os.path.join(SCRAPED_DIR, folder_name)
        if not os.path.isdir(folder_path):
            continue

        data_path = os.path.join(folder_path, "data.json")
        label_path = os.path.join(folder_path, OUTPUT_FILENAME)

        if os.path.exists(label_path):
            continue

        if not os.path.exists(data_path):
            print(f"Missing data.json in {folder_path}")
            continue

        try:
            with open(data_path, "r") as f:
                parcel_info = json.load(f)
        except Exception as e:
            print(f"Failed to read {data_path}: {e}")
            continue

        image_file = next(
            (os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.lower().endswith((".jpg", ".jpeg", ".png"))),
            None
        )

        if not image_file:
            print(f"No image found in {folder_path}")
            continue

        label_vector = call_gpt_labeler(parcel_info, image_file)
        with open(label_path, "w") as f:
            json.dump(label_vector, f)

        labeled_count += 1
        print(f"✅ Labeled {folder_name}: {label_vector}")

    print(f"\n🏁 Total labeled: {labeled_count}")

if __name__ == "__main__":
    label_all()

📸 Using image: scraped_properties/RAMBEAU_RD_11/photo_1.jpg

🔍 GPT raw output: [0.6, 0.5, 0.4, 0.3]

✅ Labeled RAMBEAU_RD_11: [0.6, 0.5, 0.4, 0.3]
📸 Using image: scraped_properties/RAMBEAU_RD_16/photo_1.jpg

🔍 GPT raw output: [0.6, 0.4, 0.5, 0.3]

✅ Labeled RAMBEAU_RD_16: [0.6, 0.4, 0.5, 0.3]
📸 Using image: scraped_properties/SANTEE_DR_15/photo_1.jpg

🔍 GPT raw output: [0.7, 0.6, 0.5, 0.4]

✅ Labeled SANTEE_DR_15: [0.7, 0.6, 0.5, 0.4]
📸 Using image: scraped_properties/RAMBEAU_RD_29/photo_1.jpg

🔍 GPT raw output: [0.7, 0.6, 0.5, 0.4]

✅ Labeled RAMBEAU_RD_29: [0.7, 0.6, 0.5, 0.4]
📸 Using image: scraped_properties/FAIRLEE_CT_3/photo_1.jpg

🔍 GPT raw output: [0.6, 0.4, 0.5, 0.3]

✅ Labeled FAIRLEE_CT_3: [0.6, 0.4, 0.5, 0.3]
📸 Using image: scraped_properties/LENAIRE_RD_4/photo_1.jpg

🔍 GPT raw output: [0.7, 0.5, 0.6, 0.4]

✅ Labeled LENAIRE_RD_4: [0.7, 0.5, 0.6, 0.4]
📸 Using image: scraped_properties/LENAIRE_RD_3/photo_1.jpg

🔍 GPT raw output: [0.7, 0.6, 0.8, 0.5]

✅ Labeled LENAIRE_RD_3: 

In [11]:
os.listdir(SCRAPED_DIR)

['LENAIRE_RD_2',
 'LENAIRE_RD_5',
 'RAMBEAU_RD_20',
 'SANTEE_DR_23',
 'RAMBEAU_RD_27',
 'RAMBEAU_RD_18',
 'SANTEE_DR_12',
 'RAMBEAU_RD_11',
 'RAMBEAU_RD_16',
 'SANTEE_DR_15',
 'RAMBEAU_RD_29',
 'FAIRLEE_CT_3',
 'LENAIRE_RD_4',
 'LENAIRE_RD_3',
 'SANTEE_DR_14',
 'RAMBEAU_RD_17',
 'RAMBEAU_RD_28',
 'RAMBEAU_RD_10',
 'SANTEE_DR_13',
 'RAMBEAU_RD_26',
 'RAMBEAU_RD_19',
 'SANTEE_DR_22',
 'RAMBEAU_RD_21',
 'SANTEE_DR_6',
 'SANTEE_RD_39',
 'SANTEE_DR_1',
 'RAMBEAU_RD_5',
 'SANTEE_DR_8',
 'RAMBEAU_RD_2',
 'SANTEE_RD_30',
 'IRONSTONE_RD_11',
 'RAMBEAU_RD_3',
 'SANTEE_RD_31',
 'SANTEE_DR_9',
 'RAMBEAU_RD_4',
 'SANTEE_DR_0',
 'SANTEE_RD_38',
 'SANTEE_DR_7',
 'IRONSTONE_RD_10',
 'SANTEE_RD_40',
 'RIDON_CT_2',
 'RIDON_CT_5',
 'SANTEE_RD_25',
 'RIDON_CT_14',
 'SANTEE_RD_8',
 'SANTEE_RD_22',
 'FORNANCE_RD_8',
 'RIDON_CT_13',
 'SANTEE_RD_14',
 'FORNANCE_RD_1',
 'SANTEE_RD_1',
 'SANTEE_RD_13',
 'FORNANCE_RD_6',
 'SANTEE_RD_6',
 'RIDON_CT_4',
 'RIDON_CT_3',
 'SANTEE_RD_41',
 'FORNANCE_RD_7',
 'SANTEE_RD

In [16]:
import os
import json
import base64

# Config
SCRAPED_DIR = "dataset"
OUTPUT_JSONL = "training_data_gpt4o.jsonl"

# Helper to flatten parcel info
def format_parcel_text(parcel_info):
    return "\n".join(f"{k}: {v}" for k, v in parcel_info.items())

# Helper to encode image as base64
def encode_image_base64(image_path):
    with open(image_path, "rb") as img_file:
        return base64.b64encode(img_file.read()).decode("utf-8")

# Collect training samples
samples = []

for folder_name in os.listdir(SCRAPED_DIR):
    folder_path = os.path.join(SCRAPED_DIR, folder_name)
    if not os.path.isdir(folder_path):
        continue

    data_path = os.path.join(folder_path, "data.json")
    label_path = os.path.join(folder_path, "labels.json")
    image_path = next(
        (os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.lower().endswith((".jpg", ".jpeg", ".png"))),
        None
    )

    if not (os.path.exists(data_path) and os.path.exists(label_path) and image_path):
        print(f"⛔ Skipping {folder_name}: missing required files.")
        continue

    try:
        with open(data_path, "r") as f:
            parcel_info = json.load(f)
        with open(label_path, "r") as f:
            label_vector = json.load(f)
        image_base64 = encode_image_base64(image_path)
    except Exception as e:
        print(f"❌ Error processing {folder_name}: {e}")
        continue

    parcel_text = format_parcel_text(parcel_info)

    sample = {
        "messages": [
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": f"Parcel information:\n{parcel_text}\n\nProvide retrofit priority scores (floats between 0.0 and 1.0) for:\n1. Insulation upgrade\n2. HVAC replacement\n3. Air sealing\n4. Moisture mitigation\nReturn only a JSON list of 4 floats."
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{image_base64}"
                        }
                    }
                ]
            },
            {
                "role": "assistant",
                "content": json.dumps(label_vector)
            }
        ]
    }

    samples.append(sample)
    print(f"✅ Added: {folder_name}")

# Write to JSONL
with open(OUTPUT_JSONL, "w") as f:
    for sample in samples:
        f.write(json.dumps(sample) + "\n")

print(f"\n🏁 Wrote {len(samples)} examples to {OUTPUT_JSONL}")

✅ Added: LENAIRE_RD_2
✅ Added: LENAIRE_RD_5
✅ Added: RAMBEAU_RD_20
✅ Added: SANTEE_DR_23
✅ Added: RAMBEAU_RD_27
✅ Added: RAMBEAU_RD_18
✅ Added: SANTEE_DR_12
✅ Added: RAMBEAU_RD_11
✅ Added: RAMBEAU_RD_16
✅ Added: SANTEE_DR_15
✅ Added: RAMBEAU_RD_29
✅ Added: FAIRLEE_CT_3
✅ Added: LENAIRE_RD_4
✅ Added: LENAIRE_RD_3
✅ Added: SANTEE_DR_14
✅ Added: RAMBEAU_RD_17
✅ Added: RAMBEAU_RD_28
✅ Added: RAMBEAU_RD_10
✅ Added: SANTEE_DR_13
✅ Added: RAMBEAU_RD_26
✅ Added: RAMBEAU_RD_19
✅ Added: SANTEE_DR_22
✅ Added: RAMBEAU_RD_21
✅ Added: SANTEE_DR_6
✅ Added: SANTEE_RD_39
✅ Added: SANTEE_DR_1
✅ Added: RAMBEAU_RD_5
✅ Added: SANTEE_DR_8
✅ Added: RAMBEAU_RD_2
✅ Added: SANTEE_RD_30
✅ Added: IRONSTONE_RD_11
✅ Added: RAMBEAU_RD_3
✅ Added: SANTEE_RD_31
✅ Added: SANTEE_DR_9
✅ Added: RAMBEAU_RD_4
✅ Added: SANTEE_DR_0
✅ Added: SANTEE_RD_38
✅ Added: SANTEE_DR_7
✅ Added: IRONSTONE_RD_10
✅ Added: SANTEE_RD_40
✅ Added: RIDON_CT_2
✅ Added: RIDON_CT_5
✅ Added: SANTEE_RD_25
✅ Added: RIDON_CT_14
✅ Added: SANTEE_RD_8
✅ Ad