# Room Description Generator (Company-Aligned)

This notebook:
1. **Loads** the prepared two-column datasets (companies & job postings).
2. **Builds** a structured **space description** for each company by calling the **OpenAI API** (step 1).
3. **Augments** the description with plausible, company-aligned details that may fit but were **not** present in the extracted answers (step 2).
4. **Saves** the final data (step 3).

> The output is a JSON structure per company that follows the required schema.

## Configuration

In [None]:
from pathlib import Path

# Input paths (two columns: ID + 'text')
COMPANIES_PARQUET = Path("companies_two_columns.parquet")
COMPANIES_CSV     = Path("companies_two_columns.csv")

JOBS_PARQUET      = Path("jobs_two_columns.parquet")
JOBS_CSV          = Path("jobs_two_columns.csv")

# Column names
COMPANY_ID_COL    = "prsId"   # adjust if different
COMPANY_TEXT_COL  = "text"

JOB_ID_COL        = "jobId"
JOB_TEXT_COL      = "text"
JOB_COMPANY_ID_COL= "prsId"   # foreign key Job -> Company (adjust if needed)

# Model configuration
OPENAI_MODEL_PRIMARY = "gpt-4o-mini"  # for generation
OPENAI_MODEL_AUGMENT = "gpt-4o-mini"  # for augmentation (can be same)

# Inference parameters
MAX_INPUT_CHARS_PER_COMPANY = 20000  # cap combined context
TEMPERATURE_PRIMARY = 0.3
TEMPERATURE_AUGMENT = 0.2

# Output paths
OUTPUT_JSONL = Path("space_descriptions.jsonl")
OUTPUT_PARQUET = Path("space_descriptions.parquet")

## Load Data

In [None]:
import pandas as pd

def _load_two_col(path_parquet: Path, path_csv: Path, id_col: str, text_col: str) -> pd.DataFrame:
    if path_parquet.exists():
        df = pd.read_parquet(path_parquet)
    elif path_csv.exists():
        df = pd.read_csv(path_csv)
    else:
        raise FileNotFoundError(f"Neither {path_parquet} nor {path_csv} was found.")
    if id_col not in df.columns or text_col not in df.columns:
        raise KeyError(f"Expected columns missing: id_col={id_col}, text_col={text_col}. Got: {df.columns.tolist()}")
    return df[[id_col, text_col]].dropna().drop_duplicates()

companies = _load_two_col(COMPANIES_PARQUET, COMPANIES_CSV, COMPANY_ID_COL, COMPANY_TEXT_COL)
jobs      = _load_two_col(JOBS_PARQUET, JOBS_CSV, JOB_ID_COL, JOB_TEXT_COL)

# Ensure we have a join key from jobs -> company
if JOB_COMPANY_ID_COL not in jobs.columns:
    for cand in ["prsId", "companyId", "company_id", "employerId"]:
        if cand in jobs.columns:
            JOB_COMPANY_ID_COL = cand
            break
    else:
        raise KeyError(f"Join column for Jobs->Company not found. Please set JOB_COMPANY_ID_COL. Available: {jobs.columns.tolist()}")

display(companies.head())
display(jobs.head())

## Build Company Context (Profiles + Job Postings)

In [None]:
def truncate_chars(s: str, max_chars: int) -> str:
    return s if len(s) <= max_chars else s[:max_chars]

company_context = {}
for _, crow in companies.iterrows():
    cid = crow[COMPANY_ID_COL]
    ctext = str(crow[COMPANY_TEXT_COL])
    job_texts = [str(j) for j in jobs.loc[jobs[JOB_COMPANY_ID_COL] == cid, JOB_TEXT_COL].tolist()]
    parts = []
    if ctext:
        parts.append("# Company Profile\n" + ctext.strip())
    if job_texts:
        parts.append("# Job Postings\n" + "\n\n---\n\n".join([jt.strip() for jt in job_texts if isinstance(jt, str)]))
    joined = "\n\n".join(parts)
    company_context[cid] = truncate_chars(joined, MAX_INPUT_CHARS_PER_COMPANY)

len(company_context)

## Schema

In [None]:
SPACE_SCHEMA = {
  "identity_brand": {
    "legal_name": "string",
    "short_tagline": "string (<= 80 chars)",
    "sector": "software|manufacturing|healthcare|mobility|finance|energy|media|research|public",
    "org_maturity": "startup|scaleup|sme|enterprise|ngo|public",
    "geography_scope": "local|regional|global",
    "palette_hex": ["#AABBCC"],  # 1–3
    "shape_language": "angular|rounded|mixed",
    "material_hint": "wood|metal|glass|fabric|mixed|sustainable",
    "logo_usage": "digital_only|freestanding_signage|light_projection|none"
  },
  "value_proposition_map": [
    {
      "type": "product|service|platform|solution",
      "name": "string",
      "audience": "customers|devs|partners|public",
      "proof_tokens": ["keyword"],
      "weight_percent": 0
    }
  ],  # 2–6 items (weights sum to 100)
  "culture_profile": {
    "values_keywords": ["kw1", "kw2", "kw3"],  # 3–6
    "pace": "steady|fast|research|regulated",
    "collaboration_style": "pairing|squad|crossfunctional|individual",
    "work_mode": "on_site|hybrid|remote_first"
  },
  "portfolio_focus": [
    {
      "artifact_type": "device|app|dataset|prototype|case_study|patent|award",
      "display_mode": "live_demo|interactive_mock|static_model|video_loop|artifact_on_plinth",
      "handling": "open_touch|supervised|staff_only",
      "safety_notes": ["optional note"]  # 0–3
    }
  ],  # 1–5
  "narrative_assets": {
    "headline": "string (<= 80)",
    "story_snippets": ["snippet1", "snippet2"],  # 2–4
    "impact_metrics": [{"label": "string", "value": "number|string", "unit": "string"}],  # 0–5
    "external_signals": [{"type": "award|certification|standard|open_source|partnership", "label": "string"}]  # 0–5
  },
  "interaction_flow": {
    "modes": [{"mode": "self_guided|staff_guided|scheduled_talks|open_demo", "weight_percent": 0}],  # 1–4, sum=100
    "visitor_path": ["Zone A", "Zone B"]
  },
  "zones_layout": {
    "pattern": "islands|loop|gallery|theater|clusters",
    "zones": [{"name": "welcome|showcase|hands_on|conversation|media|prototype|coffee", "relative_area_percent": 0}]  # 2–6, sum=100
  },
  "anchor_objects": [
    {"type": "kiosk|demo_table|high_table|seating_cluster|device_rig|brochure_stand|mobile_whiteboard|display_plinth|plant_cluster|charging_station|brand_tower|media_totem|product_plinth",
     "count": 0, "mobility": "fixed|wheeled|portable"}
  ],  # 2–8
  "circulation_accessibility": {
    "aisle_min_width_m": 1.2,
    "entry_points": "single|dual",
    "accessibility_notes": ["optional"]  # 0–3
  },
  "ambience": {
    "lighting_signature": "spotlit|diffuse|mixed|dynamic",
    "acoustic_profile": "quiet|moderate_buzz|demo_audio",
    "music": "none|low",
    "scent": "none|brand_neutral"
  }
}

## Prompt Builders

In [None]:
import json

def build_primary_prompt(company_text: str) -> str:
    schema_str = json.dumps(SPACE_SCHEMA, indent=2)
    return (
        "You are a spatial experience designer. Based **only** on the following company profile + job postings, "
        "generate a concise JSON object that fills the fields of the provided schema.\n"
        "- Keep it plausible and conservative; do not hallucinate specifics beyond what is reasonably suggested by the text.\n"
        "- All arrays must respect required cardinalities and weight sums must equal 100 where specified.\n"
        "- Use **English**.\n"
        "- Output **only valid JSON** without extra commentary.\n\n"
        "### CONTEXT\n" + company_text + "\n\n"
        "### JSON SCHEMA (example / shape, not a strict JSON Schema)\n" + schema_str + "\n\n"
        "### OUTPUT\nReturn a single JSON object adhering to the schema."
    )

def build_augment_prompt(existing_json: dict, company_text: str) -> str:
    init_str = json.dumps(existing_json, indent=2)
    return (
        "You are refining a space description JSON for a company. You will propose **additions** that plausibly fit the "
        "company **but are not explicitly covered** in the provided context or initial JSON.\n"
        "Rules:\n"
        "- Stay consistent with the company's domain and tone.\n"
        "- Do not contradict the initial JSON.\n"
        "- Keep suggestions compact and realistic.\n"
        "- Do not repeat fields already well filled.\n"
        "- Return only a JSON with fields **to add or refine**, mirroring keys from the original schema where relevant.\n\n"
        "### CONTEXT\n" + company_text + "\n\n"
        "### INITIAL_JSON\n" + init_str + "\n\n"
        "### OUTPUT\nReturn a JSON object with only **additional or refined** fields (partial structure allowed). If nothing to add, return {}."
    )

## OpenAI Helper Functions

In [None]:
import os, json, time

def _init_openai():
    # Try the modern client first; fall back to legacy if needed.
    try:
        from openai import OpenAI
        client = OpenAI()
        return ("responses", client)
    except Exception:
        import openai
        return ("chat", openai)

API_KIND, OPENAI_CLIENT = _init_openai()

def call_openai_json(prompt: str, model: str, temperature: float = 0.2, max_retries: int = 3) -> dict:
    """Calls the OpenAI API and returns a parsed JSON object from the response text."""
    for attempt in range(max_retries):
        try:
            if API_KIND == "responses":
                out = OPENAI_CLIENT.responses.create(
                    model=model,
                    input=[{"role": "user", "content": prompt}],
                    temperature=temperature,
                    max_output_tokens=2000,
                    response_format={"type":"json_object"},
                )
                text = out.output_text
            else:
                openai = OPENAI_CLIENT
                resp = openai.ChatCompletion.create(
                    model=model,
                    messages=[{"role":"user","content": prompt}],
                    temperature=temperature,
                )
                text = resp.choices[0].message["content"]
            return json.loads(text)
        except Exception as e:
            if attempt < max_retries - 1:
                time.sleep(1.5 * (attempt + 1))
                continue
            raise RuntimeError(f"OpenAI call failed after {max_retries} attempts: {e}")

## Step 1 — Generate Space Description per Company

In [None]:
primary_json_by_company = {}

for cid, ctx in company_context.items():
    prompt = build_primary_prompt(ctx)
    primary = call_openai_json(prompt, model=OPENAI_MODEL_PRIMARY, temperature=TEMPERATURE_PRIMARY)
    primary_json_by_company[cid] = primary

len(primary_json_by_company)

## Step 2 — Augment with Plausible, Company-Aligned Info

In [None]:
def deep_merge(a, b):
    if isinstance(a, dict) and isinstance(b, dict):
        out = a.copy()
        for k, v in b.items():
            if k in out:
                out[k] = deep_merge(out[k], v)
            else:
                out[k] = v
        return out
    return b if b is not None else a

augmented_json_by_company = {}

for cid, base in primary_json_by_company.items():
    ctx = company_context[cid]
    prompt = build_augment_prompt(base, ctx)
    try:
        augment = call_openai_json(prompt, model=OPENAI_MODEL_AUGMENT, temperature=TEMPERATURE_AUGMENT)
    except Exception:
        augment = {}
    merged = deep_merge(base, augment)
    augmented_json_by_company[cid] = merged

len(augmented_json_by_company)

## Save Results

In [None]:
import json, pandas as pd

# Save JSONL (one JSON per line)
with open(OUTPUT_JSONL, "w", encoding="utf-8") as f:
    for cid, js in augmented_json_by_company.items():
        rec = {"company_id": cid, "space_description": js}
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")

# Also flatten a few top-level fields for a quick Parquet table
flat_rows = []
for cid, js in augmented_json_by_company.items():
    ib = js.get("identity_brand", {}) if isinstance(js, dict) else {}
    flat_rows.append({
        "company_id": cid,
        "legal_name": ib.get("legal_name"),
        "short_tagline": ib.get("short_tagline"),
        "sector": ib.get("sector"),
        "org_maturity": ib.get("org_maturity"),
        "geography_scope": ib.get("geography_scope"),
    })

df = pd.DataFrame(flat_rows)
try:
    df.to_parquet(OUTPUT_PARQUET, index=False)
except Exception as e:
    print("Skipped Parquet (install pyarrow):", e)

print(f"Saved JSONL -> {OUTPUT_JSONL.resolve()}")
try:
    print(f"Saved Parquet -> {OUTPUT_PARQUET.resolve()}")
except Exception:
    pass