Cell 1 – Imports, config, constants

In [12]:
# %% [markdown]
# # Travel Content Automation Pipeline
# 
# This notebook:
# - Loads locations from `locations.txt`
# - Uses an AI researcher + web search to build structured factual context
# - Generates travel articles with STRICT non-hallucination rules
# - Runs QA/validation
# - Writes one `.txt` file per location to `output/`

# %% [code]
import os
import json
import re
import textwrap
import pathlib
from typing import Dict, Any, List, Optional

from dotenv import load_dotenv
from tqdm import tqdm
from openai import OpenAI

# Load environment variables (for OPENAI_API_KEY)
load_dotenv()

# Initialize OpenAI client
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# ---- Paths ----
PROJECT_ROOT = pathlib.Path(".").resolve()
INPUT_LOCATIONS_FILE = PROJECT_ROOT / "locations.txt"
OUTPUT_DIR = PROJECT_ROOT / "output"
OUTPUT_DIR.mkdir(exist_ok=True)

# ---- Models (adjust if needed depending on your account) ----
RESEARCH_MODEL = "gpt-4.1-mini"   # AI researcher with web_search
WRITER_MODEL = "gpt-4.1-mini"     # Article generator
VALIDATOR_MODEL = "gpt-4.1"       # Stricter validator

# ---- Unknown placeholder pattern ----
UNKNOWN_PATTERN = "[[UNKNOWN: {description}]]"

print("Project root:", PROJECT_ROOT)
print("Output dir:", OUTPUT_DIR)


Project root: C:\Users\brethm01\travel-content-project
Output dir: C:\Users\brethm01\travel-content-project\output


Helper function

In [13]:
# %% [code]
def extract_text_from_response(response) -> str:
    """
    Robustly extract the textual output from a Responses API call.
    - Prefer output[*].content[*].text where type == 'output_text'
    - Fall back to response.output_text if needed.
    - Strip leading/trailing whitespace.
    """
    pieces = []

    try:
        output_list = getattr(response, "output", None) or []
        for out in output_list:
            content_list = getattr(out, "content", None) or []
            for c in content_list:
                c_type = getattr(c, "type", None)
                if c_type == "output_text":
                    text_part = getattr(c, "text", "") or ""
                    pieces.append(text_part)
    except Exception as e:
        print("[WARN] Failed to extract from response.output:", e)

    # If we found pieces, join them
    if pieces:
        return "\n".join(pieces).strip()

    # Fallback: use response.output_text
    try:
        fallback = (response.output_text or "").strip()
        return fallback
    except Exception as e:
        print("[WARN] Failed to extract response.output_text:", e)
        return ""


Cell 3 – Utility: load locations, slugify

In [14]:
# %% [code]
def load_locations(path: pathlib.Path) -> List[str]:
    """
    Load location names from a text file. One location per line.
    Empty lines are ignored.
    """
    locations: List[str] = []
    with open(path, encoding="utf-8") as f:
        for line in f:
            loc = line.strip()
            if loc:
                locations.append(loc)
    return locations


def slugify(name: str) -> str:
    """
    Turn a location name into a safe file name slug.
    """
    s = name.lower()
    s = re.sub(r"[^a-z0-9]+", "-", s)
    s = s.strip("-")
    return s or "location"


locations = load_locations(INPUT_LOCATIONS_FILE)
print("Loaded locations:", locations)


Loaded locations: ['Marseille, France']


Cell 4 – AI research step: fetch structured context with web search

In [None]:
# %% [code]
def fetch_ai_context(location: str) -> Dict[str, Any]:
    """
    Use an AI researcher with web_search to build structured factual context
    for a location.

    STRICT RULES:
    - Temperature = 0
    - Use ONLY information that can be reasonably obtained from web search.
    - Do NOT invent specific details (prices, exact opening hours, addresses)
      unless clearly supported by the sources.
    - Any missing or uncertain info MUST be marked with [[UNKNOWN: ...]].
    - Output must be valid JSON following the specified schema.
    """

    system_msg = f"""
You are a very careful, fact-focused research assistant for a travel agency.

You MUST:
- Use the web_search tool to gather up-to-date information about travel locations.
- Base your answers ONLY on what you find in web search results.
- If you are not sure about something, you MUST use an UNKNOWN placeholder
  in this exact pattern:
  {UNKNOWN_PATTERN.replace("{description}", "brief explanation")}
- You must AVOID hallucinations and guessing.

When researching:
- Pay attention to: short description of the area, history/architecture,
  why people visit, overall vibe and cultural feel, fun facts about the location.
- Also collect clear, factual information about HOW TO GET THERE:
  main train / metro / tram / bus lines, nearby stations, airports,
  major roads or ferry routes, and typical ways people arrive,
  but only if these details are explicitly supported by your sources.
- Look for luggage storage information ONLY if clearly supported by
  reliable sources (e.g. official station websites, the operator’s pages,
  or recognisable travel information sites).
- When you identify main touristic sites, inspect their official websites
  and trusted travel guides to understand what they offer and why people visit.
- If you cannot find something after a reasonable attempt, leave a clear
  [[UNKNOWN: ...]] placeholder in the relevant field of the JSON.

Return ONLY a JSON object with this schema:

{{
  "location": string,
  "summary": string,
  "about": {{
    "short_description": string,
    "history_or_architecture": string,
    "why_people_visit": string,
    "overall_vibe": string
  }},
  "luggage_storage": {{
    "official_options": string
  }},
  "what_to_do_and_see": {{
    "highlights": string
  }},
  "sources": [
    {{
      "url": string,
      "title": string
    }}
  ],
  "has_unknowns": boolean,
  "low_source_coverage": boolean
}}

Rules:
- Temperature = 0: deterministic and conservative.
- If you have very few or weak sources, set low_source_coverage = true.
- If any field uses UNKNOWN placeholders, set has_unknowns = true.
- Text should be neutral, factual, slightly relaxed but not hyped.
- You MUST output valid JSON and nothing else.
""".strip()

    user_msg = f"""
Location: "{location}"

TASK:
1. Use web_search to collect information specifically about this location
   as a travel destination.
2. Fill the JSON schema carefully and conservatively.
3. Use UNKNOWN placeholders instead of guessing.
4. Include at least 2–3 sources where possible, with URLs and titles.

Output ONLY JSON, no extra commentary.
""".strip()

    response = client.responses.create(
        model=RESEARCH_MODEL,
        input=[
            {"role": "system", "content": system_msg},
            {"role": "user", "content": user_msg},
        ],
        temperature=0,
        tools=[{"type": "web_search"}],   # ✅ allow web_search, but don't force it as the *final* output
        # ❌ IMPORTANT: NO tool_choice here
        # ❌ IMPORTANT: NO response_format / NO text={"format": ...} here
    )

    # ----- robust extraction of text -----
    raw = extract_text_from_response(response)

    # If the model still insists on wrapping JSON with extra text, try to chop out the JSON part
    raw_stripped = raw.strip()
    if not raw_stripped:
        print(f"[ERROR] Empty output from research model for '{location}'")
    else:
        # Try to isolate the JSON between first '{' and last '}'
        if not raw_stripped.lstrip().startswith("{"):
            start = raw_stripped.find("{")
            end = raw_stripped.rfind("}")
            if start != -1 and end > start:
                candidate = raw_stripped[start:end+1]
            else:
                candidate = raw_stripped
        else:
            candidate = raw_stripped
        raw_stripped = candidate

    try:
        data = json.loads(raw_stripped)
    except json.JSONDecodeError as e:
        print(f"[ERROR] Context JSON parse failed for '{location}': {e}")
        print("Raw output snippet:\n", raw[:800])
        # Fail-safe: return fully UNKNOWN structure
        return {
            "location": location,
            "summary": UNKNOWN_PATTERN.format(description="context summary unavailable (JSON parse error)"),
            "about": {
                "short_description": UNKNOWN_PATTERN.format(description="short description unavailable"),
                "history_or_architecture": UNKNOWN_PATTERN.format(description="history/architecture info unavailable"),
                "why_people_visit": UNKNOWN_PATTERN.format(description="reasons to visit unavailable"),
                "overall_vibe": UNKNOWN_PATTERN.format(description="overall vibe unavailable"),
            },
            "luggage_storage": {
                "official_options": UNKNOWN_PATTERN.format(description="luggage storage info unavailable"),
            },
            "what_to_do_and_see": {
                "highlights": UNKNOWN_PATTERN.format(description="what to do and see info unavailable"),
            },
            "sources": [],
            "has_unknowns": True,
            "low_source_coverage": True,
            "_meta": {
                "error": "json_parse_failed",
                "raw_snippet": raw[:800],
            },
        }

    # Ensure minimal keys exist, even if the model omitted them for some reason
    data.setdefault("location", location)
    data.setdefault("about", {})
    data.setdefault("luggage_storage", {})
    data.setdefault("what_to_do_and_see", {})
    data.setdefault("sources", [])
    data.setdefault("has_unknowns", False)
    data.setdefault("low_source_coverage", False)

    return data


In [16]:
if locations:
    test_loc = locations[0]
    print(f"Testing fetch_ai_context for: {test_loc}")
    ctx_json = fetch_ai_context(test_loc)
    print("Context keys:", ctx_json.keys())
else:
    print("No locations found.")


Testing fetch_ai_context for: Marseille, France
Context keys: dict_keys(['location', 'summary', 'about', 'luggage_storage', 'what_to_do_and_see', 'sources', 'has_unknowns', 'low_source_coverage'])


Cell 5 – Context QA / validation

In [17]:
# %% [code]
def basic_context_schema_check(ctx: Dict[str, Any]) -> List[str]:
    """
    Check that the context JSON has all required fields
    and that types are sensible.
    Returns a list of problems (empty if OK).
    """
    problems: List[str] = []

    # Top-level keys
    required_top = ["location", "summary", "about", "luggage_storage", "what_to_do_and_see", "sources"]
    for key in required_top:
        if key not in ctx:
            problems.append(f"Missing key: {key}")

    # Type checks
    if not isinstance(ctx.get("about", {}), dict):
        problems.append("about should be an object")

    if not isinstance(ctx.get("luggage_storage", {}), dict):
        problems.append("luggage_storage should be an object")

    if not isinstance(ctx.get("what_to_do_and_see", {}), dict):
        problems.append("what_to_do_and_see should be an object")

    # Nested fields
    about = ctx.get("about", {})
    for k in ["short_description", "history_or_architecture", "why_people_visit", "overall_vibe"]:
        if not isinstance(about.get(k, ""), str):
            problems.append(f"about.{k} missing or not string")

    luggage = ctx.get("luggage_storage", {})
    if not isinstance(luggage.get("official_options", ""), str):
        problems.append("luggage_storage.official_options missing or not string")

    todo = ctx.get("what_to_do_and_see", {})
    if not isinstance(todo.get("highlights", ""), str):
        problems.append("what_to_do_and_see.highlights missing or not string")

    # Sources
    sources = ctx.get("sources", [])
    if not isinstance(sources, list):
        problems.append("sources should be a list")
    else:
        for i, s in enumerate(sources):
            if not isinstance(s, dict) or "url" not in s:
                problems.append(f"sources[{i}] invalid (expected object with url)")

    return problems


def count_unknowns_in_context(ctx: Dict[str, Any]) -> int:
    """
    Count how many UNKNOWN placeholders appear in the context JSON.
    """
    text = json.dumps(ctx, ensure_ascii=False)
    return len(re.findall(r"\[\[UNKNOWN:", text))


def qa_context(ctx: Dict[str, Any]) -> Dict[str, Any]:
    """
    Compute a QA summary for the context:
    - unknown_count
    - source_count
    - schema_problems
    - low_source_coverage (from model OR heuristics)
    - needs_manual_review
    """
    schema_problems = basic_context_schema_check(ctx)
    unknown_count = count_unknowns_in_context(ctx)
    sources = ctx.get("sources", [])
    source_count = len(sources)

    low_source_flag = bool(ctx.get("low_source_coverage")) or source_count < 2

    needs_manual_review = bool(schema_problems) or low_source_flag

    return {
        "unknown_count": unknown_count,
        "source_count": source_count,
        "schema_problems": schema_problems,
        "low_source_coverage": low_source_flag,
        "needs_manual_review": needs_manual_review,
    }


# Quick smoke test on first location (will use the API)
if locations:
    test_loc = locations[0]
    print(f"Fetching context for test location: {test_loc}")
    ctx_test = fetch_ai_context(test_loc)
    qa_test = qa_context(ctx_test)
    print("QA summary:", qa_test)


Fetching context for test location: Marseille, France
QA summary: {'unknown_count': 0, 'source_count': 3, 'schema_problems': [], 'low_source_coverage': False, 'needs_manual_review': False}


Cell 6 – Convert context JSON → plain context text block

In [25]:
# %% [code]
def context_json_to_text(ctx: Dict[str, Any]) -> str:
    """
    Convert structured context JSON into a plain text block
    that will be passed to the content-writing model as CONTEXT.
    """
    about = ctx.get("about", {})
    luggage = ctx.get("luggage_storage", {})
    todo = ctx.get("what_to_do_and_see", {})
    sources = ctx.get("sources", [])

    lines: List[str] = []

    lines.append(f"Location: {ctx.get('location', 'UNKNOWN')}")
    lines.append("")
    lines.append("SUMMARY")
    lines.append(ctx.get("summary", "").strip())
    lines.append("")
    lines.append("ABOUT")
    lines.append("Short description (with overall vibe):")
    lines.append(about.get("short_description", "").strip())
    lines.append("")
    lines.append("History or architecture:")
    lines.append(about.get("history_or_architecture", "").strip())
    lines.append("")
    lines.append("Why people visit:")
    lines.append(about.get("why_people_visit", "").strip())
    lines.append("")
    lines.append("Overall vibe:")
    lines.append(about.get("overall_vibe", "").strip())
    lines.append("")
    lines.append("LUGGAGE STORAGE (official options):")
    lines.append(luggage.get("official_options", "").strip())
    lines.append("")
    lines.append("WHAT TO DO AND SEE (high-level highlights):")
    lines.append(todo.get("highlights", "").strip())
    lines.append("")
    lines.append("SOURCES (for internal QA, not shown to customers):")
    for s in sources:
        url = s.get("url", "")
        title = s.get("title", "")
        if title and url:
            lines.append(f"- {title} – {url}")
        elif url:
            lines.append(f"- {url}")
        elif title:
            lines.append(f"- {title}")

    return "\n".join(lines)


# Quick sanity check on context-to-text conversion
if locations:
    example_ctx = fetch_ai_context(locations[0])
    example_text_ctx = context_json_to_text(example_ctx)
    print("Context text preview:\n")
    print(example_text_ctx[:2000], "...\n")


Context text preview:

Location: Marseille, France

SUMMARY
Marseille, France's second-largest city, is a vibrant Mediterranean port known for its rich history, diverse culture, and stunning coastal landscapes. Founded around 600 BC, it seamlessly blends ancient traditions with modern influences, offering visitors a unique and dynamic experience.

ABOUT
Short description (with overall vibe):
Marseille is a bustling port city on France's southeastern coast, celebrated for its historical significance, cultural diversity, and picturesque Mediterranean setting.

History or architecture:
Established circa 600 BC by Greek sailors, Marseille boasts a rich tapestry of history, evident in landmarks like the ancient port, the 17th-century Fort Saint-Jean, and the 18th-century Château Borély. The city's architecture reflects a blend of ancient Greek, Roman, and modern influences, showcasing its evolution over millennia.

Why people visit:
Visitors are drawn to Marseille for its historical sites, 

In [26]:
# Show the sources used in the test generation
if locations:
    test_loc = locations[0]
    print(f"Generating article for test location: {test_loc}")
    test_ctx = fetch_ai_context(test_loc)
    test_ctx_text = context_json_to_text(test_ctx)
    article_test = generate_article(test_loc, test_ctx_text)
    print("Generated article keys:", article_test.keys())

Generating article for test location: Marseille, France
Generated article keys: dict_keys(['short_description', 'why_people_visit', 'how_to_get_there', 'luggage_storage_options', 'five_things_to_do_around', 'location', 'has_unknowns'])


Cell 7 – Content generation (article) using second LLM pass

In [None]:
def generate_article(location: str, context_text: str) -> Dict[str, Any]:
    """
    Use a second LLM pass to generate the final article in a relaxed,
    concrete travel tone.

    STRICT RULES:
    - Temperature = 0
    - Use ONLY the provided context_text.
    - Do NOT invent facts (no general knowledge).
    - Replace missing info with [[UNKNOWN: ...]].
    - Output must be valid JSON, not markdown or prose.
    - JSON schema is fixed (see system prompt).
    """

    system_msg = """
You are a factual, deterministic travel-content writer (temperature = 0).
Your job is to generate short, highly specific, non-hallucinated content for a given location using ONLY the context provided by the research pipeline.

CRITICAL RULES
- Use ONLY the context provided. Never invent details.
- If information is missing, write: [[UNKNOWN: reason]].
- Tone: young, energetic, concrete, specific. No generic travel-guide fluff.
- No marketing clichés. No filler.
- No markdown, no headings in the output.
- Output MUST be valid JSON matching the schema below.

JSON OUTPUT SCHEMA
You must output a JSON object with exactly these fields:

{
  "short_description": "...",
  "why_people_visit": "...",
  "how_to_get_there": "...",
  "luggage_storage_options": "...",
  "five_things_to_do_around": [
    "...",
    "...",
    "...",
    "...",
    "..."
  ]
}

SECTION GUIDELINES

1) short_description
- Max 5 lines total.
- Can include some (not all are required) of:
  - overall vibe / cultural feel
  - history or architectural significance
  - important yearly events or festivals
- Keep it punchy and concrete. Imagine explaining the place quickly to a friend who just arrived from the airport.
- Avoid generic phrases like "offers something for everyone" or "rich in history and culture".

Reference style (example of the tone, NOT content to copy):
"Marseille is France's oldest city, where busy port cranes, salty sea air, and pastel townhouses all meet the Mediterranean. Expect noisy café terraces, football fans in blue and white, and plates of grilled fish that land on your table still smelling of the sea."

2) why_people_visit
- Focus on specific reasons, not abstractions.
- Mention concrete activities, places, food, quirks, and local culture that are explicitly present in the context.
- Bad style (too generic):
  - "Visitors are drawn here for its rich history, architecture, and cultural offerings."
- Good style (what you should aim for):
  - "People come for the old harbour cafés, the local fish markets, the famous regional dishes, the street art, the big football stadium, and the city’s role as a trading port."
- Use short, vivid sentences. Prefer examples over adjectives.

3) how_to_get_there
- Purely factual, transportation-focused.
- Use only what appears in the context: train lines, metro lines, tram lines, buses, airports, major roads, ferries, etc.
- Be clear and practical: which station, which line, which terminal, approximate journey patterns if they are in the context.
- If the context has no useful information about transport, write:
  [[UNKNOWN: insufficient transport information in context]]

4) luggage_storage_options
- Describe:
  - specific luggage storage facilities or services
  - their location (e.g., inside the main hall, near a specific platform, in a nearby street)
  - opening hours if available
  - approximate prices ONLY if the context explicitly provides them
  - any conditions (e.g., max weight, lockers vs staffed desk) if present
- Bad style (too vague):
  - "Luggage storage facilities are available at the station; for details please see the official website."
- Good style:
  - "There is a left-luggage office next to Platform 1, open daily from 07:00 to 23:00. Lockers and staffed counters are available; prices and maximum size limits are listed in the context."
- If luggage storage is not mentioned in the context, write:
  [[UNKNOWN: no luggage storage information in context]]

5) five_things_to_do_around
- Return exactly 5 items, each 1–2 sentences.
- Mix of:
  - attractions, museums, parks, viewpoints
  - good food/drink options
  - fun or slightly quirky experiences
  - must-see spots within short walking distance if mentioned
  - off-the-beaten-path or local neighbourhood favourites if mentioned
- Every item must be supported by the context. If you cannot find enough distinct items, use:
  [[UNKNOWN: not enough points of interest in context]]
- Style: specific, friendly, but still factual. No invented reviews or emotions. You can use light, energetic phrasing like "grab a coffee", "wander through", "check out", as long as what you mention exists in the context.

OVERALL STYLE RULES
- Write as if you’re a young, curious traveller speaking to another traveller.
- Prefer strong nouns and concrete details (street names, local dishes, districts, stadiums, specific streets or squares) over vague adjectives.
- Short to medium-length sentences; avoid very long, academic-style paragraphs.
- Blend facts with light, natural enthusiasm: "You’ll find...", "It’s a favourite spot for...", "Locals come here to..."
- Use examples to make points, e.g., instead of "great food", say "bakeries selling local pastries" or "harbour restaurants serving grilled fish".
- Avoid lists of abstract qualities ("history, architecture, culture") unless immediately grounded with specific examples.
- Never copy or paraphrase generic-sounding examples from other locations in the context; always stay specific to the current place.

HALLUCINATION CONTROL
- You MUST only use information present in the given context text.
- Do NOT rely on general knowledge, even if you are confident it is true.
- When you are missing information for a required field or detail, use a clear placeholder:
  [[UNKNOWN: explain briefly what is missing]]

OUTPUT FORMAT
- Output a single JSON object.
- No extra text before or after the JSON.
- Strings must not contain unescaped line breaks that would break JSON validity.
""".strip()

    user_msg = f"""
Write structured travel content for the following location, using only the context provided.

Location: {location}

Context (the only source of truth):
{context_text}

Remember:
- Do not invent anything that is not present in the context.
- If something is missing, use [[UNKNOWN: …]].
- Follow the style rules and examples in the system message.
- Output valid JSON only, matching the required schema.
""".strip()

    response = client.responses.create(
        model=WRITER_MODEL,
        input=[
            {"role": "system", "content": system_msg},
            {"role": "user", "content": user_msg},
        ],
        temperature=0.95,
    )

    # Extract text (you can use extract_text_from_response here too if you like)
    try:
        raw = extract_text_from_response(response)
    except NameError:
        # fallback if you didn't define the helper for writer
        raw = response.output_text

    try:
        article = json.loads(raw)
    except json.JSONDecodeError as e:
        print(f"[ERROR] Article JSON parse failed for '{location}': {e}")
        print("Raw output snippet:\n", raw[:800])
        # Fail-safe fully UNKNOWN article
        return {
            "location": location,
            "short_description": UNKNOWN_PATTERN.format(description="short description unavailable"),
            "why_people_visit": UNKNOWN_PATTERN.format(description="reasons to visit unavailable"),
            "how_to_get_there": UNKNOWN_PATTERN.format(description="transport information unavailable"),
            "luggage_storage_options": UNKNOWN_PATTERN.format(description="luggage storage info unavailable"),
            "five_things_to_do_around": [
                UNKNOWN_PATTERN.format(description="things to do info unavailable")
            ],
            "has_unknowns": True,
            "_meta": {
                "error": "article_json_parse_failed",
                "raw_snippet": raw[:800],
            },
        }

    # Ensure minimal structure / defaults
    article.setdefault("location", location)
    article.setdefault("short_description", "")
    article.setdefault("why_people_visit", "")
    article.setdefault("how_to_get_there", "")
    article.setdefault("luggage_storage_options", "")
    if not isinstance(article.get("five_things_to_do_around"), list):
        article["five_things_to_do_around"] = []
    article.setdefault("has_unknowns", False)

    return article


In [24]:
# SHow the context used in the test generation
if locations:
    test_loc = locations[0]
    print(f"Generating article for test location: {test_loc}")
    test_ctx = fetch_ai_context(test_loc)
    test_ctx_text = context_json_to_text(test_ctx)
    article_test = generate_article(test_loc, test_ctx_text)
    print("Generated article keys:", article_test.keys())

    

Generating article for test location: Marseille, France
Generated article keys: dict_keys(['short_description', 'why_people_visit', 'how_to_get_there', 'luggage_storage_options', 'five_things_to_do_around', 'location', 'has_unknowns'])


Cell 8 – Second validation pass: article vs context

In [20]:
def validate_article(location: str, context_text: str, article: Dict[str, Any]) -> Dict[str, Any]:
    """
    Second validation pass:
    - Compare article vs context.
    - Flag sentences that appear unsupported by CONTEXT.
    - If anything suspicious, set overall_assessment = "needs_review".

    Returns a JSON object like:
    {
      "location": "...",
      "overall_assessment": "safe" | "needs_review",
      "unsupported_sentences": [ ... ]
    }
    """
    article_json_str = json.dumps(article, ensure_ascii=False, indent=2)

    system_msg = """
You are a very strict fact-checking assistant.

TASK:
- Compare the ARTICLE against the CONTEXT.
- Identify any sentences that appear to contain specific factual claims
  (dates, numbers, names, prices, opening hours, places, etc.)
  that are NOT clearly supported by the CONTEXT.
- It is better to over-flag than to miss a problem.
- If the CONTEXT is weak or very short, consider most specific claims
  unsupported.

OUTPUT:
Return ONLY a JSON object like:
{
  "location": string,
  "overall_assessment": "safe" | "needs_review",
  "unsupported_sentences": [string, ...]
}
""".strip()

    user_msg = f"""
CONTEXT:
\"\"\"
{context_text}
\"\"\"

ARTICLE (JSON):
\"\"\"
{article_json_str}
\"\"\"
""".strip()

    response = client.responses.create(
        model=VALIDATOR_MODEL,
        input=[
            {"role": "system", "content": system_msg},
            {"role": "user", "content": user_msg},
        ],
        temperature=0,
    )

    try:
        raw = extract_text_from_response(response)
    except NameError:
        raw = response.output_text

    try:
        val = json.loads(raw)
    except json.JSONDecodeError as e:
        print(f"[ERROR] Validation JSON parse failed for '{location}': {e}")
        print("Raw output snippet:\n", raw[:800])
        # If validation fails, err on the side of caution
        return {
            "location": location,
            "overall_assessment": "needs_review",
            "unsupported_sentences": [],
            "_meta": {
                "error": "validation_json_parse_failed",
                "raw_snippet": raw[:800],
            },
        }

    # Minimal structure
    val.setdefault("location", location)
    val.setdefault("overall_assessment", "needs_review")
    val.setdefault("unsupported_sentences", [])

    return val


Cell 9 – Render article to text & save to file

In [21]:
def render_article(article: Dict[str, Any],
                   validation: Dict[str, Any],
                   context_qa: Dict[str, Any]) -> str:
    """
    Turn the JSON article + validation + context QA into a human-readable
    .txt document following the desired structure:

    1. Short description
    2. Why people visit
    3. How to get there
    4. Luggage storage options
    5. 5 things to do around the place

    Unknown placeholders remain visible to avoid deception.
    """
    loc = article.get("location", "Unknown location")

    # Backwards-compatible access to any legacy nested fields
    about = article.get("about", {}) or {}
    legacy_luggage = article.get("luggage_storage", {}) or {}
    legacy_todo = article.get("what_to_do_and_see", {}) or {}

    # New flat schema fields with sensible fallbacks
    short_description = (
        (article.get("short_description") or about.get("short_description") or "").strip()
    )
    why_people_visit = (
        (article.get("why_people_visit") or about.get("why_people_visit") or "").strip()
    )
    how_to_get_there = (article.get("how_to_get_there") or "").strip()
    luggage_storage_options = (
        (article.get("luggage_storage_options")
         or legacy_luggage.get("official_options")
         or "").strip()
    )

    five_things = article.get("five_things_to_do_around")
    if not isinstance(five_things, list):
        five_things = []
    # Normalise list items to clean strings
    five_things = [str(item).strip() for item in five_things if str(item).strip()]

    lines: List[str] = []

    # Header warning if validator says "needs_review"
    if validation.get("overall_assessment") == "needs_review":
        lines.append("⚠️ NOTE: Some details in this article could not be fully verified.")
        lines.append("   Please review before publishing to customers.\n")

    # Title
    lines.append(loc)
    lines.append("=" * len(loc))
    lines.append("")

    # 1) Short description
    lines.append("Short description")
    lines.append(
        short_description
        or UNKNOWN_PATTERN.format(description="short description unavailable")
    )
    lines.append("")

    # 2) Why people visit
    lines.append("Why people visit")
    lines.append(
        why_people_visit
        or UNKNOWN_PATTERN.format(description="reasons to visit unavailable")
    )
    lines.append("")

    # 3) How to get there
    lines.append("How to get there")
    lines.append(
        how_to_get_there
        or UNKNOWN_PATTERN.format(description="transport information unavailable")
    )
    lines.append("")

    # 4) Luggage storage options
    lines.append("Luggage storage options")
    lines.append(
        luggage_storage_options
        or UNKNOWN_PATTERN.format(description="luggage storage info unavailable")
    )
    lines.append("")

    # 5) 5 things to do around
    lines.append("5 things to do around the area")
    if five_things:
        for i, item in enumerate(five_things, start=1):
            lines.append(f"{i}. {item}")
    else:
        # Fallback to legacy main_paragraph if present, otherwise UNKNOWN
        legacy_main = (legacy_todo.get("main_paragraph") or "").strip()
        if legacy_main:
            lines.append(legacy_main)
        else:
            lines.append(
                UNKNOWN_PATTERN.format(description="things to do info unavailable")
            )
    lines.append("")

    # Count unknowns in the rendered article
    full_text_so_far = "\n".join(lines)
    unknown_count_article = len(re.findall(r"\[\[UNKNOWN:", full_text_so_far))

    # QA footer
    lines.append("")
    lines.append("-----")
    lines.append("INTERNAL QA FOOTER (do not show to customers)")
    lines.append(f"- UNKNOWN placeholders in article text: {unknown_count_article}")
    lines.append(f"- Context unknown_count: {context_qa.get('unknown_count')}")
    lines.append(f"- Context source_count: {context_qa.get('source_count')}")
    lines.append(f"- Context low_source_coverage: {context_qa.get('low_source_coverage')}")
    lines.append(f"- Context schema_problems: {context_qa.get('schema_problems') or 'None'}")
    lines.append(f"- Validation overall_assessment: {validation.get('overall_assessment')}")
    lines.append(
        f"- Validation unsupported_sentences: {len(validation.get('unsupported_sentences', []))}"
    )
    lines.append("")

    return "\n".join(lines)


def save_article(location: str, text: str) -> pathlib.Path:
    """
    Save the rendered article text to a .txt file in OUTPUT_DIR.
    File name is based on a slugified version of the location.
    """
    slug = slugify(location)
    out_path = OUTPUT_DIR / f"{slug}.txt"
    with open(out_path, "w", encoding="utf-8") as f:
        f.write(text)
    return out_path


In [23]:
# Show text
if locations:
    example_location = locations[0]
    example_context = fetch_ai_context(example_location)
    example_context_text = context_json_to_text(example_context)
    example_article = generate_article(example_location, example_context_text)
    example_validation = validate_article(example_location, example_context_text, example_article)
    rendered_text = render_article(example_article, example_validation, qa_context(example_context))
    print("Rendered article preview:\n")
    print(rendered_text[:2000], "...\n")

    

Rendered article preview:

Marseille, France

Short description
Marseille is a bustling port city on France’s southeastern coast, known for its multicultural vibe and deep history. Founded by ancient Greeks as Massalia, it features historic landmarks like Fort Saint-Jean and Château Borély. The city blends old-world charm with lively urban energy. Its archaeological Jardin des Vestiges reveals ancient Greek roots. It’s a hub where Mediterranean beaches meet buzzing street culture.

Why people visit
People come for the Vieux Port’s lively marina, the historic Le Panier neighborhood’s narrow streets, and the MuCEM museum showcasing Mediterranean cultures. Outdoor fans hit the Calanques National Park for hiking limestone cliffs and swimming in turquoise waters. The mix of street art, Mediterranean seafood, and ancient sites makes Marseille unique.

How to get there
[[UNKNOWN: insufficient transport information in context]]

Luggage storage options
[[UNKNOWN: no luggage storage information

Cell 10 – Full pipeline loop + final QA report

In [22]:
# %% [code]
qa_report: List[Dict[str, Any]] = []

for loc in tqdm(locations, desc="Processing locations"):
    print(f"\n=== Processing location: {loc} ===")

    # 1) Research: fetch AI context (with web search)
    context_json = fetch_ai_context(loc)

    # 2) QA for context
    context_qa = qa_context(context_json)
    context_text = context_json_to_text(context_json)

    print(f"  Context sources: {context_qa['source_count']}, "
          f"unknowns: {context_qa['unknown_count']}, "
          f"needs_manual_review: {context_qa['needs_manual_review']}")

    # 3) Generate article from context
    article = generate_article(loc, context_text)

    # 4) Validate article vs context
    validation = validate_article(loc, context_text, article)
    print(f"  Validation assessment: {validation.get('overall_assessment')} "
          f"(unsupported sentences: {len(validation.get('unsupported_sentences', []))})")

    # 5) Render and save
    article_text = render_article(article, validation, context_qa)
    out_path = save_article(loc, article_text)

    # 6) Compute final unknown count in article
    unknown_count_article = len(re.findall(r"\[\[UNKNOWN:", article_text))

    needs_manual_review = (
        context_qa["needs_manual_review"] or
        validation.get("overall_assessment") == "needs_review"
    )

    print(f"  Saved to: {out_path}")
    print(f"  UNKNOWN placeholders in article: {unknown_count_article}")
    if needs_manual_review:
        print("  -> MARKED FOR MANUAL REVIEW")

    qa_report.append({
        "location": loc,
        "output_file": str(out_path),
        "context_unknowns": context_qa["unknown_count"],
        "context_sources": context_qa["source_count"],
        "context_low_source_coverage": context_qa["low_source_coverage"],
        "validation_overall_assessment": validation.get("overall_assessment"),
        "validation_unsupported_sentences": len(validation.get("unsupported_sentences", [])),
        "article_unknowns": unknown_count_article,
        "needs_manual_review": needs_manual_review,
    })

print("\n===== FINAL QA REPORT =====")
print(json.dumps(qa_report, indent=2, ensure_ascii=False))


Processing locations:   0%|          | 0/1 [00:00<?, ?it/s]


=== Processing location: Marseille, France ===
  Context sources: 3, unknowns: 0, needs_manual_review: False


Processing locations: 100%|██████████| 1/1 [00:19<00:00, 19.39s/it]

  Validation assessment: safe (unsupported sentences: 0)
  Saved to: C:\Users\brethm01\travel-content-project\output\marseille-france.txt
  UNKNOWN placeholders in article: 1

===== FINAL QA REPORT =====
[
  {
    "location": "Marseille, France",
    "output_file": "C:\\Users\\brethm01\\travel-content-project\\output\\marseille-france.txt",
    "context_unknowns": 0,
    "context_sources": 3,
    "context_low_source_coverage": false,
    "validation_overall_assessment": "safe",
    "validation_unsupported_sentences": 0,
    "article_unknowns": 1,
    "needs_manual_review": false
  }
]



