In [1]:
import json
import pandas as pd

# Load the JSON file
with open('data/surf_spots_enriched.json', 'r') as file:
    surf_spots_data = json.load(file)

# Flatten the nested JSON structure and convert to a DataFrame
surf_spots_df = pd.json_normalize(surf_spots_data, sep='_')

In [2]:
import pandas as pd
import itertools

# Define wave direction and bottom type mappings
direction_mapping = {
    "Left and right": ["Left", "Right", "Left and right"],
    "Left": ["Left"],
    "Right": ["Right"]
}

bottom_mapping = {
    "Sand, Sand with rocks": ["Sand", "Sand with rocks", "Sand, Sand with rocks"],
    "Sand with rocks, Reef": ["Sand with rocks", "Reef", "Sand with rocks, Reef"],
    "Reef": ["Reef"],
    "Sand": ["Sand"],
    "Sand with rocks": ["Sand with rocks"],
    "Sand, Reef": ["Sand", "Reef", "Sand, Reef"],
    "Sand, Sand with rocks, Reef": ["Sand", "Sand with rocks", "Reef", "Sand, Sand with rocks, Reef"]
}

In [3]:
# List all metadata combinations
directions = list(direction_mapping.keys())
bottoms = list(bottom_mapping.keys())

# Initialize storage for the dataset
ground_truth_data = []

# Loop through metadata combinations
for direction, bottom in itertools.product(directions, bottoms):
    # Get relevant subcategories
    relevant_directions = direction_mapping[direction]
    relevant_bottoms = bottom_mapping[bottom]

    # Filter dataset based on metadata combination
    filtered_df = surf_spots_df[
        surf_spots_df['details_Direction of Wave'].isin(relevant_directions) &
        surf_spots_df['details_Type of Bottom'].isin(relevant_bottoms)
    ]

    # Collect spot descriptions
    descriptions = filtered_df['details_Spot Description'].tolist()

    if descriptions:  # Only process if there are descriptions for this combination
        ground_truth_data.append({
            "Direction of Wave": direction,
            "Type of Bottom": bottom,
            "Descriptions": descriptions
        })

# Convert to DataFrame
ground_truth_df = pd.DataFrame(ground_truth_data)


In [4]:
from openai import OpenAI
import json
import time
import pandas as pd
from config import OPENAI_API_KEY

# Initialize OpenAI client
client = OpenAI(api_key=OPENAI_API_KEY)

def generate_ground_truth_queries(direction, bottom, descriptions, model="gpt-4o-mini", max_retries=3):
    """
    Generate user queries and corresponding expected answers using OpenAI GPT-4o-mini.
    """
    # Prepare descriptions as a structured text input
    descriptions_text = "\n".join([f"- {desc}" for desc in descriptions])

    # Define the prompt
    messages = [
        {"role": "system", "content": "You are an expert surf advisor, generating realistic user queries and their expected responses for an AI-powered surf spot recommendation system."},
        {
            "role": "user",
            "content": f"""
Make sure to use only information from the retrieved spots and do not hallucinate.
Based on the following surf spots that match these conditions:
- Wave Direction: {direction}
- Bottom Type: {bottom}

Spot Descriptions:
{descriptions_text}

Please generate 10 user queries related to all the specific spots. For each query, also provide an ideal answer that sounds like a surf report for these spots in a natural way. Make sure to use only information from the retrieved spots and do not hallucinate.

Return the result as **valid JSON** with exactly two keys: 
- "queries" (a list of 10 questions a surfer might ask)
- "answers" (a list of 10 corresponding responses that an AI surf report would generate).

Example JSON format:
{{
    "queries": ["I want to surf relaxed waves with sand bottom this weekend", "I would like to surf punchy beach breaks barrels"],
    "answers": ["Praia Azul offers left-hand waves on a sandy bottom, best surfed at mid-tide and has mellow waves", "Carcavelos offers great waves for your preferences, having a variety of peaks and punchy waves."]
}}
"""
        }
    ]

    # Implement retry logic for API failures
    for attempt in range(max_retries):
        try:
            completion = client.chat.completions.create(
                model=model,
                messages=messages
            )

            # Extract response text
            response_text = completion.choices[0].message.content

            # Try parsing JSON output
            parsed_response = json.loads(response_text)

            # Ensure required keys exist
            if "queries" in parsed_response and "answers" in parsed_response:
                return parsed_response
            else:
                print(f"Invalid response structure on attempt {attempt + 1}: {response_text}")
        
        except json.JSONDecodeError:
            print(f"JSON parsing failed on attempt {attempt + 1}. Response: {response_text}")

        except Exception as e:
            print(f"API Error on attempt {attempt + 1}: {e}")
        
        # Exponential backoff before retrying
        time.sleep(2 ** attempt)  

    return None  # Return None if all retries fail


# Apply LLM to each metadata combination
ground_truth_with_queries = []

for _, row in ground_truth_df.iterrows():
    response = generate_ground_truth_queries(
        direction=row["Direction of Wave"],
        bottom=row["Type of Bottom"],
        descriptions=row["Descriptions"]
    )
    
    if response:
        for query, answer in zip(response["queries"], response["answers"]):
            ground_truth_with_queries.append({
                "Direction of Wave": row["Direction of Wave"],
                "Type of Bottom": row["Type of Bottom"],
                "Query": query,
                "Expected Answer": answer
            })

# Convert to DataFrame
final_truth_df = pd.DataFrame(ground_truth_with_queries)

# Save results
final_truth_df.to_csv("ground_truth_rag_surf.csv", index=False)
final_truth_df.to_json("ground_truth_rag_surf.json", orient="records", indent=4)

print("Ground truth dataset successfully created and saved.")


JSON parsing failed on attempt 1. Response: ```json
{
    "queries": [
        "Where can I find less crowded waves with both lefts and rights nearby?",
        "What's a good spot for beginners looking for sandy bottom waves?",
        "Are there any intermediate-friendly surf spots with consistent waves?",
        "Which beaches offer fun left and right breaks around Lisbon?",
        "Can you recommend a place to surf with multiple peaks breaking over sand?",
        "What's the best tide to surf at Praia da Almagreira?",
        "Where can I find an uncrowded spot with good sand bottom waves?",
        "Is there a beach break that works well for both beginners and pros?",
        "What surf spot should I check if I want a solid left tube?",
        "Where can I find a quality wave with less localism?"
    ],
    "answers": [
        "Praia do Baleal Norte is ideal for escaping the crowds, offering both lefts and rights across its sandy beach. The wave quality improves significantly

In [5]:
final_truth_df.to_csv("ground_truth_rag_surf.csv", index=False)
final_truth_df.to_json("ground_truth_rag_surf.json", orient="records", indent=4)