In [7]:
import json
import pandas as pd
import random

In [30]:
def get_combine_json_data(path = "POI_data.json", at_least_hawker = 10, at_least_attraction = 30):
    # Read the JSON file
    with open(path, "r", encoding="utf-8") as file:
        data = json.load(file)

    ### This is for Hawker
    hawker_names_llm = [entry['Hawker Name'] for entry in data["Hawker"]]
    df_h = pd.read_csv("singapore_20_food_with_scores.csv")
    hawker_names_kb = df_h["Hawker Name"].to_list()
    filtered_hawker_names = [name for name in hawker_names_llm if name in hawker_names_kb]
    remaining_hawkers = [name for name in hawker_names_kb if name not in filtered_hawker_names]
    num_to_take_hawker = at_least_hawker - len(filtered_hawker_names)
    print(num_to_take_hawker)
    sampled_hawkers = random.sample(remaining_hawkers, k=min(num_to_take_hawker, len(remaining_hawkers)))
    filtered_rows_h = df_h[df_h['Hawker Name'].isin(sampled_hawkers)]

    # Step 2: Convert to list of dictionaries
    new_data = []
    for _, row in filtered_rows_h.iterrows():
        hawker_dict = {
            'Hawker Name': row['Hawker Name'],
            'Description': "NA.",
            'Rating': 2.5,  # normal to the person
            'Satisfaction Score': 2.5,  # normal to the person
            'Entrance Fee': 5.0,
            'Duration': 60,
            'Sources': ["NA"]
        }
        new_data.append(hawker_dict)
    # print(new_data)
    data['Hawker'].extend(new_data)

    ### This is for Attractions
    attraction_names_llm = [entry['Attraction Name'] for entry in data["Attraction"]]
    df_a = pd.read_csv("singapore_67_attractions_with_scores.csv")
    attraction_names_kb = df_a["Attraction Name"].to_list()
    filtered_attraction_names = [name for name in attraction_names_llm if name in attraction_names_kb]
    remaining_attractions = [name for name in attraction_names_kb if name not in filtered_attraction_names]
    num_to_take_attraction = at_least_attraction - len(filtered_attraction_names)
    sampled_attractins = random.sample(remaining_attractions, k=min(num_to_take_attraction, len(remaining_attractions)))

    filtered_rows_a = df_a[df_a['Attraction Name'].isin(sampled_attractins)]

    # Step 2: Convert to list of dictionaries
    new_data = []
    for _, row in filtered_rows_a.iterrows():
        attraction_dict = {
            'Hawker Name': None,  # Leave blank or remove if not needed
            'Attraction Name': row['Attraction Name'],
            'Description': "NA.",
            'Rating': 2.5,  # normal to the person
            'Satisfaction Score': 2.5,  # normal to the person
            'Entrance Fee': 10.0,
            'Duration': 120,
            'Sources': ["NA"]
        }
        new_data.append(attraction_dict)

    data['Attraction'].extend(new_data)

    return data

In [31]:
len(get_combine_json_data()["Hawker"])

8


13

In [29]:
len(get_combine_json_data()["Attraction"])

30

In [42]:
def get_combine_json_data(path="POI_data.json", at_least_hawker=10, at_least_attraction=30):
    # Load existing JSON data
    with open(path, "r", encoding="utf-8") as file:
        data = json.load(file)

    ### --- Load Knowledge Base CSVs ---
    df_h = pd.read_csv("singapore_20_food_with_scores.csv")
    hawker_names_kb = df_h["Hawker Name"].tolist()

    df_a = pd.read_csv("singapore_67_attractions_with_scores.csv")
    attraction_names_kb = df_a["Attraction Name"].tolist()

    ### --- Clean Hawkers: Keep only those in KB ---
    data['Hawker'] = [entry for entry in data['Hawker'] if entry['Hawker Name'] in hawker_names_kb]

    # Deduplicate by Hawker Name
    data['Hawker'] = list({entry['Hawker Name']: entry for entry in data['Hawker']}.values())

    hawker_names_existing = [entry['Hawker Name'] for entry in data['Hawker']]
    filtered_hawker_names = [name for name in hawker_names_existing if name in hawker_names_kb]
    remaining_hawkers = [name for name in hawker_names_kb if name not in filtered_hawker_names]
    num_to_take_hawker = at_least_hawker - len(filtered_hawker_names)
    print(f"Need to add {num_to_take_hawker} hawkers to reach {at_least_hawker}")

    sampled_hawkers = random.sample(remaining_hawkers, k=min(num_to_take_hawker, len(remaining_hawkers)))
    filtered_rows_h = df_h[df_h['Hawker Name'].isin(sampled_hawkers)]

    new_hawkers = []
    for _, row in filtered_rows_h.iterrows():
        hawker_dict = {
            'Hawker Name': row['Hawker Name'],
            'Description': "NA.",
            'Rating': 2.5,
            'Satisfaction Score': 2.5,
            'Entrance Fee': 5.0,
            'Duration': 60,
            'Sources': ["NA"]
        }
        new_hawkers.append(hawker_dict)

    data['Hawker'].extend(new_hawkers)

    ### --- Clean Attractions: Keep only those in KB ---
    data['Attraction'] = [entry for entry in data['Attraction'] if entry['Attraction Name'] in attraction_names_kb]

    # Deduplicate by Attraction Name
    data['Attraction'] = list({entry['Attraction Name']: entry for entry in data['Attraction']}.values())

    attraction_names_existing = [entry['Attraction Name'] for entry in data['Attraction']]
    filtered_attraction_names = [name for name in attraction_names_existing if name in attraction_names_kb]
    remaining_attractions = [name for name in attraction_names_kb if name not in filtered_attraction_names]
    num_to_take_attraction = at_least_attraction - len(filtered_attraction_names)
    print(f"Need to add {num_to_take_attraction} attractions to reach {at_least_attraction}")

    sampled_attractions = random.sample(remaining_attractions, k=min(num_to_take_attraction, len(remaining_attractions)))
    filtered_rows_a = df_a[df_a['Attraction Name'].isin(sampled_attractions)]

    new_attractions = []
    for _, row in filtered_rows_a.iterrows():
        attraction_dict = {
            'Hawker Name': None,
            'Attraction Name': row['Attraction Name'],
            'Description': "NA.",
            'Rating': 2.5,
            'Satisfaction Score': 2.5,
            'Entrance Fee': 10.0,
            'Duration': 120,
            'Sources': ["NA"]
        }
        new_attractions.append(attraction_dict)

    data['Attraction'].extend(new_attractions)

    return data

In [43]:
output_data = get_combine_json_data()
print(f"Total unique Hawkers: {len({h['Hawker Name'] for h in output_data['Hawker']})}")

Need to add 8 hawkers to reach 10
Need to add 26 attractions to reach 30
Total unique Hawkers: 10


In [44]:
output_data["Hawker"]

[{'Hawker Name': 'Tangs Market',
  'Dish Name': 'Chili Ban Mian',
  'Description': 'A bowl of noodles with a rich chili sauce, offering a spicy kick.',
  'Satisfaction Score': 4.0,
  'Rating': 4.0,
  'Avg Food Price': 5.0,
  'Duration': 60,
  'Sources': ['https://www.yelp.com/biz/tangs-market-singapore',
   'https://www.pricelisto.com/menu-prices/tangs-market-sg']},
 {'Hawker Name': 'Singapore Zam Zam Restaurant',
  'Dish Name': 'Murtabak',
  'Description': 'A savory stuffed pancake filled with spiced meat, a popular Indian-Muslim dish.',
  'Satisfaction Score': 4.6,
  'Rating': 4.6,
  'Avg Food Price': 6.0,
  'Duration': 60,
  'Sources': ['https://eatzeely.com/zam-zam-restaurant-menu-prices-singapore/',
   'https://www.singmenu.com/zam-zam-restaurant-menu/']},
 {'Hawker Name': 'Amoy Street Food Centre',
  'Description': 'NA.',
  'Rating': 2.5,
  'Satisfaction Score': 2.5,
  'Entrance Fee': 5.0,
  'Duration': 60,
  'Sources': ['NA']},
 {'Hawker Name': 'Tekka Centre',
  'Description': '