In [None]:
import os
import re
import time
import glob
import pickle
import torch
import nltk
import unidecode
import openai
import numpy as np
import pandas as pd
import pyarrow.parquet as pq

from tqdm import tqdm
from numpy.linalg import norm
from nltk.corpus import stopwords
from sentence_transformers import SentenceTransformer, SimilarityFunction

In [None]:
# Load cleaned OSM categories dataset
df_osm = pd.read_csv('Database/Clean_categories/categories_OSM_clean.csv', sep=';')

# Load cleaned Foursquare categories dataset
df_fs = pd.read_csv('Database/Clean_categories/categories_FS_clean.csv', sep=';')

# Load Foursquare categories dataset with descriptions
df_fs_desc = pd.read_csv('Database/Clean_categories/categories_FS_clean_description.csv', sep=';')

# Load the oracle dataset for ground truth mapping between OSM and FS
df_oracle = pd.read_csv('df_oracle.csv', sep=";")


# Cleaning

In [None]:
# Define English stopwords set
stop_words = set(stopwords.words('english'))

# Function to clean text: lowercasing, remove accents, punctuation, and stopwords
def clean_text(text):
    if text is None or pd.isna(text):  # preserve None or NaN
        return None
    
    text = str(text).lower()                        # lowercase
    text = unidecode.unidecode(text)                # remove accents
    text = re.sub(r'[^\w\s]', '', text)            # remove punctuation
    words = text.split()                            # split into words
    words = [word for word in words if word not in stop_words]  # filter stopwords
    return ' '.join(words)

# Apply cleaning to oracle columns
df_oracle['OSM_tag'] = df_oracle['OSM_tag'].apply(clean_text)
df_oracle['FS_tag'] = df_oracle['FS_tag'].apply(clean_text)

# Apply cleaning to OSM dataset
df_osm['Tag'] = df_osm['Tag'].apply(clean_text)
df_osm['token_Description'] = df_osm['Description'].apply(clean_text)
df_osm['Depth_1'] = df_osm['Depth_1'].apply(clean_text)
df_osm['Depth_2'] = df_osm['Depth_2'].apply(clean_text)
df_osm['Depth_3'] = df_osm['Depth_3'].apply(clean_text)

# Apply cleaning to FS dataset
df_fs['Tag'] = df_fs['Tag'].apply(clean_text)
df_fs['Depth_1'] = df_fs['Depth_1'].apply(clean_text)
df_fs['Depth_2'] = df_fs['Depth_2'].apply(clean_text)
df_fs['Depth_3'] = df_fs['Depth_3'].apply(clean_text)
df_fs['Depth_4'] = df_fs['Depth_4'].apply(clean_text)
df_fs['Depth_5'] = df_fs['Depth_5'].apply(clean_text)
df_fs['Depth_6'] = df_fs['Depth_6'].apply(clean_text)

# Apply cleaning to FS dataset with descriptions
df_fs_desc['Tag'] = df_fs_desc['Tag'].apply(clean_text)
df_fs_desc['Depth_1'] = df_fs_desc['Depth_1'].apply(clean_text)
df_fs_desc['Depth_2'] = df_fs_desc['Depth_2'].apply(clean_text)
df_fs_desc['Depth_3'] = df_fs_desc['Depth_3'].apply(clean_text)
df_fs_desc['Depth_4'] = df_fs_desc['Depth_4'].apply(clean_text)
df_fs_desc['Depth_5'] = df_fs_desc['Depth_5'].apply(clean_text)
df_fs_desc['Depth_6'] = df_fs_desc['Depth_6'].apply(clean_text)

# Rename definition column and clean descriptions
df_fs_desc = df_fs_desc.rename(columns={"definition_en": "Description"})
df_fs_desc['token_Description'] = df_fs_desc['Description'].apply(clean_text)

# Save cleaned datasets
df_osm_clean = df_osm
df_fs_clean = df_fs
df_fs_desc_clean = df_fs_desc

In [None]:
# Create a column in each dataset containing all information about the POI
def concat_columns_osm(row):
    parts = []
    for col in ['Depth_1', 'Depth_2', 'Depth_3', 'token_Description']:
        val = row[col]
        if val is not None and pd.notna(val):  # Ignore None and NaN
            parts.append(str(val))
    return ' '.join(parts)

def concat_columns_fs(row):
    parts = []
    for col in ['Depth_1', 'Depth_2', 'Depth_3', 'Depth_4', "Depth_5", "Depth_6"]:
        val = row[col]
        if val is not None and pd.notna(val):  # Ignore None and NaN
            parts.append(str(val))
    return ' '.join(parts)

def concat_columns_fs_desc(row):
    parts = []
    for col in ['Depth_1', 'Depth_2', 'Depth_3', 'Depth_4', "Depth_5", "Depth_6", "token_Description"]:
        val = row[col]
        if val is not None and pd.notna(val):  # Ignore None and NaN
            parts.append(str(val))
    return ' '.join(parts)

# Apply the functions to create full_info columns
df_osm['full_info'] = df_osm.apply(concat_columns_osm, axis=1)
df_fs_desc['full_info_and_desc'] = df_fs_desc.apply(concat_columns_fs_desc, axis=1)
df_fs_desc['full_info'] = df_fs_desc.apply(concat_columns_fs, axis=1)


# Create a column in each dataset containing the POI path in the categorization
def concat_depths_osm(row):
    levels = []
    for col in ['Depth_1', 'Depth_2', 'Depth_3']:
        val = row[col]
        if pd.notna(val) and val is not None and str(val).strip() != '':
            levels.append(str(val).strip())
    return ' > '.join(levels)

df_osm['Path'] = df_osm.apply(concat_depths_osm, axis=1)

def concat_depths_fs(row):
    levels = []
    for col in ['Depth_1', 'Depth_2', 'Depth_3', 'Depth_4', "Depth_5", "Depth_6"]:
        val = row[col]
        if pd.notna(val) and val is not None and str(val).strip() != '':
            levels.append(str(val).strip())
    return ' > '.join(levels)

# Apply to FS with descriptions
df_fs_desc['Path'] = df_fs_desc.apply(concat_depths_fs, axis=1)

# Embedding function

In [None]:
client = openai.OpenAI(api_key="your_key")

# Load a pre-trained SentenceTransformer model (fast model for embeddings)
model = SentenceTransformer('all-MiniLM-L6-v2')  # fast

def make_matchs(model, model_name, description_osm, description_fs, df_osm, df_fs_desc, desc_fs="", k=5):
    # Set similarity function to cosine similarity
    model.similarity_fn_name = SimilarityFunction.COSINE  
    
    # Embeddings for OSM descriptions
    print("-- beginning embedding OSM")
    embeddings_1 = model.encode(description_osm, convert_to_tensor=True) 
    print("-- end embedding OSM")

    # Embeddings for FS descriptions
    print("-- beginning embedding FS")
    embeddings_2 = model.encode(description_fs, convert_to_tensor=True) 
    print("-- end embedding FS")

    # Compute cosine similarity matrix between OSM and FS embeddings
    similarity_score = model.similarity(embeddings_1, embeddings_2)

    matches = []

    # Iterate over each OSM row
    for idx1, row1 in df_osm.iterrows():
        # Get top-k indices and scores
        topk_scores, topk_indices = torch.topk(similarity_score[idx1], k)

        # Extract top-k tags and scores from FS
        topk_tags = df_fs_desc.iloc[topk_indices.tolist()]['Tag'].tolist()
        topk_scores = topk_scores.tolist()

        # Save results for this OSM tag
        matches.append({
            'OSM_tag': row1['Tag'],
            'OSM_description': row1['Description'],
            'OSM_path': row1['Path'],
            'FS_topk_tags': topk_tags,
            'FS_topk_scores': topk_scores
        })

    # Convert results into a DataFrame
    matches_df = pd.DataFrame(matches)
    return matches_df

# Different prompt examples for ChatGPT usage

In [None]:
def ask_gpt_to_choose_prompt_1(osm_tag, osm_path, osm_desc, candidates, k):
    prompt = f"""
    I want to map an OpenStreetMap (OSM) point of interest (POI) to the most semantically similar FourSquare (FS) POI (tag).

    OSM Tag: {osm_tag}
    OSM Tag Description: {osm_desc}
    OSM Tag Categorisation: {osm_path}

    Here are the {k} most similar FS tags (with their descriptions):

    {candidates.to_string(index=False)}

    Rules to follow when choosing:
    1. If an FS tag has exactly the same name as the OSM tag and it makes sense, prefer selecting it.
    2. Copy the FS tag name exactly as it appears in the list (do not shorten, truncate, or modify it).
    3. If the OSM tag represents a natural feature or is related to waterway, add 'landmarks outdoors' to the list of candidate FS tags. It can be selected if nothing else is a better match.
    4. If the OSM tag is related to religion and there is no exact match for the OSM tag among the candidate FS tags, then choose 'spiritual center'.
    5. If the OSM description mentions that it is historical or protected, you can choose 'historic protected site' if it makes sense.
    6. If the OSM tag refers to a manual craft, artisan work, or any creative profession, add 'creative service' to the list of candidate FS tags. It can be selected if nothing else is a better match.
    7. If the OSM tag refers to anything related to the military, classify it as 'military' regardless of other candidate FS tags.
    8. If the OSM tag refers to a barrier or obstruction that affects traffic or circulation, classify it as 'travel transportation' regardless of other candidate FS tags.
    9. If the OSM tag belongs to the category "power", add 'electrical equipment supplier' to the list of candidate FS tags. It can be selected if nothing else is a better match.
    10. Do not invent any FS tag. Only choose from the provided candidates or the special tags ('landmarks outdoors', 'spiritual center', 'historic protected site) when applicable.
    11. Otherwise, select the FS tag that is most semantically similar.

    Question: Which FS tag best matches the OSM tag ? 
    Answer only with the FS tag name.
    """

    response = client.chat.completions.create(
        model="gpt-4o-mini",  
        messages=[{"role": "user", "content": prompt}],
        temperature=0
    )

    return response.choices[0].message.content.strip()



In [None]:
def ask_gpt_to_choose_prompt_2(osm_tag, osm_path, osm_desc, candidates, k):
    prompt = f"""
    I want to map an OpenStreetMap (OSM) point of interest (POI) to the most appropriate FourSquare (FS) POI (tag).

    Rules:
    1. If an FS tag exactly matches the OSM tag (same name or clear synonym), choose that FS tag directly.
    2. If no exact match exists, choose the FS tag that is the most specific and precise category in which the OSM tag could reasonably be classified.
    - Do not just pick the most semantically similar.
    - Prefer the FS tag that fully contains the concept of the OSM tag, even if its wording is broader.
    3. Always answer with exactly one FS tag name from the provided list, nothing else.

    OSM Tag: {osm_tag}  
    OSM Tag Description: {osm_desc}  
    OSM Tag Categorisation in OSM: {osm_path}  

    Here are the {k} most relevant FS tags (with their descriptions):  
    {candidates.to_string(index=False)}  

    Question: Which FS tag best matches the OSM tag?  
    Answer only with the FS tag name. """ 


    response = client.chat.completions.create(
        model="gpt-4o-mini",  
        messages=[{"role": "user", "content": prompt}],
        temperature=0
    )

    return response.choices[0].message.content.strip()



In [None]:
def ask_gpt_to_choose_prompt_3(osm_tag, osm_path, osm_desc, candidates, k):
    prompt = f"""
    I want to map an OpenStreetMap (OSM) point of interest (POI) to the most appropriate FourSquare (FS) POI (tag).

    Rules:
    1. If an FS tag exactly matches the OSM tag (same name or clear synonym), choose that FS tag directly.
    2. If no exact match exists, choose the FS tag that is the most specific and precise category in which the OSM tag could reasonably be classified.
    - Do not just pick the most semantically similar.
    - Prefer the FS tag that fully contains the concept of the OSM tag, even if its wording is broader.
    3. If none of the provided FS tags are a good fit, you may instead choose one category from the following broader fallback categories:
    - landmarks outdoors
    - business professional services
    - travel transportation
    - community government
    - retail
    - sports recreation
    - health medicine
    - arts entertainment
    - dining drinking
    - event
    4. Always answer with exactly one FS tag name from the provided list or, if necessary, one fallback category, nothing else.


    OSM Tag: {osm_tag}  
    OSM Tag Description: {osm_desc}  
    OSM Tag Categorisation in OSM: {osm_path}  

    Here are the {k} most relevant FS tags (with their descriptions):  
    {candidates.to_string(index=False)}  

    Question: Which FS tag best matches the OSM tag?  
    Answer only with the FS tag name. """ 


    response = client.chat.completions.create(
        model="gpt-4o-mini", 
        messages=[{"role": "user", "content": prompt}],
        temperature=0
    )

    return response.choices[0].message.content.strip()



In [None]:
def ask_gpt_to_choose_prompt_3_bis(osm_tag, osm_path, osm_desc, candidates, k):
    prompt = f"""
    I want to map an OpenStreetMap (OSM) point of interest (POI) to the most appropriate FourSquare (FS) POI (tag).

    Rules:
    1. If an FS tag exactly matches the OSM tag (same name or clear synonym), choose that FS tag directly.
    2. If no exact match exists, choose the FS tag that is the most specific and precise category in which the OSM tag could reasonably be classified.
    - Do not just pick the most semantically similar.
    - Prefer the FS tag that fully contains the concept of the OSM tag, even if its wording is broader.
    3. If none of the provided FS tags are a good fit, you may instead choose one category from the following broader fallback categories:
    - landmarks outdoors
    - business professional services
    - travel transportation
    - community government
    - retail
    - sports recreation
    - health medicine
    - arts entertainment
    - dining drinking
    - event
    4. Always answer with exactly one FS tag name from the provided list or, if necessary, one fallback category, nothing else.


    OSM Tag: {osm_tag}  
    OSM Tag Description: {osm_desc}  
    OSM Tag Categorisation in OSM: {osm_path}  

    Here are the {k} most relevant FS tags (with their descriptions):  
    {candidates.to_string(index=False)}  

    Question: Which FS tag best matches the OSM tag?  
    Answer only with the FS tag name. """ 


    response = client.chat.completions.create(
        model="gpt-4o-mini", 
        messages=[{"role": "user", "content": prompt}],
        temperature=0,
        top_p=0.9         
    )

    return response.choices[0].message.content.strip()



In [None]:
def ask_gpt_to_choose_prompt_4(osm_tag, osm_path, osm_desc, candidates, k):
    prompt = f"""
    I want to map an OpenStreetMap (OSM) point of interest (POI) to the most appropriate FourSquare (FS) POI (tag).

    Rules:
    1. If an FS tag exactly matches the OSM tag (same name or clear synonym), choose that FS tag directly.
    2. Otherwise, choose the FS tag that is the most specific and precise category in which the OSM tag could reasonably be classified. 
    - Consider both the provided FS tags AND the following broader categories as possible options:
        - landmarks outdoors
        - military
        - historic protected site
        - business professional services
        - creative service
        - travel transportation
        - community government
        - retail
        - sports recreation
        - health medicine
        - arts entertainment
        - dining drinking
        - event
        - electrical equipment supplier
        - spiritual center
        - states municipalities
        - road
        - train
    - Do not just pick the most semantically similar.
    - Prefer the FS tag that fully contains the concept of the OSM tag, even if its wording is broader.
    3. Always answer with exactly one FS tag name from the combined list of provided FS tags and fallback categories, nothing else.


    OSM Tag: {osm_tag}  
    OSM Tag Description: {osm_desc}  
    OSM Tag Categorisation in OSM: {osm_path}  

    Here are the {k} most relevant FS tags (with their descriptions):  
    {candidates.to_string(index=False)}  

    Question: Which FS tag best matches the OSM tag?  
    Answer only with the FS tag name. """ 


    response = client.chat.completions.create(
        model="gpt-4o-mini",  
        messages=[{"role": "user", "content": prompt}],
        temperature=0
    )

    return response.choices[0].message.content.strip()




In [None]:
def ask_gpt_to_choose_prompt_5(osm_tag, osm_path, osm_desc, candidates, k):
    prompt = f"""
    I want to map an OpenStreetMap (OSM) point of interest (POI) to the most appropriate FourSquare (FS) POI (tag).

    Rules:
    1. If an FS tag exactly matches the OSM tag (same name or clear synonym), choose that FS tag directly.
    2. If no exact match exists, select the FS tag that is the most precise parent category that fully contains the OSM tag.
       - Do not just pick the most semantically similar wording.
       - Prefer a broader FS tag that logically includes the OSM tag concept, even if it is less specific.
       - Exclude FS tags that are related but do not actually contain the OSM concept.
    3. Think step by step: first check for an exact match, then find the correct parent category.
    4. Output only one FS tag name from the provided list. No explanations, no extra text.

    Example:
    OSM Tag: "sea"  
    OSM Tag Description: "A large body of salt water part of, or connected to, an ocean."  
    OSM Tag Categorisation in OSM: "place > sea"  

    FS Candidates:  
    ['lake', 'bay', 'bathing area', 'dive spot', 'island', 'surf spot', 'waterfront', 'river', 'landmarks outdoors', 'boat launch']

    Correct Answer: landmarks outdoors

    Now, process the following case:

    OSM Tag: {osm_tag}  
    OSM Tag Description: {osm_desc}  
    OSM Tag Categorisation in OSM: {osm_path}  

    Here are the {k} most relevant FS tags (with their descriptions):  
    {candidates.to_string(index=False)}  

    Question: Which FS tag best matches the OSM tag?  
    Answer only with the FS tag name.
    """

    response = client.chat.completions.create(
        model="gpt-4o-mini",  
        messages=[{"role": "user", "content": prompt}],
        temperature=0
    )

    return response.choices[0].message.content.strip()


In [None]:
def ask_gpt_to_choose_prompt_6(osm_tag, osm_path, osm_desc, candidates, k):
    prompt = f"""
    I want to map an OpenStreetMap (OSM) point of interest (POI) to the most appropriate FourSquare (FS) POI (tag).

    Rules:
    1. If an FS tag exactly matches the OSM tag (same name or clear synonym), choose that FS tag directly.
    2. If no exact match exists, select the FS tag that is the most precise parent category that fully contains the OSM tag.
       - Do not just pick the most semantically similar wording.
       - Prefer a broader FS tag that logically includes the OSM tag concept.
       - Exclude FS tags that are related but do not actually contain the OSM concept.
    3. If none of the {k} FS candidates are suitable, then choose from the following broader FS categories:
       - landmarks outdoors
       - business professional services
       - travel transportation
       - community government
       - retail
       - sports recreation
       - health medicine
       - arts entertainment
       - dining drinking
       - event
    4. Think step by step: first check for an exact match, then find the correct parent category, and only if needed, fall back to the broader FS categories above.
    5. Output only one FS tag name from the provided list (either from the {k} candidates or from the broader categories).
       Do not add explanations, reasoning, or extra text.

    Example:
    OSM Tag: "sea"  
    OSM Tag Description: "A large body of salt water part of, or connected to, an ocean."  
    OSM Tag Categorisation in OSM: "place > sea"  

    FS Candidates:  
    ['lake', 'bay', 'bathing area', 'dive spot', 'island', 'surf spot', 'waterfront', 'river', 'landmarks outdoors', 'boat launch']

    Correct Answer: landmarks outdoors 

    Now, process the following case:

    OSM Tag: {osm_tag}  
    OSM Tag Description: {osm_desc}  
    OSM Tag Categorisation in OSM: {osm_path}  

    Here are the {k} most relevant FS tags (with their descriptions):  
    {candidates.to_string(index=False)}  

    Question: Which FS tag best matches the OSM tag?  
    Answer only with the FS tag name.
    """

    response = client.chat.completions.create(
        model="gpt-4o-mini",  
        messages=[{"role": "user", "content": prompt}],
        temperature=0
    )

    return response.choices[0].message.content.strip()

In [None]:
def ask_gpt_to_choose_prompt_7(osm_tag, osm_path, osm_desc, candidates, k):
    prompt = f"""
    I want to map an OpenStreetMap (OSM) point of interest (POI) to the most appropriate FourSquare (FS) POI (tag).

    Rules:
    1. If an FS tag exactly matches the OSM tag (same name or clear synonym), choose that FS tag directly.
    2. If no exact match exists, choose the FS tag that is the most specific and precise category in which the OSM tag could reasonably be classified.
    - Do not just pick the most semantically similar.
    - Prefer the FS tag that fully contains the concept of the OSM tag, even if its wording is broader.
    3. If none of the provided FS tags are a good fit, you may instead choose one category from the following broader fallback categories:
    - landmarks outdoors
    - business professional services
    - travel transportation
    - community government
    - retail
    - sports recreation
    - health medicine
    - arts entertainment
    - dining drinking
    - event
    4. Always answer with exactly one FS tag name from the provided list or, if necessary, one fallback category, nothing else.

    Example 1:
    OSM Tag: "valley"  
    OSM Tag Description: "A natural depression flanked by ridges or ranges of mountains or hills."  
    OSM Tag Categorisation in OSM: "natural > geology related > valley"  
    FS Candidates: ['hill', 'mountain', 'volcano', 'mountain hut', 'scenic lookout', 'cave', 'hiking trail', 'bridge', 'rock climbing spot', 'well', 'bike trail', 'great outdoors', 'waterfall', 'river', 'village', 'farm', 'state provincial park', 'geological service', 'natural park', 'dam']	
    Correct Answer: landmarks outdoors

    Example 2:
    OSM Tag: "reef"
    OSM Tag Decription: "A feature (rock, sandbar, coral, etc) lying permanently beneath the surface of the water."
    OSM Tag Categorisation in OSM: "natural > water related > reef"
    FS Candidates: ['island', 'bay', 'waterfall', 'dive spot', 'bathing area', 'well', 'cave', 'river', 'surf spot', 'harbor marina', 'lake', 'reservoir', 'dam', 'fountain', 'mountain', 'waterfront', 'floating market', 'canal lock', 'tunnel', 'beach']
    Correct answer: landmarks outdoors

    Now, process the following case:

    OSM Tag: {osm_tag}  
    OSM Tag Description: {osm_desc}  
    OSM Tag Categorisation in OSM: {osm_path}  

    Here are the {k} most relevant FS tags (with their descriptions):  
    {candidates.to_string(index=False)}  

    Question: Which FS tag best matches the OSM tag?  
    Answer only with the FS tag name. """ 


    response = client.chat.completions.create(
        model="gpt-4o-mini",  
        messages=[{"role": "user", "content": prompt}],
        temperature=0
    )

    return response.choices[0].message.content.strip()



In [None]:
def ask_gpt_to_choose_prompt_8(osm_tag, osm_path, osm_desc, candidates, k):
    prompt = f"""
    I want to map an OpenStreetMap (OSM) point of interest (POI) to the most appropriate FourSquare (FS) POI (tag).

    Rules:
    1. If an FS tag exactly matches the OSM tag (same name or clear synonym), choose that FS tag directly.
    2. If no exact match exists, choose the FS tag that is the most specific and precise category in which the OSM tag could reasonably be classified.
    - Do not just pick the most semantically similar.
    - Prefer the FS tag that fully contains the concept of the OSM tag, even if its wording is broader.
    3. If none of the provided FS tags are a good fit, you may instead choose one category from the following broader fallback categories:
       Here are the main FS categories and their descriptions:
    - landmarks outdoors: natural features, outdoor landmarks, scenic locations and administrative boundarie
    - business professional services: companies offering professional or office services
    - travel transportation: places and infrastructure related to travel, transport, or movement, including any obstacles or barriers along paths or routes
    - community government: public, governmental, or community buildings and facilities
    - retail: shops, stores, and commercial outlets selling goods
    - sports recreation: places for sports, fitness, or recreational activities
    - health medicine: medical facilities, clinics, and health-related services
    - arts entertainment: theaters, museums, cultural venues, or entertainment spots
    - dining drinking: restaurants, cafes, bars, and food & beverage locations
    - event: locations for gatherings, festivals, exhibitions, or organized events
    4. Always answer with exactly one FS tag name from the provided list or, if necessary, one fallback category, nothing else.

    Now, process the following case:

    OSM Tag: {osm_tag}  
    OSM Tag Description: {osm_desc}  
    OSM Tag Categorisation in OSM: {osm_path}  

    Here are the {k} most relevant FS tags (with their descriptions):  
    {candidates.to_string(index=False)}  

    Question: Which FS tag best matches the OSM tag?  
    Answer only with the FS tag name. """ 


    response = client.chat.completions.create(
        model="gpt-4o-mini",  
        messages=[{"role": "user", "content": prompt}],
        temperature=0,
        top_p=0.9         
    )

    return response.choices[0].message.content.strip()



In [None]:
def ask_gpt_to_choose_prompt_9(osm_tag, osm_path, osm_desc, candidates, k):
    prompt = f"""
    I want to map an OpenStreetMap (OSM) point of interest (POI) to the most appropriate FourSquare (FS) POI (tag).

    Rules:
    1. If an FS tag exactly matches the OSM tag (same name or clear synonym), choose that FS tag directly.
    2. If no exact match exists, choose the FS tag that is the most specific and precise category in which the OSM tag could reasonably be classified.
    - Do not just pick the most semantically similar.
    - Always prioritize placing the OSM tag in the main category it belongs to, rather than matching only semantically similar tags. 
    - Prefer the FS tag that fully contains the concept of the OSM tag, even if its wording is broader.
    3. If none of the provided FS tags are a good fit, you may instead choose one category from the following broader fallback categories:
       Here are the main FS categories and their descriptions:
    - landmarks outdoors: natural features, outdoor landmarks, scenic locations and administrative boundarie
    - business professional services: companies offering professional or office services
    - travel transportation: places and infrastructure related to travel, transport, or movement, including any obstacles or barriers along paths or routes
    - community government: public, governmental, or community buildings and facilities
    - retail: shops, stores, and commercial outlets selling goods
    - sports recreation: places for sports, fitness, or recreational activities
    - health medicine: medical facilities, clinics, and health-related services
    - arts entertainment: theaters, museums, cultural venues, or entertainment spots
    - dining drinking: restaurants, cafes, bars, and food & beverage locations
    - event: locations for gatherings, festivals, exhibitions, or organized events
    4. Always answer with exactly one FS tag name from the provided list or, if necessary, one fallback category, nothing else.

    Now, process the following case:

    OSM Tag: {osm_tag}  
    OSM Tag Description: {osm_desc}  
    OSM Tag Categorisation in OSM: {osm_path}  

    Here are the {k} most relevant FS tags (with their descriptions):  
    {candidates.to_string(index=False)}  

    Question: Which FS tag best matches the OSM tag?  
    Answer only with the FS tag name. """ 


    response = client.chat.completions.create(
        model="gpt-4o-mini", 
        messages=[{"role": "user", "content": prompt}],
        temperature=0
    )

    return response.choices[0].message.content.strip()



In [None]:
def ask_gpt_to_choose_prompt_10(osm_tag, osm_path, osm_desc, candidates, k):
    prompt = f"""
    I want to map an OpenStreetMap (OSM) point of interest (POI) to the most appropriate FourSquare (FS) POI (tag).

    Rules:
    1. If an FS tag exactly matches the OSM tag (same name or clear synonym), choose that FS tag directly.
    2. If no exact match exists, choose the FS tag that is the most specific and precise category in which the OSM tag could reasonably be classified.
    - Do not just pick the most semantically similar.
    - Prefer the FS tag that fully contains the concept of the OSM tag, even if its wording is broader.
    3. If none of the provided FS tags are a good fit, you may instead choose one category from the following broader fallback categories:
    - landmarks outdoors
    - business professional services
    - travel transportation
    - community government
    - retail
    - sports recreation
    - health medicine
    - arts entertainment
    - dining drinking
    - event
    4. Always answer with exactly one FS tag name from the provided list or, if necessary, one fallback category, nothing else.

    Example 1:
    OSM Tag: "valley"  
    OSM Tag Description: "A natural depression flanked by ridges or ranges of mountains or hills."  
    OSM Tag Categorisation in OSM: "natural > geology related > valley"  
    FS Candidates: ['hill', 'mountain', 'volcano', 'mountain hut', 'scenic lookout', 'cave', 'hiking trail', 'bridge', 'rock climbing spot', 'well', 'bike trail', 'great outdoors', 'waterfall', 'river', 'village', 'farm', 'state provincial park', 'geological service', 'natural park', 'dam']	
    Correct Answer: landmarks outdoors

    Example 2:
    OSM Tag: "reef"
    OSM Tag Decription: "A feature (rock, sandbar, coral, etc) lying permanently beneath the surface of the water."
    OSM Tag Categorisation in OSM: "natural > water related > reef"
    FS Candidates: ['island', 'bay', 'waterfall', 'dive spot', 'bathing area', 'well', 'cave', 'river', 'surf spot', 'harbor marina', 'lake', 'reservoir', 'dam', 'fountain', 'mountain', 'waterfront', 'floating market', 'canal lock', 'tunnel', 'beach']
    Correct answer: landmarks outdoors

    Example 3:
    OSM Tag: "outcrop"
    OSM Tag Description: "A place where the bedrock or superficial deposits previously covered under the soil have become locally exposed."
    OSM Tag Categorisation in OSM: "geological > outcrop"
    FS Candidates: ['cave', 'hill', 'mountain', 'volcano', 'well', 'bay', 'island', 'geological service', 'village', 'historic protected site', 'farm', 'dam', 'rock climbing spot', 'town', 'city', 'lake', 'nature preserve', 'field', 'state provincial park', 'dump']
    Correct answer: landmarks outdoors

    Example 4:
    OSM Tag: "glassblower"
    OSM Tag Description: "craft > glassblower
    OSM Tag Categorisation in OSM: "A person or company that blows bottles or other objects from molten glass."
    FS Candidates: ['fireworks store', 'ice bar', 'winery', 'meadery', 'eyecare store', 'wine store', 'distillery', 'plastics supplier', 'bubble tea shop', 'recycling facility', 'arts crafts store', 'manufacturer', 'creative services', 'hobby store', 'chemicals gasses manufacturer', 'champagne bar', 'sunglasses store', 'liquor store', 'waste management service', 'kitchen supply store']
    Correct answer: creative services

    Example 5:
    OSM Tag: "elevator"
    OSM Tag Description: "craft > elevator"
    OSM Tag Categorisation in OSM: "Company specialized in installing elevators."
    FS Candidates: ['factory', 'industrial equipment supplier', 'metals supplier', 'machine shop', 'garage door supplier', 'construction supplies store', 'engineer', 'hardware store', 'electrical equipment supplier', 'equipment rental service', 'plastics supplier', 'doors windows contractor', 'knitting store', 'carpenter', 'tunnel', 'body piercing shop', 'manufacturer', 'transmissions shop', 'construction', 'metro station']
    Correct answer: business professional services

    Example 6:
    OSM Tag: "ditch"
    OSM Tag Description: "A ditch or a trench is a long and narrow man-made barrier dug in the ground to prevent access to the other side."
    OSM Tag Categorisation in OSM: "barrier > linear barriers > ditch"
    FS Candidates: ['bridge', 'well', 'dam', 'tunnel', 'cave', 'canal lock', 'structure', 'road', 'farm', 'nature preserve', 'platform', 'field', 'canal', 'bay', 'stable', 'hill', 'street', 'natural park', 'historic protected site', 'island']
    Correct answer: travel transportation

    Now, process the following case:

    OSM Tag: {osm_tag}  
    OSM Tag Description: {osm_desc}  
    OSM Tag Categorisation in OSM: {osm_path}  

    Here are the {k} most relevant FS tags (with their descriptions):  
    {candidates.to_string(index=False)}  

    Question: Which FS tag best matches the OSM tag?  
    Answer only with the FS tag name. """ 


    response = client.chat.completions.create(
        model="gpt-4o-mini", 
        messages=[{"role": "user", "content": prompt}],
        temperature=0
    )

    return response.choices[0].message.content.strip()



# Embeddings then selection of the FS tag by ChatGPT via the API

In [None]:
import pandas as pd
from tqdm import tqdm

# CHOOSE YOUR INPUTS !!!

prompts = [2, 3, 5, 6]
k=5 # number of candidates to give to ChatGPT

def concat_depths_fs(row):
    # Concatenate non-empty depth levels into a hierarchical path string
    levels = []
    for col in ['Depth_1', 'Depth_2', 'Depth_3', 'Depth_4', "Depth_5", "Depth_6"]:
        val = row[col]
        if pd.notna(val) and val is not None and str(val).strip() != '':
            levels.append(str(val).strip())
    return ' > '.join(levels)

# Apply the function to create the FS path
df_fs['Path'] = df_fs.apply(concat_depths_fs, axis=1)


# Build a dictionary mapping FS_tag -> Path
tag_to_path = dict(zip(df_fs["Tag"], df_fs["Path"]))


# Generate candidate matches using embeddings and cosine similarity
matches_df = make_matchs(
    model=model,
    model_name="MiniLM",
    description_osm=df_osm["full_info"].tolist(),
    description_fs=df_fs_desc["full_info_and_desc"].tolist(),
    df_osm=df_osm,
    df_fs_desc=df_fs_desc,
    k=k  
)


# For each prompt variation
for script in prompts:
    k = k # or another value if needed
    results = []

    print(f"--- Running prompt {script} ---")

    # Iterate over OSM–FS candidate matches
    for idx, row in tqdm(matches_df.iterrows(), total=len(matches_df)):
        osm_tag = row["OSM_tag"]
        osm_desc = row["OSM_description"]
        osm_path = row["OSM_path"]
        fs_candidates = row["FS_topk_tags"]
        fs_scores = row["FS_topk_scores"]

        # Temporary DataFrame for prompt input (candidates)
        candidates = pd.DataFrame({
            "FS_tag": fs_candidates,
            "Score": fs_scores
        })

        # Call ChatGPT API depending on the prompt version
        if script == 2:
            fs_choice = ask_gpt_to_choose_prompt_2(osm_tag, osm_path, osm_desc, candidates, k)
        elif script == 3:
            fs_choice = ask_gpt_to_choose_prompt_3(osm_tag, osm_path, osm_desc, candidates, k)
        elif script == 5:
            fs_choice = ask_gpt_to_choose_prompt_5(osm_tag, osm_path, osm_desc, candidates, k)
        elif script == 6:
            fs_choice = ask_gpt_to_choose_prompt_6(osm_tag, osm_path, osm_desc, candidates, k)

        # Save the results for this row
        results.append({
            "OSM_tag": osm_tag,
            "OSM_path": osm_path,
            "OSM_description": osm_desc,
            "FS_candidates": fs_candidates,
            "FS_scores": fs_scores,
            "FS_tag_GPT": fs_choice
        })

    # Create a DataFrame with ChatGPT’s predictions
    df_gpt_mapping = pd.DataFrame(results)

    # Merge with oracle to compare predictions with ground truth
    df_oracle = pd.read_csv('df_oracle.csv', sep=";")
    df_match = pd.merge(df_gpt_mapping, df_oracle, 
                        left_on=["OSM_tag", "OSM_path"], 
                        right_on=['OSM_tag', "OSM_path"], how="left")
    df_match = df_match.rename(columns={"FS_tag":"FS_manual_tag"})

    # Column: exact match correctness
    df_match["correct"] = (df_match['FS_manual_tag'] == df_match['FS_tag_GPT']).astype(int)
        
    def is_new_correct(fs_manual_tag, fs_tag_gpt, tag_to_path):
        # Condition 1: exact match
        if fs_manual_tag == fs_tag_gpt:
            return 1
        # Condition 2: FS_manual_tag appears anywhere in the FS_tag_GPT path
        path_str = tag_to_path.get(fs_tag_gpt, "")  # get path string
        path_tags = [tag.strip() for tag in path_str.split(">")]  # split into tags
        if fs_manual_tag in path_tags:
            return 1
        return 0

    # Apply the new correctness logic row by row
    df_match["new_correct"] = df_match.apply(
        lambda row: is_new_correct(row["FS_manual_tag"], row["FS_tag_GPT"], tag_to_path),
        axis=1
    )

    # Extract OSM main category and main+sub levels
    df_match["OSM_main"] = df_match["OSM_path"].str.split(" > ").str[0]
    df_match["OSM_main_sub"] = df_match["OSM_path"].apply(lambda x: " > ".join(x.split(" > ")[:2]))

    # Save results immediately after each prompt
    filename = f"FID gpt models/miniLM + chat_gpt_prompt_{script} (k={k}).csv"
    df_match.to_csv(filename, index=False, sep=";")
    print(f"✅ Prompt {script} finished and saved in {filename}\n")