In [1]:
import pandas as pd
import googlemaps
import numpy as np
import datetime
from tqdm import tqdm

In [2]:
df_base_with_coords = pd.read_csv("merged_datasets/with_coords/france_monuments.csv")
print(df_base_with_coords.shape)
df_base_with_coords.head()

(564, 15)


Unnamed: 0,name,url,short_description,ticket_price,ticket_price_conditions,opening_hours,payment_methods,address,visiting_services,ticket_price_raw,city,category,lat,lng,Tourpedia_id
0,Basilique Cathédrale de Saint-Denis,https://www.saint-denis-basilique.fr/,Discover the necropolis of the kings and queen...,17.0,Free for under 26s,From October to March\nMonday to Saturday: 10:...,coins | bank cards | cheques | Culture cheques...,"1, rue de la Légion d'Honneur, 93200 Saint-Denis",Cabin baggage ('cabines') allowed: maximum 40x...,Price: 17 €,,,48.935461,2.359835,
1,Oppidum et musée archéologique d'Ensérune,https://www.enserune.fr/,Discover a major Gallo-Roman city located betw...,9.0,Free for those under 26 years old.,Please note: last access to the monument one h...,cash | bank cards | cheques | Culture cheques ...,"2901 route d’Ensérune, 34440 Nissan-lez-Ensérune",Drinking fountain | Toilets | Wheelchair | Pos...,Price: 9 €,,,43.3103,3.1152,
2,Château d'Assier,https://www.chateau-assier.fr/,"Between Figeac and Rocamadour, visit the castl...",5.0,Free for under 25s,From May 2nd to 31st\n10:00 AM - 12:30 PM and ...,cash | bank cards | cheques | Culture cheques ...,"41 Rue des Écuries de Galiot, 46320 Assier","Dogs accepted outdoors, kept on a leash | Stro...",Price: 5 €,,,44.6748,1.878194,
3,Cathédrale Saint-Lazare d'Autun,https://www.tresor-cathedrale-autun.fr/,Discover the monumental reliquary intended to ...,4.0,Free for under 26s,From April 1st to October 31st\nMonday to Satu...,,"Place du Terreau, 71400 Autun","During ceremonies, access is via the Clos Gisl...",Price: 4€,,,46.944918,4.299149,
4,Château d'Angers,https://www.chateau-angers.fr/,"An impregnable fortress, a gigantic unique med...",11.0,Free for under 26s,From May 2nd to September 4th\n10:00 AM - 6:30...,,"2, promenade du Bout-du-Monde, 49100 Angers",Visitors under 15 must be accompanied by a res...,Price: 11 €,,,47.470071,-0.560073,


In [None]:
API_KEY = "YOUR_GOOGLE_MAPS_API_KEY"

gmaps = googlemaps.Client(key=API_KEY)


def get_place_id(gmaps, lat, lng, name, address=None):
    """
    Fetches rich data for a POI using a two-step API process (Find + Details).
    Returns a clean dictionary or None if not found.
    """    
    # Construct query: specific address is better, but fallback to name
    search_query = f"{name} {address}" if address else name

    try:
        # --- STEP 1: Find the Place ID ($0.017) ---
        find_result = gmaps.find_place(
            input=search_query,
            input_type="textquery",
            fields=['place_id'], 
            location_bias=f"point:{lat},{lng}"
        )

        if not find_result['candidates']:
            print(f"Skipping: '{name}' not found.")
            return None

        place_id = find_result['candidates'][0]['place_id']
        return place_id
    
    except Exception as e:
        print(f"Error fetching '{name}': {e}")
        return None

def fetch_place_details(gmaps, place_id):
    try:
        # --- STEP 2: Get Details ($0.025) ---
        fetch_timestamp = datetime.datetime.now().timestamp()
        details = gmaps.place(
            place_id=place_id, 
            fields=[
                'name', 'formatted_address', 'address_component', 
                'website', 'formatted_phone_number', 'url', 
                'geometry', 'rating', 'user_ratings_total', 'reviews',
                'price_level', 'business_status', 'photo', 'opening_hours'
            ]
        )
        
        result = details.get('result', {})
        result['fetch_timestamp'] = fetch_timestamp
        result['place_id'] = place_id
        # result['query_input'] = search_query
        # result['query_location_bias'] = address
        return result

    except Exception as e:
        print(f"Error fetching '{place_id}': {e}")
        return None
    
    
def format_opening_hours(raw_data):
    """
    Extracts and formats opening hours from raw Google Places data.
    """
    opening_hours_data = raw_data.get('opening_hours', {})
    weekday_text = opening_hours_data.get('weekday_text', [])
    
    if not weekday_text:
        return "Opening hours not available."

    formatted_text = "\n".join(weekday_text)
    formatted_text = formatted_text.replace('\u202f', ' ').replace('\u2009', ' ')
    
    return formatted_text

def get_processed_data(raw_result):
        # Extract City
        city = None
        components = raw_result.get('address_components') or raw_result.get('address_component')
        
        if components:
            for c in components:
                if 'locality' in c['types']:
                    city = c['long_name']
                    break
            # Fallback for places without a defined 'locality'
            if city is None:
                for c in components:
                    if 'administrative_area_level_2' in c['types']:
                        city = c['long_name']
                        break

        # Parse Price Level
        price_map = {0: "Free", 1: "Inexpensive", 2: "Moderate", 3: "Expensive", 4: "Very Expensive"}
        price_desc = price_map.get(raw_result.get('price_level'), "Price not available")

        # Format Reviews
        # Extracts top 5 reviews (Author + Text + Rating)
        reviews_data = []
        if 'reviews' in raw_result:
            for rev in raw_result['reviews']:
                text = rev.get('text', "")
                            
                reviews_data.append({
                    "author": rev.get('author_name'),
                    "author_url": rev.get('author_url'),
                    "rating": rev.get('rating'),
                    "text": text,
                    "language": rev.get('language'),
                    "original_language": rev.get('original_language'),
                    "time": rev.get('time'),
                    "relative_time": rev.get('relative_time_description'),
                    "fetched_timestamp": raw_result.get('fetch_timestamp')
                })

        return {
            "place_id": raw_result.get('place_id'),
            "name": raw_result.get('name'),
            "status": raw_result.get('business_status'), # e.g., OPERATIONAL
            "price_level": price_desc,
            "address": raw_result.get('formatted_address'),
            "city": city,
            "website": raw_result.get('website'),
            "phone": raw_result.get('formatted_phone_number'),
            "map_url": raw_result.get('url'),
            "coordinates": raw_result.get('geometry', {}).get('location'),
            "rating": raw_result.get('rating'),
            "votes_count": raw_result.get('user_ratings_total'),
            "opening_hours": format_opening_hours(raw_result),
            "reviews": reviews_data
        }

In [5]:
import json
import pathlib
import time

raw_data_path = pathlib.Path("merged_datasets/with_coords/google_maps_raw_data.json")

if raw_data_path.exists():
    raw_data = json.loads(raw_data_path.read_text())
else:        
    place_id_list = []
    for idx, row in tqdm(df_base_with_coords.iterrows(), total=df_base_with_coords.shape[0]):
        raw = get_place_id(
            gmaps, 
            lat=row['lat'], 
            lng=row['lng'], 
            name=row['name'], 
            address=row['address'] if not pd.isna(row['address']) else None
        )
        place_id_list.append(raw)

In [6]:
if raw_data_path.exists():
    raw_data = json.loads(raw_data_path.read_text())
else:        
    raw_data = []
    for idx, place_id in tqdm(enumerate(place_id_list), total=len(place_id_list)):
        if place_id is None:
            raw_data.append(None)
            continue
        raw_result = fetch_place_details(gmaps, place_id)
        for _ in range(2):
            if raw_result is not None:
                break
            time.sleep(1)
            raw_result = fetch_place_details(gmaps, place_id)
        raw_data.append(raw_result)
        if raw_result is None:
            print(f"Failed to fetch data for Place ID: {place_id} at index {idx}")
        
    raw_data_path.write_text(json.dumps(raw_data))


In [7]:
not_founded_fingerprints = [
    'Sites et musée archéologiques de Montmaurin',
    'Circulation(s), Festival de la Jeune Photographie Européenne',
    'Point G',
]

not_founded_ids = df_base_with_coords.loc[
    df_base_with_coords.apply(
        lambda row: np.array([f in (row['name'] + " " + row['address'] if pd.notnull(row['address']) else row['name']) for f in not_founded_fingerprints]).any(), 
        axis=1
    )
].index.tolist()
not_founded_ids

[66, 522, 535]

In [8]:
import pandas as pd
from tqdm import tqdm

places_list = []
reviews_list = []

for idx, raw in tqdm(enumerate(raw_data)):
    if raw is None:
        continue
    
    processed = get_processed_data(raw) 

    place_record = {
        "place_id": processed.get('place_id'),
        "attraction_id": idx,
        "name": processed.get('name'),
        "status": processed.get('status'),
        "lat": processed['coordinates']['lat'] if processed.get('coordinates') else None,
        "lng": processed['coordinates']['lng'] if processed.get('coordinates') else None,
        "price_level": processed.get('price_level'),
        "address": processed.get('address'),
        "city": processed.get('city'),
        "website": processed.get('website'),
        "phone": processed.get('phone'),
        "map_url": processed.get('map_url'),
        "rating": processed.get('rating'),
        "votes_count": processed.get('votes_count'),
        "opening_hours": processed.get('opening_hours'),
    }
    places_list.append(place_record)

    if processed.get('reviews'):
        for rev in processed['reviews']:
            review_record = {
                "place_id": processed.get('place_id'), # Foreign Key
                "author_name": rev.get('author'),
                "rating": rev.get('rating'),
                "text": rev.get('text'),
                "language": rev.get('language'), 
                "original_language": rev.get('original_language'),
                "timestamp": rev.get('time'), 
                "author_url": rev.get('author_url') 
            }
            reviews_list.append(review_record)

df_google_data = pd.DataFrame(places_list)
df_google_reviews = pd.DataFrame(reviews_list)


df_google_data = df_google_data.astype({
    "place_id": "string",
    "attraction_id": "int",
    "name": "string",
    "status": "string",
    "price_level": "string",
    "address": "string",
    "city": "string",
    "website": "string",
    "phone": "string",
    "map_url": "string",
    "lat": "float",
    "lng": "float",
    "rating": "float",
    "opening_hours": "string"
})

# Handle votes_count (convert to numeric, turning errors/None into NaN)
df_google_data['votes_count'] = pd.to_numeric(df_google_data['votes_count'], errors='coerce')

df_google_reviews = df_google_reviews.astype({
    "author_name": "string",
    "author_url": "string",
    "place_id": "string",
    "text": "string",
    "rating": "int",
    "timestamp": "float",
    "language": "string" ,
    "original_language": "string"
})

print(f"Processed {len(df_google_data)} places and {len(df_google_reviews)} reviews.")

564it [00:00, 67124.10it/s]

Processed 561 places and 2044 reviews.





In [9]:
print(df_google_data.shape)
df_google_data.head(2)

(561, 15)


Unnamed: 0,place_id,attraction_id,name,status,lat,lng,price_level,address,city,website,phone,map_url,rating,votes_count,opening_hours
0,ChIJvRj8ErNu5kcRfOX_xli-__I,0,Basilique Cathédrale Saint-Denis,OPERATIONAL,48.935694,2.359117,Price not available,"1 Rue de la Légion d'Honneur, 93200 Saint-Deni...",Saint-Denis,https://www.saint-denis-basilique.fr/,01 48 09 83 54,https://maps.google.com/?cid=17509923164744836476,4.6,7421.0,Monday: 10:00 AM – 4:45 PM\nTuesday: 10:00 AM ...
1,ChIJAyBtWjwIsRIRAZTNwHn7XA0,1,Site and archaeological museum Ensérune,OPERATIONAL,43.310118,3.113364,Price not available,"Site et musée d'enserune, 2901 route d'Ensérun...",Nissan-lez-e-enserune,http://www.enserune.fr/,04 67 32 60 35,https://maps.google.com/?cid=962920920694756353,4.3,666.0,Monday: Closed\nTuesday: Closed\nWednesday: 9:...


In [10]:
print(df_google_reviews.shape)
df_google_reviews.head(2)

(2044, 8)


Unnamed: 0,place_id,author_name,rating,text,language,original_language,timestamp,author_url
0,ChIJvRj8ErNu5kcRfOX_xli-__I,Przemyslaw Kowalski,5,I visited the Basilique-Cathédrale de Saint-De...,en,en,1764147000.0,https://www.google.com/maps/contrib/1091533539...
1,ChIJvRj8ErNu5kcRfOX_xli-__I,Seimen Burum,5,Impressive church and since the 7th century th...,en,en,1762984000.0,https://www.google.com/maps/contrib/1174519139...


In [11]:
import math

def distance_m(lat1, lon1, lat2, lon2):
    """Great-circle distance (Haversine) in meters between two lat/lon points."""
    R = 6371000.0  # Earth radius in meters

    phi1, phi2 = math.radians(lat1), math.radians(lat2)
    dphi = math.radians(lat2 - lat1)
    dlambda = math.radians(lon2 - lon1)

    a = math.sin(dphi / 2)**2 + math.cos(phi1) * math.cos(phi2) * math.sin(dlambda / 2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    return R * c

In [12]:
# Generally replace the cities if the google map coordinates are within 500 meters of the original ones

df_base_with_city = df_base_with_coords.copy()
CITY_RANGE_METERS = 500


# now replace the ones within 500 meters of google maps data
google_df_distance_list = df_google_data.apply(
    lambda row: distance_m(
        row['lat'], 
        row['lng'],
        df_base_with_city.loc[row['attraction_id'], 'lat'], 
        df_base_with_city.loc[row['attraction_id'], 'lng']
    ),
    axis=1
)

df_base_with_city.loc[
    df_google_data.loc[
        google_df_distance_list < CITY_RANGE_METERS
    ].attraction_id.values,
    'city'
] = df_google_data.loc[
    google_df_distance_list < CITY_RANGE_METERS
].city.values

df_base_with_city.city.isna().sum()

np.int64(9)

In [13]:
print(df_base_with_city.shape)
df_base_with_city.head(2)

(564, 15)


Unnamed: 0,name,url,short_description,ticket_price,ticket_price_conditions,opening_hours,payment_methods,address,visiting_services,ticket_price_raw,city,category,lat,lng,Tourpedia_id
0,Basilique Cathédrale de Saint-Denis,https://www.saint-denis-basilique.fr/,Discover the necropolis of the kings and queen...,17.0,Free for under 26s,From October to March\nMonday to Saturday: 10:...,coins | bank cards | cheques | Culture cheques...,"1, rue de la Légion d'Honneur, 93200 Saint-Denis",Cabin baggage ('cabines') allowed: maximum 40x...,Price: 17 €,Saint-Denis,,48.935461,2.359835,
1,Oppidum et musée archéologique d'Ensérune,https://www.enserune.fr/,Discover a major Gallo-Roman city located betw...,9.0,Free for those under 26 years old.,Please note: last access to the monument one h...,cash | bank cards | cheques | Culture cheques ...,"2901 route d’Ensérune, 34440 Nissan-lez-Ensérune",Drinking fountain | Toilets | Wheelchair | Pos...,Price: 9 €,Nissan-lez-e-enserune,,43.3103,3.1152,


# Replace null urls
Fisrt lets try to check similarity scores between existing urls 

In [14]:
import unicodedata
from thefuzz import fuzz
import re
from urllib.parse import urlparse


FRENCH_STOP = {
    # very common French function words
    "de","du","des","la","le","les","d","l","au","aux","et","en","a",
    "un","une","pour","sur","sous","chez","dans","avec","sans","par",
    # address boilerplate
    "rue","avenue","av","boulevard","bd","route","rte","place","pl",
    "impasse","imp","square","sq","chemin","ch","allee","all",
    "saint","sainte","ste","st",
    # phone / contact noise
    "tel","telephone","t","contact"
}


LEGAL_WORDS = {
    "sarl","sas","sa","eurl","sasu","sci","scm","snc","selarl","selas",
    "association","assoc","societe","ste","ets","etablissements"
}

ADDR_ABBREV = {
    "av": "avenue",
    "bd": "boulevard",
    "bld": "boulevard",
    "rte": "route",
    "st": "saint",
    "ste": "sainte",
    "pl": "place",
    "imp": "impasse",
    "sq": "square",
}

def strip_accents(s: str) -> str:
    s = unicodedata.normalize("NFKD", s)
    return "".join(c for c in s if not unicodedata.combining(c))

def normalize_text(s: str) -> str:
    if pd.isna(s):
        return ""
    s = s.lower().strip()
    s = strip_accents(s)
    s = re.sub(r"[^a-z0-9]+", " ", s)   # punctuation -> space
    s = re.sub(r"\s+", " ", s).strip()
    s = " ".join([t for t in s.split() if t not in FRENCH_STOP])
    return s

def normalize_name(name: str) -> str:
    s = normalize_text(str(name) if name and name != "nan" else "")
    toks = [t for t in s.split() if t not in LEGAL_WORDS]
    return " ".join(toks)

def normalize_address(addr: str) -> str:
    s = normalize_text(str(addr) if addr and addr != "nan" else "")
    toks = [ADDR_ABBREV.get(t, t) for t in s.split()]
    toks = [t for t in toks if not t.isdigit() or len(t) == 5]
    return " ".join(toks)

def normalize_url(u: str) -> str:
    if pd.isna(u):
        return ""
    u = u.strip()
    if not re.match(r"^[a-z]+://", u, re.I):
        u = "http://" + u
    p = urlparse(u)
    host = (p.netloc or "").lower()
    host = host[4:] if host.startswith("www.") else host
    path = re.sub(r"/+$", "", (p.path or "").lower())
    return host + path  # drop query/fragment

def make_fingerprint(name: str, address: str, url: str) -> str:
    return f"{normalize_name(name)} {normalize_address(address)} {normalize_url(url)}".strip()

def url_domain(u_norm: str) -> str:
    return u_norm.split("/")[0] if u_norm else ""


def token_set(s: str):
    return set(s.split()) if s else set()

def jaccard(a: str, b: str) -> float:
    A, B = token_set(a), token_set(b)
    if not A or not B:
        return 0.0
    return len(A & B) / len(A | B)

def score_pair(r1, r2):
    n1, a1, u1 = r1["name"], r1["address"], r1["url"]
    n2, a2, u2 = r2["name"], r2["address"], r2["url"]

    # Gate 1: if both URLs exist and domains differ -> non-match and if domains same -> match
    if not pd.isna(u1) and not pd.isna(u2):
        if url_domain(u1) != url_domain(u2):
            return 0.0
        else:
            return 100.0

    # Gate 2: require minimal name token overlap to avoid generic matches
    if jaccard(n1, n2) < 0.15:   # tune 0.15–0.30 depending on your data
        return 0.0

    fingerprint1 = make_fingerprint(n1, a1, u1)
    fingerprint2 = make_fingerprint(n2, a2, u2)

    return fuzz.token_set_ratio(fingerprint1, fingerprint2) if (not pd.isna(fingerprint1) and not pd.isna(fingerprint2)) else 0

In [15]:
SIMILARITY_THRESHOLD = 85
df_base_with_urls_and_openings = df_base_with_city.copy()

similarity_scores = df_google_data.rename(columns={"website": "url"}).apply(
    lambda row: score_pair(
        row,
        df_base_with_urls_and_openings.loc[row.name]
    ),
    axis=1
)
(similarity_scores > SIMILARITY_THRESHOLD).sum()

np.int64(32)

Not so good maybe it's better to combine it with distance which is a much more reliable metric

In [16]:
IDENTICLY_MATCH_RANGE_METERS = 100
(similarity_scores > SIMILARITY_THRESHOLD).sum(), (google_df_distance_list < IDENTICLY_MATCH_RANGE_METERS).sum(), ((similarity_scores > SIMILARITY_THRESHOLD) | (google_df_distance_list < IDENTICLY_MATCH_RANGE_METERS)).sum()

(np.int64(32), np.int64(415), np.int64(425))

In [17]:
df_base_with_urls_and_openings = df_base_with_city.copy()

df_base_with_urls_and_openings.loc[
    df_google_data.loc[
        google_df_distance_list < IDENTICLY_MATCH_RANGE_METERS
    ].attraction_id.values,
    'opening_hours'
] = df_google_data.loc[
    google_df_distance_list < IDENTICLY_MATCH_RANGE_METERS
].opening_hours.values

df_base_with_urls_and_openings.loc[
    df_google_data.loc[
        google_df_distance_list < IDENTICLY_MATCH_RANGE_METERS
    ].attraction_id.values,
    'url'
] = df_google_data.loc[
    google_df_distance_list < IDENTICLY_MATCH_RANGE_METERS
].website.values

In [18]:
df_base_with_city.url.isna().sum(), df_base_with_city.opening_hours.isna().sum()

(np.int64(444), np.int64(476))

In [20]:
df_base_with_urls_and_openings.url.isna().sum(), df_base_with_urls_and_openings.opening_hours.isna().sum()

(np.int64(273), np.int64(119))

# Cleaning 
Now Let's Drop Every google map data with more that 100 meters distance or not high similarity score

In [21]:
df_base_with_urls_and_openings.to_csv("merged_datasets/with_coords/with_google_map_url_city_opening_hour/france_monuments.csv", index=False)

In [22]:
df_google_data_cleaned = df_google_data.loc[(google_df_distance_list < IDENTICLY_MATCH_RANGE_METERS) | (similarity_scores > SIMILARITY_THRESHOLD)]
df_google_reviews_cleaned = df_google_reviews.loc[df_google_reviews['place_id'].isin(df_google_data_cleaned['place_id'])]

df_google_data_cleaned.to_csv("merged_datasets/with_coords/with_google_map_url_city_opening_hour/google_maps_data_cleaned.csv", index=False)
df_google_reviews_cleaned.to_csv("merged_datasets/with_coords/with_google_map_url_city_opening_hour/google_maps_reviews_cleaned.csv", index=False)