In [53]:
import requests
import pandas as pd
from tqdm import tqdm

In [54]:
# Overpass API URL
overpass_url = "http://overpass-api.de/api/interpreter"

# Query for all features in Poland with tags
query = """
[out:json][timeout:180];
(
  node["historic"="monument"](46.0,10.0,55.0,24.0);
  node["tourism"="attraction"](46.0,10.0,55.0,24.0);
  node["amenity"="memorial"](46.0,10.0,55.0,24.0);
  way["historic"="monument"](46.0,10.0,55.0,24.0);
  way["tourism"="attraction"](46.0,10.0,55.0,24.0);
  way["amenity"="memorial"](46.0,10.0,55.0,24.0);
  relation["historic"="monument"](46.0,10.0,55.0,24.0);
  relation["tourism"="attraction"](46.0,10.0,55.0,24.0);
  relation["amenity"="memorial"](46.0,10.0,55.0,24.0);
);
out center tags;
"""




# Fetch data from Overpass API
response = requests.get(overpass_url, params={"data": query})

if response.status_code != 200:
    print(f"Error: {response.status_code}")
    exit()


In [55]:
# Parse response
data = response.json()
elements = data.get("elements", [])

# Extract relevant data
monuments = []
for elem in tqdm(elements, desc="Processing data"):
    tags = elem.get("tags", {})
    lat = elem.get("lat") or elem.get("center", {}).get("lat")
    lon = elem.get("lon") or elem.get("center", {}).get("lon")
    if lat and lon:
        monuments.append({
            "name": tags.get("name", "Unnamed"),
            "latitude": lat,
            "longitude": lon,
            "historic": tags.get("historic", "Unknown"),
            "tourism": tags.get("tourism", "Unknown"),
            "amenity": tags.get("amenity", "Unknown")
        })

Processing data: 100%|██████████| 36014/36014 [00:00<00:00, 1142104.99it/s]


In [56]:
# Convert to DataFrame
df = pd.DataFrame(monuments)

# Remove columns with too many null values
threshold = 0.9  # Keep columns with at least 10% non-null values
df = df.loc[:, df.isnull().mean() < threshold]


In [57]:
# Remove irrelevant place types dynamically
exclude_place_types = ["neighbourhood", "peak", "waterway"]
if "place" in df.columns:
    df = df[~df["place"].isin(exclude_place_types)]

# Drop duplicates
df = df.drop_duplicates(subset=["name", "latitude", "longitude"])

In [58]:
df.isnull().sum()

name         0
latitude     0
longitude    0
historic     0
tourism      0
amenity      0
dtype: int64

In [59]:
print(df.columns)
df = df[['name', 'latitude', 'longitude']]
df = df[
    (df["latitude"] >= 49.0) & (df["latitude"] <= 55.0) &
    (df["longitude"] >= 14.0) & (df["longitude"] <= 24.0)
]

Index(['name', 'latitude', 'longitude', 'historic', 'tourism', 'amenity'], dtype='object')


In [60]:
to_remove = ["Poland", "West Poland", "South Poland", "South Poland", "East Poland"]  
df = df[~df['name'].isin(to_remove)]  
df['name'].unique()

array(['Vlčí rokle', 'Vodní tunel pod tratí', 'Hrad Šternberk', ...,
       'Pomnik Poległych Stoczniowców w Gdańsku', 'Staw Kormoranów',
       'Molo Kołobrzeg'], dtype=object)

In [61]:
print(f"\nTotal Monuments Retrieved: {len(df)}")
print(df.head())


Total Monuments Retrieved: 16566
                     name   latitude  longitude
0              Vlčí rokle  49.874643  14.531274
1   Vodní tunel pod tratí  49.872497  14.514091
3          Hrad Šternberk  49.733078  17.301858
14              Giszowiec  50.221804  19.067786
22     Krypta Oleśnickich  50.859276  21.053433


In [62]:
# Save to CSV
df.to_csv("poland_monuments_filtered.csv", index=False)