In [1]:
import requests
import pandas as pd
from tqdm import tqdm


In [2]:
# Overpass API URL
overpass_url = "http://overpass-api.de/api/interpreter"

# Query for all features in Poland with tags
query = """
[out:json][timeout:1800];
area["name"="Poland"]->.searchArea;
(
  node(area.searchArea);
  way(area.searchArea);
  relation(area.searchArea);
);
out center tags;
>;
out skel qt;
"""

# Fetch data from Overpass API
response = requests.get(overpass_url, params={"data": query})

if response.status_code != 200:
    print(f"Error: {response.status_code}")
    exit()


In [3]:
# Parse the response
data = response.json()
elements = data.get("elements", [])

# Extract data dynamically
monuments = []
for elem in tqdm(elements, desc="Processing data"):
    tags = elem.get("tags", {})
    lat = elem.get("lat") or elem.get("center", {}).get("lat")
    lon = elem.get("lon") or elem.get("center", {}).get("lon")
    if tags.get("name") and lat and lon:
        record = {
            "name": tags.get("name", "Unnamed"),
            "latitude": lat,
            "longitude": lon,
        }
        # Include all tags dynamically
        record.update(tags)
        monuments.append(record)


Processing data: 100%|██████████| 143795/143795 [00:00<00:00, 1893733.22it/s]


In [4]:
# Convert to DataFrame
df = pd.DataFrame(monuments)

# Remove columns with too many null values
threshold = 0.9  # Keep columns with at least 10% non-null values
df = df.loc[:, df.isnull().mean() < threshold]


In [5]:
# Remove irrelevant place types dynamically
exclude_place_types = ["neighbourhood", "peak", "waterway"]
if "place" in df.columns:
    df = df[~df["place"].isin(exclude_place_types)]

# Drop duplicates
df = df.drop_duplicates(subset=["name", "latitude", "longitude"])

In [10]:
df.isnull().sum()

name                 0
latitude             0
longitude            0
building           753
highway            241
tiger:cfcc         496
tiger:county       496
tiger:name_base    496
tiger:name_type    516
tiger:zip_left     670
tiger:zip_right    716
tiger:reviewed     513
dtype: int64

In [None]:
print(df.columns)
df = df[['name', 'latitude', 'longitude']]


Index(['name', 'latitude', 'longitude', 'building', 'highway', 'tiger:cfcc',
       'tiger:county', 'tiger:name_base', 'tiger:name_type', 'tiger:zip_left',
       'tiger:zip_right', 'tiger:reviewed'],
      dtype='object')


In [15]:
to_remove = ["Poland", "West Poland", "South Poland", "South Poland", "East Poland"]  
df = df[~df['name'].isin(to_remove)]  
df['name'].unique()

array(['Poland Spring', 'Oak Hill', 'Five Corners', 'Promised Land',
       'Moussam', 'Hackett Mills', 'Empire',
       'South Main Street Historic District',
       'Chapel of the Evangelical Free Church', 'Bailey Hill',
       'Black Cat Mountain', 'Harris Hill', 'Johnson Hill',
       'Megquier Hill', 'White Oak Hill', 'Range Hill', 'Raspberry Hill',
       'Ricker Hill', 'Shaker Hill', 'Waterhouse Brook Dam',
       'Elan Rehabilitation Center', 'Empire Grove Church',
       'Poland Baptist Church', 'Poland Community Church', 'Potash Cove',
       'Lower Range Seaplane Base', 'Pro Shop', "Dunkin' Donuts",
       'Subway', 'Family Dollar', 'Vernal Pool', 'White Oaks',
       'East Poland Post Office',
       'Androscoggin County Sheriffs Department Poland Unit',
       'FairPoint Central Office', 'Big Apple', 'Maine DOT',
       'West Poland Post Office', 'Poland Fire and Rescue Department',
       'Poland Public Works', "McDonald's", 'Poland Transfer Station',
       'J.M. Morin E

In [16]:
print(f"\nTotal Monuments Retrieved: {len(df)}")
print(df.head())


Total Monuments Retrieved: 839
            name   latitude  longitude
2  Poland Spring  44.075133 -70.416299
5       Oak Hill  44.050350 -70.408668
6   Five Corners  44.075350 -70.416446
8  Promised Land  44.067850 -70.438668
9        Moussam  44.080350 -70.340889


In [17]:
# Save to CSV
df.to_csv("poland_monuments_filtered.csv", index=False)