In [7]:
import osmnx as ox
import pandas as pd
import random
import requests

In [8]:
def get_top_cities(country_name, limit=4):
    """
    Finds the most populated cities in a country using a lightweight Overpass query.
    Returns a list of strings ["City, Country", ...].
    """
    # 1. Get the OSM Area ID for the country
    gdf = ox.geocode_to_gdf(country_name)
    # OSM relations usually need 3600000000 added to their ID to become an Overpass 'area'
    osmid = gdf.iloc[0].osm_id
    area_id = int(osmid) + 3_600_000_000

    # 2. Define a raw Overpass query
    # We ask for nodes/relations tagged as 'city' and ensure they have a 'population' tag
    # We output JSON and only the tags (no heavy geometry)
    overpass_url = "http://overpass-api.de/api/interpreter"
    overpass_query = f"""
    [out:json][timeout:25];
    area({area_id})->.searchArea;
    (
      node["place"="city"]["population"](area.searchArea);
      relation["place"="city"]["population"](area.searchArea);
    );
    out tags;
    """

    print(f"Finding cities in {country_name}...")
    response = requests.get(overpass_url, params={'data': overpass_query})
    data = response.json()

    # 3. Parse data into a DataFrame
    cities_data = []
    for element in data.get('elements', []):
        tags = element.get('tags', {})
        name = tags.get('name')
        pop = tags.get('population')

        if name and pop:
            # Clean population string (remove commas, spaces, extract numbers)
            try:
                # Handle "1,000,000" or "approx 5000"
                clean_pop = int(''.join(filter(str.isdigit, str(pop))))
                cities_data.append({'name': name, 'population': clean_pop})
            except ValueError:
                continue

    df = pd.DataFrame(cities_data)

    # 4. Sort and slice
    if df.empty:
        print("No cities with population data found.")
        return []

    # Remove duplicates (sometimes a city appears as both node and relation)
    df = df.sort_values('population', ascending=False).drop_duplicates(subset='name')
    top_cities = df.head(limit)['name'].tolist()

    # Format them as "City, Country" for OSMnx
    return [f"{city}, {country_name}" for city in top_cities]

In [9]:
country = "Portugal"

# Step 1: Get the 4 biggest cities names
top_4_places = get_top_cities(country, limit=4)
print(f"Top cities found: {top_4_places}")

Finding cities in Portugal...
Top cities found: ['Lisboa, Portugal', 'Porto, Portugal', 'Vila Nova de Gaia, Portugal', 'Amadora, Portugal']


In [3]:
place = "Poland"
tags = {
    "place": "city"
}

cities = ox.features.features_from_place(place, tags)
cities

  multi_poly_proj = utils_geo._consolidate_subdivide_geometry(poly_proj)


KeyboardInterrupt: 

In [32]:
place = "Warsaw, Poland"
cols_of_interest = [
    'name',
    'shop',
    'geometry'
]
tags_furniture = {
    "shop": ["furniture", "bed", "kitchen", "lighting", "interior_decoration", "bathroom_furnishing"]
}
tags_technology = {
    "shop": ["electronics", "computer", "mobile_phone", "hifi", "photo", "video_games"]
}
tags_office = {
    "shop": ["office_supplies", "stationery", "copyshop", "printer_ink"]
}


In [30]:
furniture = ox.features.features_from_place(place, tags_furniture)
technology = ox.features.features_from_place(place, tags_technology)
office_supplies = ox.features.features_from_place(place, tags_office)

In [37]:
furniture_df = furniture[cols_of_interest].dropna()
technology_df = technology[cols_of_interest].dropna()
office_supplies_df = office_supplies[cols_of_interest].dropna()

In [48]:
number_of_stores = 10

companies = random.sample(furniture_df["name"].tolist(), 10)
companies

['Zara Home',
 'Ego√©',
 'home&you',
 'Studio Dekoracji Okien',
 'MebleSosnowe.eu',
 'Mebloket',
 'TomDom.pl',
 'Sypialnia Plus',
 'Vives',
 'AMS Meble']

In [52]:
print(furniture_df[furniture_df["name"] == 'Zara Home'].iloc[0])

name                            Zara Home
shop                  interior_decoration
geometry    POINT (20.9300639 52.2943521)
Name: (node, 6495128832), dtype: object


In [107]:
# columns_to_extract = ['product_category_name']
# product_values = extract_columns('data2.csv', columns_to_extract)
product_values = ['Paper', 'Machines', 'Tables', 'Copiers', 'Supplies', 'Fasteners', 'Storage', 'Envelopes', 'Accessories', 'Bookcases', 'Chairs', 'Art', 'Appliances', 'Labels', 'Phones', 'Binders', 'Furnishings']

craft_values = [x for x in buildings["craft"].unique() if str(x) != 'nan']

In [39]:
industrial = [x for x in buildings["brand"].unique() if isinstance(x, str)]
companies = random.sample(industrial, 10)
print(companies)

['Dbam o Zdrowie', 'Maxi Zoo', 'Biedronka', 'Lukullus', 'Twoje Auto Twoje SPA', 'Costa', 'Orlen', "Domino's", 'PKO BP', 'Rossmann']
