In [19]:
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut
import time

geolocator = Nominatim(user_agent="skyscraper-scraper")

def get_country_from_city(city_name):
    try:
        location = geolocator.geocode(city_name, exactly_one=True, timeout=10)
        if location and location.address:
            for part in location.address.split(",")[::-1]:
                if part.strip().isalpha():
                    return part.strip()
    except GeocoderTimedOut:
        time.sleep(1)
        return get_country_from_city(city_name)
    except Exception:
        pass
    return None

In [35]:
scrape_skyscrapers(
    "https://www.skyscrapercenter.com/buildings?status=completed&material=all&function=all&location=world&year=2025",
    "100_tallest_constructed.csv"
)

scrape_skyscrapers(
    "https://www.skyscrapercenter.com/buildings?status=construction&material=all&function=all&location=world&year=2025",
    "100_tallest_under_construction.csv"
)

scrape_skyscrapers(
    "https://www.skyscrapercenter.com/buildings?status=proposed&material=all&function=all&location=world&year=2025",
    "100_tallest_proposed.csv"
)

🔎 Scraping from: https://www.skyscrapercenter.com/buildings?status=completed&material=all&function=all&location=world&year=2025
✅ Saved to 100_tallest_constructed.csv
🔎 Scraping from: https://www.skyscrapercenter.com/buildings?status=construction&material=all&function=all&location=world&year=2025
✅ Saved to 100_tallest_under_construction.csv
🔎 Scraping from: https://www.skyscrapercenter.com/buildings?status=proposed&material=all&function=all&location=world&year=2025
✅ Saved to 100_tallest_proposed.csv


In [None]:
city_country_cache = {
    # Already provided
    "Jeddah": "Saudi Arabia",
    "Dubai": "United Arab Emirates",
    "Shanghai": "China",
    "Dongguan": "China",
    "New York City": "United States",
    "Abidjan": "Ivory Coast",
    "Ras al Khaimah": "United Arab Emirates",
    "Bangkok": "Thailand",
    "Miami": "United States",
    "Austin": "United States",
    "New York": "United States",
    "Toronto": "Canada",
    "Oklahoma City": "United States",

    # Newly identified
    "Mecca": "Saudi Arabia",
    "Seoul": "South Korea",
    "St. Petersburg": "Russia",
    "Ho Chi Minh City": "Vietnam",
    "Busan": "South Korea",
    "Ningbo": "China",
    "Nanning": "China",
    "Abu Dhabi": "United Arab Emirates",
    "Istanbul": "Turkey",
    "Xi’an": "China",
    "Kaohsiung": "Taiwan",
    "Goyang": "South Korea",
    "Durban": "South Africa",
    "Colombo": "Sri Lanka",
    "Watamu": "Kenya",
    "Beijing": "China",
    "Riyadh": "Saudi Arabia",
    "Moscow": "Russia",
    "Philadelphia": "United States",
    "Los Angeles": "United States",
    "Chicago": "United States",
    "Osaka" : "Japan",
    "Bogota" : "Colombai"
}

city_country_cache["Osaka"] = "Japan"
city_country_cache["Johannesburg"] = "South Africa"
city_country_cache["London"] = "United Kingdom"

In [None]:
city_country_cache["Toronto"] = "Canada"

In [53]:
import pandas as pd
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut
import time
import sys
import os

geolocator = Nominatim(user_agent="city-country-mapper")

US_STATES = {
    "Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado", "Connecticut",
    "Delaware", "Florida", "Georgia", "Hawaii", "Idaho", "Illinois", "Indiana", "Iowa",
    "Kansas", "Kentucky", "Louisiana", "Maine", "Maryland", "Massachusetts", "Michigan",
    "Minnesota", "Mississippi", "Missouri", "Montana", "Nebraska", "Nevada", "New Hampshire",
    "New Jersey", "New Mexico", "New York", "North Carolina", "North Dakota", "Ohio",
    "Oklahoma", "Oregon", "Pennsylvania", "Rhode Island", "South Carolina", "South Dakota",
    "Tennessee", "Texas", "Utah", "Vermont", "Virginia", "Washington", "West Virginia",
    "Wisconsin", "Wyoming"
}


# ✅ Add this cache dictionary


def get_country(city):
    if city in city_country_cache:
        return city_country_cache[city]

    try:
        location = geolocator.geocode(city, exactly_one=True, language="en")
        if location and location.address:
            address_parts = [part.strip() for part in location.address.split(",")]
            # Check if any known US state is in the address
            for part in reversed(address_parts):
                if part in US_STATES:
                    city_country_cache[city] = "United States"
                    return "United States"
                elif part.isalpha():
                    city_country_cache[city] = part
                    return part
    except GeocoderTimedOut:
        print("EXCEPT")
        time.sleep(1)
        return get_country(city)
    except Exception:
        pass

    city_country_cache[city] = None
    return None


def add_country_column(csv_path):
    df = pd.read_csv(csv_path)
    print(f"🌍 Resolving countries for {len(df)} rows...")

    countries = []
    for i, city in enumerate(df['City']):
        country = get_country(city)
        countries.append(country)
        print(f"{i+1}/{len(df)}: {city} → {country}")

    df.insert(df.columns.get_loc("City") + 1, "Country", countries)

    output_path = os.path.splitext(csv_path)[0] + "_with_countries.csv"
    df.to_csv(output_path, index=False)
    print(f"✅ Saved: {output_path}")

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

HEADERS = {"User-Agent": "Mozilla/5.0"}

def extract_height_parts(p_tag):
    try:
        text = p_tag.get_text().strip().replace("\xa0", " ")  # e.g., "828 m / 2,717 ft"
        parts = text.split('/')
        meters = parts[0].strip().replace(" m", "").replace(",", "")
        feet = parts[1].strip().replace(" ft", "").replace(",", "")
        return int(meters), int(feet)
    except Exception as e:
        print(f"⚠️ Failed to parse height: {e}")
        return None, None

def scrape_skyscrapers():
    URL = "https://www.skyscrapercenter.com/buildings"
    response = requests.get(URL, headers=HEADERS)
    soup = BeautifulSoup(response.content, "html.parser")
    table_rows = soup.find_all("tr")

    buildings = []
    for row in table_rows:
        cols = row.find_all("td")
        if len(cols) < 9:
            continue  # skip non-data rows

        try:
            name = cols[1].find("a").text.strip()
            city = cols[2].find("a").text.strip()
            completion = cols[4].text.strip()

            height_p = cols[5].find("p")
            height_m, height_ft = extract_height_parts(height_p)

            floors = cols[6].text.strip()
            material = cols[7].text.strip()
            function = cols[8].text.strip()

            buildings.append({
                "Name": name,
                "City": city,
                "Completion Year": completion,
                "Height (m)": height_m,
                "Height (ft)": height_ft,
                "Floors": floors,
                "Material": material,
                "Function": function,
            })
        except Exception as e:
            print(f"⚠️ Skipping row due to error: {e}")
            continue

    df = pd.DataFrame(buildings)
    df.to_csv("100_tallest_buildings.csv", index=False)
    print("✅ Scraping complete. Data saved to tallest_buildings.csv")


✅ Scraping complete. Data saved to tallest_buildings.csv


In [None]:
import pandas as pd
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut
import time
import sys
import os

geolocator = Nominatim(user_agent="city-country-mapper")

# ✅ Add this cache dictionary
city_country_cache = {}

def get_country(city):
    if city in city_country_cache:
        return city_country_cache[city]
    
    try:
        location = geolocator.geocode(city, exactly_one=True, language="en")
        if location and location.address:
            for part in location.address.split(",")[::-1]:
                if part.strip().isalpha():
                    country = part.strip()
                    city_country_cache[city] = country
                    return country
    except GeocoderTimedOut:
        time.sleep(1)
        return get_country(city)
    except Exception:
        pass

    city_country_cache[city] = None
    return None

def add_country_column(csv_path):
    df = pd.read_csv(csv_path)
    print(f"🌍 Resolving countries for {len(df)} rows...")

    countries = []
    for i, city in enumerate(df['City']):
        country = get_country(city)
        countries.append(country)
        print(f"{i+1}/{len(df)}: {city} → {country}")

    df.insert(df.columns.get_loc("City") + 1, "Country", countries)

    output_path = os.path.splitext(csv_path)[0] + "_with_countries.csv"
    df.to_csv(output_path, index=False)
    print(f"✅ Saved: {output_path}")


In [None]:
scrape_skyscrapers(
    "https://www.skyscrapercenter.com/buildings?status=completed&material=all&function=all&location=world&year=2025",
    "100_tallest_constructed.csv"
)

scrape_skyscrapers(
    "https://www.skyscrapercenter.com/buildings?status=construction&material=all&function=all&location=world&year=2025",
    "100_tallest_under_construction.csv"
)

scrape_skyscrapers(
    "https://www.skyscrapercenter.com/buildings?status=proposed&material=all&function=all&location=world&year=2025",
    "100_tallest_proposed.csv"
)

🔎 Scraping from: https://www.skyscrapercenter.com/buildings?status=completed&material=all&function=all&location=world&year=2025
✅ Saved to 100_tallest_constructed.csv
🔎 Scraping from: https://www.skyscrapercenter.com/buildings?status=construction&material=all&function=all&location=world&year=2025
✅ Saved to 100_tallest_under_construction.csv
🔎 Scraping from: https://www.skyscrapercenter.com/buildings?status=proposed&material=all&function=all&location=world&year=2025
✅ Saved to 100_tallest_proposed.csv


In [73]:
for year in range(1890, 1970, 10):
    url = f"https://www.skyscrapercenter.com/buildings?status=completed&material=all&function=all&location=world&year={year}"
    output_filename = f"100_tallest_constructed_world_{year}.csv"
    print(f"🔎 Scraping for year {year}...")
    scrape_skyscrapers(url, output_filename)

🔎 Scraping for year 1890...
🔎 Scraping from: https://www.skyscrapercenter.com/buildings?status=completed&material=all&function=all&location=world&year=1890
✅ Saved to 100_tallest_constructed_world_1890.csv
🔎 Scraping for year 1900...
🔎 Scraping from: https://www.skyscrapercenter.com/buildings?status=completed&material=all&function=all&location=world&year=1900
✅ Saved to 100_tallest_constructed_world_1900.csv
🔎 Scraping for year 1910...
🔎 Scraping from: https://www.skyscrapercenter.com/buildings?status=completed&material=all&function=all&location=world&year=1910
✅ Saved to 100_tallest_constructed_world_1910.csv
🔎 Scraping for year 1920...
🔎 Scraping from: https://www.skyscrapercenter.com/buildings?status=completed&material=all&function=all&location=world&year=1920
⚠️ Failed to parse height: list index out of range
✅ Saved to 100_tallest_constructed_world_1920.csv
🔎 Scraping for year 1930...
🔎 Scraping from: https://www.skyscrapercenter.com/buildings?status=completed&material=all&functio

In [None]:
for year in range(1890, 1970, 10):
    filename = f"100_tallest_constructed_world_{year}.csv"
    print(f"🌍 Resolving countries for {filename}...")
    add_country_column(filename)


🌍 Resolving countries for 100_tallest_constructed_world_1890.csv...
🌍 Resolving countries for 100 rows...
1/100: Turin → Italy
2/100: Ulm → Germany
3/100: Cologne → Germany
4/100: Rouen → France
5/100: Hamburg → Germany
6/100: Strasbourg → France
7/100: Vienna → Austria
8/100: Hamburg → Germany
9/100: Vatican City → None
10/100: Hamburg → Germany
11/100: Landshut → Germany
12/100: Tallinn → Estonia
13/100: Riga → Latvia
14/100: Antwerp → Belgium
15/100: Salisbury → England
16/100: St. Petersburg → Russia
17/100: Hamburg → Germany
18/100: Florence → Italy
19/100: Utrecht → Netherlands
20/100: London → United Kingdom
21/100: Springfield (IL) → United States
22/100: Graz → Austria
23/100: Delft → Netherlands
24/100: Milan → Italy
25/100: Tallinn → Estonia
26/100: Dortmund → Germany
27/100: Dortmund → Germany
28/100: New York City → United States
29/100: London → United Kingdom
30/100: St. Petersburg → Russia
31/100: Amersfoort → Netherlands
32/100: Chicago → United States
33/100: Vienna →