In [14]:
!pip install -q aiohttp aiodns async-timeout requests beautifulsoup4 pandas

In [15]:
import asyncio
import aiohttp
from aiohttp import ClientSession
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import pandas as pd
import time

BASE = "https://www.worldometers.info/"

In [16]:
resp = requests.get(BASE)
resp.raise_for_status()
html_text = resp.text
print("=== HTML Source of worldometers homepage (first 2000 chars) ===\n")
print(html_text[:2000])
print("\n... (HTML truncated for display) ...\n")

=== HTML Source of worldometers homepage (first 2000 chars) ===

<!DOCTYPE html><html lang="en" dir="ltr"> <head><meta charset="UTF-8"><meta http-equiv="X-UA-Compatible" content="IE=edge"><meta name="viewport" content="width=device-width, initial-scale=1"><title>Worldometer - real time world statistics</title><!-- <I18nTags /> --><script async src="https://www.googletagmanager.com/gtag/js?id=G-ZDP3BFSX60"></script> <script>
    window.dataLayer = window.dataLayer || [];
    function gtag() {
      // eslint-disable-next-line prefer-rest-params
      dataLayer.push(arguments);
    }
    gtag("js", new Date());
    gtag("config", "G-ZDP3BFSX60", {
      page_lang: document.documentElement.lang,
    });
</script><script src="https://pagead2.googlesyndication.com/pagead/js/adsbygoogle.js?client=ca-pub-3701697624350410" crossorigin="anonymous"></script> <script async src="https://securepubads.g.doubleclick.net/tag/js/gpt.js"></script> <link rel="preconnect" href="https://a.pub.network/" cro

In [17]:
countries_index = "https://www.worldometers.info/world-population/population-by-country/"
r = requests.get(countries_index)
r.raise_for_status()
soup = BeautifulSoup(r.text, "html.parser")

In [18]:
country_rows = soup.select("table#example2 tbody tr")  # table id may be example2 on their page
country_links = []
for tr in country_rows:
    a = tr.select_one("a")
    if a and a.has_attr("href"):
        href = urljoin(countries_index, a["href"])
        name = a.get_text(strip=True)
        country_links.append({"country": name, "url": href})

print(f"Found {len(country_links)} country links (will attempt to parse each).")

Found 233 country links (will attempt to parse each).


In [19]:
largest_cities_page = urljoin(BASE, "population/largest-cities-in-the-world/")
extra_pages = [{"country": "Largest Cities (global)", "url": largest_cities_page}]

In [20]:
targets = country_links + extra_pages

In [21]:
sem = asyncio.Semaphore(30)   # limit concurrency to be polite
results = []

async def fetch_and_parse(session: ClientSession, item):
    url = item["url"]
    country_name = item["country"]
    try:
        async with sem:
            async with session.get(url, timeout=30) as resp:
                text = await resp.text()
    except Exception as e:
        # on error, skip
        return

    sp = BeautifulSoup(text, "html.parser")

    found = False

    for header_tag in sp.select("h2, h3, h4, strong, p"):
        header_text = header_tag.get_text(" ", strip=True).lower()
        if "main cities" in header_text or "main cities by population" in header_text or "major cities" in header_text:
            # find the next table or list after this header
            nxt_table = header_tag.find_next_sibling("table")
            if nxt_table:
                for row in nxt_table.select("tbody tr"):
                    cols = [td.get_text(strip=True) for td in row.select("td")]
                    if len(cols) >= 1:
                        city = cols[0]
                        pop = cols[1] if len(cols) > 1 else ""
                        results.append({
                            "country": country_name,
                            "city": city,
                            "city_population": pop,
                            "source_url": url,
                            "note": "from table after header"
                        })
                        found = True
            nxt_list = header_tag.find_next_sibling()
            if nxt_list and nxt_list.name in ("ul","ol"):
                for li in nxt_list.select("li"):
                    # try to split name and population by dash or comma
                    text = li.get_text(" ", strip=True)
                    parts = [p.strip() for p in text.replace("—","-").split("-")]
                    city = parts[0]
                    pop = parts[1] if len(parts) > 1 else ""
                    results.append({
                        "country": country_name,
                        "city": city,
                        "city_population": pop,
                        "source_url": url,
                        "note": "from list after header"
                    })
                    found = True

    if not found:
        for table in sp.select("table"):
            headers = [th.get_text(" ", strip=True).lower() for th in table.select("thead th")]
            if any("city" in h or "population" in h for h in headers):
                for row in table.select("tbody tr"):
                    cols = [td.get_text(" ", strip=True) for td in row.select("td")]
                    if not cols:
                        continue
                    # attempt best-guess mapping
                    city = cols[0]
                    pop = ""
                    # try to find numeric-looking column
                    for c in cols[1:]:
                        if any(ch.isdigit() for ch in c):
                            pop = c
                            break
                    results.append({
                        "country": country_name,
                        "city": city,
                        "city_population": pop,
                        "source_url": url,
                        "note": "from guessed table"
                    })
                    found = True

    if "largest-cities-in-the-world" in url:
        table = sp.select_one("table")
        if table:
            for row in table.select("tbody tr"):
                cols = [td.get_text(" ", strip=True) for td in row.select("td")]
                if len(cols) >= 2:
                    city = cols[1]
                    pop = cols[2] if len(cols) > 2 else ""
                    results.append({
                        "country": "Global-largest-cities",
                        "city": city,
                        "city_population": pop,
                        "source_url": url,
                        "note": "from largest cities table"
                    })

    return

In [23]:
async def main_fetch(entries):
    conn = aiohttp.TCPConnector(limit=60)
    timeout = aiohttp.ClientTimeout(total=60)
    async with aiohttp.ClientSession(connector=conn, timeout=timeout) as session:
        tasks = [fetch_and_parse(session, e) for e in entries]
        # run in chunks to be polite and avoid memory spikes
        chunk = 60
        for i in range(0, len(tasks), chunk):
            await asyncio.gather(*tasks[i:i+chunk])
            # small pause between chunks
            await asyncio.sleep(0.5)

In [24]:
import nest_asyncio
nest_asyncio.apply()
asyncio.run(main_fetch(targets))

print("Total city-like records extracted (raw):", len(results))

Total city-like records extracted (raw): 13281


In [25]:
df = pd.DataFrame(results)
if not df.empty:
    df['city'] = df['city'].astype(str).str.replace(r'\s+\(.*\)$', '', regex=True).str.strip()
    df = df.drop_duplicates(subset=["country","city","city_population"])

print("After dedup:", df.shape[0])
display(df.head(15))

After dedup: 13281


Unnamed: 0,country,city,city_population,source_url,note
0,Sudan,2025,51662147,https://www.worldometers.info/world-population...,from guessed table
1,Sudan,2024,50448963,https://www.worldometers.info/world-population...,from guessed table
2,Sudan,2023,50042791,https://www.worldometers.info/world-population...,from guessed table
3,Sudan,2022,49383346,https://www.worldometers.info/world-population...,from guessed table
4,Sudan,2020,46789231,https://www.worldometers.info/world-population...,from guessed table
5,Sudan,2015,40024431,https://www.worldometers.info/world-population...,from guessed table
6,Sudan,2010,35414399,https://www.worldometers.info/world-population...,from guessed table
7,Sudan,2005,31262444,https://www.worldometers.info/world-population...,from guessed table
8,Sudan,2000,27816745,https://www.worldometers.info/world-population...,from guessed table
9,Sudan,1995,24672143,https://www.worldometers.info/world-population...,from guessed table


In [28]:
if df.shape[0] < 1000:
    print("Not enough records (<1000). Attempting a second pass: visiting country pages to find more city entries (looking for lists like 'Cities' or 'Main Cities').")
    extra_results = []
    async def aggressive_fetch(session: ClientSession, item):
        url = item["url"]
        country_name = item["country"]
        try:
            async with sem:
                async with session.get(url, timeout=30) as resp:
                    text = await resp.text()
        except Exception:
            return
        sp = BeautifulSoup(text, "html.parser")
        text_all = sp.get_text(" ", strip=True).lower()

        for li in sp.select("li"):
            txt = li.get_text(" ", strip=True)
            if len(txt) > 20 and any(ch.isdigit() for ch in txt):
                # heuristic: likely a city entry
                parts = [p.strip() for p in txt.replace("—","-").split("-")]
                city = parts[0]
                pop = parts[1] if len(parts)>1 else ""
                extra_results.append({"country": country_name,"city":city,"city_population":pop,"source_url":url,"note":"aggressive-li"})
        for row in sp.select("table tbody tr"):
            cols = [td.get_text(" ", strip=True) for td in row.select("td")]
            if not cols: continue
            # if row contains a city-like name and a numeric value
            if any(any(ch.isdigit() for ch in c) for c in cols):
                city = cols[0]
                pop = ""
                for c in cols[1:]:
                    if any(ch.isdigit() for ch in c):
                        pop = c; break
                extra_results.append({"country":country_name,"city":city,"city_population":pop,"source_url":url,"note":"aggressive-table"})

    async def run_aggressive():
        conn = aiohttp.TCPConnector(limit=60)
        timeout = aiohttp.ClientTimeout(total=60)
        async with aiohttp.ClientSession(connector=conn, timeout=timeout) as session:
            tasks = [aggressive_fetch(session, e) for e in targets]
            for i in range(0, len(tasks), 60):
                await asyncio.gather(*tasks[i:i+60])
                await asyncio.sleep(0.5)
    asyncio.run(run_aggressive())
    if extra_results:
        df_extra = pd.DataFrame(extra_results)
        df_extra['city'] = df_extra['city'].astype(str).str.replace(r'\s+\(.*\)$', '', regex=True).str.strip()
        df_all = pd.concat([df, df_extra], ignore_index=True)
        df_all = df_all.drop_duplicates(subset=["country","city","city_population"])
    else:
        df_all = df.copy()
else:
    df_all = df.copy()

In [29]:
print("Final deduped rows:", df_all.shape[0])
# If still <1000 we can consider adding synthetic rows (not recommended) or scraping other worldometers sections.
csv_path = "/content/worldometers_cities.csv"
df_all.to_csv(csv_path, index=False, encoding="utf-8-sig")
print("Saved CSV to:", csv_path)
display(df_all.head(20))

Final deduped rows: 13281
Saved CSV to: /content/worldometers_cities.csv


Unnamed: 0,country,city,city_population,source_url,note
0,Sudan,2025,51662147,https://www.worldometers.info/world-population...,from guessed table
1,Sudan,2024,50448963,https://www.worldometers.info/world-population...,from guessed table
2,Sudan,2023,50042791,https://www.worldometers.info/world-population...,from guessed table
3,Sudan,2022,49383346,https://www.worldometers.info/world-population...,from guessed table
4,Sudan,2020,46789231,https://www.worldometers.info/world-population...,from guessed table
5,Sudan,2015,40024431,https://www.worldometers.info/world-population...,from guessed table
6,Sudan,2010,35414399,https://www.worldometers.info/world-population...,from guessed table
7,Sudan,2005,31262444,https://www.worldometers.info/world-population...,from guessed table
8,Sudan,2000,27816745,https://www.worldometers.info/world-population...,from guessed table
9,Sudan,1995,24672143,https://www.worldometers.info/world-population...,from guessed table
