In [None]:
import pandas as pd
import requests
import time
import random
import shelve
from urllib.parse import quote
import chardet

# === НАСТРОЙКИ ===
INPUT_FILE = "Cities.csv"
OUTPUT_FILE = "Cities_with_population.csv"
CACHE_FILE = "cache.db"
USER_AGENT = "CityPopulationCollector/1.0 (contact: your-email@example.com)"
SLEEP_MIN = 0.3
SLEEP_MAX = 0.8
MAX_RETRIES = 6

# === ОПРЕДЕЛЕНИЕ КОДИРОВКИ И РАЗДЕЛИТЕЛЯ CSV ===
with open(INPUT_FILE, 'rb') as f:
    rawdata = f.read()
    result = chardet.detect(rawdata)
    encoding = result['encoding']

# Попробуем угадать разделитель
with open(INPUT_FILE, encoding=encoding) as f:
    first_line = f.readline()
if ';' in first_line:
    sep = ';'
elif '\t' in first_line:
    sep = '\t'
else:
    sep = ','

# === НАСТРОЙКА СЕССИИ ===
SESSION = requests.Session()
SESSION.headers.update({"User-Agent": USER_AGENT})

# === ВЕЖЛИВЫЙ GET ЗАПРОС ===
def polite_get(url, params=None):
    attempt = 0
    while True:
        try:
            r = SESSION.get(url, params=params, timeout=30)
        except requests.RequestException:
            attempt += 1
            if attempt > MAX_RETRIES:
                raise
            time.sleep(min(60, 2**attempt) + random.uniform(0, 0.5))
            continue

        if r.status_code == 200:
            time.sleep(random.uniform(SLEEP_MIN, SLEEP_MAX))
            return r.json()
        elif r.status_code in (429, 503):
            retry_after = r.headers.get("Retry-After")
            sleep_time = int(retry_after) if retry_after else min(60, 2**attempt)
            print(f"⚠️  Ограничение скорости. Спим {sleep_time:.1f}s...")
            time.sleep(sleep_time)
            attempt += 1
            continue
        else:
            r.raise_for_status()

# === ФУНКЦИЯ ДЛЯ ПОЛУЧЕНИЯ ДАННЫХ О ГОРОДЕ ===
def get_city_population(city, country, cache):
    key = f"{city}||{country}"
    if key in cache:
        return cache[key]

    # 1. Поиск города в Wikidata
    search_url = "https://www.wikidata.org/w/api.php"
    params = {
        "action": "wbsearchentities",
        "search": f"{city} {country}",
        "language": "en",
        "type": "item",
        "format": "json"
    }
    data = polite_get(search_url, params=params)
    if not data.get("search"):
        result = {"population": None, "note": "no_search_results"}
        cache[key] = result
        return result

    qid = data["search"][0]["id"]

    # 2. Получение населения (P1082)
    entity_url = "https://www.wikidata.org/w/api.php"
    params = {
        "action": "wbgetentities",
        "ids": qid,
        "props": "claims",
        "format": "json"
    }
    entity_data = polite_get(entity_url, params=params)
    claims = entity_data.get("entities", {}).get(qid, {}).get("claims", {})

    pop = None
    if "P1082" in claims:
        pop_values = []
        for claim in claims["P1082"]:
            mainsnak = claim.get("mainsnak", {})
            if mainsnak.get("datavalue"):
                pop_values.append(mainsnak["datavalue"]["value"])
        if pop_values:
            pop = max(pop_values)

    result = {"population": pop, "note": None}
    cache[key] = result
    return result

# === ОСНОВНОЙ СКРИПТ ===
def main():
    # Чтение CSV с новой опцией вместо error_bad_lines
    df = pd.read_csv(INPUT_FILE, encoding=encoding, sep=sep, on_bad_lines='skip')
    results = []

    with shelve.open(CACHE_FILE) as cache:
        for i, row in df.iterrows():
            city = str(row["City"]).strip()
            country = str(row["Country"]).strip()
            print(f"[{i+1}/{len(df)}] 🔍 {city}, {country}...")

            try:
                res = get_city_population(city, country, cache)
            except Exception as e:
                res = {"population": None, "note": f"error:{e}"}

            results.append({
                "City": city,
                "Country": country,
                "Population": res["population"],
                "Note": res["note"]
            })

            # Сохраняем прогресс каждые 50 строк
            if (i + 1) % 50 == 0:
                pd.DataFrame(results).to_csv(OUTPUT_FILE, index=False, encoding="utf-8")
                print(f"💾 Сохранено прогресс ({i+1} строк).")

    # Сохраняем финальный результат
    pd.DataFrame(results).to_csv(OUTPUT_FILE, index=False, encoding="utf-8")
    print("✅ Готово! Результаты сохранены в:", OUTPUT_FILE)

if __name__ == "__main__":
    main()


[1/2486] 🔍 Aguascalientes, Mexico...
[2/2486] 🔍 Alamagordo, United States of America...
[3/2486] 🔍 Allentown, United States of America...
[4/2486] 🔍 Apodaca, Mexico...
[5/2486] 🔍 Apodaca NL, Mexico...
[6/2486] 🔍 Apodaca, NL, Mexico...
[7/2486] 🔍 Atlanta GA, United States of America...
[8/2486] 🔍 Auburn Hills, United States of America...
[9/2486] 🔍 Aumsville, Oregon, United States of America...
[10/2486] 🔍 AUSTIN, United States of America...
[11/2486] 🔍 Baltimore MD, United States of America...
[12/2486] 🔍 Bangalore, United States of America...
[13/2486] 🔍 Barberton, United States of America...
[14/2486] 🔍 Barrie, Canada...
[15/2486] 🔍 Barrio Chapultepec Norte, Mexico...
[16/2486] 🔍 Bedford, United States of America...
[17/2486] 🔍 Bethlehem, United States of America...
[18/2486] 🔍 Bettendorf, United States of America...
[19/2486] 🔍 Bettendorf, IA, United States of America...
[20/2486] 🔍 Bloomington, United States of America...
[21/2486] 🔍 Boca Raton, United States of America...
[22/2486