In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
import time
import re

INPUT_PATH = Path("../../raw_data/worldcitiespop.csv")
OUTPUT_DIR = Path("../../clean_data")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
OUT_BASIC = OUTPUT_DIR / "destination_sample_basic.csv"
OUT_WIKI  = OUTPUT_DIR / "destination_sample_wikipedia.csv"

usecols = ["Country", "City", "AccentCity", "Region", "Population", "Latitude", "Longitude"]
df = pd.read_csv(INPUT_PATH, encoding="latin-1", usecols=usecols, low_memory=False)

df.columns = [c.lower() for c in df.columns]
df = df.rename(columns={"country": "country_code", "accentcity": "city_display"})

for c in ["city", "city_display", "country_code", "region"]:
    df[c] = df[c].astype(str).str.strip()

df["population"] = pd.to_numeric(df["population"], errors="coerce")
df["latitude"]   = pd.to_numeric(df["latitude"], errors="coerce")
df["longitude"]  = pd.to_numeric(df["longitude"], errors="coerce")

df = df[(df["city"].str.len() > 0) & df["latitude"].notna() & df["longitude"].notna()].copy()

df["name"] = np.where(df["city_display"].str.len() > 0, df["city_display"], df["city"].str.title())

df["country"] = df["country_code"].str.upper()

df = df.sort_values(["population"], ascending=False)
df = df.drop_duplicates(subset=["name", "country"])

TARGET_N = 150
MAX_PER_COUNTRY = 5

selected_rows = []
country_counts = {}

for _, row in df.iterrows():
    ctry = row["country"]
    if country_counts.get(ctry, 0) >= MAX_PER_COUNTRY:
        continue
    selected_rows.append(row)
    country_counts[ctry] = country_counts.get(ctry, 0) + 1
    if len(selected_rows) >= TARGET_N:
        break

sel = pd.DataFrame(selected_rows)


if len(sel) < TARGET_N:
    remaining = df[~df.index.isin(sel.index)]
    extra = remaining.head(TARGET_N - len(sel))
    sel = pd.concat([sel, extra], ignore_index=True)

def make_basic_desc(row):
    parts = []
    parts.append(f"{row['name']} is a city in {row['country']}.")
    if pd.notna(row["population"]) and row["population"] > 0:
        parts.append(f"It has an estimated population of about {int(row['population']):,} people.")
    if isinstance(row.get("region", ""), str) and row["region"].strip():
        parts.append(f"It is located in region {row['region']}.")
    if pd.notna(row["latitude"]) and pd.notna(row["longitude"]):
        lat = row["latitude"]; lng = row["longitude"]
        hemi_ns = "northern" if lat >= 0 else "southern"
        hemi_ew = "eastern"  if lng >= 0 else "western"
        parts.append(f"The city lies in the {hemi_ns} and {hemi_ew} hemispheres.")
    return " ".join(parts)

sel["description"] = sel.apply(make_basic_desc, axis=1)

basic_cols = ["name","country","city","region","population","latitude","longitude","description"]
basic_cols = [c for c in basic_cols if c in sel.columns]
sel[basic_cols].to_csv(OUT_BASIC, index=False, encoding="utf-8")
print(f"[OK] Basic file saved: {OUT_BASIC} ({len(sel)} rows)")

# ---------------- Wikipedia ----------------
USE_WIKIPEDIA = True

if USE_WIKIPEDIA:
    try:
        try:
            import wikipedia
        except Exception:
            import sys, subprocess
            subprocess.run([sys.executable, "-m", "pip", "install", "wikipedia"], check=True)
            import wikipedia

        wikipedia.set_lang("en")

        def clean_wiki_summary(text):
            if not isinstance(text, str):
                return ""
            text = re.sub(r"\s+", " ", text).strip()
            return text

        wiki_desc = []
        for i, row in sel.iterrows():
            name = row["name"]
            country = row["country"]
            summary = ""
            queries = [f"{name}, {country}", name]
            for q in queries:
                try:
                    summary = wikipedia.summary(q, sentences=2, auto_suggest=True, redirect=True)
                    summary = clean_wiki_summary(summary)
                    if summary:
                        break
                except Exception:
                    summary = ""
            if not summary:
                summary = row["description"]
            wiki_desc.append(summary)
            time.sleep(0.35)

        sel["description"] = wiki_desc
        sel[basic_cols].to_csv(OUT_WIKI, index=False, encoding="utf-8")
        print(f"[OK] Wikipedia file saved: {OUT_WIKI} ({len(sel)} rows)")
    except Exception as e:
        print("[WARN] Wikipedia enrichment failed:", repr(e))
        print("Kept the basic file only.")


[OK] Basic file saved: ../../clean_data/destination_sample_basic.csv (150 rows)
Collecting wikipedia
  Downloading wikipedia-1.4.0.tar.gz (27 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: wikipedia
  Building wheel for wikipedia (setup.py): started
  Building wheel for wikipedia (setup.py): finished with status 'done'
  Created wheel for wikipedia: filename=wikipedia-1.4.0-py3-none-any.whl size=11678 sha256=6b4fa9620273c59333d6be7f63aee08eeb7335898093fc34f3d2c97b0e8d42a2
  Stored in directory: /Users/zhouyiqin/Library/Caches/pip/wheels/63/47/7c/a9688349aa74d228ce0a9023229c6c0ac52ca2a40fe87679b8
Successfully built wikipedia
Installing collected packages: wikipedia
Successfully installed wikipedia-1.4.0




  lis = BeautifulSoup(html).find_all('li')


[OK] Wikipedia file saved: ../../clean_data/destination_sample_wikipedia.csv (150 rows)
