# FINAL PROJECT : tourism in Europe

In [44]:
# Import libraries

import re
from pathlib import Path
from urllib.parse import quote

import requests
import pandas as pd
from bs4 import BeautifulSoup

In [45]:
# Params

TARGET_CITIES = {"Paris":"Paris","Berlin":"Berlin","Madrid":"Madrid","Rome":"Rome"}
URL = "https://en.wikipedia.org/wiki/List_of_cities_by_international_visitors"
OUTPUT_CSV = "../data/clean/wiki_city_international_visitors.csv"
UA = {"User-Agent":"Mozilla/5.0"}

In [46]:
# Utils

def parse_number(x):
    #'24.5', '24,5', '24.5 million', '24,500,000' -> float (absolu)
    s = str(x).lower()
    is_million = "million" in s
    s = (s.replace("million","").replace("millions","")
           .replace("\u2009","").replace("\xa0"," ").strip())
    s = re.sub(r"[^0-9,.\-]", "", s)
    if s.count(",")>0 and "." not in s: s = s.replace(",", ".")
    else: s = s.replace(",", "")
    try: v = float(s)
    except: return float("nan")
    return v*1_000_000 if is_million else v

def city_desc(city: str, lang="en") -> str:
    # Scrape de la page ville: meta description ou 1er paragraphe
    url = f"https://{lang}.wikipedia.org/wiki/{quote(city.replace(' ','_'))}"
    try:
        r = requests.get(url, headers=UA, timeout=15); r.raise_for_status()
        soup = BeautifulSoup(r.text, "lxml")
        meta = soup.find("meta", attrs={"name":"description"}) or soup.find("meta", attrs={"property":"og:description"})
        if meta and meta.get("content"): return meta["content"].strip()
        for p in soup.select("#mw-content-text .mw-parser-output > p"):
            t = re.sub(r"\[.*?\]","", p.get_text(" ", strip=True)).strip()
            t = re.sub(r"^\s*\([^)]*\)\s*", "", t)
            if len(t)>=80: return t
    except: pass
    return ""

def _col_like(df, keys):
    for c in df.columns:
        if any(k in str(c).lower() for k in keys): return c

def _normalize_city(s: pd.Series) -> pd.Series:
    return (s.astype(str).str.replace(r"\s*\(.*?\)$","",regex=True)
                     .str.partition(",")[0].str.strip())

def _try_table(t: pd.DataFrame):
    # Retourne (df_norm, millions_header) ou (None, None)
    city = _col_like(t, ["city","destination"])
    visitors = _col_like(t, ["visitor","arrival","overnight"])
    if not (city and visitors): return None, None
    country = _col_like(t, ["country","nation"])
    yearcol = _col_like(t, ["year"])

    # Année depuis l'en-tête (ex: "Arrivals 2018", "Visitors (2018)")
    m = re.search(r"(20\d{2})", str(visitors))
    header_year = int(m.group(1)) if m else None
    millions_header = "million" in str(visitors).lower()

    ren = {city:"city", visitors:"visitors"}
    if country: ren[country]="country"
    if yearcol: ren[yearcol]="year"
    df = t.rename(columns=ren).copy()

    # City, Country dans la même cellule ?
    if "country" not in df and "city" in df:
        sp = df["city"].astype(str).str.split(",", n=1, expand=True)
        if sp.shape[1]==2:
            df["city"], df["country"] = sp[0].str.strip(), sp[1].str.strip()

    df["city"] = _normalize_city(df["city"])
    if "year" in df: df["year"] = pd.to_numeric(df["year"], errors="coerce").astype("Int64")
    else: df["year"] = header_year

    keep = ["city","visitors","year"] + (["country"] if "country" in df else [])
    return df[keep].copy(), millions_header

def scrape_2018(url: str, targets: dict) -> pd.DataFrame:
    """Extrait uniquement 2018 pour les villes cibles."""
    r = requests.get(url, headers=UA, timeout=20); r.raise_for_status()
    rows = []
    for t in pd.read_html(r.text):
        df, millions_header = _try_table(t)
        if df is None: continue
        df = df[df["year"]==2018].copy()
        if df.empty: continue
        df["visitors"] = df["visitors"].apply(parse_number)
        if millions_header and df["visitors"].max(skipna=True) < 1000:
            df["visitors"] = df["visitors"] * 1_000_000
        want = {v.lower():k for k,v in targets.items()}
        df = df[df["city"].str.lower().isin(want.keys())].copy()
        if df.empty: continue
        df["city"] = df["city"].str.lower().map(want)
        rows.append(df[["city","visitors"] + (["country"] if "country" in df.columns else [])])

    if not rows: raise RuntimeError("Aucune ligne 2018 trouvée pour les villes cibles.")
    out = pd.concat(rows, ignore_index=True).dropna(subset=["visitors"])
    out = out.drop_duplicates(subset=["city"])  # garde la 1ère occurrence si doublon
    out = out.rename(columns={"visitors":"international_visitors_2018"})
    return out

In [47]:
# Fonction de nettoyage
def clean_text(text):
    if pd.isna(text):
        return ""
    text = str(text)
    text = re.sub(r"\(.*?\)", "", text)  # supprimer tout ce qu'il y a entre ( )
    text = text.replace('"', "'")  # remplacer guillemets doubles
    text = text.replace("\n", " ").replace("\r", " ")  # enlever retours à la ligne
    text = re.sub(r"[^\x20-\x7E]+", " ", text)  # supprimer caractères spéciaux non-ASCII
    text = re.sub(r"\s+", " ", text)  # remplacer multiples espaces par un seul
    return text.strip()

# Main
if __name__ == "__main__":
    df = scrape_2018(URL, TARGET_CITIES).sort_values("city").reset_index(drop=True)
    df["description"] = [city_desc(c) for c in df["city"]]
    df["extract_date"] = pd.Timestamp.today().normalize()
    df["source_url"] = URL

    # Nettoyage des descriptions
    df["description"] = df["description"].apply(clean_text)

    # Colonnes finales
    cols = [
        "city", "country", "international_visitors_2018",
        "description", "extract_date", "source_url"
    ]
    df = df[[c for c in cols if c in df.columns]]

    # Aperçu complet des descriptions (nettoyées)
    with pd.option_context("display.max_rows", None, "display.max_colwidth", None, "display.width", 2000):
        print(df[["city","international_visitors_2018","description"]].to_string(index=False))

    # Export propre
    Path(OUTPUT_CSV).parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(
        OUTPUT_CSV,
        index=False,
        encoding="utf-8",
        quoting=csv.QUOTE_ALL  # tous les champs entre guillemets
    )
    print("\n✅ Exporté :", OUTPUT_CSV)

  city  international_visitors_2018                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     

NameError: name 'csv' is not defined

In [48]:
df.dtypes

city                                   object
country                                object
international_visitors_2018           float64
description                            object
extract_date                   datetime64[ns]
source_url                             object
dtype: object