# FINAL PROJECT : tourism in Europe

In [3]:
# 1) Import + URLs

import re
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup

HEADERS = {
    "User-Agent": ("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
                   "AppleWebKit/537.36 (KHTML, like Gecko) "
                   "Chrome/120.0.0.0 Safari/537.36")
}

URLS = {
    "France":  "https://www.worlddata.info/europe/france/tourism.php",
    "Germany": "https://www.worlddata.info/europe/germany/tourism.php",
    "Spain":   "https://www.worlddata.info/europe/spain/tourism.php",
    "Italy":   "https://www.worlddata.info/europe/italy/tourism.php",
}

In [4]:
# 2) Fetch + conersion

def fetch_html(url: str) -> str:
    r = requests.get(url, headers=HEADERS, timeout=30)
    r.raise_for_status()
    return r.text

def parse_arrivals_cell(x) -> int | None:
    """
    Convertit une cellule de 'Number of tourists' en nombre de personnes.
    Exemples acceptés: '90.91 m', '89,4 m', '81 980 000', '-', 'NA'.
    """
    s = str(x).strip().lower()
    if not s or s in {"-", "na", "nan"}:
        return None

    # détermine le multiplicateur (m = million, bn/billion au cas où)
    mult = 1
    if "b" in s or "billion" in s:
        mult = 1_000_000_000
    if "m" in s and "million" in s or " m" in s or s.endswith("m"):
        mult = 1_000_000

    # garde uniquement chiffres, virgules, points, espaces
    num = re.sub(r"[^0-9,\.\s\-]", "", s)
    # supprime les espaces séparateurs de milliers
    num = num.replace(" ", "")

    # règle virgule décimale européenne -> point
    if "," in num and "." not in num:
        num = num.replace(",", ".")
    else:
        # cas 1,234,567.89 -> enlève les virgules milliers
        num = num.replace(",", "")

    try:
        val = float(num) * mult
        return int(round(val))
    except Exception:
        return None

In [5]:
# 3) Columns 'Year' and 'Number of tourist'

def extract_year_table(html: str) -> pd.DataFrame:
    # Parcourt tous les tableaux de la page et retourne celui qui contient une colonne 'Year' ET une colonne 'Number of tourists'
    # Méthode 1: pandas.read_html (rapide)
    tables = pd.read_html(html)
    for df in tables:
        cols = [str(c).strip().lower() for c in df.columns]
        if any(c == "year" for c in cols) and any("number of tourists" in c for c in cols):
            return df

In [6]:
all_frames = []

for country, url in URLS.items():
    html = fetch_html(url)
    df = extract_year_table(html)

    # Normalise les en-têtes
    df.columns = [str(c).strip() for c in df.columns]
    # Détecte les colonnes utiles (tolérant à la casse et aux variantes)
    year_col = next(c for c in df.columns if c.lower() == "year")
    tourists_col = next(c for c in df.columns if "number of tourists" in c.lower())

    out = df[[year_col, tourists_col]].rename(columns={
        year_col: "year",
        tourists_col: "arrivals_raw"
    })

    # convertit en int (personnes)
    out["arrivals"] = out["arrivals_raw"].apply(parse_arrivals_cell)
    out["year"] = pd.to_numeric(out["year"], errors="coerce").astype("Int64")

    # garde les lignes valides + 10 dernières années non-null
    out = out.dropna(subset=["year", "arrivals"]).astype({"year":"int"})
    out = out.sort_values("year").tail(10).reset_index(drop=True)
    out["country"] = country

    all_frames.append(out[["country", "year", "arrivals"]])
    time.sleep(0.5)  # politesse anti-anti-bot

data = pd.concat(all_frames, ignore_index=True).sort_values(["country","year"]).reset_index(drop=True)
data

  tables = pd.read_html(html)
  tables = pd.read_html(html)
  tables = pd.read_html(html)
  tables = pd.read_html(html)


Unnamed: 0,country,year,arrivals
0,France,2012,81980000
1,France,2013,83630000
2,France,2014,83700000
3,France,2015,84450000
4,France,2016,82680000
5,France,2017,86760000
6,France,2018,89320000
7,France,2019,90910000
8,France,2020,41680000
9,France,2021,48400000


In [7]:
df = data.rename(columns={"country":"Country", "year":"Year"}).copy()
df["Arrivals (millions)"] = (df["arrivals"] / 1_000_000).round(1)

df_out = (
    df[["Country", "Year", "Arrivals (millions)"]]
      .sort_values(["Country", "Year"])
      .reset_index(drop=True)
)

df_out

Unnamed: 0,Country,Year,Arrivals (millions)
0,France,2012,82.0
1,France,2013,83.6
2,France,2014,83.7
3,France,2015,84.4
4,France,2016,82.7
5,France,2017,86.8
6,France,2018,89.3
7,France,2019,90.9
8,France,2020,41.7
9,France,2021,48.4


In [8]:
df_out.dtypes

Country                 object
Year                     int64
Arrivals (millions)    float64
dtype: object

In [9]:
# df_out.to_csv("../data/clean/world_data_number_of_arrivals.csv", index=False, encoding="utf-8")