# FINAL PROJECT : tourism in Europe

In [45]:
# Import librairies

import time
import re
import random
import requests
import pandas as pd
import unicodedata
from urllib.parse import quote
from bs4 import BeautifulSoup

In [46]:
# 1) Params

CITIES = ["Paris", "Berlin", "Madrid", "Rome"]
BASE = "https://www.numbeo.com/cost-of-living/in/{city}?displayCurrency=EUR"

In [47]:
# 2) Columns

ITEMS = {
    "Meal_inexpensive_restaurant" : ["Meal, Inexpensive Restaurant"],
    "Mcdonalds": ["McMeal at McDonalds (or Equivalent Combo Meal)"],
    "Cappuccino": ["Cappuccino (regular)"],
    "Gasoline_1l": ["Gasoline (1 liter)"],
    "One_way_ticket": ["One-way Ticket (Local Transport)"],
    "Monthly_pass": ["Monthly Pass (Regular Price)"],
    "Taxi_1km": ["Taxi 1km (Normal Tariff)"],
    "Cinema": ["Cinema, International Release, 1 seat"],
    "Fitness_monthly": ["Fitness Club, Monthly Fee for 1 Adult"],
}

In [48]:
for city in CITIES:
    r = fetch_city(city, session)
    print(city, r.get("Meal_inexpensive_restaurant", "Not found"))

Paris 15.0
Berlin 15.0
Madrid 15.0
Rome 15.0


In [49]:
# 3) Helpers (robust text parsing)
num_re = re.compile(r"\d+(?:[.,]\d+)?")

def parse_price(text: str):
    # Retourne un float (moyenne) extrait du texte (gère '10–18', '12,50', devise, etc.)."""
    if not text:
        return None
    t = unicodedata.normalize("NFKC", text).replace("\xa0", " ").replace(",", ".")
    nums = [float(x) for x in num_re.findall(t)]
    if not nums:
        return None
    return sum(nums) / len(nums)

def norm(s: str) -> str:
    return unicodedata.normalize("NFKC", s).lower().strip()

def fetch_city(city: str, session: requests.Session) -> dict:
    url = BASE.format(city=quote(city))
    r = session.get(url, timeout=30)
    r.raise_for_status()
    soup = BeautifulSoup(r.text, "lxml")

    # Indexer la table principale: label -> texte valeur
    by_label = {}
    for tr in soup.select("table tr"):
        tds = tr.find_all("td")
        if len(tds) >= 2:
            label = tds[0].get_text(" ", strip=True)
            value = tds[1].get_text(" ", strip=True)
            by_label[label] = value

    # Récupération des items (exact d'abord, sinon fallback par "contains" insensible à la casse)
    out = {"ville": city}
    for col, candidates in ITEMS.items():
        val = None
        # 1) exact
        for lbl in candidates:
            if lbl in by_label:
                val = parse_price(by_label[lbl]); break
        # 2) contains (fallback)
        if val is None:
            wanted = [norm(c) for c in candidates]
            for k, v in by_label.items():
                if any(w in norm(k) for w in wanted):
                    val = parse_price(v); break
        out[col] = val

    # Dérivés
    ts, tkm = out.get("taxi_start"), out.get("taxi_1km")
    out["taxi_5km"] = round(ts + 5 * tkm, 2) if ts is not None and tkm is not None else None

    return out

In [50]:
#  4) Run scraping

headers = {"User-Agent": "Mozilla/5.0 (compatible; numbeo-scraper; +https://example.com)"}
session = requests.Session(); session.headers.update(headers)

rows = []
for city in CITIES:
    try:
        rows.append(fetch_city(city, session))
        time.sleep(random.uniform(1.5, 3.0))  # petite pause polie
    except Exception as e:
        print(f"[WARN] {city}: {e}")

df = pd.DataFrame(rows)

# Colonnes finales (ordre ajustable)
cols_order = [
    "ville", "Meal_inexpensive_restaurant", "Mcdonalds", "Cappuccino",
    "Gasoline_1l", "One_way_ticket", "Monthly_pass", "Taxi_1km", 
     "Cinema", "Fitness_monthly"
]
df = df.reindex(columns=cols_order)

df = df.rename(columns={"ville": "City"})

df.head(10)

Unnamed: 0,City,Meal_inexpensive_restaurant,Mcdonalds,Cappuccino,Gasoline_1l,One_way_ticket,Monthly_pass,Taxi_1km,Cinema,Fitness_monthly
0,Paris,15.0,12.0,4.17,1.82,2.5,88.8,1.93,14.0,37.05
1,Berlin,15.0,11.6,3.83,1.71,3.8,59.0,2.8,13.0,33.77
2,Madrid,15.0,10.0,2.71,1.6,1.5,39.25,1.3,10.0,43.71
3,Rome,15.0,10.0,1.71,1.73,1.5,35.0,1.5,10.0,60.49


In [51]:
df_long = (
    df.melt(id_vars="City", var_name="Item_label", value_name="Price_eur")
      .sort_values(["City", "Item_label"])
      .reset_index(drop=True)
)
df_long.head(20)

Unnamed: 0,City,Item_label,Price_eur
0,Berlin,Cappuccino,3.83
1,Berlin,Cinema,13.0
2,Berlin,Fitness_monthly,33.77
3,Berlin,Gasoline_1l,1.71
4,Berlin,Mcdonalds,11.6
5,Berlin,Meal_inexpensive_restaurant,15.0
6,Berlin,Monthly_pass,59.0
7,Berlin,One_way_ticket,3.8
8,Berlin,Taxi_1km,2.8
9,Madrid,Cappuccino,2.71


In [52]:
df_long.dtypes

City           object
Item_label     object
Price_eur     float64
dtype: object

In [53]:
# df_long.to_csv("../data/clean/numbeo_cost_of_living.csv", index=False, encoding="utf-8")