# FINAL PROJECT : tourism in Europe

In [4]:
# Import libraries

import pandas as pd
import numpy as np

### 1- Paths & output folder

In [6]:
PATH_AIRBNB = "../data/clean/airbnb_clean.csv"
PATH_NUMBEO = "../data/clean/numbeo_cost_of_living.csv"
PATH_WIKI   = "../data/clean/wiki_city_international_visitors.csv"
PATH_EUROSTAT = "../data/clean/eurostat_trips_all.csv"
PATH_WORLD_DATA = "../data/clean/world_data_number_of_arrivals.csv"
OUTPUT_CSV  = "../data/clean/tourism_all.csv"

def normalize_colnames(cols):
    return [c.strip().lower().replace(" ", "_") for c in cols]

def clean_city(s):
    return s.astype(str).str.strip().str.title()

In [7]:
paths = {
    "airbnb": PATH_AIRBNB,
    "numbeo": PATH_NUMBEO,
    "wiki": PATH_WIKI,
    "eurostat": PATH_EUROSTAT,
    "world_data": PATH_WORLD_DATA
}

for name, p in paths.items():
    cols = pd.read_csv(p, nrows=0, sep=None, engine="python").columns.tolist()
    print(f"{name}: {len(cols)} colonnes")
    print(cols, "\n")

airbnb: 17 colonnes
['listing_name', 'neighbourhood_cleansed', 'latitude', 'longitude', 'room_type', 'accommodates', 'bedrooms', 'beds', 'bathrooms', 'bathrooms_text', 'price', 'minimum_nights', 'maximum_nights', 'availability_365', 'number_of_reviews', 'review_scores_rating', 'city'] 

numbeo: 3 colonnes
['City', 'Item_label', 'Price_eur'] 

wiki: 6 colonnes
['city', 'country', 'international_visitors_2018', 'description', 'extract_date', 'source_url'] 

eurostat: 10 colonnes
['Country', 'Time', 'Accommodation', 'Purpose', 'Duration', 'Transport', 'Destination', 'Value_accomodation', 'Value_transport', 'Value_destination'] 

world_data: 3 colonnes
['Country', 'Year', 'Arrivals (millions)'] 



In [8]:
# Helpers
def normalize_colnames(cols):
    return [c.strip().lower().replace(" ", "_") for c in cols]

def clean_city(s: pd.Series) -> pd.Series:
    return s.astype(str).str.strip().str.title()

# 1) Charger les 4 sources
airbnb   = pd.read_csv(PATH_AIRBNB,  low_memory=False)
numbeo   = pd.read_csv(PATH_NUMBEO,  low_memory=False)
wiki     = pd.read_csv(PATH_WIKI,    low_memory=False)
eurostat = pd.read_csv(PATH_EUROSTAT,low_memory=False)

# Harmoniser les noms de colonnes (snake_case) – n'en supprime aucune
airbnb.columns   = normalize_colnames(airbnb.columns)
numbeo.columns   = normalize_colnames(numbeo.columns)
wiki.columns     = normalize_colnames(wiki.columns)
eurostat.columns = normalize_colnames(eurostat.columns)

# 2) City en Title-case quand dispo
if "city" in airbnb.columns:   airbnb["city"]   = clean_city(airbnb["city"])
if "ville" in numbeo.columns and "city" not in numbeo.columns:
    numbeo = numbeo.rename(columns={"ville":"city"})
if "city" in numbeo.columns:   numbeo["city"]   = clean_city(numbeo["city"])
if "city" in wiki.columns:     wiki["city"]     = clean_city(wiki["city"])

# 3) Construire un mapping City -> Country depuis WIKI
if "country" not in wiki.columns and "country_name" in wiki.columns:
    wiki["country"] = wiki["country_name"]

city_country_map = {}
if "city" in wiki.columns and "country" in wiki.columns:
    tmp = wiki[["city","country"]].dropna().drop_duplicates(subset=["city"])
    city_country_map = dict(zip(tmp["city"], tmp["country"]))

# Fallback manuel (au cas où le CSV wiki ne couvre pas toutes tes villes)
city_country_map.setdefault("Paris", "France")
city_country_map.setdefault("Berlin", "Germany")
city_country_map.setdefault("Madrid", "Spain")
city_country_map.setdefault("Rome", "Italy")
city_country_map.setdefault("Warsaw", "Poland")

# 4) Garantir une colonne 'country' REMPLIE pour CHAQUE source
def ensure_country(df: pd.DataFrame, src_name: str) -> pd.DataFrame:
    df = df.copy()
    # Crée la colonne 'country' si absente
    if "country" not in df.columns:
        df["country"] = pd.NA
    # Si une colonne 'country_name' existe, on s'en sert pour remplir
    if "country_name" in df.columns:
        df["country"] = df["country"].fillna(df["country_name"])
    # Si 'city' est dispo, on mappe grâce au wiki
    if "city" in df.columns:
        df["country"] = df["country"].fillna(df["city"].map(city_country_map))
    # Standardise visuellement
    df["country"] = df["country"].astype(str).str.strip().str.title()
    # S'il en reste sans pays, on met 'Unknown' (tout en gardant la ligne)
    df.loc[df["country"].isin(["Nan","", "None"]), "country"] = "Unknown"
    # Tag source (on garde TOUTES les colonnes d'origine + 'source')
    df["source"] = src_name
    return df

airbnb   = ensure_country(airbnb, "airbnb")
numbeo   = ensure_country(numbeo, "numbeo")
wiki     = ensure_country(wiki,   "wiki")

# Eurostat a parfois 'geo' ou 'country'
if "country" not in eurostat.columns and "geo" in eurostat.columns:
    eurostat["country"] = eurostat["geo"]
eurostat = ensure_country(eurostat, "eurostat")

# 5) Concat final : union de toutes les colonnes + toutes les lignes
df_all = pd.concat([airbnb, numbeo, wiki, eurostat], ignore_index=True, sort=False)

# Mettre quelques colonnes clés devant pour le confort visuel (sans en perdre aucune)
front = [c for c in ["source","city","country"] if c in df_all.columns]
df_all = df_all[front + [c for c in df_all.columns if c not in front]]

# 6) Aperçu & export
with pd.option_context('display.max_columns', None, 'display.width', 2000, 'display.max_colwidth', None):
    display(df_all.head(5))

df_all.to_csv(OUTPUT_CSV, index=False)

Unnamed: 0,source,city,country,listing_name,neighbourhood_cleansed,latitude,longitude,room_type,accommodates,bedrooms,beds,bathrooms,bathrooms_text,price,minimum_nights,maximum_nights,availability_365,number_of_reviews,review_scores_rating,item_label,price_eur,international_visitors_2018,description,extract_date,source_url,time,accommodation,purpose,duration,transport,destination,value_accomodation,value_transport,value_destination
0,airbnb,Paris,France,zen and calm,Observatoire,48.83191,2.3187,Entire home/apt,2.0,1.0,1.0,1.0,1 bath,135.0,2.0,30.0,355.0,7.0,5.0,,,,,,,,,,,,,,,
1,airbnb,Paris,France,Your perfect Paris studio on Île Saint-Louis,Hôtel-de-Ville,48.85247,2.35835,Entire home/apt,2.0,0.0,1.0,1.0,1 bath,114.0,1.0,730.0,69.0,452.0,4.62,,,,,,,,,,,,,,,
2,airbnb,Paris,France,MARAIS - 2ROOMS APT - 24 PEOPLE,Hôtel-de-Ville,48.85909,2.35315,Entire home/apt,4.0,2.0,1.0,1.0,1 bath,149.0,10.0,130.0,197.0,380.0,4.73,,,,,,,,,,,,,,,
3,airbnb,Paris,France,"Cozy, Central Paris WALK or VELIB EVERYWHERE !",Louvre,48.86006,2.34863,Entire home/apt,1.0,1.0,1.0,1.0,1 bath,75.0,180.0,360.0,358.0,0.0,,,,,,,,,,,,,,,,
4,airbnb,Paris,France,room in an artists flat with great view!,Buttes-Montmartre,48.88946,2.35867,Private room,1.0,1.0,1.0,1.0,1 shared bath,50.0,5.0,1125.0,82.0,63.0,4.63,,,,,,,,,,,,,,,


In [9]:
df_all.shape

(159583, 34)

In [10]:
null_counts = df_all.isna().sum().sort_values(ascending=False)

In [18]:
print("Missing values per column:")
print(null_counts)

Missing values per column:
international_visitors_2018    159579
description                    159579
extract_date                   159579
source_url                     159579
price_eur                      159538
item_label                     159538
value_accomodation             157436
accommodation                  157436
value_transport                154260
transport                      154260
value_destination              121732
destination                    121732
purpose                        116409
duration                       116409
time                           114262
review_scores_rating            67037
bedrooms                        45618
beds                            45547
bathrooms_text                  45468
bathrooms                       45418
listing_name                    45414
number_of_reviews               45370
availability_365                45370
maximum_nights                  45370
minimum_nights                  45370
price                  