# Ajout de nouvelles features

In [1]:
# librairies 
#----------------------------------------------
#----- Calculs 
import pandas as pd
import numpy as np
from numpy import mean, std
import numpy.linalg as npl
import math 
from math import sqrt, log
import re #to find integers & floats in a string 
from collections import Counter #to count values in a dataframe or array
#----- Date
from datetime import date, timedelta, datetime #for the date
import datetime as dt
#save model 
import pickle 

import warnings
warnings.filterwarnings("ignore")

In [2]:
PATH = '.' #modifier si besoin le chemin du fichier
model_folder=PATH+"//models_sav/"

### Récupération & mise en forme des datasets

In [3]:
# Lecture du fichier des données tests
#----------------------------------------------

# récupération du fichier contentant l'ensemble des données issues des requêtes
pricing_requests = pd.read_csv(PATH + '//pricing_requests_done.csv')

# récupération du fichier contenant les features des hotels
hotels = pd.read_csv('features_hotels.csv', index_col=['hotel_id', 'city'])

print("pricing_requests.shape: ",pricing_requests.shape)
pricing_requests.head()

pricing_requests.shape:  (422016, 9)


Unnamed: 0,hotel_id,price,stock,city,date,language,mobile,avatar_id,request_number
0,105,144,102,amsterdam,39,austrian,1,6056,1.0
1,880,113,81,amsterdam,39,austrian,1,6056,1.0
2,588,172,108,amsterdam,39,austrian,1,6056,1.0
3,352,105,84,amsterdam,39,austrian,1,6056,1.0
4,925,103,83,amsterdam,39,austrian,1,6056,1.0


In [4]:
# Obtenir des features en plus correspondant aux features de l'hotel : on fait une jointure 
#----------------------------------------------
pricing_requests = pricing_requests.join(hotels, on=['hotel_id', 'city'])
pricing_requests.head()

Unnamed: 0,hotel_id,price,stock,city,date,language,mobile,avatar_id,request_number,group,brand,parking,pool,children_policy
0,105,144,102,amsterdam,39,austrian,1,6056,1.0,Yin Yang,Ardisson,0,0,0
1,880,113,81,amsterdam,39,austrian,1,6056,1.0,Boss Western,Boss Western,0,0,0
2,588,172,108,amsterdam,39,austrian,1,6056,1.0,Independant,Independant,0,0,0
3,352,105,84,amsterdam,39,austrian,1,6056,1.0,Independant,Independant,1,0,0
4,925,103,83,amsterdam,39,austrian,1,6056,1.0,Boss Western,Boss Western,1,0,0


In [5]:
# Assigner le bon type aux variables
#----------------------------------------------
#--- Convert to integer: 
int_list = ["date","avatar_id","hotel_id","stock","request_number","price"] 
pricing_requests[int_list] = pricing_requests[int_list].astype(int) 
#--- Convert to categorical: 
pricing_requests["city"] = pd.Categorical(pricing_requests["city"],ordered=False)
pricing_requests["language"] = pd.Categorical(pricing_requests["language"],ordered=False)
pricing_requests["mobile"] = pd.Categorical(pricing_requests["mobile"],ordered=False)
pricing_requests["parking"] = pd.Categorical(pricing_requests["parking"],ordered=False)
pricing_requests["pool"] = pd.Categorical(pricing_requests["pool"],ordered=False)
pricing_requests["children_policy"] = pd.Categorical(pricing_requests["children_policy"],ordered=False)
pricing_requests["group"] = pd.Categorical(pricing_requests["group"],ordered=False)
pricing_requests["brand"] = pd.Categorical(pricing_requests["brand"],ordered=False)

In [6]:
# Vérification du type des variables
#----------------------------------------------
pricing_requests.dtypes

hotel_id              int32
price                 int32
stock                 int32
city               category
date                  int32
language           category
mobile             category
avatar_id             int32
request_number        int32
group              category
brand              category
parking            category
pool               category
children_policy    category
dtype: object

### ajout du PIB par pays
**source:** https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal) estimate by IMF consulté le 17/11/2022.

- amsterdam: netherlands
- copenhagen: denmark
- paris: france
- sofia: bulgaria 
- vienna: austria
- rome:  italy
- madrid: spain
- vilnius: lithuania
- valletta: malta

In [7]:
# Ajout du PIB par pays
#-----------------------------------------
# dictionnaire contenant la ville et le pib du pays où se situe la ville
PIB = {"amsterdam": 990583,
     "copenhagen": 386724,
     "paris": 2778090,
     "sofia": 85008,
     "vienna": 468046,
     "rome": 1996934,
     "madrid":1389927,
     "vilnius":68031,
     "valletta":17156}

# ajout du pib au dataframe général 
pricing_requests["pib"] = np.zeros(len(pricing_requests))

for city in PIB.keys():
    city_idx = np.where(pricing_requests["city"]== city)
    pricing_requests["pib"].iloc[city_idx] = PIB[city]

### ajout du prix moyen par m2 dans chaque ville:

**source**: https://checkinprice.com/europe-square-meter-prices/ consulté le 17/11/22. Données datant de 2018.

In [8]:
# Ajout du prix moyen par m2 par ville
#-----------------------------------------
# dictionnaire contenant la ville et le prix moyen par m2 en euros
price_m2 = {"amsterdam": 4610,
            "copenhagen": 5236,
            "paris": 9160,
            "sofia": 1095,
            "vienna": 6550,
            "rome": 3044,
            "madrid": 3540,
            "vilnius": 1469,
            "valletta": 3600}

# ajout du prix par m2 au dataframe général 
pricing_requests["price_m2"] = np.zeros(len(pricing_requests))

for city in price_m2.keys():
    city_idx = np.where(pricing_requests["city"]== city)
    pricing_requests["price_m2"].iloc[city_idx] = price_m2[city]

In [9]:
pricing_requests

Unnamed: 0,hotel_id,price,stock,city,date,language,mobile,avatar_id,request_number,group,brand,parking,pool,children_policy,pib,price_m2
0,105,144,102,amsterdam,39,austrian,1,6056,1,Yin Yang,Ardisson,0,0,0,990583.0,4610.0
1,880,113,81,amsterdam,39,austrian,1,6056,1,Boss Western,Boss Western,0,0,0,990583.0,4610.0
2,588,172,108,amsterdam,39,austrian,1,6056,1,Independant,Independant,0,0,0,990583.0,4610.0
3,352,105,84,amsterdam,39,austrian,1,6056,1,Independant,Independant,1,0,0,990583.0,4610.0
4,925,103,83,amsterdam,39,austrian,1,6056,1,Boss Western,Boss Western,1,0,0,990583.0,4610.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
422011,286,107,54,rome,34,greek,1,141896,4011,Accar Hotels,Navatel,0,0,0,1996934.0,3044.0
422012,659,154,107,rome,34,greek,1,141896,4011,Yin Yang,Royal Lotus,1,0,0,1996934.0,3044.0
422013,739,168,97,rome,34,greek,1,141896,4011,Accar Hotels,Marcure,1,0,0,1996934.0,3044.0
422014,364,131,64,rome,34,greek,1,141896,4011,Yin Yang,Ardisson,1,0,0,1996934.0,3044.0


### ajout du nombre de touristes par an dans chaque ville

**source:** https://www.indexmundi.com/facts/indicators/ST.INT.ARVL/rankings consulté 17/11/22. Chiffres de 2020. 

In [10]:
# Ajout du nombre de touristes par pays en 2020
#-----------------------------------------
# dictionnaire contenant le nombre de touristes par pays en 2020
nb_tourists = {"amsterdam": 7265, 
                "copenhagen": 15595 ,
                "paris": 117109,
                "sofia": 4973,
                "vienna": 15091,
                "rome": 38419,
                "madrid": 36410,
                "vilnius": 2284,
                "valletta": 718}
# ajouter 10^3 à toutes les valeurs pour obtenir les vraies valeurs

# ajout du nombre de touristes par pays
pricing_requests["nb_tourists"] = np.zeros(len(pricing_requests))

for city in nb_tourists.keys():
    city_idx = np.where(pricing_requests["city"]== city)
    pricing_requests["nb_tourists"].iloc[city_idx] = nb_tourists[city]

### ajout du nombre d'habitants par km2 dans chaque ville.

**source**: wikikepia: https://fr.wikipedia.org/wiki/La_Valette, https://fr.wikipedia.org/wiki/Vilnius, https://fr.wikipedia.org/wiki/Madrid, https://fr.wikipedia.org/wiki/Rome, https://fr.wikipedia.org/wiki/Vienne_(Autriche), https://fr.wikipedia.org/wiki/Sofia, https://fr.wikipedia.org/wiki/Paris, https://fr.wikipedia.org/wiki/Copenhague, https://fr.wikipedia.org/wiki/Amsterda

In [11]:
# Ajout du nombre de d'habitants par km2 dans chaque ville
#-----------------------------------------
# dictionnaire contenant le nombre de touristes par pays en 2020
nb_hab_km2 = {"amsterdam": 3530, 
            "copenhagen": 7064,
            "paris": 20545,
            "sofia": 7354,
            "vienna": 4607,
            "rome": 2213,
            "madrid": 5437,
            "vilnius": 1432,
            "valletta": 8344}

# ajout du nombre d'habitants par km2 par ville
pricing_requests["nb_hab_km2"] = np.zeros(len(pricing_requests))

for city in nb_hab_km2.keys():
    city_idx = np.where(pricing_requests["city"]== city)
    pricing_requests["nb_hab_km2"].iloc[city_idx] = nb_hab_km2[city]

### ajout du type d'hôtel

En tapant: `pricing_requests[["group", "brand"]].drop_duplicates()` on obtient la liste des différents companies hôtelières.

Les groupes d'hôtels et leur marque sont fictifs mais on peut voir qu'ils sont un pseudo de véritables hotels. Pour chaque hôtel, on recherche le véritable nom et on renseigne la catérgori de l'hôtel parmi: Luxe, premium, milieu de gamme, economique et non renseigné (lorsque l'on a pas trouvé d'avantage d'information sur l'hôtel).

- Morriott International: Marriott International hotel
    * Morriot: Marriott (premium) 185
    * Corlton: Ritz-Carlton (haut luxe) 265
    * CourtYord: CourtYard (milieu de gamme) 157
- Yin Yang: Jin Jiang international
    * Ardisson: Radisson (premium) 157
    * Royal Lotus: Royal/Golden Tulip (luxe)  187
    * 8 Premium: (economy) 77
- Accar Hotels: Accor Hotels
    * Marcure: mercure (milieu de gamme) 188
    * Navatel: novotel (milieu de gamme) 115
    * Safitel: Sofitel (luxe) 268
    * Ibas: Ibis (economique) 79
- Boss Western: Best Western Hotels 
    * Boss Western: (milieu de gamme) 117
    * J.Halliday Inn: ? 150
- Chillton Worldwide: Hilton Worldwide hotel
    * Quadrupletree: Double Tree (milieu de gamme) 192
    * Tripletree: Double Tree (milieu de gamme) 154
    * Chill Garden Inn: Hilton Garden Inn (premium)  275
- Independant: autre hotels n'appartenants à aucun des gropes ci-dessus. 
     * Independant: (non renseigné)
    

In [88]:
pricing_requests["price"].iloc[np.where(pricing_requests.brand == "Chill Garden Inn")].describe()

count    14597.000000
mean       289.088374
std         70.853727
min        120.000000
25%        243.000000
50%        275.000000
75%        316.000000
max        507.000000
Name: price, dtype: float64

In [46]:
pricing_requests

Unnamed: 0,hotel_id,price,stock,city,date,language,mobile,avatar_id,request_number,group,brand,parking,pool,children_policy,pib,price_m2,nb_tourists
0,105,144,102,amsterdam,39,austrian,1,6056,1,Yin Yang,Ardisson,0,0,0,990583.0,4610.0,7265.0
1,880,113,81,amsterdam,39,austrian,1,6056,1,Boss Western,Boss Western,0,0,0,990583.0,4610.0,7265.0
2,588,172,108,amsterdam,39,austrian,1,6056,1,Independant,Independant,0,0,0,990583.0,4610.0,7265.0
3,352,105,84,amsterdam,39,austrian,1,6056,1,Independant,Independant,1,0,0,990583.0,4610.0,7265.0
4,925,103,83,amsterdam,39,austrian,1,6056,1,Boss Western,Boss Western,1,0,0,990583.0,4610.0,7265.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
422011,286,107,54,rome,34,greek,1,141896,4011,Accar Hotels,Navatel,0,0,0,1996934.0,3044.0,38419.0
422012,659,154,107,rome,34,greek,1,141896,4011,Yin Yang,Royal Lotus,1,0,0,1996934.0,3044.0,38419.0
422013,739,168,97,rome,34,greek,1,141896,4011,Accar Hotels,Marcure,1,0,0,1996934.0,3044.0,38419.0
422014,364,131,64,rome,34,greek,1,141896,4011,Yin Yang,Ardisson,1,0,0,1996934.0,3044.0,38419.0


date des données: 2018.
- sofia: https://investsofia.com/wp-content/uploads/2019/10/Sofia-Tourism-and-Air-Transport-Market-Report-2019-ENG.pdf
- rome: 36.6 million overnight stays in 2018 https://www.wantedinrome.com/news/rome-in-top-spot-as-italys-art-city-tourism-booms.html
- madrid: 19.7 million overnight stays in 2018 https://www.colliers.com/-/media/Files/EMEA/Spain/Colliers_5Claves_MAD_BCN-EN090419.ashx
- vienna: record 16.5 million overnight stays during 2018, http://www.xinhuanet.com/english/2019-01/24/c_137771798.htm and https://b2b.wien.info/en/press-media-services/bilanz2018-341332
- vilnius: nb of overnigths stays at hotels during 2018: 3.7 million https://www.oecd-ilibrary.org/sites/e314c4fa-en/index.html?itemId=/content/component/e314c4fa-en
- paris 37,771 overnight stays in 2018 https://www.insee.fr/en/statistiques/serie/010607170
- amsterdam 16.67 million hotel nights in 2018 https://www.statista.com/statistics/959108/number-of-hotel-nights-spent-in-amsterdam-netherlands/
- 