# Project : Plan your trip with KAYAK

In [1]:
# Importing useful libraries

import pandas as pd
import requests
import json
import pprint
from datetime import datetime

!pip install plotly -q
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
pio.renderers.default = "iframe_connected"

!pip install scrapy -q
import scrapy
import os
import logging
from scrapy.crawler import CrawlerProcess

## Part I : Get coordinates of cities from nominatim.org's API

In [2]:
# Define the list of cities we are interested in

cities = ["Mont Saint Michel",
"St Malo",
"Bayeux",
"Le Havre",
"Rouen",
"Paris",
"Amiens",
"Lille",
"Strasbourg",
"Chateau du Haut Koenigsbourg",
"Colmar",
"Eguisheim",
"Besancon",
"Dijon",
"Annecy",
"Grenoble",
"Lyon",
"Gorges du Verdon",
"Bormes les Mimosas",
"Cassis",
"Marseille",
"Aix en Provence",
"Avignon",
"Uzes",
"Nimes",
"Aigues Mortes",
"Saintes Maries de la mer",
"Collioure",
"Carcassonne",
"Ariege",
"Toulouse",
"Montauban",
"Biarritz",
"Bayonne",
"La Rochelle"]

In [3]:
# Store the cities in a dataframe
df=pd.DataFrame(cities, columns=['city'])
df.head()

Unnamed: 0,city
0,Mont Saint Michel
1,St Malo
2,Bayeux
3,Le Havre
4,Rouen


In [4]:
# Use the requests library to get geolocalisation data from openstreetmap API
# example of France
r = requests.get('https://nominatim.openstreetmap.org/search?country=France&format=json')
resultat = r.json()
resultat

[{'place_id': 282341688,
  'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. https://osm.org/copyright',
  'osm_type': 'relation',
  'osm_id': 2202162,
  'boundingbox': ['-50.2187169', '51.3055721', '-178.3873749', '172.3057152'],
  'lat': '46.603354',
  'lon': '1.8883335',
  'display_name': 'France',
  'class': 'boundary',
  'type': 'administrative',
  'importance': 1.0233264437396503,
  'icon': 'https://nominatim.openstreetmap.org/ui/mapicons//poi_boundary_administrative.p.20.png'}]

In [5]:
print('France latitude is: ' ,resultat[0].get('lat'))
print('France longitude is: ' ,resultat[0].get('lon'))

France latitude is:  46.603354
France longitude is:  1.8883335


In [6]:
# Define a function that gets the coordinates of one city
def city_coordinates(city):
    try:
        encoded_city = city.replace(' ','+') # to handle the query
        r = requests.get('https://nominatim.openstreetmap.org/search?q={}&country=France&format=json'.format(encoded_city))
        resultat = r.json()
        lat = resultat[0].get('lat')
        lon = resultat[0].get('lon')
        return (lat, lon)
    except IndexError:
        return (None, None)

In [7]:
# Test on a city that not exists
a = city_coordinates('lkkhkjh kjhkh kjhkj ')
a

(None, None)

In [8]:
b = city_coordinates('Rouen')
b

('49.4404591', '1.0939658')

As coordinates are strings we will transform them to floats in the following

In [9]:
# Create columns where cities coordinates will be stored
df['latitude'] = None
df['longitude'] = None

# Iterate on all cities
for i in df.index:
    df['latitude'][i]= round(float(city_coordinates(df['city'][i])[0]),4) # the function returns a tuple of (lat, lon). In the position 0 we get latitude
    df['longitude'][i]= round(float(city_coordinates(df['city'][i])[1]),4) # the function returns a tuple of (lat, lon). In the position 0 we get latitude

In [10]:
# Show the dataset
df

Unnamed: 0,city,latitude,longitude
0,Mont Saint Michel,48.636,-1.5115
1,St Malo,48.6495,-2.026
2,Bayeux,49.2765,-0.7025
3,Le Havre,49.4939,0.108
4,Rouen,49.4405,1.094
5,Paris,48.8589,2.32
6,Amiens,49.8942,2.2957
7,Lille,50.6366,3.0635
8,Strasbourg,48.5846,7.7507
9,Chateau du Haut Koenigsbourg,48.2495,7.3443


In [11]:
df.to_csv('src/cities_coordinates.csv')

## Part II : Get weather data from openweathermap API

In [16]:
# Use requests Library to get weather data from openweathermal API
# Example to have a look to the content of the response
example_coord = [df.latitude[0],df.longitude[0]]
example_weather = requests.get(
    "https://api.openweathermap.org/data/2.5/onecall?lat={}&lon={}&exclude=hourly,current,minutely&appid=93dd497abfa8c0ecdc1920ba1bacd20e4&units=metric"
    .format(example_coord[0],example_coord[1]))
example_weather

<Response [200]>

In [18]:
# Look to the content of the response. We will use the pretty printer module of python to display the content in a clearer format
example_weather = example_weather.json()
pprint.pprint(example_weather)

{'daily': [{'clouds': 89,
            'dew_point': 7.23,
            'dt': 1643284800,
            'feels_like': {'day': 9.73,
                           'eve': 6.71,
                           'morn': -1.04,
                           'night': 8.49},
            'humidity': 80,
            'moon_phase': 0.82,
            'moonrise': 1643252160,
            'moonset': 1643284620,
            'pop': 0.38,
            'pressure': 1033,
            'rain': 0.42,
            'sunrise': 1643269326,
            'sunset': 1643302511,
            'temp': {'day': 10.53,
                     'eve': 8.57,
                     'max': 10.92,
                     'min': 2.32,
                     'morn': 2.35,
                     'night': 9.34},
            'uvi': 0.9,
            'weather': [{'description': 'light rain',
                         'icon': '10d',
                         'id': 500,
                         'main': 'Rain'}],
            'wind_deg': 279,
            'wind_gust': 9.63,


In [19]:
example_weather.keys()

dict_keys(['lat', 'lon', 'timezone', 'timezone_offset', 'daily'])

In [20]:
len(example_weather['daily'])

8

The api provides weather information for 8 days

In [21]:
# Explore the content of daily
weath_keys_all = example_weather['daily'][0].keys()
weath_keys_all

dict_keys(['dt', 'sunrise', 'sunset', 'moonrise', 'moonset', 'moon_phase', 'temp', 'feels_like', 'pressure', 'humidity', 'dew_point', 'wind_speed', 'wind_deg', 'wind_gust', 'weather', 'clouds', 'pop', 'rain', 'uvi'])

In [22]:
example_weather['daily'][0]['temp']

{'day': 10.53,
 'min': 2.32,
 'max': 10.92,
 'night': 9.34,
 'eve': 8.57,
 'morn': 2.35}

In [23]:
example_weather['daily'][0]['feels_like']

{'day': 9.73, 'night': 8.49, 'eve': 6.71, 'morn': -1.04}

In [24]:
ex_tem_day_d0 = example_weather['daily'][0]['temp']['day']
ex_tem_night_d0 = example_weather['daily'][0]['temp']['night']
ex_feelslike_day_d0 = example_weather['daily'][0]['feels_like']['day']
ex_feelslike_night_d0 = example_weather['daily'][0]['feels_like']['night']
ex_clouds_d0 = example_weather['daily'][0]['clouds']
ex_wind_d0 = example_weather['daily'][0]['wind_speed']
ex_humidity_d0 = example_weather['daily'][0]['humidity']

In [25]:
print(ex_tem_day_d0)
print(ex_tem_night_d0)
print(ex_feelslike_day_d0)
print(ex_feelslike_night_d0)
print(ex_clouds_d0)
print(ex_wind_d0)
print(ex_humidity_d0)

10.53
9.34
9.73
8.49
89
4.21
80


In [26]:
# Get weather data we choosed to keep for all cities 
df_weather = []
days = 8

for i in range(df.shape[0]):
    for j in range(days):
        r = requests.get("https://api.openweathermap.org/data/2.5/onecall?lat={}&lon={}&exclude=hourly,current,minutely&appid=93dd497abfa8c0ecdc1920ba1bacd20e&units=metric"
                         .format(df.latitude[i],df.longitude[i])).json()
        city = df.city[i]
        lat = df.latitude[i]
        lon = df.longitude[i]
        day = j
        d = r['daily']
        date = datetime.fromtimestamp(d[j]['dt']).strftime('%d/%m/%Y')
        tem_day = d[j]['temp']['day']
        tem_night = d[j]['temp']['night']
        feelslike_day = d[j]['feels_like']['day']
        feelslike_night = d[j]['feels_like']['night']
        clouds = d[j]['clouds']
        wind = d[j]['wind_speed']
        humidity = d[j]['humidity']
        df_weather.append([city, lat, lon, day, date, tem_day, tem_night, feelslike_day, feelslike_night, clouds, wind, humidity])


In [27]:
# Create a dataframe with the weather data
cols = ['city','latitude', 'longitude', 'day', 'date', 'temp_day', 'temp_night', 'feels_like_day', 'feels_like_night', 'clouds', 'wind_speed', 'humidity']
df_weather_days = pd.DataFrame(df_weather, columns=cols)
pd.set_option('display.max_rows', None)
df_weather_days.head(10)

Unnamed: 0,city,latitude,longitude,day,date,temp_day,temp_night,feels_like_day,feels_like_night,clouds,wind_speed,humidity
0,Mont Saint Michel,48.636,-1.5115,0,27/01/2022,10.53,9.34,9.73,8.49,89,4.21,80
1,Mont Saint Michel,48.636,-1.5115,1,28/01/2022,10.42,7.99,9.55,6.87,73,2.32,78
2,Mont Saint Michel,48.636,-1.5115,2,29/01/2022,11.73,9.08,11.07,6.91,43,6.38,81
3,Mont Saint Michel,48.636,-1.5115,3,30/01/2022,8.53,4.8,7.39,4.8,100,3.64,68
4,Mont Saint Michel,48.636,-1.5115,4,31/01/2022,7.67,4.43,3.63,1.95,100,9.06,73
5,Mont Saint Michel,48.636,-1.5115,5,01/02/2022,10.51,9.85,9.76,7.27,71,7.9,82
6,Mont Saint Michel,48.636,-1.5115,6,02/02/2022,11.86,7.63,11.22,5.47,88,5.4,81
7,Mont Saint Michel,48.636,-1.5115,7,03/02/2022,9.88,9.91,7.09,6.91,94,7.9,84
8,St Malo,48.6495,-2.026,0,27/01/2022,10.39,9.7,9.5,8.03,92,6.0,77
9,St Malo,48.6495,-2.026,1,28/01/2022,10.42,8.95,9.58,6.98,73,3.47,79


In [28]:
df_weather_days['city'].nunique()

35

In [29]:
# Store weather data for all days in a csv
df_weather_days.to_csv('src/weather_days.csv')

In [30]:
df_weather_days = pd.read_csv('src/weather_days.csv')

In [31]:
# Define a function that gives a list of 5 top cities according to a specific criteria and how the user want it to be sorted
def top_cities_names(nb_cities, criteria, ascending):
    res = df_weather_days.groupby('city')[criteria].mean().sort_values(ascending=ascending).reset_index()[0:nb_cities]['city'].values
    return res

In [32]:
top_cities_names(5,'temp_day',False)

array(['Collioure', 'Nimes', 'Aigues Mortes', 'Cassis',
       'Saintes Maries de la mer'], dtype=object)

In [33]:
# Define a dataframe with the average weather mesures on 8 days
df_weather_avg = df_weather_days.groupby('city').mean()
df_weather_avg.head()

Unnamed: 0_level_0,Unnamed: 0,latitude,longitude,day,temp_day,temp_night,feels_like_day,feels_like_night,clouds,wind_speed,humidity
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Aigues Mortes,203.5,43.5658,4.1913,3.5,12.2225,9.1025,10.86375,6.8775,18.0,7.3175,51.5
Aix en Provence,171.5,43.5298,5.4475,3.5,11.0075,7.07,9.5525,5.865,17.375,3.71875,48.75
Amiens,51.5,49.8942,2.2957,3.5,9.07875,6.81375,7.3525,4.92875,83.625,6.935,80.5
Annecy,115.5,45.8992,6.1289,3.5,7.605,3.0125,6.675,2.56875,58.75,2.145,69.875
Ariege,235.5,42.9455,1.4066,3.5,5.68,0.25,4.25125,-1.7275,36.0,2.59625,58.0


In [34]:
# Drop useless columns
df_weather_avg.drop(['Unnamed: 0', 'day'], inplace=True, axis=1)

In [35]:
df_weather_avg.head()

Unnamed: 0_level_0,latitude,longitude,temp_day,temp_night,feels_like_day,feels_like_night,clouds,wind_speed,humidity
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Aigues Mortes,43.5658,4.1913,12.2225,9.1025,10.86375,6.8775,18.0,7.3175,51.5
Aix en Provence,43.5298,5.4475,11.0075,7.07,9.5525,5.865,17.375,3.71875,48.75
Amiens,49.8942,2.2957,9.07875,6.81375,7.3525,4.92875,83.625,6.935,80.5
Annecy,45.8992,6.1289,7.605,3.0125,6.675,2.56875,58.75,2.145,69.875
Ariege,42.9455,1.4066,5.68,0.25,4.25125,-1.7275,36.0,2.59625,58.0


In [36]:
# Store in a csv the weather average data
df_weather_avg.to_csv('src/weather.csv')

In [37]:
# Define a function that returns a sorted dataset according to a column with the average of weather measures on the 8 days
def df_sorted_cities(df, column, ascending):
    res = df.groupby('city').mean().sort_values(column, ascending=ascending).reset_index()
    return res

In [38]:
df_sorted_cities(df_weather_days, 'temp_day', False).head()

Unnamed: 0.1,city,Unnamed: 0,latitude,longitude,day,temp_day,temp_night,feels_like_day,feels_like_night,clouds,wind_speed,humidity
0,Collioure,219.5,42.5251,3.0832,3.5,14.5275,10.315,13.2475,8.385,15.0,8.7675,46.375
1,Nimes,195.5,43.8374,4.3601,3.5,12.32375,7.425,10.6375,4.82875,18.625,7.79375,47.125
2,Aigues Mortes,203.5,43.5658,4.1913,3.5,12.2225,9.1025,10.86375,6.8775,18.0,7.3175,51.5
3,Cassis,155.5,43.214,5.5396,3.5,12.15875,10.04,10.805,8.2475,21.375,6.3075,52.625
4,Saintes Maries de la mer,211.5,43.4523,4.4287,3.5,12.09,9.58625,10.5975,6.88375,24.125,10.47625,53.375


In [92]:
#  Visualize all cities average temperature
cities_temp = df_sorted_cities(df_weather_avg,'temp_day',False)
fig1 = px.scatter_mapbox(cities_temp, lat="latitude", lon="longitude", color="temp_day",hover_name="city", size="temp_day",
                        mapbox_style="carto-positron", zoom=5)
fig1.update_layout(title='Cities temperature')
fig1.show()

In [87]:
# Define some variables for the plo loop
weather_cols_sens = [('temp_day','des'), 
                    ('temp_night','des'), 
                    ('feels_like_day','des'), 
                    ('feels_like_night','des'), 
                    ('clouds','asc'), 
                    ('wind_speed','asc'), 
                    ('humidity','asc')]
dict_titles = {
    'temp_day':'average 8 days temperature during the day' , 
    'temp_night':'average 8 days temperature during the night', 
    'feels_like_day':'average 8 days felt temperature during the day', 
    'feels_like_night': 'average 8 days felt temperature during the night', 
    'clouds': 'average 8 days clouds', 
    'wind_speed': 'average 8 days wind speed', 
    'humidity': 'average 8 days humidity'    
}

dict_sens = {
    'asc' : True,
    'des' : False
}

In [108]:
# Visualise top 5 cities according to different weather columns
for col, sort in weather_cols_sens:
    fig2 = px.scatter_mapbox(df_sorted_cities(df_weather_avg,col,dict_sens[sort])[0:5], lat="latitude", lon="longitude", color=col,hover_name="city", size=col,
                        mapbox_style="carto-positron", zoom=5)
    fig2.update_layout(title = 'Top 5 cities according to {}'.format(dict_titles[col]))
    fig2.show()

## Part III : scraping booking.com

In [68]:
# test urls that allow selecting only hotels in desending rating order for all cities
list_url_essai = []
for i in range (0, len(cities)):
    list_url_essai.append("https://www.booking.com/searchresults.fr.html?label=gen173nr-1DCAEoggI46AdIM1gEaE2IAQGYAQ24ARfIAQzYAQPoAQGIAgGoAgO4ApSdyo8GwAIB0gIkMDcyYTY0N2YtMWQzYS00ZWE1LTlhYTYtYWI0MThkMTM1ZTcz2AIE4AIB&sid=1af4ef95ff0245766f5bd84859ae7b29&aid=304142&lang=fr&sb_lp=1&error_url=https%3A%2F%2Fwww.booking.com%2Findex.fr.html%3Flabel%3Dgen173nr-1DCAEoggI46AdIM1gEaE2IAQGYAQ24ARfIAQzYAQPoAQGIAgGoAgO4ApSdyo8GwAIB0gIkMDcyYTY0N2YtMWQzYS00ZWE1LTlhYTYtYWI0MThkMTM1ZTcz2AIE4AIB%3Bsid%3D1af4ef95ff0245766f5bd84859ae7b29%3Bsb_price_type%3Dtotal%26%3B&ss={}&nflt=ht_id%3D204&shw_aparth=0&order=bayesian_review_score"
                .format(cities[i].replace(' ','+')))
list_url_essai[0]

'https://www.booking.com/searchresults.fr.html?label=gen173nr-1DCAEoggI46AdIM1gEaE2IAQGYAQ24ARfIAQzYAQPoAQGIAgGoAgO4ApSdyo8GwAIB0gIkMDcyYTY0N2YtMWQzYS00ZWE1LTlhYTYtYWI0MThkMTM1ZTcz2AIE4AIB&sid=1af4ef95ff0245766f5bd84859ae7b29&aid=304142&lang=fr&sb_lp=1&error_url=https%3A%2F%2Fwww.booking.com%2Findex.fr.html%3Flabel%3Dgen173nr-1DCAEoggI46AdIM1gEaE2IAQGYAQ24ARfIAQzYAQPoAQGIAgGoAgO4ApSdyo8GwAIB0gIkMDcyYTY0N2YtMWQzYS00ZWE1LTlhYTYtYWI0MThkMTM1ZTcz2AIE4AIB%3Bsid%3D1af4ef95ff0245766f5bd84859ae7b29%3Bsb_price_type%3Dtotal%26%3B&ss=Mont+Saint+Michel&nflt=ht_id%3D204&shw_aparth=0&order=bayesian_review_score'

In [44]:
# Booking scraping
class BookingSpider(scrapy.Spider):
    
    name = "booking"      
    city_idx = 0
    list_url = []
    for i in range (0, len(cities)):
        list_url.append("https://www.booking.com/searchresults.fr.html?label=gen173nr-1DCAEoggI46AdIM1gEaE2IAQGYAQ24ARfIAQzYAQPoAQGIAgGoAgO4ApSdyo8GwAIB0gIkMDcyYTY0N2YtMWQzYS00ZWE1LTlhYTYtYWI0MThkMTM1ZTcz2AIE4AIB&sid=1af4ef95ff0245766f5bd84859ae7b29&aid=304142&lang=fr&sb_lp=1&error_url=https%3A%2F%2Fwww.booking.com%2Findex.fr.html%3Flabel%3Dgen173nr-1DCAEoggI46AdIM1gEaE2IAQGYAQ24ARfIAQzYAQPoAQGIAgGoAgO4ApSdyo8GwAIB0gIkMDcyYTY0N2YtMWQzYS00ZWE1LTlhYTYtYWI0MThkMTM1ZTcz2AIE4AIB%3Bsid%3D1af4ef95ff0245766f5bd84859ae7b29%3Bsb_price_type%3Dtotal%26%3B&ss={}&nflt=ht_id%3D204&shw_aparth=0&order=bayesian_review_score"
                .format(cities[i].replace(' ','+')))

    start_urls = list_url                
    
    def parse(self, response):
        hotels = response.css('div._fe1927d9e._0811a1b54._a8a1be610._022ee35ec.b9c27d6646.fb3c4512b4.fc21746a73')
        for hotel in hotels:
            hotel_url = hotel.css('a::attr(href)').get()
            hotel_name = hotel.css('div.fde444d7ef._c445487e2::text').get()
            yield scrapy.Request(
                    hotel_url, 
                    callback=self.scrap_hotel,
                    meta={'city' : cities[self.city_idx],
                         'hotel_url' : hotel_url,
                         'hotel_name' : hotel_name }
                    )
        self.city_idx += 1
    def scrap_hotel(self, response):
        return {
                'city' : response.meta.get('city'),
                'hotel_name': response.meta.get('hotel_name'),
                'hotel_ranking': response.css('div._9c5f726ff.bd528f9ea6::text').get(),
                'hotel_description': ''.join(response.css('div#property_description_content p::text').getall()),
                'hotel_url' : response.meta.get('hotel_url'),
                'hotel_lat':response.css('a#hotel_header').attrib['data-atlas-latlng'].split(",")[0],
                'hotel_lon':response.css('a#hotel_header').attrib['data-atlas-latlng'].split(",")[1]
                }

In [45]:
# Name of the file where the results will be saved
filename = "top_hotels.json"

# If file already exists, delete it before crawling
if filename in os.listdir('src/'):
        os.remove('src/' + filename)

# Declare a new CrawlerProcess
process = CrawlerProcess(settings = {
    'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:96.0) Gecko/20100101 Firefox/96.0', #=> Simulates a browser on an OS
    'LOG_LEVEL': logging.INFO, # => Minimal Level of Log 
    "FEEDS": {
        'src/' + filename : {"format": "json"}, # => Where the file will be stored
    },
    "AUTOTHROTTLE_ENABLED": True
})

# Start the crawling
process.crawl(BookingSpider)
process.start()

2022-01-27 12:45:40 [scrapy.utils.log] INFO: Scrapy 2.5.1 started (bot: scrapybot)
2022-01-27 12:45:40 [scrapy.utils.log] INFO: Versions: lxml 4.7.1.0, libxml2 2.9.12, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 21.7.0, Python 3.8.6 | packaged by conda-forge | (default, Oct  7 2020, 19:08:05) - [GCC 7.5.0], pyOpenSSL 19.1.0 (OpenSSL 1.1.1h  22 Sep 2020), cryptography 3.1.1, Platform Linux-5.4.129+-x86_64-with-glibc2.10
2022-01-27 12:45:40 [scrapy.crawler] INFO: Overridden settings:
{'AUTOTHROTTLE_ENABLED': True,
 'LOG_LEVEL': 20,
 'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:96.0) '
               'Gecko/20100101 Firefox/96.0'}
2022-01-27 12:45:40 [scrapy.extensions.telnet] INFO: Telnet Password: 8c8ce0bd750bd326
2022-01-27 12:45:40 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.exte

In [57]:
# Import the scraping results from the .json file
df_hotels = pd.read_json('src/top_hotels.json')

In [58]:
df_hotels.head()

Unnamed: 0,city,hotel_name,hotel_ranking,hotel_description,hotel_url,hotel_lat,hotel_lon
0,Mont Saint Michel,Le Lithana,73,Vous pouvez bénéficier d'une réduction Genius ...,https://www.booking.com/hotel/fr/le-lithana.fr...,48.554097,-1.500629
1,St Malo,Escale Oceania Saint Malo,82,Vous pouvez bénéficier d'une réduction Genius ...,https://www.booking.com/hotel/fr/mascottesaint...,48.65428,-2.008838
2,Le Havre,Hôtel Le Green,42,L'Hôtel Le Green est situé dans le centre du H...,https://www.booking.com/hotel/fr/le-green.fr.h...,49.492303,0.122933
3,Bayeux,Hôtel De La Marine,81,Face à la mer et au célèbre musée du Débarquem...,https://www.booking.com/hotel/fr/de-la-marine....,49.340483,-0.623109
4,Rouen,Astrid,65,Cet hôtel moderne est situé en face de la gare...,https://www.booking.com/hotel/fr/hotelastrid_r...,49.448306,1.093814


In [59]:
#Data cleaning
df_hotels['hotel_name'] = df_hotels['hotel_name'].str.replace(r'NOCNOC - ', '')
df_hotels['hotel_name'] = df_hotels['hotel_name'].str.replace(r'par NOCNOC', '')
df_hotels['hotel_description'] = df_hotels['hotel_description'].str.replace(r'\xa0', '')
df_hotels['hotel_description'] = df_hotels['hotel_description'].str.replace(r' - ', ' ')
df_hotels['hotel_description'] = df_hotels['hotel_description'].str.replace(r'NOCNOC', '')
df_hotels['hotel_description'] = df_hotels['hotel_description'].str.replace(r'par NOCNOC', '')
df_hotels['hotel_description'] = df_hotels['hotel_description'].str.replace(r'.', '. ')
df_hotels['hotel_description'] = df_hotels['hotel_description'].str.replace(r'  ', ' ')
df_hotels['hotel_ranking'] = df_hotels['hotel_ranking'].str.replace(r',', '.')
df_hotels['hotel_ranking'] = df_hotels['hotel_ranking'].astype(float)

In [60]:
df_hotels.head()

Unnamed: 0,city,hotel_name,hotel_ranking,hotel_description,hotel_url,hotel_lat,hotel_lon
0,Mont Saint Michel,Le Lithana,7.3,Vous pouvez bénéficier d'une réduction Genius ...,https://www.booking.com/hotel/fr/le-lithana.fr...,48.554097,-1.500629
1,St Malo,Escale Oceania Saint Malo,8.2,Vous pouvez bénéficier d'une réduction Genius ...,https://www.booking.com/hotel/fr/mascottesaint...,48.65428,-2.008838
2,Le Havre,Hôtel Le Green,4.2,L'Hôtel Le Green est situé dans le centre du H...,https://www.booking.com/hotel/fr/le-green.fr.h...,49.492303,0.122933
3,Bayeux,Hôtel De La Marine,8.1,Face à la mer et au célèbre musée du Débarquem...,https://www.booking.com/hotel/fr/de-la-marine....,49.340483,-0.623109
4,Rouen,Astrid,6.5,Cet hôtel moderne est situé en face de la gare...,https://www.booking.com/hotel/fr/hotelastrid_r...,49.448306,1.093814


In [61]:
# Merge weather and hotels dataframes
df_kayak = df_hotels.merge(df_weather_avg, on=['city'])

In [62]:
df_kayak.columns

Index(['city', 'hotel_name', 'hotel_ranking', 'hotel_description', 'hotel_url',
       'hotel_lat', 'hotel_lon', 'latitude', 'longitude', 'temp_day',
       'temp_night', 'feels_like_day', 'feels_like_night', 'clouds',
       'wind_speed', 'humidity'],
      dtype='object')

In [63]:
# Drop useless columns
df_kayak.drop(['latitude', 'longitude'], inplace=True, axis=1)

In [64]:
df_kayak.head()

Unnamed: 0,city,hotel_name,hotel_ranking,hotel_description,hotel_url,hotel_lat,hotel_lon,temp_day,temp_night,feels_like_day,feels_like_night,clouds,wind_speed,humidity
0,Mont Saint Michel,Le Lithana,7.3,Vous pouvez bénéficier d'une réduction Genius ...,https://www.booking.com/hotel/fr/le-lithana.fr...,48.554097,-1.500629,10.14125,7.87875,8.68,6.08375,82.25,5.85125,78.375
1,Mont Saint Michel,Le Beauvoir,7.7,Vous pouvez bénéficier d'une réduction Genius ...,https://www.booking.com/hotel/fr/le-beauvoir-b...,48.597713,-1.513098,10.14125,7.87875,8.68,6.08375,82.25,5.85125,78.375
2,Mont Saint Michel,Hôtel Ariane & SPA,7.6,Ce charmant hôtel est situé à la frontière ent...,https://www.booking.com/hotel/fr/ariane-pontor...,48.552403,-1.507123,10.14125,7.87875,8.68,6.08375,82.25,5.85125,78.375
3,Mont Saint Michel,"Manoir de la Roche Torin, The Originals Relais...",7.7,Vous pouvez bénéficier d'une réduction Genius ...,https://www.booking.com/hotel/fr/le-manoir-de-...,48.637273,-1.425062,10.14125,7.87875,8.68,6.08375,82.25,5.85125,78.375
4,Mont Saint Michel,"The Originals Boutique, Hôtel Les Quatre Salin...",7.9,Vous pouvez bénéficier d'une réduction Genius ...,https://www.booking.com/hotel/fr/lessalines.fr...,48.596404,-1.589203,10.14125,7.87875,8.68,6.08375,82.25,5.85125,78.375


In [65]:
# Export the dataframe to a csv
df_kayak.to_csv('src/kayak.csv')

In [70]:
# List of top 5 cities
top_cities_names(5,'temp_day',False)

array(['Collioure', 'Nimes', 'Aigues Mortes', 'Cassis',
       'Saintes Maries de la mer'], dtype=object)

In [69]:
# Loop on top five cities to plot the 20 hight rated hotels in each city
for city in top_cities_names(5,'temp_day',False):
    data = df_kayak.loc[df_kayak['city']==city].sort_values('hotel_ranking', ascending=False)[0:20]
    fig = px.scatter_mapbox(data, 
                            lat="hotel_lat", 
                            lon="hotel_lon", 
                            color="hotel_ranking",
                            hover_name="hotel_name", 
                            mapbox_style="carto-positron", 
                            zoom=5)
    fig.update_layout(title= 'Top 20 hotels in {}'.format(city))
    fig.show() 