In [1]:
import pandas as pd , numpy as np , requests
from tqdm import tqdm
import matplotlib.pyplot as plt , seaborn as sns
import plotly.graph_objects as go , plotly.express as px


from statistics import mode

from scraping_booking import get_data_by_city # Use my scraping class
import scrapy 
from scrapy.crawler import CrawlerProcess
from scrapy.utils.log import configure_logging
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By 
from selenium.webdriver.common.keys import Keys
from webdriver_manager.chrome import ChromeDriverManager


import time
import warnings
warnings.filterwarnings('ignore')

In [143]:
##### Constant ##### 
liste_city = ["Mont Saint Michel",
"St Malo",
"Bayeux",
"Le Havre",
"Rouen",
"Paris",
"Amiens",
"Lille",
"Strasbourg",
"Chateau du Haut Koenigsbourg",
"Colmar",
"Eguisheim",
"Besancon",
"Dijon",
"Annecy",
"Grenoble",
"Lyon",
"Gorges du Verdon",
"Bormes les Mimosas",
"Cassis",
"Marseille",
"Aix en Provence",
"Avignon",
"Uzes",
"Nimes",
"Aigues Mortes",
"Saintes Maries de la mer",
"Collioure",
"Carcassonne",
"Ariege",
"Toulouse",
"Montauban",
"Biarritz",
"Bayonne",
"La Rochelle"]

api_gps = 'https://nominatim.org/'
api_weather = 'https://openweathermap.org/api/one-call-api'
api_weather_key = '8227ebf88578625ece9b69b06f786a96'

## Get GPS data
Use https://nominatim.org/ to get the gps coordinates of all the cities (no subscription required) Documentation : https://nominatim.org/release-docs/develop/api/Search/

In [144]:
def get_coordinates(address):
    url = 'https://nominatim.openstreetmap.org/search'
    params = {'q': address, 'format': 'jsonv2'}
    response = requests.get(url, params=params).json()
    if len(response) > 0:
        lat = response[0]['lat']
        lon = response[0]['lon']
        return (float(lat), float(lon))
    else:
        return None
    

lat , long = [] , []    
for city in tqdm(liste_city):

    tuple_latlong = get_coordinates(address=city)
    lat.append(tuple_latlong[0])
    long.append(tuple_latlong[1])

100%|██████████| 35/35 [00:17<00:00,  2.04it/s]


In [145]:
df = pd.DataFrame({'City' : liste_city , 'Lat' : lat , 'Long' : long})
df.head()

Unnamed: 0,City,Lat,Long
0,Mont Saint Michel,48.635954,-1.51146
1,St Malo,48.649518,-2.026041
2,Bayeux,49.276462,-0.702474
3,Le Havre,49.493898,0.107973
4,Rouen,49.440459,1.093966


## Get weather data
Use https://openweathermap.org/appid (you have to subscribe to get a free apikey) and https://openweathermap.org/api/one-call-api to get some information about the weather for the 35 cities and put it in a DataFrame

In [146]:
mean_temps , mean_prob , most_freq_weather = [] , [] , []
for i in range(len(df)):
    
    lat = df.iloc[i]['Lat']
    long = df.iloc[i]['Long']

    url = f'https://api.openweathermap.org/data/3.0/onecall?lat={lat}&lon={long}&exclude=current,minutely,hourly,alerts&units=metric&appid={api_weather_key}'
    response = requests.get(url).json()
    
    temps , prob , weather = [] , [] , []
    for j in range(len(response['daily'])):
        weather.append(response['daily'][j]['weather'][0]['main']) # Main meteo
        prob.append(response['daily'][j]['pop'] * 10) # Probality of precipitation
        temps.append(response['daily'][j]['feels_like']['day']) # Global temperature of the day

    mean_temps.append(np.mean(temps))
    mean_prob.append(np.mean(prob))
    most_freq_weather.append(mode(weather))
    

score = np.array(mean_temps) - np.array(mean_prob) # The KPI here is temperature_of_the_day - (prob_rain * 10) , and the weather has to be sunny
assert len(mean_prob) == len(mean_temps) == len(most_freq_weather) == len(score)

df['score_7_days'] = score 
df['mean_temperature_7_days'] = mean_temps
df['most_freq_weather_7_days'] = most_freq_weather
df['mean_prob_of_rain_7_days'] = np.array(mean_prob) / 10


df = df.sort_values(by='score_7_days' , ascending=False).reset_index(drop=True)
df_bestdest = df[df['most_freq_weather_7_days'] != 'Rain']
df.to_csv('df_with_weather.csv')
df_bestdest.to_csv('df_best_destination.csv')
df.head()

Unnamed: 0,City,Lat,Long,score_7_days,mean_temperature_7_days,most_freq_weather_7_days,mean_prob_of_rain_7_days
0,Collioure,42.52505,3.083155,15.995,16.57,Clouds,0.0575
1,Nimes,43.837425,4.360069,15.965,16.1025,Clouds,0.01375
2,Aix en Provence,43.529842,5.447474,15.53375,15.57125,Clouds,0.00375
3,Bormes les Mimosas,43.150697,6.341928,15.36625,15.44125,Clouds,0.0075
4,Aigues Mortes,43.565823,4.191284,15.18375,15.25875,Clouds,0.0075


## Find the nicest weather
Determine the list of cities where the weather will be the nicest within the next 7 days For example, you can use the values of daily.pop and daily.rain to compute the expected volume of rain within the next 7 days... But it's only an example, actually you can have different opinions on a what a nice weather would be like 😎 Maybe the most important criterion for you is the temperature or humidity, so feel free to change the rules !

## Save all the results in a .csv file
Save all the results in a .csv file, you will use it later 😉 You can save all the informations that seem important to you ! Don't forget to save the name of the cities, and also to create a column containing a unique identifier (id) of each city (this is important for what's next in the project)

## Use plotly to display the best destinations on a map 

In [156]:
fig = px.scatter_mapbox(df_bestdest.head(), lat="Lat", lon="Long", hover_name = 'City', zoom = 5,
                        hover_data={
        'Lat': False,
        'Long': False,
        'score_7_days':False,
        'City': True,
        'mean_temperature_7_days': True,
        }, 
                        color = 'City', color_discrete_sequence = px.colors.sequential.Inferno, template='plotly', size='score_7_days',
                        mapbox_style='open-street-map',width = 1050, height = 900,
                        title='Best rated hotels in selected cities (with the best weather in the last 7 days)')
fig.show()


## Scrape Booking.com
Since BookingHoldings doesn't have aggregated databases, it will be much faster to scrape data directly from booking.com

You can scrap as many information as you want, but we suggest that you get at least:

- hotel name,
- Url to its booking.com page,
- Its coordinates: latitude and longitude
- Score given by the website users
- Text description of the hotel
- Create your data lake using S3
- Once you managed to build your dataset, you should store into S3 as a csv file.

In [2]:
df_ = get_data_by_city('Rouen' , 1)
df_

2023-03-30 17:13:38 [scrapy.core.scraper] ERROR: Spider error processing <GET https://www.booking.com/searchresults.fr.html?label=gen173nr-1FCAEoggI46AdIDVgEaE2IAQGYAQ24ARfIAQzYAQHoAQH4AQKIAgGoAgO4AvuylaEGwAIB0gIkMjBiZDJlZWUtZjY0ZC00OWVlLWExZGQtMWQzN2NhYTA1NDA52AIF4AIB&aid=304142&ss=Rouen&ssne=Rouen&ssne_untouched=Rouen&lang=fr&sb=1&src_elem=sb&src=index&dest_id=-1456928&dest_type=city&group_adults=2&no_rooms=1&group_children=0&sb_travel_purpose=leisure&offset=225> (referer: https://www.booking.com/searchresults.fr.html?label=gen173nr-1FCAEoggI46AdIDVgEaE2IAQGYAQ24ARfIAQzYAQHoAQH4AQKIAgGoAgO4AvuylaEGwAIB0gIkMjBiZDJlZWUtZjY0ZC00OWVlLWExZGQtMWQzN2NhYTA1NDA52AIF4AIB&aid=304142&ss=Rouen&ssne=Rouen&ssne_untouched=Rouen&lang=fr&sb=1&src_elem=sb&src=index&dest_id=-1456928&dest_type=city&group_adults=2&no_rooms=1&group_children=0&sb_travel_purpose=leisure&offset=200)
Traceback (most recent call last):
  File "C:\Users\lucas\AppData\Roaming\Python\Python310\site-packages\urllib3\connection.py",

Unnamed: 0,city,name_hotel,Loc,Description,Rating,Link
0,Rouen,Urban Bivouac Hotel,"13e arr., Paris",L’Urban Bivouac Hotel propose des hébergements...,87,https://www.booking.com/hotel/fr/ub-paris.fr.h...
1,Rouen,Belambra City - Magendie,"13e arr., Paris",L’hôtel Belambra City - Magendie se trouve à 1...,77,https://www.booking.com/hotel/fr/belambra-city...
2,Rouen,OKKO Hotels Paris Gare de l'Est,"10e arr., Paris",L’OKKO Hotels Paris Gare de l'Est est situé à ...,83,https://www.booking.com/hotel/fr/okko-hotels-p...
3,Rouen,HIGHSTAY - Le Marais - Serviced Apartments,"3e arr., Paris",Le HIGHSTAY - Le Marais - Serviced Apartments ...,88,https://www.booking.com/hotel/fr/highstay-reau...
4,Rouen,LivinParis - Luxury 2 Bedrooms Opera I,"10e arr., Paris","Situé à Paris, à moins de 1 km de la station d...",84,https://www.booking.com/hotel/fr/luxury-apartm...
5,Rouen,Le Marceau Bastille,"12e arr., Paris",Situé à 550 mètres de la place de la Bastille ...,86,https://www.booking.com/hotel/fr/le-marceau-ba...
6,Rouen,Pullman Paris Centre - Bercy,"12e arr., Paris",Le Pullman Paris Centre - Bercy vous accueille...,83,https://www.booking.com/hotel/fr/pullman-paris...
7,Rouen,Hotel Ile de France Opéra,"2e arr., Paris",L'hôtel Île de France Opéra occupe un bâtiment...,76,https://www.booking.com/hotel/fr/idfo.fr.html?...
8,Rouen,Les Patios du Marais 1,"3e arr., Paris",L'établissement Les Patios du Marais propose d...,73,https://www.booking.com/hotel/fr/les-patios-du...
9,Rouen,Hôtel La Nouvelle République & Hammam,"11e arr., Paris","Situé dans le 11ème arrondissement de Paris, l...",87,https://www.booking.com/hotel/fr/la-nouvelle-r...
