In [None]:
import pandas as pd , numpy as np , requests
from statistics import mode
from tqdm import tqdm

import matplotlib.pyplot as plt , seaborn as sns
import plotly.graph_objects as go , plotly.express as px


import boto3
import mysql.connector


import warnings
warnings.filterwarnings('ignore')

In [None]:
##### Constant ##### 
liste_city = ["Mont Saint Michel",
"St Malo",
"Bayeux",
"Le Havre",
"Rouen",
"Paris",
"Amiens",
"Lille",
"Strasbourg",
"Chateau du Haut Koenigsbourg",
"Colmar",
"Eguisheim",
"Besancon",
"Dijon",
"Annecy",
"Grenoble",
"Lyon",
"Gorges du Verdon",
"Bormes les Mimosas",
"Cassis",
"Marseille",
"Aix en Provence",
"Avignon",
"Uzes",
"Nimes",
"Aigues Mortes",
"Saintes Maries de la mer",
"Collioure",
"Carcassonne",
"Ariege",
"Toulouse",
"Montauban",
"Biarritz",
"Bayonne",
"La Rochelle"]

api_gps = 'https://nominatim.org/'
api_weather = 'https://openweathermap.org/api/one-call-api'
api_weather_key = ''
credentials = open('C:\\Users\\lucas\\OneDrive\\Bureau\\Credential_aws.txt' , 'r').readlines()
api_key = credentials[0].split()[0]
api_secret_key = credentials[0].split()[1]

## Get GPS data
Use https://nominatim.org/ to get the gps coordinates of all the cities (no subscription required) Documentation : https://nominatim.org/release-docs/develop/api/Search/

In [None]:
def get_coordinates(address):
    url = 'https://nominatim.openstreetmap.org/search'
    params = {'q': address, 'format': 'jsonv2'}
    response = requests.get(url, params=params).json()
    if len(response) > 0:
        lat = response[0]['lat']
        lon = response[0]['lon']
        return (float(lat), float(lon))
    else:
        return None
    

lat , long = [] , []    
for city in tqdm(liste_city):

    tuple_latlong = get_coordinates(address=city)
    lat.append(tuple_latlong[0])
    long.append(tuple_latlong[1])


df = pd.DataFrame({'City' : liste_city , 'Lat' : lat , 'Long' : long})
df.head()

## Get weather data
Use https://openweathermap.org/appid (you have to subscribe to get a free apikey) and https://openweathermap.org/api/one-call-api to get some information about the weather for the 35 cities and put it in a DataFrame

In [None]:
mean_temps , mean_prob , most_freq_weather = [] , [] , []
for i in range(len(df)):
    
    lat = df.iloc[i]['Lat']
    long = df.iloc[i]['Long']

    url = f'https://api.openweathermap.org/data/3.0/onecall?lat={lat}&lon={long}&exclude=current,minutely,hourly,alerts&units=metric&appid={api_weather_key}'
    response = requests.get(url).json()
    
    temps , prob , weather = [] , [] , []
    for j in range(len(response['daily'])):
        weather.append(response['daily'][j]['weather'][0]['main']) # Main meteo
        prob.append(response['daily'][j]['pop'] * 10) # Probality of precipitation
        temps.append(response['daily'][j]['feels_like']['day']) # Global temperature of the day

    mean_temps.append(np.mean(temps))
    mean_prob.append(np.mean(prob))
    most_freq_weather.append(mode(weather))
    

score = np.array(mean_temps) - np.array(mean_prob) # The KPI here is temperature_of_the_day - (prob_rain * 10) , and the weather has to be sunny
assert len(mean_prob) == len(mean_temps) == len(most_freq_weather) == len(score)

df['score_7_days'] = score 
df['mean_temperature_7_days'] = mean_temps
df['most_freq_weather_7_days'] = most_freq_weather
df['mean_prob_of_rain_7_days'] = np.array(mean_prob) / 10


df = df.sort_values(by='score_7_days' , ascending=False).reset_index(drop=True)
df_bestdest = df[df['most_freq_weather_7_days'] != 'Rain']
df.to_csv('df_with_weather.csv')
df_bestdest.to_csv('df_best_destination.csv')
df.head()

## Find the nicest weather
Determine the list of cities where the weather will be the nicest within the next 7 days For example, you can use the values of daily.pop and daily.rain to compute the expected volume of rain within the next 7 days... But it's only an example, actually you can have different opinions on a what a nice weather would be like 😎 Maybe the most important criterion for you is the temperature or humidity, so feel free to change the rules !

#### Save all the results in a .csv file
Save all the results in a .csv file, you will use it later 😉 You can save all the informations that seem important to you ! Don't forget to save the name of the cities, and also to create a column containing a unique identifier (id) of each city (this is important for what's next in the project)

#### Use plotly to display the best destinations on a map 

In [None]:
fig = px.scatter_mapbox(df_bestdest.head(), lat="Lat", lon="Long", hover_name = 'City', zoom = 5,
                        hover_data={
        'Lat': False,
        'Long': False,
        'score_7_days':False,
        'City': True,
        'mean_temperature_7_days': True,
        }, 
                        color = 'City', color_discrete_sequence = px.colors.sequential.Inferno, template='plotly', size='score_7_days',
                        mapbox_style='open-street-map',width = 1050, height = 900,
                        title='Best rated cities (with the best weather in the next 7 days)')
fig.show()

## Scrape Booking.com
Since BookingHoldings doesn't have aggregated databases, it will be much faster to scrape data directly from booking.com

You can scrap as many information as you want, but we suggest that you get at least:

- hotel name,
- Url to its booking.com page,
- Its coordinates: latitude and longitude
- Score given by the website users
- Text description of the hotel
- Create your data lake using S3
- Once you managed to build your dataset, you should store into S3 as a csv file.

In [None]:

df_scraped = pd.read_csv('Scraped_df.csv' , index_col='Unnamed: 0')

lats , longs = [] , []
for adress in tqdm(list(df_scraped['Adress'])):
        try:
            lat , long = get_coordinates(adress)
            lats.append(lat)
            longs.append(long)

        except TypeError:
            lats.append(np.nan)
            longs.append(np.nan)

assert len(lats) == len(longs) == len(df_scraped)

df_scraped['Lat'] = lats 
df_scraped['Long'] = longs

df_scraped.to_csv('df_scraped_with_lat_long.csv')

## Send data to my data lake

In [None]:
# Merge all the informations
df_scp_with_lat_long = pd.read_csv('df_scraped_with_lat_long.csv' , index_col='Unnamed: 0')
df_with_weather = pd.read_csv('df_with_weather.csv' , index_col='Unnamed: 0')
merged = df_scp_with_lat_long.merge(df_with_weather.drop(['Lat' , 'Long'] , axis=1) , how='left' , on='City').sort_values(by='score_7_days' , ascending=False)
merged.to_csv('Full_df.csv' , index=False)

# Connect to my account
s3 = boto3.client('s3', aws_access_key_id=api_key, aws_secret_access_key=api_secret_key)
bucket_name = 'kayakprojectlucas'

# Send the CSV files
s3.upload_file('Full_df.csv', bucket_name, 'Full_df.csv')

## Map of the 20 best destinations

In [None]:
# Get the best destination in the best cities
to_plot = merged[merged['City'].isin(df_bestdest['City'].head())].groupby('City' , as_index=False).apply(lambda x: x.nlargest(4, 'Rating')).reset_index(drop=True)

# Plot the figure
fig = px.scatter_mapbox(to_plot , lat="Lat", lon="Long", hover_name = 'Name', zoom = 5,
                        hover_data={
        'Lat': False,
        'Long': False,
        'score_7_days':False,
        'Name' : True,
        'City': True,
        'mean_temperature_7_days': True,
        }, 
                        color = 'City', color_discrete_sequence = px.colors.sequential.Inferno, template='plotly', size='Rating',
                        mapbox_style='open-street-map',width = 1050, height = 900,
                        title='Best rated hotels in selected cities (with the best weather in the next 7 days)')


fig.show()