# Kayak Project

## Imports

In [22]:
import pandas as pd
import numpy as np
import plotly.express as px
import requests
from dotenv import load_dotenv
import os
import logging
import scrapy
from scrapy.crawler import CrawlerProcess
from bs4 import BeautifulSoup
import requests
import time
import boto3
import datetime

load_dotenv()
%load_ext dotenv
%dotenv

# Openweathermap API
key = os.getenv('APIKEY')
# RDS Connection
DBHOST = os.getenv("DBHOST")
DBUSER = os.getenv("DBUSER")
DBPASS = os.getenv("DBPASS")
DBNAME = os.getenv("DBNAME")

The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv


In [23]:
# Importing the cities list
cities = ["Mont Saint Michel", "St Malo", "Bayeux", "Le Havre", "Rouen", "Paris", "Amiens", "Lille", "Strasbourg", "Chateau du Haut Koenigsbourg", "Colmar", "Eguisheim", "Besancon", "Dijon", "Annecy", "Grenoble", "Lyon", "Gorges du Verdon", "Bormes les Mimosas", "Cassis", "Marseille", "Aix en Provence", "Avignon", "Uzes", "Nimes", "Aigues Mortes", "Saintes Maries de la mer", "Collioure", "Carcassonne", "Ariege", "Toulouse", "Montauban", "Biarritz", "Bayonne", "La Rochelle"]

In [24]:
# Creating a dataframe with the cities list
df_cities = pd.DataFrame(columns=["city"])
df_cities['city'] = cities
df_cities.reset_index(inplace=True)
df_cities.rename(columns={'index': 'id'}, inplace=True)
df_cities

Unnamed: 0,id,city
0,0,Mont Saint Michel
1,1,St Malo
2,2,Bayeux
3,3,Le Havre
4,4,Rouen
5,5,Paris
6,6,Amiens
7,7,Lille
8,8,Strasbourg
9,9,Chateau du Haut Koenigsbourg


## Scraping Booking.com

In [25]:
# Creating a copy of the dataframe
df_booking = df_cities.copy(deep=True)

# Defining the spider class
class BookingSpider(scrapy.Spider):
    name = "Booking_data"
    cities = df_booking["city"]
    start_urls = ['https://www.booking.com/index.fr.html']

    def parse(self, response):
        for i in cities:
            yield scrapy.FormRequest.from_response(
            response,
            formdata={'ss': i },
            callback=self.after_search
        )
    # Important note: the classes names change often and may not be correct at a later date
    def after_search(self, response):
        cities = response.url.split("ss=")[-1].split("&")[0]
                
        booking = response.css('.d4924c9e74')
        
        for k in booking:
            
            yield {
                'location': cities,
                'name': k.css('a div.fcab3ed991.a23c043802::text').getall(),
                'url': k.css('h3.a4225678b2 a::attr(href)').getall(),
            }

        try:
            next_page = response.css('a.paging-next').attrib["href"]
        except KeyError:
            logging.info('No next page. Terminating crawling process.')
        else:
            yield response.follow(next_page, callback=self.after_search)

In [5]:
# Initializing the crawler process
filename = "cities.json"

if filename in os.listdir():
        os.remove(filename)

process = CrawlerProcess(settings = {
    'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:92.0) Gecko/20100101 Firefox/92.0',
    'LOG_LEVEL': logging.INFO,
    "FEEDS": {
        filename: {"format": "json"},
    },
    "AUTOTHROTTLE_ENABLED": True
})

process.crawl(BookingSpider)
process.start()

2022-04-14 22:32:20 [scrapy.utils.log] INFO: Scrapy 2.6.1 started (bot: scrapybot)
2022-04-14 22:32:20 [scrapy.utils.log] INFO: Versions: lxml 4.8.0.0, libxml2 2.9.12, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 22.2.0, Python 3.9.11 (main, Mar 29 2022, 19:08:29) - [GCC 7.5.0], pyOpenSSL 22.0.0 (OpenSSL 1.1.1n  15 Mar 2022), cryptography 36.0.0, Platform Linux-5.13.0-39-generic-x86_64-with-glibc2.31
2022-04-14 22:32:20 [scrapy.crawler] INFO: Overridden settings:
{'AUTOTHROTTLE_ENABLED': True,
 'LOG_LEVEL': 20,
 'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:92.0) '
               'Gecko/20100101 Firefox/92.0'}
2022-04-14 22:32:20 [scrapy.extensions.telnet] INFO: Telnet Password: fae05b61e55cb43f
2022-04-14 22:32:20 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogS

In [26]:
# Reading the scraped json data into a new dataframe
df = pd.read_json("cities.json")
df.head()

Unnamed: 0,location,name,url
0,Mont+Saint+Michel,"[Hôtel Vert, Le Relais Saint Michel, La Mère P...",[https://www.booking.com/hotel/fr/vert.fr.html...
1,St+Malo,"[Hotel Eden, Hotel d'Aleth, Hotel Ajoncs d'Or,...",[https://www.booking.com/hotel/fr/eden-saint-m...
2,Bayeux,"[ibis budget Bayeux, Hôtel De Brunville et La ...",[https://www.booking.com/hotel/fr/etap-bayeux....
3,Le+Havre,"[Holiday Inn Express - Le Havre Centre, Nomad ...",[https://www.booking.com/hotel/fr/campanile-le...
4,Rouen,"[Radisson Blu Hotel, Rouen Centre, Maison hype...",[https://www.booking.com/hotel/fr/radisson-blu...


In [27]:
# Keeping only the first 20 items from each list
for i in range (len(df["location"])):
    df["name"][i] = df["name"][i][:20]
    df["url"][i] = df["url"][i][:20]


In [28]:
# Initializing the scraper with BeautifulSoup
lat_full = []
lon_full = []
description_full = []
score_full = []

navigator = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1)'

for i in range(len(df["url"])):
    lat_list = []
    lon_list = []
    description_list = []
    score_list = []
    hotel_list = df["url"][i]

    for j in hotel_list:

        # In some cases, the process will return an error on the first attempt
        try:
            page = requests.get(j, headers={'User-Agent': navigator})
            soup = BeautifulSoup(page.text, 'html.parser')
        except:
            page = requests.get(j, headers={'User-Agent': navigator})
            soup = BeautifulSoup(page.text, 'html.parser')

        lat_list.append(soup.select('p.address.address_clean a')[0].get("data-atlas-latlng").split(",")[0])
        lon_list.append(soup.select('p.address.address_clean a')[0].get("data-atlas-latlng").split(",")[1])
        description_list.append(soup.select('div#property_description_content')[0].get_text())

        try:
            score_list.append(soup.select('div.b5cd09854e.d10a6220b4')[0].get_text())
            
        except:
            # If the score is missing, setting the default value to 5.0
            score_list.append("5.0")

        # Setting up a wait timer to avoid spamming the website
        time.sleep(1.4)
        
    lat_full.append(lat_list)
    lon_full.append(lon_list)
    description_full.append(description_list)
    score_full.append(score_list)
    
    # Printing the progress, the whole process takes a bit more than 40 minutes
    print (f"{df['location'].iloc[i]} completed")

# Adding new columns based on lists of lists gathered above    
df["lat"] = lat_full
df["lon"] = lon_full
df["description"] = description_full
df["score"] = score_full
print('Task completed')

Mont+Saint+Michel completed
St+Malo completed
Bayeux completed
Le+Havre completed
Rouen completed
Paris completed
Amiens completed
Lille completed
Strasbourg completed
Chateau+du+Haut+Koenigsbourg completed
Colmar completed
Eguisheim completed
Besancon completed
Dijon completed
Grenoble completed
Annecy completed
La+Rochelle completed
Bayonne completed
Biarritz completed
Montauban completed
Toulouse completed
Ariege completed
Carcassonne completed
Collioure completed
Saintes+Maries+de+la+mer completed
Aigues+Mortes completed
Nimes completed
Uzes completed
Avignon completed
Aix+en+Provence completed
Marseille completed
Cassis completed
Bormes+les+Mimosas completed
Gorges+du+Verdon completed
Lyon completed
Task completed


In [29]:
# Replacing the + in cities names and reindexing the dataframe to add the id column that will be useful to merge later
df["location"] = df["location"].str.replace("+", " ", regex=True)
df = df.set_index("location")
df = df.reindex(cities)
df.reset_index(inplace=True)
df.reset_index(inplace=True)
df.rename(columns={'index': 'id'}, inplace=True)
df.to_csv('df_bs4.csv', index=False)

## Getting API Data

### GPS Coordinates

In [30]:
# Creating another copy of the cities dataframe to store the coordinates data from Nominatim API
df_gps = df_cities.copy(deep=True)
lat_list = []
lon_list = []

for i in cities:
    # There is no Gorges du Verdon city, replacing by the closest city instead
    if i == "Gorges du Verdon":
        i = "La%20Palud-sur-Verdon"
        r = requests.get(f"https://nominatim.openstreetmap.org/search?city={i}&format=json").json()
        lat_list.append(r[0]['lat'])
        lon_list.append(r[0]['lon'])
    # Ariege is not a city, using the county instead here
    elif i == 'Ariege':
        r = requests.get(f"https://nominatim.openstreetmap.org/search?county={i}&format=json").json()
        lat_list.append(r[0]['lat'])
        lon_list.append(r[0]['lon'])
    # Replacing the whitespace by a web friendly notation
    else:
        name = i.replace(" ", "%20")
        r = requests.get(f"https://nominatim.openstreetmap.org/search?city={name}&format=json").json()
        lat_list.append(r[0]['lat'])
        lon_list.append(r[0]['lon'])
 
# Adding the coordinates to the dataframe   
df_gps['lat'] = lat_list
df_gps['lon'] = lon_list 

In [31]:
df_gps.head()

Unnamed: 0,id,city,lat,lon
0,0,Mont Saint Michel,48.6359541,-1.511459954959514
1,1,St Malo,48.649518,-2.0260409
2,2,Bayeux,49.2764624,-0.7024738
3,3,Le Havre,49.4938975,0.1079732
4,4,Rouen,49.4404591,1.0939658


### Weather Data

In [33]:
# Getting the weather data based on coordinates from the Openweathermap API
df_full = df_gps.copy(deep=True)
temps_list = []
rain_pop = []
humidity_list = []
# The days list will be used to store data as items of a list for day +1 to day +7
days = list(range(1,8))

for i in df_full.itertuples():
    lat = i.lat
    lon = i.lon
    r = requests.get(f"https://api.openweathermap.org/data/2.5/onecall?lat={lat}&lon={lon}&units=metric&appid={key}").json()
    weather_7_days = r['daily'][1:] # Getting the weather data for the next 7 days, first item is the current weather, which we don't want here
    temps = [j['feels_like']['day'] for j in weather_7_days]
    rain = [int(j['pop'] * 100) for j in weather_7_days]
    humidity = [j['humidity'] for j in weather_7_days]
    temps_list.append(temps)
    rain_pop.append(rain)
    humidity_list.append(humidity)
    
df_full['day_plus'] = [days for _ in range(len(df_full))]
df_full['felt_temperature'] = temps_list
df_full['rain_chances'] = rain_pop
df_full['humidity'] = humidity_list
# Calculation of a weather quality indicator, using absolute value of 35 - temperature (with a weight of 2), chances of rain as a percentage, and humidity as a percentage (with a weight of 0.5) - lower is better
df_full['score_weather'] = df_full.apply(lambda x: abs((35 - np.mean(x['felt_temperature'])) * 2) + np.mean(x['rain_chances']) + (np.mean(x['humidity']) / 2), axis=1)   

KeyError: 'daily'

In [36]:
requests.get(f"https://api.openweathermap.org/data/2.5/onecall?lat={lat}&lon={lon}&units=metric&appid={key}")

<Response [429]>

In [None]:
df_full.head()

Unnamed: 0,id,city,lat,lon,day_plus,felt_temperature,rain_chances,humidity,score_weather
0,0,Mont Saint Michel,48.6359541,-1.511459954959514,"[1, 2, 3, 4, 5, 6, 7]","[15.67, 16.18, 12.48, 15.19, 13.85, 14.85, 17.06]","[0, 0, 85, 100, 6, 0, 77]","[50, 56, 87, 59, 58, 54, 66]",108.92
1,1,St Malo,48.649518,-2.0260409,"[1, 2, 3, 4, 5, 6, 7]","[13.36, 13.98, 11.35, 12.99, 11.72, 12.41, 14.74]","[0, 6, 79, 100, 13, 1, 63]","[59, 67, 89, 69, 67, 67, 76]",116.842857
2,2,Bayeux,49.2764624,-0.7024738,"[1, 2, 3, 4, 5, 6, 7]","[13.74, 12.72, 11.53, 12.14, 11.07, 11.7, 10.62]","[0, 0, 71, 77, 0, 0, 87]","[63, 72, 86, 69, 64, 63, 86]",115.637143
3,3,Le Havre,49.4938975,0.1079732,"[1, 2, 3, 4, 5, 6, 7]","[13.5, 12.63, 14.55, 11.81, 9.59, 10.51, 13.71]","[0, 0, 51, 47, 0, 0, 49]","[61, 69, 70, 66, 67, 65, 65]",99.414286
4,4,Rouen,49.4404591,1.0939658,"[1, 2, 3, 4, 5, 6, 7]","[14.58, 13.61, 15.52, 12.71, 12.44, 13.26, 15.75]","[0, 0, 46, 33, 20, 0, 0]","[57, 64, 67, 60, 58, 58, 59]",86.394286


## Merging Data

In [None]:
# Renaming coordinates columns to avoid confusion between city and hotel coordinates
df.rename(columns={'lat': 'lat_hotels', 'lon': 'lon_hotels'}, inplace=True)

In [None]:
# Merging the hotels and weather dataframes, and dropping the unnecessary location column
df_complete = pd.merge(df_full, df, on='id')
df_complete.drop(columns=['location'], inplace=True)
now = datetime.datetime.now()
df_complete['updated_at'] = now
print(df_complete.shape)

# Saving the merged dataset as a csv file
df_complete.to_csv('df_complete.csv', index=False, header=True)

(35, 15)


## Storing the dataset in a S3 data lake

In [None]:
# Loading the S3 dession and resource
session = boto3.Session()
s3 = boto3.resource('s3')

In [None]:
# Uploading the file to S3
s3.Bucket('kayak-project-garp').upload_file('df_complete.csv', 'df_complete.csv')

In [None]:
# Downloading the file from S3 and checking it kept the same shape.
df_s3 = pd.read_csv('s3://kayak-project-garp/df_complete.csv')
df_s3.shape

(35, 16)

## Transforming

In [None]:
# Splitting the dataframe into 2 tables

df_weather = df_s3[['id', 'city', 'lat', 'lon', 'day_plus', 'felt_temperature', 'rain_chances', 'humidity', 'score_weather']]
df_hotels = df_s3[['id', 'city', 'name', 'url', 'lat_hotels', 'lon_hotels', 'description', 'score', 'score_weather']]

In [None]:
df_weather.head()

Unnamed: 0,id,city,lat,lon,day_plus,felt_temperature,rain_chances,humidity,score_weather
0,0,Mont Saint Michel,48.635954,-1.51146,"[1, 2, 3, 4, 5, 6, 7]","[16.49, 12.81, 14.74, 14.83, 15.44, 18.17, 19.31]","[72, 4, 0, 0, 33, 31, 36]","[61, 64, 53, 52, 59, 67, 60]",92.917143
1,1,St Malo,48.649518,-2.026041,"[1, 2, 3, 4, 5, 6, 7]","[13.73, 11.48, 12.75, 13.03, 13.1, 15.51, 16.39]","[81, 3, 0, 0, 30, 33, 23]","[77, 68, 63, 62, 72, 73, 71]",101.574286
2,2,Bayeux,49.276462,-0.702474,"[1, 2, 3, 4, 5, 6, 7]","[12.89, 10.4, 12.16, 11.22, 13.23, 16.18, 15.91]","[80, 0, 0, 0, 0, 22, 0]","[73, 72, 67, 63, 71, 69, 69]",92.86
3,3,Le Havre,49.493898,0.107973,"[1, 2, 3, 4, 5, 6, 7]","[12.84, 11.51, 12.31, 12.08, 14.94, 15.96, 15.4]","[88, 2, 0, 0, 0, 0, 0]","[70, 70, 63, 60, 59, 68, 69]",88.488571
4,4,Rouen,49.440459,1.093966,"[1, 2, 3, 4, 5, 6, 7]","[15.42, 13.54, 13.91, 13.99, 16.97, 19.3, 17.3]","[70, 42, 0, 0, 0, 0, 0]","[61, 61, 54, 55, 52, 52, 58]",82.52


In [None]:
df_hotels.head()

Unnamed: 0,id,city,name,url,lat_hotels,lon_hotels,description,score,score_weather
0,0,Mont Saint Michel,"['Le Relais Saint Michel', 'La Mère Poulard', ...",['https://www.booking.com/hotel/fr/le-relais-s...,"['48.61758727', '48.63508532', '48.61424653', ...","['-1.51039615', '-1.51053965', '-1.51054502', ...","[""\nVous pouvez bénéficier d'une réduction Gen...","['7,8', '7,2', '8,2', '7,3', '7,2', '8,1', '8,...",92.917143
1,1,St Malo,"['Studio cocooning', 'Apartment, St Malo', 'Pe...",['https://www.booking.com/hotel/fr/studio-coco...,"['48.65537970', '48.65874000', '48.65956907', ...","['-2.00327690', '-1.97702800', '-1.98920718', ...","[""\nVous pouvez bénéficier d'une réduction Gen...","['9,5', '5.0', '9,3', '8,7', '7,5', '5.0', '7,...",101.574286
2,2,Bayeux,"['Premiere Classe Bayeux', 'ibis budget Bayeux...",['https://www.booking.com/hotel/fr/premiere-cl...,"['49.26942872', '49.25424209', '49.27571400', ...","['-0.70668697', '-0.64648747', '-0.69804200', ...","[""\nVous pouvez bénéficier d'une réduction Gen...","['7,7', '8,2', '9,7', '5.0', '7,8', '8,1', '7,...",92.86
3,3,Le Havre,"['Holiday Inn Express - Le Havre Centre', 'Ibi...",['https://www.booking.com/hotel/fr/campanile-l...,"['49.49823800', '49.49424468', '49.49331760', ...","['0.12883700', '0.14285243', '0.12530688', '0....","[""\nL’Holiday Inn Express - Le Havre Centre, s...","['8,2', '8,4', '8,6', '8,1', '8.8', '7,5', '8,...",88.488571
4,4,Rouen,"['Maison hypercentre Rouen tout confort', ""L'A...",['https://www.booking.com/hotel/fr/maison-hype...,"['49.44465180', '49.44454400', '49.43572312', ...","['1.09132620', '1.09117800', '1.10273123', '1....","[""\nVous pouvez bénéficier d'une réduction Gen...","['8,8', '9,1', '8,2', '8,9', '9,1', '8,5', '8,...",82.52


In [None]:
# The csv has converted the lists into strings, we need to revert that first for both dataframes
df_weather_final = pd.DataFrame()
df_weather_final[['id', 'city', 'lat', 'lon', 'score_weather']] = df_weather[['id', 'city', 'lat', 'lon', 'score_weather']]
df_weather_final['day_plus'] = df_weather['day_plus'].apply(eval)
df_weather_final['felt_temperature'] = df_weather['felt_temperature'].apply(eval)
df_weather_final['rain_chances'] = df_weather['rain_chances'].apply(eval)
df_weather_final['humidity'] = df_weather['humidity'].apply(eval)

df_hotels_final = pd.DataFrame()
df_hotels_final[['id', 'city', 'score_weather']] = df_hotels[['id', 'city', 'score_weather']]
df_hotels_final['name'] = df_hotels['name'].apply(eval)
df_hotels_final['url'] = df_hotels['url'].apply(eval)
df_hotels_final['lat_hotels'] = df_hotels['lat_hotels'].apply(eval)
df_hotels_final['lon_hotels'] = df_hotels['lon_hotels'].apply(eval)
df_hotels_final['description'] = df_hotels['description'].apply(eval)
df_hotels_final['score'] = df_hotels['score'].apply(eval)

# We need to convert coordinates to numeric type as well
df_weather_final[['lat', 'lon']] = df_weather[['lat', 'lon']].apply(pd.to_numeric)

In [None]:
# We need to sort the data by weather score and explode the lists to prepare the data for warehousing and plotting
df_weather_full = df_weather_final.sort_values('score_weather')
df_weather_full.reset_index(inplace=True, drop=True)
df_weather_full = df_weather_full.apply(pd.Series.explode)
df_weather_full[['day_plus', 'felt_temperature', 'rain_chances', 'humidity']] = df_weather_full[['day_plus', 'felt_temperature', 'rain_chances', 'humidity']].apply(pd.to_numeric)

df_hotels_full = df_hotels_final.sort_values('score_weather')
df_hotels_full.reset_index(inplace=True, drop=True)
df_hotels_full = df_hotels_full.apply(pd.Series.explode)
df_hotels_full['description'] = df_hotels_full['description'].replace("\\n", "", regex=True)
df_hotels_full['score'] = df_hotels_full['score'].replace(",", ".", regex=True)
df_hotels_full[['lat_hotels', 'lon_hotels', 'score']] = df_hotels_full[['lat_hotels', 'lon_hotels', 'score']].apply(pd.to_numeric)

In [None]:
# We reset the index before pushing to SQL in order to give each line a unique id
df_weather_full_sql = df_weather_full.reset_index()
df_weather_full_sql.reset_index(inplace=True)
df_weather_full_sql.drop(columns = ['index', 'updated_at'], inplace=True)
df_weather_full_sql.rename({'level_0': 'index'}, axis=1, inplace=True)

df_hotels_full_sql = df_hotels_full.reset_index()
df_hotels_full_sql.reset_index(inplace=True)
df_hotels_full_sql.drop(columns = ['index', 'updated_at'], inplace=True)
df_hotels_full_sql.rename({'level_0': 'index'}, axis=1, inplace=True)

## Storing the transformed data in a RDS data warehouse

In [None]:
# Importing libraries for the data warehousing and initialization of the SQL engine
from sqlalchemy import create_engine, text
from sqlalchemy.orm import sessionmaker

engine = create_engine(f"postgresql+psycopg2://{DBUSER}:{DBPASS}@{DBHOST}/{DBNAME}", echo=True)
Session = sessionmaker(bind=engine)
session = Session()

In [None]:
# Storing the two dataframes as SQL tables
df_weather_full_sql.to_sql('weather', con=engine, if_exists='replace', index=False)
df_hotels_full_sql.to_sql('hotels', con=engine, if_exists='replace', index=False)

2022-04-23 18:33:07,651 INFO sqlalchemy.engine.Engine select pg_catalog.version()
2022-04-23 18:33:07,652 INFO sqlalchemy.engine.Engine [raw sql] {}
2022-04-23 18:33:07,678 INFO sqlalchemy.engine.Engine select current_schema()
2022-04-23 18:33:07,678 INFO sqlalchemy.engine.Engine [raw sql] {}
2022-04-23 18:33:07,702 INFO sqlalchemy.engine.Engine show standard_conforming_strings
2022-04-23 18:33:07,702 INFO sqlalchemy.engine.Engine [raw sql] {}
2022-04-23 18:33:07,729 INFO sqlalchemy.engine.Engine select relname from pg_class c join pg_namespace n on n.oid=c.relnamespace where pg_catalog.pg_table_is_visible(c.oid) and relname=%(name)s
2022-04-23 18:33:07,729 INFO sqlalchemy.engine.Engine [generated in 0.00043s] {'name': 'weather'}
2022-04-23 18:33:07,769 INFO sqlalchemy.engine.Engine select relname from pg_class c join pg_namespace n on n.oid=c.relnamespace where pg_catalog.pg_table_is_visible(c.oid) and relname=%(name)s
2022-04-23 18:33:07,770 INFO sqlalchemy.engine.Engine [cached sinc

700

In [None]:
# Testing the data with a query
query = text("SELECT * FROM weather LIMIT 21")
pd.read_sql(query, engine)

2022-04-23 18:33:08,754 INFO sqlalchemy.engine.Engine select relname from pg_class c join pg_namespace n on n.oid=c.relnamespace where pg_catalog.pg_table_is_visible(c.oid) and relname=%(name)s
2022-04-23 18:33:08,755 INFO sqlalchemy.engine.Engine [cached since 1.026s ago] {'name': 'SELECT * FROM weather LIMIT 21'}
2022-04-23 18:33:08,794 INFO sqlalchemy.engine.Engine SELECT * FROM weather LIMIT 21
2022-04-23 18:33:08,794 INFO sqlalchemy.engine.Engine [generated in 0.00042s] {}


Unnamed: 0,index,id,city,lat,lon,score_weather,day_plus,felt_temperature,rain_chances,humidity
0,0,7,Lille,50.636565,3.063528,68.314286,1,15.55,18,47
1,1,7,Lille,50.636565,3.063528,68.314286,2,11.91,22,57
2,2,7,Lille,50.636565,3.063528,68.314286,3,13.2,0,41
3,3,7,Lille,50.636565,3.063528,68.314286,4,13.27,0,46
4,4,7,Lille,50.636565,3.063528,68.314286,5,17.86,0,39
5,5,7,Lille,50.636565,3.063528,68.314286,6,18.71,0,46
6,6,7,Lille,50.636565,3.063528,68.314286,7,15.15,0,43
7,7,21,Aix en Provence,43.529842,5.447474,70.3,1,10.58,94,82
8,8,21,Aix en Provence,43.529842,5.447474,70.3,2,16.83,0,49
9,9,21,Aix en Provence,43.529842,5.447474,70.3,3,19.81,0,35


In [None]:
query = text("SELECT * FROM hotels LIMIT 40")
pd.read_sql(query, engine)

2022-04-23 18:33:08,903 INFO sqlalchemy.engine.Engine select relname from pg_class c join pg_namespace n on n.oid=c.relnamespace where pg_catalog.pg_table_is_visible(c.oid) and relname=%(name)s
2022-04-23 18:33:08,903 INFO sqlalchemy.engine.Engine [cached since 1.175s ago] {'name': 'SELECT * FROM hotels LIMIT 40'}
2022-04-23 18:33:08,939 INFO sqlalchemy.engine.Engine SELECT * FROM hotels LIMIT 40
2022-04-23 18:33:08,940 INFO sqlalchemy.engine.Engine [generated in 0.00043s] {}


Unnamed: 0,index,id,city,score_weather,name,url,lat_hotels,lon_hotels,description,score
0,0,7,Lille,68.314286,NOCNOC - La Villa Cachee,https://www.booking.com/hotel/fr/la-villa-cach...,50.629146,3.064199,Vous pouvez bénéficier d'une réduction Genius ...,8.4
1,1,7,Lille,68.314286,Hotel Lille Europe,https://www.booking.com/hotel/fr/lille-europe....,50.637888,3.072685,Vous pouvez bénéficier d'une réduction Genius ...,8.1
2,2,7,Lille,68.314286,Moxy Lille City,https://www.booking.com/hotel/fr/moxy-lille-ci...,50.627831,3.063592,"Le Moxy Lille City est situé à Lille, à 700 mè...",8.7
3,3,7,Lille,68.314286,B&B Hôtel Lille Centre Grand Palais,https://www.booking.com/hotel/fr/b-b-lille-cen...,50.628138,3.081424,Le B&B Hôtel Lille Centre Grand Palais est sit...,7.9
4,4,7,Lille,68.314286,ibis Styles Lille Centre Gare Beffroi,https://www.booking.com/hotel/fr/ibis-styles-l...,50.632382,3.067943,L’ibis Styles Lille Centre Gare Beffroi vous a...,7.8
5,233,4,Rouen,82.52,Superbe appartement St sever,https://www.booking.com/hotel/fr/superbe-appar...,49.428775,1.080964,Le Superbe appartement St Sever est situé à Ro...,5.0
6,5,7,Lille,68.314286,Hôtel Calm Lille,https://www.booking.com/hotel/fr/calm-lille.fr...,50.635288,3.069707,Vous pouvez bénéficier d'une réduction Genius ...,7.8
7,6,7,Lille,68.314286,ibis budget Lille Centre,https://www.booking.com/hotel/fr/ibis-budget-l...,50.642414,3.06787,L’ibis budget Lille Centre propose des héberge...,7.1
8,7,7,Lille,68.314286,LoveLoc,https://www.booking.com/hotel/fr/tip-top-studi...,50.637242,3.069105,"Situé à Lille, à 300 mètres de l'opéra, le Lov...",5.0
9,8,7,Lille,68.314286,"Holiday Inn Express Lille Centre, an IHG Hotel",https://www.booking.com/hotel/fr/expressbyholi...,50.630759,3.058088,L'Holiday Inn Express Lille Centre est situé d...,8.1


## Plotting Maps

In [None]:
# Creating a new column to be used to determine the size of the points in the plot
df_weather_full['inverted_score'] = max(df_weather_full['score_weather']) - df_weather_full['score_weather']

### Weather Map

In [None]:
# Creating a plotly scatter mapbox showing the 5 cities with the lowest weather score
px.set_mapbox_access_token(open(".mapbox_token").read())

fig = px.scatter_mapbox(
    df_weather_full[:35],
    lat='lat',
    lon='lon',
    color='felt_temperature',
    size='inverted_score',
    color_continuous_scale=px.colors.sequential.Bluered,
    size_max=35,
    zoom=4.7,
    range_color = [min(df_weather_full[:35]['felt_temperature']), max(df_weather_full[:35]['felt_temperature'])],
    hover_name='city',
    hover_data={
        'lat': False,
        'lon': False,
        'day_plus': False,
        'rain_chances': True,
        'humidity': True,
        'felt_temperature': True,
        'inverted_score': False,
        },
    animation_frame='day_plus',
    labels={'felt_temperature': 'Felt Temperature', 'rain_chances': 'Rain Chances', 'humidity': 'Humidity'}
)

fig.update_layout(
    width = 1100,
    height = 800,
    template='plotly_dark',
    title_x=0.5,
    title_text='The 5 cities with the best weather over the next 7 days')
fig.show()

### Hotels Map

In [None]:
# Creating a plotly scatter mapbox showing the 20 hotels from the previous 5 top cities
px.set_mapbox_access_token(open(".mapbox_token").read())

fig = px.scatter_mapbox(
    df_hotels_full[:100],
    lat='lat_hotels',
    lon='lon_hotels',
    color='score',
    color_continuous_scale=px.colors.diverging.RdYlGn,
    size='score',
    size_max=25,
    zoom=4.7,
    range_color = [min(df_hotels_full[:100]['score']), max(df_hotels_full[:100]['score'])],
    hover_name='city',
    hover_data={
        'lat_hotels': False,
        'lon_hotels': False,
        'name': True,
        'url': False,
        'description': False,
        'score_weather': False,
        'id': False,
        },
    labels={'score': 'Score', 'city': 'City'}
)

fig.update_layout(
    width = 1100,
    height = 800,
    template='plotly_dark',
    title_x=0.5,
    title_text='20 best hotels from the 5 cities with the best weather')
fig.show()