# Kayak Project

## Imports

In [101]:

import pandas as pd
import numpy as np
import plotly.express as px
import requests
from dotenv import load_dotenv
import os
import logging
import scrapy
from scrapy.crawler import CrawlerProcess
from bs4 import BeautifulSoup
import requests
import time
import boto3

load_dotenv()
%load_ext dotenv
%dotenv

# Openweathermap API
key = os.getenv('APIKEY')
# RDS Connection
DBHOST = os.getenv("DBHOST")
DBUSER = os.getenv("DBUSER")
DBPASS = os.getenv("DBPASS")
DBNAME = os.getenv("DBNAME")

The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv


In [102]:
# Importing the cities list
cities = ["Mont Saint Michel", "St Malo", "Bayeux", "Le Havre", "Rouen", "Paris", "Amiens", "Lille", "Strasbourg", "Chateau du Haut Koenigsbourg", "Colmar", "Eguisheim", "Besancon", "Dijon", "Annecy", "Grenoble", "Lyon", "Gorges du Verdon", "Bormes les Mimosas", "Cassis", "Marseille", "Aix en Provence", "Avignon", "Uzes", "Nimes", "Aigues Mortes", "Saintes Maries de la mer", "Collioure", "Carcassonne", "Ariege", "Toulouse", "Montauban", "Biarritz", "Bayonne", "La Rochelle"]

In [103]:
# Creating a dataframe with the cities list
df_cities = pd.DataFrame(columns=["city"])
df_cities['city'] = cities
df_cities.reset_index(inplace=True)
df_cities.rename(columns={'index': 'id'}, inplace=True)
df_cities

Unnamed: 0,id,city
0,0,Mont Saint Michel
1,1,St Malo
2,2,Bayeux
3,3,Le Havre
4,4,Rouen
5,5,Paris
6,6,Amiens
7,7,Lille
8,8,Strasbourg
9,9,Chateau du Haut Koenigsbourg


## Scraping Booking.com

In [4]:
# Creating a copy of the dataframe
df_booking = df_cities.copy(deep=True)

# Defining the spider class
class BookingSpider(scrapy.Spider):
    name = "Booking_data"
    cities = df_booking["city"]
    start_urls = ['https://www.booking.com/index.fr.html']

    def parse(self, response):
        for i in cities:
            yield scrapy.FormRequest.from_response(
            response,
            formdata={'ss': i },
            callback=self.after_search
        )
    # Important note: the classes names change often and may not be correct at a later date
    def after_search(self, response):
        cities = response.url.split("ss=")[-1].split("&")[0]
                
        booking = response.css('.d4924c9e74')
        
        for k in booking:
            
            yield {
                'location': cities,
                'name': k.css('a div.fcab3ed991.a23c043802::text').getall(),
                'url': k.css('h3.a4225678b2 a::attr(href)').getall(),
            }

        try:
            next_page = response.css('a.paging-next').attrib["href"]
        except KeyError:
            logging.info('No next page. Terminating crawling process.')
        else:
            yield response.follow(next_page, callback=self.after_search)

In [5]:
# Initializing the crawler process
filename = "cities.json"

if filename in os.listdir():
        os.remove(filename)

process = CrawlerProcess(settings = {
    'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:92.0) Gecko/20100101 Firefox/92.0',
    'LOG_LEVEL': logging.INFO,
    "FEEDS": {
        filename: {"format": "json"},
    },
    "AUTOTHROTTLE_ENABLED": True
})

process.crawl(BookingSpider)
process.start()

2022-04-14 22:32:20 [scrapy.utils.log] INFO: Scrapy 2.6.1 started (bot: scrapybot)
2022-04-14 22:32:20 [scrapy.utils.log] INFO: Versions: lxml 4.8.0.0, libxml2 2.9.12, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 22.2.0, Python 3.9.11 (main, Mar 29 2022, 19:08:29) - [GCC 7.5.0], pyOpenSSL 22.0.0 (OpenSSL 1.1.1n  15 Mar 2022), cryptography 36.0.0, Platform Linux-5.13.0-39-generic-x86_64-with-glibc2.31
2022-04-14 22:32:20 [scrapy.crawler] INFO: Overridden settings:
{'AUTOTHROTTLE_ENABLED': True,
 'LOG_LEVEL': 20,
 'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:92.0) '
               'Gecko/20100101 Firefox/92.0'}
2022-04-14 22:32:20 [scrapy.extensions.telnet] INFO: Telnet Password: fae05b61e55cb43f
2022-04-14 22:32:20 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogS

In [10]:
# Reading the scraped json data into a new dataframe
df = pd.read_json("cities.json")
df.head()

Unnamed: 0,location,name,url
0,Mont+Saint+Michel,"[Le Relais Saint Michel, La Mère Poulard, Merc...",[https://www.booking.com/hotel/fr/le-relais-sa...
1,St+Malo,"[Studio cocooning, Apartment, St Malo, Petite ...",[https://www.booking.com/hotel/fr/studio-cocoo...
2,Bayeux,"[Premiere Classe Bayeux, ibis budget Bayeux, L...",[https://www.booking.com/hotel/fr/premiere-cla...
3,Le+Havre,"[Holiday Inn Express - Le Havre Centre, Ibis B...",[https://www.booking.com/hotel/fr/campanile-le...
4,Rouen,"[Maison hypercentre Rouen tout confort, L'Aute...",[https://www.booking.com/hotel/fr/maison-hyper...


In [11]:
# Keeping only the first 20 items from each list
for i in range (len(df["location"])):
    df["name"][i] = df["name"][i][:20]
    df["url"][i] = df["url"][i][:20]


In [13]:
# Initializing the scraper with BeautifulSoup

lat_full = []
lon_full = []
description_full = []
score_full = []

navigator = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1)'

for i in range(len(df["url"])):
    lat_list = []
    lon_list = []
    description_list = []
    score_list = []
    hotel_list = df["url"][i]

    for j in hotel_list:

        # In some cases, the process will return an error on the first attempt
        try:
            page = requests.get(j, headers={'User-Agent': navigator})
            soup = BeautifulSoup(page.text, 'html.parser')
        except:
            page = requests.get(j, headers={'User-Agent': navigator})
            soup = BeautifulSoup(page.text, 'html.parser')

        lat_list.append(soup.select('p.address.address_clean a')[0].get("data-atlas-latlng").split(",")[0])
        lon_list.append(soup.select('p.address.address_clean a')[0].get("data-atlas-latlng").split(",")[1])
        description_list.append(soup.select('div#property_description_content')[0].get_text())

        try:
            score_list.append(soup.select('div.b5cd09854e.d10a6220b4')[0].get_text())
            
        except:
            # If the score is missing, setting the default value to 5.0
            score_list.append("5.0")

        # Setting up a wait timer to avoid spamming the website
        time.sleep(1.4)
        
    lat_full.append(lat_list)
    lon_full.append(lon_list)
    description_full.append(description_list)
    score_full.append(score_list)
    
    # Printing the progress, the whole process takes a bit more than 40 minutes
    print (f"{df['location'].iloc[i]} completed")

# Adding new columns based on lists of lists gathered above    
df["lat"] = lat_full
df["lon"] = lon_full
df["description"] = description_full
df["score"] = score_full

city Mont+Saint+Michel done
city St+Malo done
city Bayeux done
city Le+Havre done
city Rouen done
city Paris done
city Amiens done
city Lille done
city Strasbourg done
city Chateau+du+Haut+Koenigsbourg done
city Colmar done
city Eguisheim done
city Besancon done
city Dijon done
city Annecy done
city Grenoble done
city La+Rochelle done
city Bayonne done
city Biarritz done
city Montauban done
city Toulouse done
city Ariege done
city Carcassonne done
city Collioure done
city Saintes+Maries+de+la+mer done
city Aigues+Mortes done
city Nimes done
city Uzes done
city Avignon done
city Aix+en+Provence done
city Marseille done
city Cassis done
city Bormes+les+Mimosas done
city Gorges+du+Verdon done
city Lyon done


In [None]:
# Replacing the + in cities names and reindexing the dataframe to add the id column that will be useful to merge later
df["location"] = df["location"].str.replace("+", " ", regex=True)
df = df.set_index("location")
df = df.reindex(cities)
df.reset_index(inplace=True)
df.reset_index(inplace=True)
df.rename(columns={'index': 'id'}, inplace=True)

In [None]:
# px.set_mapbox_access_token(open(".mapbox_token").read())
# fig = px.scatter_mapbox(
#     df_hotels.sort_values('name'),
#     lat='lat',
#     lon='lon',
#     color='score',
#     size='score',
#     color_continuous_scale=px.colors.sequential.Bluered,
#     size_max=30,
#     zoom=5,
#     hover_name='name'
# )

# fig.update_layout(width = 1300, height = 800, template='plotly_dark' ,title='Weather')
# fig.show()

## Getting API Data

### GPS Coordinates

In [104]:
# Creating another copy of the cities dataframe to store the coordinates data from Nominatim API
df_gps = df_cities.copy(deep=True)
lat_list = []
lon_list = []

for i in cities:
    # There is no Gorges du Verdon city, replacing by the closest city instead
    if i == "Gorges du Verdon":
        i = "La%20Palud-sur-Verdon"
        r = requests.get(f"https://nominatim.openstreetmap.org/search?city={i}&format=json").json()
        lat_list.append(r[0]['lat'])
        lon_list.append(r[0]['lon'])
    # Ariege is not a city, using the county instead here
    elif i == 'Ariege':
        r = requests.get(f"https://nominatim.openstreetmap.org/search?county={i}&format=json").json()
        lat_list.append(r[0]['lat'])
        lon_list.append(r[0]['lon'])
    # Replacing the whitespace by a web friendly notation
    else:
        name = i.replace(" ", "%20")
        r = requests.get(f"https://nominatim.openstreetmap.org/search?city={name}&format=json").json()
        lat_list.append(r[0]['lat'])
        lon_list.append(r[0]['lon'])
 
# Adding the coordinates to the dataframe   
df_gps['lat'] = lat_list
df_gps['lon'] = lon_list 

In [106]:
df_gps.head()

Unnamed: 0,id,city,lat,lon
0,0,Mont Saint Michel,48.6359541,-1.511459954959514
1,1,St Malo,48.649518,-2.0260409
2,2,Bayeux,49.2764624,-0.7024738
3,3,Le Havre,49.4938975,0.1079732
4,4,Rouen,49.4404591,1.0939658


### Weather Data

In [109]:
# Getting the weather data based on coordinates from the Openweathermap API
df_full = df_gps.copy(deep=True)
temps_list = []
rain_pop = []
humidity_list = []
# The days list will be used to store data as items of a list for day +1 to day +7
days = list(range(1,8))

for i in df.itertuples():
    lat = i.lat
    lon = i.lon
    r = requests.get(f"https://api.openweathermap.org/data/2.5/onecall?lat={lat}&lon={lon}&units=metric&appid={key}").json()
    weather_7_days = r['daily'][1:] # Getting the weather data for the next 7 days, first item is the current weather, which we don't want here
    temps = [j['feels_like']['day'] for j in weather_7_days]
    rain = [int(j['pop'] * 100) for j in weather_7_days]
    humidity = [j['humidity'] for j in weather_7_days]
    temps_list.append(temps)
    rain_pop.append(rain)
    humidity_list.append(humidity)
    
df_full['day_plus'] = [days for _ in range(len(df))]
df_full['felt_temperature'] = temps_list
df_full['rain_chances'] = rain_pop
df_full['humidity'] = humidity_list
# Calculation of a weather quality indicator, using absolute value of 35 - temperature (with a weight of 2), chances of rain as a percentage, and humidity as a percentage (with a weight of 0.5) - lower is better
df_full['score_weather'] = df_full.apply(lambda x: abs((35 - np.mean(x['felt_temperature'])) * 2) + np.mean(x['rain_chances']) + (np.mean(x['humidity']) / 2), axis=1)   

In [112]:
df_full.head()

Unnamed: 0,id,city,lat,lon,day_plus,felt_temperature,rain_chances,humidity,score_weather
0,0,Mont Saint Michel,48.6359541,-1.511459954959514,"[1, 2, 3, 4, 5, 6, 7]","[15.83, 15.06, 15, 15.98, 13.56, 15.26, 14.15]","[43, 24, 49, 100, 86, 45, 24]","[59, 48, 60, 66, 72, 61, 67]",123.974286
1,1,St Malo,48.649518,-2.0260409,"[1, 2, 3, 4, 5, 6, 7]","[12.29, 13.22, 13.46, 13.7, 12.37, 12.76, 12.97]","[37, 23, 60, 98, 76, 38, 8]","[79, 59, 69, 75, 76, 74, 72]",128.637143
2,2,Bayeux,49.2764624,-0.7024738,"[1, 2, 3, 4, 5, 6, 7]","[12.54, 12.48, 11.93, 15.25, 12.8, 12.83, 12.12]","[10, 0, 0, 97, 89, 0, 19]","[71, 61, 71, 72, 79, 65, 67]",109.728571
3,3,Le Havre,49.4938975,0.1079732,"[1, 2, 3, 4, 5, 6, 7]","[13.34, 12.66, 12.82, 14.04, 10.5, 11.9, 12.36]","[0, 0, 0, 100, 75, 25, 18]","[68, 56, 67, 75, 94, 67, 61]",110.965714
4,4,Rouen,49.4404591,1.0939658,"[1, 2, 3, 4, 5, 6, 7]","[14.93, 13.59, 14.52, 15.67, 10.08, 13.9, 13.95]","[0, 0, 0, 100, 56, 55, 7]","[58, 53, 63, 77, 93, 59, 55]",106.245714


In [51]:
# px.set_mapbox_access_token(open(".mapbox_token").read())
# fig = px.scatter_mapbox(
#     df_plotly.sort_values('day_plus'),
#     lat='lat',
#     lon='lon',
#     color='felt_temperature',
#     size='humidity',
#     color_continuous_scale=px.colors.sequential.Bluered,
#     size_max=30,
#     zoom=4.7,
#     hover_name='city',
#     hover_data={
#         'lat': False,
#         'lon': False,
#         'day_plus': False,
#         'rain_chances': True,
#         'humidity': True,
#         'felt_temperature': True        
#         },
#     animation_frame='day_plus'
# )

# fig.update_layout(width = 1300, height = 800, template='plotly_dark' ,title='Weather')
# fig.show()

## Merging Data

In [None]:
# Renaming coordinates columns to avoid confusion between city and hotel coordinates
df.rename(columns={'lat': 'lat_hotels', 'lon': 'lon_hotels'}, inplace=True)

In [58]:
# Merging the hotels and weather dataframes, and dropping the unnecessary location column
df_complete = pd.merge(df_full, df, on='id')
df_complete.drop(columns=['location'], inplace=True)
print(df_complete.shape)

# Saving the merged dataset as a csv file
df_complete.to_csv('df_complete.csv', index=False, header=True)

## Storing the dataset in a S3 data lake

In [10]:
# Loading the S3 dession and resource
session = boto3.Session()
s3 = boto3.resource('s3')

In [11]:
# Uploading the file to S3
s3.Bucket('kayak-project-garp').upload_file('df_complete.csv', 'df_complete.csv')

In [13]:
# Downloading the file from S3 and checking it kept the same shape.
df_s3 = pd.read_csv('s3://kayak-project-garp/df_complete.csv')
df_s3.shape

(35, 16)

## Transforming

In [45]:
# Splitting the dataframe into 2 tables

df_weather = df_s3[['id', 'city', 'lat', 'lon', 'day_plus', 'felt_temperature', 'rain_chances', 'humidity', 'score_weather']]
df_hotels = df_s3[['id', 'city', 'name', 'url', 'lat_hotels', 'lon_hotels', 'description', 'score', 'score_weather']]

In [27]:
df_weather.head()

Unnamed: 0,id,city,lat,lon,day_plus,felt_temperature,rain_chances,humidity,score_weather
0,0,Mont Saint Michel,48.635954,-1.51146,"[1, 2, 3, 4, 5, 6, 7]","[17.62, 18.3, 15.16, 10.7, 12.04, 9.64, 10.46]","[69.0, 0, 0, 0, 0, 67.0, 96.0]","[71, 59, 50, 84, 67, 95, 66]",111.451429
1,1,St Malo,48.649518,-2.026041,"[1, 2, 3, 4, 5, 6, 7]","[14.67, 15.18, 13.59, 9.95, 10.35, 11.25, 9.65]","[47.0, 0, 0, 0, 0, 98.0, 93.0]","[80, 72, 56, 79, 71, 89, 71]",116.817143
2,2,Bayeux,49.276462,-0.702474,"[1, 2, 3, 4, 5, 6, 7]","[17.97, 16.15, 14.29, 9.73, 10.77, 10.25, 9.08]","[28.999999999999996, 0, 0, 4.0, 0, 61.0, 96.0]","[68, 68, 52, 89, 68, 90, 68]",107.86
3,3,Le Havre,49.493898,0.107973,"[1, 2, 3, 4, 5, 6, 7]","[15.04, 14.54, 12.27, 11.81, 10.59, 9.79, 6.27]","[0, 0, 0, 0, 9.0, 76.0, 94.0]","[74, 70, 54, 68, 74, 78, 81]",108.268571
4,4,Rouen,49.440459,1.093966,"[1, 2, 3, 4, 5, 6, 7]","[17.9, 16.29, 13.83, 15.38, 12.96, 8.98, 7.94]","[4.0, 0, 0, 0, 50.0, 81.0, 75.0]","[61, 60, 50, 56, 78, 84, 93]",107.777143


In [28]:
df_hotels.head()

Unnamed: 0,id,city,name,url,lat_hotels,lon_hotels,description,score
0,0,Mont Saint Michel,"['Le Relais Saint Michel', 'La Mère Poulard', ...",['https://www.booking.com/hotel/fr/le-relais-s...,"['48.61758727', '48.63508532', '48.61424653', ...","['-1.51039615', '-1.51053965', '-1.51054502', ...","[""\nVous pouvez bénéficier d'une réduction Gen...","['7,8', '7,2', '8,2', '7,3', '7,2', '8,1', '8,..."
1,1,St Malo,"['Studio cocooning', 'Apartment, St Malo', 'Pe...",['https://www.booking.com/hotel/fr/studio-coco...,"['48.65537970', '48.65874000', '48.65956907', ...","['-2.00327690', '-1.97702800', '-1.98920718', ...","[""\nVous pouvez bénéficier d'une réduction Gen...","['9,5', '0.0', '9,3', '8,7', '7,5', '0.0', '7,..."
2,2,Bayeux,"['Premiere Classe Bayeux', 'ibis budget Bayeux...",['https://www.booking.com/hotel/fr/premiere-cl...,"['49.26942872', '49.25424209', '49.27571400', ...","['-0.70668697', '-0.64648747', '-0.69804200', ...","[""\nVous pouvez bénéficier d'une réduction Gen...","['7,7', '8,2', '9,7', '0.0', '7,8', '8,1', '7,..."
3,3,Le Havre,"['Holiday Inn Express - Le Havre Centre', 'Ibi...",['https://www.booking.com/hotel/fr/campanile-l...,"['49.49823800', '49.49424468', '49.49331760', ...","['0.12883700', '0.14285243', '0.12530688', '0....","[""\nL’Holiday Inn Express - Le Havre Centre, s...","['8,2', '8,4', '8,6', '8,1', '0.0', '7,5', '8,..."
4,4,Rouen,"['Maison hypercentre Rouen tout confort', ""L'A...",['https://www.booking.com/hotel/fr/maison-hype...,"['49.44465180', '49.44454400', '49.43572312', ...","['1.09132620', '1.09117800', '1.10273123', '1....","[""\nVous pouvez bénéficier d'une réduction Gen...","['8,8', '9,1', '8,2', '8,9', '9,2', '8,5', '8,..."


In [84]:
# The csv has converted the lists into strings, we need to revert that first for both dataframes
df_weather_final = pd.DataFrame()
df_weather_final[['id', 'city', 'lat', 'lon', 'score_weather']] = df_weather[['id', 'city', 'lat', 'lon', 'score_weather']]
df_weather_final['day_plus'] = df_weather['day_plus'].apply(eval)
df_weather_final['felt_temperature'] = df_weather['felt_temperature'].apply(eval)
df_weather_final['rain_chances'] = df_weather['rain_chances'].apply(eval)
df_weather_final['humidity'] = df_weather['humidity'].apply(eval)

df_hotels_final = pd.DataFrame()
df_hotels_final[['id', 'city', 'score_weather']] = df_hotels[['id', 'city', 'score_weather']]
df_hotels_final['name'] = df_hotels['name'].apply(eval)
df_hotels_final['url'] = df_hotels['url'].apply(eval)
df_hotels_final['lat_hotels'] = df_hotels['lat_hotels'].apply(eval)
df_hotels_final['lon_hotels'] = df_hotels['lon_hotels'].apply(eval)
df_hotels_final['description'] = df_hotels['description'].apply(eval)
df_hotels_final['score'] = df_hotels['score'].apply(eval)

# We need to convert coordinates to numeric type as well
df_weather_final[['lat', 'lon']] = df_weather[['lat', 'lon']].apply(pd.to_numeric)

In [85]:
# We need to sort the data by weather score and explode the lists to prepare the data for warehousing and plotting
df_weather_full = df_weather_final.sort_values('score_weather')
df_weather_full.reset_index(inplace=True, drop=True)
df_weather_full = df_weather_full.apply(pd.Series.explode)
df_weather_full[['day_plus', 'felt_temperature', 'rain_chances', 'humidity']] = df_weather_full[['day_plus', 'felt_temperature', 'rain_chances', 'humidity']].apply(pd.to_numeric)

df_hotels_full = df_hotels_final.sort_values('score_weather')
df_hotels_full.reset_index(inplace=True, drop=True)
df_hotels_full = df_hotels_full.apply(pd.Series.explode)
df_hotels_full['description'] = df_hotels_full['description'].replace("\\n", "", regex=True)
df_hotels_full['score'] = df_hotels_full['score'].replace(",", ".", regex=True)
df_hotels_full[['lat_hotels', 'lon_hotels', 'score']] = df_hotels_full[['lat_hotels', 'lon_hotels', 'score']].apply(pd.to_numeric)

In [87]:
df_hotels_full[:100]

Unnamed: 0,id,city,score_weather,name,url,lat_hotels,lon_hotels,description,score
0,21,Aix en Provence,55.088571,TheCamp Hotel&Lodges - Aix en Provence,https://www.booking.com/hotel/fr/thecamp-amp-l...,43.496316,5.342152,Vous pouvez bénéficier d'une réduction Genius ...,8.5
0,21,Aix en Provence,55.088571,Odalys City Aix en Provence L'Atrium,https://www.booking.com/hotel/fr/atrium-d-anai...,43.524091,5.455216,Vous pouvez bénéficier d'une réduction Genius ...,7.4
0,21,Aix en Provence,55.088571,Nice apartment for 4 people in the center of A...,https://www.booking.com/hotel/fr/nice-apartmen...,43.518571,5.458349,"Situé à Aix-en-Provence, à proximité du cours ...",0.0
0,21,Aix en Provence,55.088571,Furnished apartment with terrace in the center...,https://www.booking.com/hotel/fr/furnished-apa...,43.526627,5.446262,Situé dans le centre historique d'Aix-en-Prove...,0.0
0,21,Aix en Provence,55.088571,Séjours & Affaires Aix-en-Provence Mirabeau,https://www.booking.com/hotel/fr/residence-mir...,43.525241,5.441266,Vous pouvez bénéficier d'une réduction Genius ...,8.2
...,...,...,...,...,...,...,...,...,...
4,20,Marseille,66.391429,Climatisé Gare StCHARLES 4 chambres Grand Balc...,https://www.booking.com/hotel/fr/climatise-gar...,43.306098,5.381638,Situé dans le quartier Saint-Charles à Marseil...,9.4
4,20,Marseille,66.391429,Sherwood,https://www.booking.com/hotel/fr/grand-apparte...,43.295631,5.379223,Le Sherwood propose un hébergement avec connex...,0.0
4,20,Marseille,66.391429,Le Paradis-Prado vélodrome parc Chanot plage à...,https://www.booking.com/hotel/fr/le-paradis-pr...,43.270241,5.386880,Doté d'une connexion Wi-Fi gratuite et offrant...,0.0
4,20,Marseille,66.391429,Studio Calme et Lumineux proche gare St Charles,https://www.booking.com/hotel/fr/studio-proche...,43.306563,5.380532,"Situé à Marseille, à 1,2 km du centre commerci...",0.0


## Storing the transformed data in a RDS data warehouse

In [91]:
# Importing libraries for the data warehousing and initialization of the SQL engine
from sqlalchemy import create_engine, text
from sqlalchemy.orm import sessionmaker

engine = create_engine(f"postgresql+psycopg2://{DBUSER}:{DBPASS}@{DBHOST}/{DBNAME}", echo=True)
Session = sessionmaker(bind=engine)
session = Session()

In [None]:
# Storing the two dataframes as SQL tables
df_weather_full.to_sql('weather', con=engine, if_exists='replace')
df_hotels_full.to_sql('hotels', con=engine, if_exists='replace')

In [97]:
# Testing the data with a query
query = text("SELECT * FROM weather LIMIT 21")
pd.read_sql(query, engine)

2022-04-19 00:36:06,655 INFO sqlalchemy.engine.Engine select relname from pg_class c join pg_namespace n on n.oid=c.relnamespace where pg_catalog.pg_table_is_visible(c.oid) and relname=%(name)s
2022-04-19 00:36:06,656 INFO sqlalchemy.engine.Engine [cached since 298s ago] {'name': 'SELECT * FROM weather LIMIT 21'}
2022-04-19 00:36:06,687 INFO sqlalchemy.engine.Engine SELECT * FROM weather LIMIT 21
2022-04-19 00:36:06,688 INFO sqlalchemy.engine.Engine [generated in 0.00053s] {}


Unnamed: 0,id,city,lat,lon,score_weather,day_plus,felt_temperature,rain_chances,humidity
0,21,Aix en Provence,43.529842,5.447474,55.088571,1,21.87,1.0,33
1,21,Aix en Provence,43.529842,5.447474,55.088571,2,23.09,0.0,40
2,21,Aix en Provence,43.529842,5.447474,55.088571,3,18.47,0.0,42
3,21,Aix en Provence,43.529842,5.447474,55.088571,4,17.98,0.0,41
4,21,Aix en Provence,43.529842,5.447474,55.088571,5,17.03,0.0,51
5,21,Aix en Provence,43.529842,5.447474,55.088571,6,15.07,8.0,52
6,21,Aix en Provence,43.529842,5.447474,55.088571,7,17.43,2.0,34
7,24,Nimes,43.837425,4.360069,60.705714,1,23.66,2.0,39
8,24,Nimes,43.837425,4.360069,60.705714,2,22.79,0.0,33
9,24,Nimes,43.837425,4.360069,60.705714,3,19.51,0.0,39


In [98]:
query = text("SELECT * FROM hotels LIMIT 40")
pd.read_sql(query, engine)

2022-04-19 00:36:31,769 INFO sqlalchemy.engine.Engine select relname from pg_class c join pg_namespace n on n.oid=c.relnamespace where pg_catalog.pg_table_is_visible(c.oid) and relname=%(name)s
2022-04-19 00:36:31,770 INFO sqlalchemy.engine.Engine [cached since 323.2s ago] {'name': 'SELECT * FROM hotels LIMIT 40'}
2022-04-19 00:36:31,801 INFO sqlalchemy.engine.Engine SELECT * FROM hotels LIMIT 40
2022-04-19 00:36:31,802 INFO sqlalchemy.engine.Engine [generated in 0.00068s] {}


Unnamed: 0,id,city,score_weather,name,url,lat_hotels,lon_hotels,description,score
0,21,Aix en Provence,55.088571,TheCamp Hotel&Lodges - Aix en Provence,https://www.booking.com/hotel/fr/thecamp-amp-l...,43.496316,5.342152,Vous pouvez bénéficier d'une réduction Genius ...,8.5
1,21,Aix en Provence,55.088571,Odalys City Aix en Provence L'Atrium,https://www.booking.com/hotel/fr/atrium-d-anai...,43.524091,5.455216,Vous pouvez bénéficier d'une réduction Genius ...,7.4
2,21,Aix en Provence,55.088571,Nice apartment for 4 people in the center of A...,https://www.booking.com/hotel/fr/nice-apartmen...,43.518571,5.458349,"Situé à Aix-en-Provence, à proximité du cours ...",0.0
3,21,Aix en Provence,55.088571,Furnished apartment with terrace in the center...,https://www.booking.com/hotel/fr/furnished-apa...,43.526627,5.446262,Situé dans le centre historique d'Aix-en-Prove...,0.0
4,21,Aix en Provence,55.088571,Séjours & Affaires Aix-en-Provence Mirabeau,https://www.booking.com/hotel/fr/residence-mir...,43.525241,5.441266,Vous pouvez bénéficier d'une réduction Genius ...,8.2
5,21,Aix en Provence,55.088571,Campanile Aix-en-Provence Sud - Pont de l'Arc,https://www.booking.com/hotel/fr/campanile-aix...,43.511916,5.436654,"Installé à Aix-en-Provence, à 3 km du centre h...",7.4
6,21,Aix en Provence,55.088571,Furnished apartment in the heart of the city c...,https://www.booking.com/hotel/fr/furnished-apa...,43.53017,5.44735,Doté d'une connexion Wi-Fi gratuite et offrant...,0.0
7,21,Aix en Provence,55.088571,ibis budget Aix en Provence,https://www.booking.com/hotel/fr/etap-aix-en-p...,43.49453,5.368881,L'ibis budget Aix en Provence est situé en pér...,7.7
8,21,Aix en Provence,55.088571,Joli T2 avec parking à deux pas du centre vill...,https://www.booking.com/hotel/fr/joli-t2-avec-...,43.530129,5.458953,"Doté d'une terrasse, le Joli T2 avec parking à...",0.0
9,21,Aix en Provence,55.088571,Furnished apartment in the heart of the city n...,https://www.booking.com/hotel/fr/furnished-apa...,43.531277,5.44944,"Situé dans le centre d'Aix-en-Provence, à seul...",0.0
