# Kayak Project

## Imports

In [1]:

import pandas as pd
import numpy as np
import plotly.express as px
import requests
from dotenv import load_dotenv
import os
import logging
import scrapy
from scrapy.crawler import CrawlerProcess
from bs4 import BeautifulSoup
import requests
import time

load_dotenv()
%load_ext dotenv
%dotenv

key = os.getenv('APIKEY')

In [2]:
cities = ["Mont Saint Michel", "St Malo", "Bayeux", "Le Havre", "Rouen", "Paris", "Amiens", "Lille", "Strasbourg", "Chateau du Haut Koenigsbourg", "Colmar", "Eguisheim", "Besancon", "Dijon", "Annecy", "Grenoble", "Lyon", "Gorges du Verdon", "Bormes les Mimosas", "Cassis", "Marseille", "Aix en Provence", "Avignon", "Uzes", "Nimes", "Aigues Mortes", "Saintes Maries de la mer", "Collioure", "Carcassonne", "Ariege", "Toulouse", "Montauban", "Biarritz", "Bayonne", "La Rochelle"]

In [3]:
df_cities = pd.DataFrame(columns=["city"])
df_cities['city'] = cities
df_cities.reset_index(inplace=True)
df_cities.rename(columns={'index': 'id'}, inplace=True)
df_cities

Unnamed: 0,id,city
0,0,Mont Saint Michel
1,1,St Malo
2,2,Bayeux
3,3,Le Havre
4,4,Rouen
5,5,Paris
6,6,Amiens
7,7,Lille
8,8,Strasbourg
9,9,Chateau du Haut Koenigsbourg


## Scraping Booking.com

In [4]:
df_booking = df_cities.copy(deep=True)

class BookingSpider(scrapy.Spider):
    name = "Booking_data"
    cities = df_booking["city"]
    start_urls = ['https://www.booking.com/index.fr.html']

    def parse(self, response):
        for x in cities:
            yield scrapy.FormRequest.from_response(
            response,
            formdata={'ss': x },
            callback=self.after_search
        )

    def after_search(self, response):
        cities = response.url.split("ss=")[-1].split("&")[0]
                
        booking = response.css('.d4924c9e74')
        
        for data in booking:
            
            yield {
                'location': cities,
                'name': data.css('a div.fcab3ed991.a23c043802::text').getall(),
                'url': data.css('h3.a4225678b2 a::attr(href)').getall(),
                # 'score': data.css('div.b5cd09854e.d10a6220b4::text').getall(),
                # 'description': data.css('div.d8eab2cf7f::text').getall(),
            }

        try:
            next_page = response.css('a.paging-next').attrib["href"]
        except KeyError:
            logging.info('No next page. Terminating crawling process.')
        else:
            yield response.follow(next_page, callback=self.after_search)

In [5]:
filename = "cities.json"

if filename in os.listdir():
        os.remove(filename)

process = CrawlerProcess(settings = {
    'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:92.0) Gecko/20100101 Firefox/92.0',
    'LOG_LEVEL': logging.INFO,
    "FEEDS": {
        filename: {"format": "json"},
    },
    "AUTOTHROTTLE_ENABLED": True
})

process.crawl(BookingSpider)
process.start()

2022-04-14 22:32:20 [scrapy.utils.log] INFO: Scrapy 2.6.1 started (bot: scrapybot)
2022-04-14 22:32:20 [scrapy.utils.log] INFO: Versions: lxml 4.8.0.0, libxml2 2.9.12, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 22.2.0, Python 3.9.11 (main, Mar 29 2022, 19:08:29) - [GCC 7.5.0], pyOpenSSL 22.0.0 (OpenSSL 1.1.1n  15 Mar 2022), cryptography 36.0.0, Platform Linux-5.13.0-39-generic-x86_64-with-glibc2.31
2022-04-14 22:32:20 [scrapy.crawler] INFO: Overridden settings:
{'AUTOTHROTTLE_ENABLED': True,
 'LOG_LEVEL': 20,
 'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:92.0) '
               'Gecko/20100101 Firefox/92.0'}
2022-04-14 22:32:20 [scrapy.extensions.telnet] INFO: Telnet Password: fae05b61e55cb43f
2022-04-14 22:32:20 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogS

In [10]:
df = pd.read_json("cities.json")
df.head()

Unnamed: 0,location,name,url
0,Mont+Saint+Michel,"[Le Relais Saint Michel, La Mère Poulard, Merc...",[https://www.booking.com/hotel/fr/le-relais-sa...
1,St+Malo,"[Studio cocooning, Apartment, St Malo, Petite ...",[https://www.booking.com/hotel/fr/studio-cocoo...
2,Bayeux,"[Premiere Classe Bayeux, ibis budget Bayeux, L...",[https://www.booking.com/hotel/fr/premiere-cla...
3,Le+Havre,"[Holiday Inn Express - Le Havre Centre, Ibis B...",[https://www.booking.com/hotel/fr/campanile-le...
4,Rouen,"[Maison hypercentre Rouen tout confort, L'Aute...",[https://www.booking.com/hotel/fr/maison-hyper...


In [11]:
for i in range (len(df["location"])):
    df["name"][i] = df["name"][i][:20]
    df["url"][i] = df["url"][i][:20]


In [12]:
df[df['url'].isna()]

Unnamed: 0,location,name,url


In [13]:
lat_full = []
lon_full = []
description_full = []
score_full = []

navigator = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1)'

for i in range(len(df["url"])):
    lat_list = []
    lon_list = []
    description_list = []
    score_list = []
    hotel_list = df["url"][i]

    for j in hotel_list:

        try:
            page = requests.get(j, headers={'User-Agent': navigator})
            soup = BeautifulSoup(page.text, 'html.parser')
        except:
            page = requests.get(j, headers={'User-Agent': navigator})
            soup = BeautifulSoup(page.text, 'html.parser')

        lat_list.append(soup.select('p.address.address_clean a')[0].get("data-atlas-latlng").split(",")[0])
        lon_list.append(soup.select('p.address.address_clean a')[0].get("data-atlas-latlng").split(",")[1])
        description_list.append(soup.select('div#property_description_content')[0].get_text())

        try:
            score_list.append(soup.select('div.b5cd09854e.d10a6220b4')[0].get_text())
            
        except:
            # Some hotels dont have a score yet but I still need one for the visualization. I set it to 0.
            score_list.append("0.0")

        time.sleep(1.4)
        
    lat_full.append(lat_list)
    lon_full.append(lon_list)
    description_full.append(description_list)
    score_full.append(score_list)
    
    print (f"city {df['location'].iloc[i]} done")
    
df["lat"] = lat_full
df["lon"] = lon_full
df["description"] = description_full
df["score"] = score_full

city Mont+Saint+Michel done
city St+Malo done
city Bayeux done
city Le+Havre done
city Rouen done
city Paris done
city Amiens done
city Lille done
city Strasbourg done
city Chateau+du+Haut+Koenigsbourg done
city Colmar done
city Eguisheim done
city Besancon done
city Dijon done
city Annecy done
city Grenoble done
city La+Rochelle done
city Bayonne done
city Biarritz done
city Montauban done
city Toulouse done
city Ariege done
city Carcassonne done
city Collioure done
city Saintes+Maries+de+la+mer done
city Aigues+Mortes done
city Nimes done
city Uzes done
city Avignon done
city Aix+en+Provence done
city Marseille done
city Cassis done
city Bormes+les+Mimosas done
city Gorges+du+Verdon done
city Lyon done


In [None]:
df["location"] = df["location"].str.replace("+", " ", regex=True)
df = df.set_index("location")
df = df.reindex(cities)
df.reset_index(inplace=True)
df.reset_index(inplace=True)
df.rename(columns={'index': 'id'}, inplace=True)

In [41]:
df_hotels = df.copy(deep=True)
df_hotels = df_hotels.apply(pd.Series.explode)
df_hotels["score"] = df_hotels["score"].replace(",", ".", regex=True)
df_hotels["description"] = df_hotels["description"].replace("\\n", "", regex=True)
df_hotels[['lat', 'lon', 'score']] = df_hotels[['lat', 'lon', 'score']].apply(pd.to_numeric)
df_hotels


Unnamed: 0,id,location,name,url,lat,lon,description,score
0,0,Mont Saint Michel,Le Relais Saint Michel,https://www.booking.com/hotel/fr/le-relais-sai...,48.617587,-1.510396,Vous pouvez bénéficier d'une réduction Genius ...,7.8
0,0,Mont Saint Michel,La Mère Poulard,https://www.booking.com/hotel/fr/la-mere-poula...,48.635085,-1.510540,Vous pouvez bénéficier d'une réduction Genius ...,7.2
0,0,Mont Saint Michel,Mercure Mont Saint Michel,https://www.booking.com/hotel/fr/mont-saint-mi...,48.614247,-1.510545,Installé dans des espaces verts à seulement 2 ...,8.2
0,0,Mont Saint Michel,Les Terrasses Poulard,https://www.booking.com/hotel/fr/les-terrasses...,48.635349,-1.510379,Vous pouvez bénéficier d'une réduction Genius ...,7.3
0,0,Mont Saint Michel,Hotel De La Digue,https://www.booking.com/hotel/fr/de-la-digue.f...,48.616882,-1.510918,Cet établissement est à 1 minute à pied de la ...,7.2
...,...,...,...,...,...,...,...,...
34,34,La Rochelle,Hôtel Le Rochelois,https://www.booking.com/hotel/fr/le-rochelois....,46.154182,-1.182437,Vous pouvez bénéficier d'une réduction Genius ...,8.0
34,34,La Rochelle,Joli appartement de standing meublé avec balco...,https://www.booking.com/hotel/fr/joli-appartem...,46.149940,-1.155554,"Doté d'un balcon, le Joli appartement de stand...",0.0
34,34,La Rochelle,"Studio La Rochelle, 2 pièces, 4 personnes - FR...",https://www.booking.com/hotel/fr/apartment-la-...,46.143361,-1.153995,"Le Studio La Rochelle, 2 pièces, 4 personnes -...",0.0
34,34,La Rochelle,QUAI 2 Les MINIMES,https://www.booking.com/hotel/fr/quai-2-d203.f...,46.143132,-1.162204,Cet établissement est à 3 minutes à pied de la...,7.9


In [42]:
df_hotels.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 700 entries, 0 to 34
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           700 non-null    int64  
 1   location     700 non-null    object 
 2   name         700 non-null    object 
 3   url          700 non-null    object 
 4   lat          700 non-null    float64
 5   lon          700 non-null    float64
 6   description  700 non-null    object 
 7   score        700 non-null    float64
dtypes: float64(3), int64(1), object(4)
memory usage: 49.2+ KB


In [44]:
px.set_mapbox_access_token(open(".mapbox_token").read())
fig = px.scatter_mapbox(
    df_hotels.sort_values('name'),
    lat='lat',
    lon='lon',
    color='score',
    size='score',
    color_continuous_scale=px.colors.sequential.Bluered,
    size_max=30,
    zoom=5,
    hover_name='name'
)

fig.update_layout(width = 1300, height = 800, template='plotly_dark' ,title='Weather')
fig.show()

## Getting API Data

### GPS Coordinates

In [46]:
def get_gps(df):

    df_new = df.copy(deep=True)
    lat_list = []
    lon_list = []
    for i in cities:
        if i == "Gorges du Verdon":
            i = "La%20Palud-sur-Verdon"
            r = requests.get(f"https://nominatim.openstreetmap.org/search?city={i}&format=json").json()
            lat_list.append(r[0]['lat'])
            lon_list.append(r[0]['lon'])
        elif i == 'Ariege':
            r = requests.get(f"https://nominatim.openstreetmap.org/search?county={i}&format=json").json()
            lat_list.append(r[0]['lat'])
            lon_list.append(r[0]['lon'])
        else:
            name = i.replace(" ", "%20")
            r = requests.get(f"https://nominatim.openstreetmap.org/search?city={name}&format=json").json()
            lat_list.append(r[0]['lat'])
            lon_list.append(r[0]['lon'])
        
    df_new['lat'] = lat_list
    df_new['lon'] = lon_list 
    return df_new

In [47]:
df_gps = get_gps(df_cities)
df_gps

Unnamed: 0,id,city,lat,lon
0,0,Mont Saint Michel,48.6359541,-1.511459954959514
1,1,St Malo,48.649518,-2.0260409
2,2,Bayeux,49.2764624,-0.7024738
3,3,Le Havre,49.4938975,0.1079732
4,4,Rouen,49.4404591,1.0939658
5,5,Paris,48.8588897,2.3200410217200766
6,6,Amiens,49.8941708,2.2956951
7,7,Lille,50.6365654,3.0635282
8,8,Strasbourg,48.584614,7.7507127
9,9,Chateau du Haut Koenigsbourg,48.2495226,7.3454923


### Weather Data

In [48]:
def get_weather(df):
    
    df_new = df.copy(deep=True)
    temps_list = []
    rain_pop = []
    humidity_list = []
    days = list(range(1,8))
    
    for i in df.itertuples():
        lat = i.lat
        lon = i.lon
        r = requests.get(f"https://api.openweathermap.org/data/2.5/onecall?lat={lat}&lon={lon}&units=metric&appid={key}").json()
        weather_7_days = r['daily'][1:]
        temps = [i['feels_like']['day'] for i in weather_7_days]
        rain = [i['pop'] * 100 for i in weather_7_days]
        humidity = [i['humidity'] for i in weather_7_days]
        temps_list.append(temps)
        rain_pop.append(rain)
        humidity_list.append(humidity)
        
    df_new['day_plus'] = [days for _ in range(len(df))]
    df_new['felt_temperature'] = temps_list
    df_new['rain_chances'] = rain_pop
    df_new['humidity'] = humidity_list
    # Calculation of a weather quality indicator, using absolute value of 35 - temperature (with a weight of 2), chances of rain as a percentage, and humidity as a percentage (with a weight of 0.5)
    df_new['score_weather'] = df_new.apply(lambda x: abs((35 - np.mean(x['felt_temperature'])) * 2) + np.mean(x['rain_chances']) + (np.mean(x['humidity']) / 2), axis=1)
    
    return df_new
    

In [49]:
df_full = get_weather(df_gps)
df_full.head(20)

Unnamed: 0,id,city,lat,lon,day_plus,felt_temperature,rain_chances,humidity,score_weather
0,0,Mont Saint Michel,48.6359541,-1.511459954959514,"[1, 2, 3, 4, 5, 6, 7]","[17.62, 18.3, 15.16, 10.7, 12.04, 9.64, 10.46]","[69.0, 0, 0, 0, 0, 67.0, 96.0]","[71, 59, 50, 84, 67, 95, 66]",111.451429
1,1,St Malo,48.649518,-2.0260409,"[1, 2, 3, 4, 5, 6, 7]","[14.67, 15.18, 13.59, 9.95, 10.35, 11.25, 9.65]","[47.0, 0, 0, 0, 0, 98.0, 93.0]","[80, 72, 56, 79, 71, 89, 71]",116.817143
2,2,Bayeux,49.2764624,-0.7024738,"[1, 2, 3, 4, 5, 6, 7]","[17.97, 16.15, 14.29, 9.73, 10.77, 10.25, 9.08]","[28.999999999999996, 0, 0, 4.0, 0, 61.0, 96.0]","[68, 68, 52, 89, 68, 90, 68]",107.86
3,3,Le Havre,49.4938975,0.1079732,"[1, 2, 3, 4, 5, 6, 7]","[15.04, 14.54, 12.27, 11.81, 10.59, 9.79, 6.27]","[0, 0, 0, 0, 9.0, 76.0, 94.0]","[74, 70, 54, 68, 74, 78, 81]",108.268571
4,4,Rouen,49.4404591,1.0939658,"[1, 2, 3, 4, 5, 6, 7]","[17.9, 16.29, 13.83, 15.38, 12.96, 8.98, 7.94]","[4.0, 0, 0, 0, 50.0, 81.0, 75.0]","[61, 60, 50, 56, 78, 84, 93]",107.777143
5,5,Paris,48.8588897,2.3200410217200766,"[1, 2, 3, 4, 5, 6, 7]","[18.77, 16.44, 14.62, 15.71, 16.56, 14.59, 11.97]","[3.0, 0, 0, 0, 21.0, 37.0, 81.0]","[50, 44, 32, 34, 43, 47, 63]",81.597143
6,6,Amiens,49.8941708,2.2956951,"[1, 2, 3, 4, 5, 6, 7]","[16.58, 15.26, 13.49, 14.31, 15.36, 13.67, 12.76]","[0, 0, 0, 0, 42.0, 35.0, 60.0]","[63, 53, 47, 50, 55, 56, 63]",88.234286
7,7,Lille,50.6365654,3.0635282,"[1, 2, 3, 4, 5, 6, 7]","[16.33, 15.12, 13.98, 14.85, 15.63, 14.54, 12.67]","[0, 0, 0, 0, 0, 28.000000000000004, 72.0]","[54, 47, 37, 40, 47, 44, 61]",78.394286
8,8,Strasbourg,48.584614,7.7507127,"[1, 2, 3, 4, 5, 6, 7]","[17.33, 13.01, 12.83, 13.78, 14.27, 13.19, 15.74]","[9.0, 0, 0, 0, 0, 0, 84.0]","[58, 52, 38, 41, 46, 44, 44]",77.742857
9,9,Chateau du Haut Koenigsbourg,48.2495226,7.3454923,"[1, 2, 3, 4, 5, 6, 7]","[14.89, 10.21, 8.84, 10.45, 10.61, 11.56, 11.28]","[18.0, 0, 0, 0, 0, 0, 83.0]","[57, 49, 39, 41, 47, 39, 50]",85.188571


In [50]:
def df_map(df):

    df_sorted = df_full.sort_values('score_weather').head()
    df_sorted.reset_index(inplace=True, drop=True)
    df_sorted = df_sorted.apply(pd.Series.explode)
    df_sorted[['lat', 'lon', 'day_plus', 'felt_temperature', 'rain_chances', 'humidity']] = df_sorted[['lat', 'lon', 'day_plus', 'felt_temperature', 'rain_chances', 'humidity']].apply(pd.to_numeric)
    return df_sorted

df_plotly = df_map(df_full)

df_plotly

Unnamed: 0,id,city,lat,lon,day_plus,felt_temperature,rain_chances,humidity,score_weather
0,21,Aix en Provence,43.529842,5.447474,1,21.87,1.0,33,55.088571
0,21,Aix en Provence,43.529842,5.447474,2,23.09,0.0,40,55.088571
0,21,Aix en Provence,43.529842,5.447474,3,18.47,0.0,42,55.088571
0,21,Aix en Provence,43.529842,5.447474,4,17.98,0.0,41,55.088571
0,21,Aix en Provence,43.529842,5.447474,5,17.03,0.0,51,55.088571
0,21,Aix en Provence,43.529842,5.447474,6,15.07,8.0,52,55.088571
0,21,Aix en Provence,43.529842,5.447474,7,17.43,2.0,34,55.088571
1,24,Nimes,43.837425,4.360069,1,23.66,2.0,39,60.705714
1,24,Nimes,43.837425,4.360069,2,22.79,0.0,33,60.705714
1,24,Nimes,43.837425,4.360069,3,19.51,0.0,39,60.705714


In [51]:
px.set_mapbox_access_token(open(".mapbox_token").read())
fig = px.scatter_mapbox(
    df_plotly.sort_values('day_plus'),
    lat='lat',
    lon='lon',
    color='felt_temperature',
    size='humidity',
    color_continuous_scale=px.colors.sequential.Bluered,
    size_max=30,
    zoom=4.7,
    hover_name='city',
    hover_data={
        'lat': False,
        'lon': False,
        'day_plus': False,
        'rain_chances': True,
        'humidity': True,
        'felt_temperature': True        
        },
    animation_frame='day_plus'
)

fig.update_layout(width = 1300, height = 800, template='plotly_dark' ,title='Weather')
fig.show()

## Merging Data

In [None]:
df.rename(columns={'lat': 'lat_hotels', 'lon': 'lon_hotels'}, inplace=True)

In [58]:
#df_complete = pd.merge(df_full, df, on='id')
csv = df_complete.to_csv('df_complete.csv', index=False, header=True)

## ETL

In [None]:
df = pd.read_csv('df_complete.csv')


In [25]:
df.to_csv('df.csv', index=False, header=True)

In [1]:
import boto3

session = boto3.Session()
s3 = boto3.resource('s3')

In [30]:
bucket = s3.create_bucket(Bucket='kayak-project-garp', CreateBucketConfiguration={'LocationConstraint': 'eu-west-3'})

In [31]:
put_object = bucket.put_object(Body='df.csv', Key='df.csv')

In [32]:
df_s3 = pd.read_csv('s3://kayak-project-garp/df.csv')
df_s3.head()

Unnamed: 0,df.csv
