# Kayak Project

## Imports

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import requests
from dotenv import load_dotenv
import os
import logging
import scrapy
from scrapy.crawler import CrawlerProcess
from bs4 import BeautifulSoup
import requests
import time

load_dotenv()
%load_ext dotenv
%dotenv

key = os.getenv('APIKEY')

In [2]:
cities = ["Mont Saint Michel", "St Malo", "Bayeux", "Le Havre", "Rouen", "Paris", "Amiens", "Lille", "Strasbourg", "Chateau du Haut Koenigsbourg", "Colmar", "Eguisheim", "Besancon", "Dijon", "Annecy", "Grenoble", "Lyon", "Gorges du Verdon", "Bormes les Mimosas", "Cassis", "Marseille", "Aix en Provence", "Avignon", "Uzes", "Nimes", "Aigues Mortes", "Saintes Maries de la mer", "Collioure", "Carcassonne", "Ariege", "Toulouse", "Montauban", "Biarritz", "Bayonne", "La Rochelle"]

In [3]:
df_cities = pd.DataFrame(columns=["city"])
df_cities['city'] = cities
df_cities.reset_index(inplace=True)
df_cities.rename(columns={'index': 'id'}, inplace=True)
df_cities

Unnamed: 0,id,city
0,0,Mont Saint Michel
1,1,St Malo
2,2,Bayeux
3,3,Le Havre
4,4,Rouen
5,5,Paris
6,6,Amiens
7,7,Lille
8,8,Strasbourg
9,9,Chateau du Haut Koenigsbourg


## Scraping Booking.com

In [4]:
df_booking = df_cities.copy(deep=True)

class BookingSpider(scrapy.Spider):
    name = "Booking_data"
    cities = df_booking["city"]
    start_urls = ['https://www.booking.com/index.fr.html']

    def parse(self, response):
        for x in cities:
            yield scrapy.FormRequest.from_response(
            response,
            formdata={'ss': x },
            callback=self.after_search
        )

    def after_search(self, response):
        cities = response.url.split("ss=")[-1].split("&")[0]
                
        booking = response.css('.d4924c9e74')
        
        for data in booking:
            
            yield {
                'location': cities,
                'name': data.css('a div.fcab3ed991.a23c043802::text').getall(),
                'url': data.css('h3.a4225678b2 a::attr(href)').getall(),
                # 'score': data.css('div.b5cd09854e.d10a6220b4::text').getall(),
                # 'description': data.css('div.d8eab2cf7f::text').getall(),
            }

        try:
            next_page = response.css('a.paging-next').attrib["href"]
        except KeyError:
            logging.info('No next page. Terminating crawling process.')
        else:
            yield response.follow(next_page, callback=self.after_search)

In [5]:
filename = "cities.json"

if filename in os.listdir():
        os.remove(filename)

process = CrawlerProcess(settings = {
    'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:92.0) Gecko/20100101 Firefox/92.0',
    'LOG_LEVEL': logging.INFO,
    "FEEDS": {
        filename: {"format": "json"},
    },
    "AUTOTHROTTLE_ENABLED": True
})

process.crawl(BookingSpider)
process.start()

2022-04-13 21:39:48 [scrapy.utils.log] INFO: Scrapy 2.6.1 started (bot: scrapybot)
2022-04-13 21:39:48 [scrapy.utils.log] INFO: Versions: lxml 4.8.0.0, libxml2 2.9.4, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 22.2.0, Python 3.9.10 | packaged by conda-forge | (main, Feb  1 2022, 21:27:43) - [Clang 11.1.0 ], pyOpenSSL 22.0.0 (OpenSSL 1.1.1n  15 Mar 2022), cryptography 36.0.1, Platform macOS-12.3.1-arm64-arm-64bit
2022-04-13 21:39:48 [scrapy.crawler] INFO: Overridden settings:
{'AUTOTHROTTLE_ENABLED': True,
 'LOG_LEVEL': 20,
 'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:92.0) '
               'Gecko/20100101 Firefox/92.0'}
2022-04-13 21:39:48 [scrapy.extensions.telnet] INFO: Telnet Password: b3178f7049a58bbe
2022-04-13 21:39:48 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions

In [None]:
df = pd.read_json("cities.json")
df.head()

In [None]:
for i in range (len(df["location"])):
    df["name"][i] = df["name"][i][0:20]
    df["url"][i] = df["url"][i][0:20]

In [None]:
df[df['url'].isna()]

In [None]:
df["lat"] = 0
df["lon"] = 0
df["description"] = "test"
df["score"] = 0.0

navigator = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1)'

for i in range(len(df["url"])):
    lat_list = []
    lon_list = []
    description_list = []
    score_list = []

    hotel_list = df["url"][i]

    for i2 in hotel_list:

        try:
            page = requests.get(i2, headers={'User-Agent': navigator})
            soup = BeautifulSoup(page.text, 'html.parser')
        except:
            page = requests.get(i2, headers={'User-Agent': navigator})
            soup = BeautifulSoup(page.text, 'html.parser')

        lat_list.append(soup.select('p.address.address_clean a')[0].get("data-atlas-latlng").split(",")[0])
        lon_list.append(soup.select('p.address.address_clean a')[0].get("data-atlas-latlng").split(",")[1])
        description_list.append(soup.select('div#property_description_content')[0].get_text())

        try:
            score_list.append(soup.select('div.b5cd09854e.d10a6220b4')[0].get_text())
            
        except:
            # Some hotels dont have a score yet but I still need one for the visualization. I set it to 0.
            score_list.append("0.0")

        time.sleep(1.4)
  
    df["lat"][i] = lat_list
    df["lon"][i] = lon_list
    df["description"][i] = description_list
    df["score"][i] = score_list
    
    print (f"city {df['location'].iloc[i]} done")

In [None]:
df["location"] = df["location"].str.replace("+", " ")
df = df.set_index("location")
df = df.reindex(cities)
df.reset_index(inplace=True)

df.reset_index(inplace=True)
df.rename(columns={'index': 'id'}, inplace=True)
df.head(2)

In [None]:
df2 = df.copy()

df_name = df2["name"].explode()
df_url = df2["url"].explode()
df_lat = df2["lat"].explode()
df_lon = df2["lon"].explode()
df_description = df2["description"].explode()
df_score = df2["score"].explode()

In [None]:
df_final = pd.concat([df_name, df_url, df_lat, df_lon, df_description, df_score], axis=1)
df_final.reset_index().head()

In [None]:
df_final["score"].replace(",", ".", regex=True, inplace=True)

In [None]:
df_final.info()

In [None]:
px.set_mapbox_access_token(open(".mapbox_token").read())
fig = px.scatter_mapbox(
    df_final.sort_values('name'),
    lat='lat',
    lon='lon',
    color='score',
    size='score',
    color_continuous_scale=px.colors.diverging.Picnic,
    size_max=30,
    zoom=5,
    hover_name='name'
)

fig.update_layout(width = 1300, height = 800, template='plotly_dark' ,title='Weather')
fig.show()

## Getting API Data

### GPS Coordinates

In [4]:
def get_gps(df):

    df_new = df.copy(deep=True)
    lat_list = []
    lon_list = []
    for i in cities:
        if i == "Gorges du Verdon":
            i = "La%20Palud-sur-Verdon"
            r = requests.get(f"https://nominatim.openstreetmap.org/search?city={i}&format=json").json()
            lat_list.append(r[0]['lat'])
            lon_list.append(r[0]['lon'])
        elif i == 'Ariege':
            r = requests.get(f"https://nominatim.openstreetmap.org/search?county={i}&format=json").json()
            lat_list.append(r[0]['lat'])
            lon_list.append(r[0]['lon'])
        else:
            name = i.replace(" ", "%20")
            r = requests.get(f"https://nominatim.openstreetmap.org/search?city={name}&format=json").json()
            lat_list.append(r[0]['lat'])
            lon_list.append(r[0]['lon'])
        
    df_new['lat'] = lat_list
    df_new['lon'] = lon_list 
    return df_new

In [5]:
df_gps = get_gps(df_cities)
df_gps

Unnamed: 0,id,city,lat,lon
0,0,Mont Saint Michel,48.6359541,-1.511459954959514
1,1,St Malo,48.649518,-2.0260409
2,2,Bayeux,49.2764624,-0.7024738
3,3,Le Havre,49.4938975,0.1079732
4,4,Rouen,49.4404591,1.0939658
5,5,Paris,48.8588897,2.3200410217200766
6,6,Amiens,49.8941708,2.2956951
7,7,Lille,50.6365654,3.0635282
8,8,Strasbourg,48.584614,7.7507127
9,9,Chateau du Haut Koenigsbourg,48.2495226,7.3454923


### Weather Data

In [24]:
def get_weather(df):
    
    df_new = df.copy(deep=True)
    temps_list = []
    rain_pop = []
    humidity_list = []
    days = list(range(1,8))
    
    for i in df.itertuples():
        lat = i.lat
        lon = i.lon
        r = requests.get(f"https://api.openweathermap.org/data/2.5/onecall?lat={lat}&lon={lon}&units=metric&appid={key}").json()
        weather_7_days = r['daily'][1:]
        temps = [i['feels_like']['day'] for i in weather_7_days]
        rain = [i['pop'] * 100 for i in weather_7_days]
        humidity = [i['humidity'] for i in weather_7_days]
        temps_list.append(temps)
        rain_pop.append(rain)
        humidity_list.append(humidity)
        
    df_new['day_plus'] = [days for _ in range(len(df))]
    df_new['felt_temperature'] = temps_list
    df_new['rain_chances'] = rain_pop
    df_new['humidity'] = humidity_list
    df_new['score'] = df_new.apply(lambda x: ((35 - np.mean(x['felt_temperature'])) * 2) + np.mean(x['rain_chances']) + (np.mean(x['humidity']) / 2), axis=1)
    
    return df_new
    

In [25]:
df_full = get_weather(df_gps)
df_full.head(20)

Unnamed: 0,id,city,lat,lon,day_plus,felt_temperature,rain_chances,humidity,score
0,0,Mont Saint Michel,48.6359541,-1.511459954959514,"[1, 2, 3, 4, 5, 6, 7]","[17.48, 18.47, 15.31, 10.6, 10.91, 7.45, 11.52]","[56.99999999999999, 0, 0, 38.0, 79.0, 82.0, 87.0]","[72, 60, 50, 80, 68, 85, 72]",127.574286
1,1,St Malo,48.649518,-2.0260409,"[1, 2, 3, 4, 5, 6, 7]","[14.66, 15.23, 14.15, 10.06, 9.83, 6.6, 10.4]","[24.0, 0, 0, 38.0, 62.0, 98.0, 83.0]","[75, 75, 58, 72, 70, 84, 75]",126.805714
2,2,Bayeux,49.2764624,-0.7024738,"[1, 2, 3, 4, 5, 6, 7]","[18.04, 16.27, 14.39, 10.03, 10.31, 10.82, 9.25]","[49.0, 0, 0, 55.00000000000001, 68.0, 38.0, 53.0]","[69, 67, 51, 90, 70, 67, 79]",117.325714
3,3,Le Havre,49.4938975,0.1079732,"[1, 2, 3, 4, 5, 6, 7]","[14.82, 14.48, 12.82, 10.52, 9.46, 10.44, 10.4]","[13.0, 0, 0, 72.0, 47.0, 20.0, 56.00000000000001]","[75, 70, 53, 93, 72, 58, 73]",111.302857
4,4,Rouen,49.4404591,1.0939658,"[1, 2, 3, 4, 5, 6, 7]","[17.96, 16.5, 14.2, 15.56, 9.97, 12.64, 12.84]","[3.0, 0, 0, 65.0, 82.0, 28.000000000000004, 65.0]","[60, 60, 49, 56, 75, 51, 53]",105.094286
5,5,Paris,48.8588897,2.3200410217200766,"[1, 2, 3, 4, 5, 6, 7]","[18.24, 16.73, 15.48, 16.52, 12.97, 13.87, 14.95]","[10.0, 0, 0, 0, 70.0, 2.0, 38.0]","[51, 45, 31, 29, 55, 38, 40]",76.711429
6,6,Amiens,49.8941708,2.2956951,"[1, 2, 3, 4, 5, 6, 7]","[16.55, 15.44, 13.97, 14.79, 9.75, 11.63, 13.74]","[0, 0, 0, 0, 89.0, 57.99999999999999, 26.0]","[63, 54, 46, 44, 87, 61, 51]",96.322857
7,7,Lille,50.6365654,3.0635282,"[1, 2, 3, 4, 5, 6, 7]","[15.74, 15.23, 14.64, 15.38, 11.84, 11.22, 14.22]","[0, 0, 0, 0, 57.99999999999999, 65.0, 0]","[54, 47, 35, 36, 62, 63, 47]",84.065714
8,8,Strasbourg,48.584614,7.7507127,"[1, 2, 3, 4, 5, 6, 7]","[17.33, 12.93, 13.76, 13.2, 13.99, 9.14, 14.21]","[7.000000000000001, 0, 0, 0, 0, 95.0, 37.0]","[59, 50, 35, 41, 50, 91, 58]",90.268571
9,9,Chateau du Haut Koenigsbourg,48.2495226,7.3454923,"[1, 2, 3, 4, 5, 6, 7]","[14.93, 10.15, 9.98, 9.56, 11.31, 6.25, 11.04]","[14.000000000000002, 0, 0, 0, 0, 100, 49.0]","[59, 51, 36, 40, 48, 89, 54]",99.294286


In [26]:
def df_map(df):

    df_sorted = df_full.sort_values('score').head()
    df_sorted.reset_index(inplace=True, drop=True)
    df_sorted = df_sorted.apply(pd.Series.explode)
    df_sorted[['lat', 'lon', 'day_plus', 'felt_temperature', 'rain_chances', 'humidity']] = df_sorted[['lat', 'lon', 'day_plus', 'felt_temperature', 'rain_chances', 'humidity']].apply(pd.to_numeric)
    return df_sorted

df_plotly = df_map(df_full)

df_plotly

Unnamed: 0,id,city,lat,lon,day_plus,felt_temperature,rain_chances,humidity,score
0,5,Paris,48.85889,2.320041,1,18.24,10.0,51,76.711429
0,5,Paris,48.85889,2.320041,2,16.73,0.0,45,76.711429
0,5,Paris,48.85889,2.320041,3,15.48,0.0,31,76.711429
0,5,Paris,48.85889,2.320041,4,16.52,0.0,29,76.711429
0,5,Paris,48.85889,2.320041,5,12.97,70.0,55,76.711429
0,5,Paris,48.85889,2.320041,6,13.87,2.0,38,76.711429
0,5,Paris,48.85889,2.320041,7,14.95,38.0,40,76.711429
1,24,Nimes,43.837425,4.360069,1,23.74,2.0,38,80.188571
1,24,Nimes,43.837425,4.360069,2,22.68,0.0,34,80.188571
1,24,Nimes,43.837425,4.360069,3,20.39,0.0,36,80.188571


In [27]:
px.set_mapbox_access_token(open(".mapbox_token").read())
fig = px.scatter_mapbox(
    df_plotly.sort_values('day_plus'),
    lat='lat',
    lon='lon',
    color='felt_temperature',
    size='humidity',
    color_continuous_scale=px.colors.sequential.Bluered,
    size_max=30,
    zoom=4.7,
    hover_name='city',
    hover_data={
        'lat': False,
        'lon': False,
        'day_plus': False,
        'rain_chances': True,
        'humidity': True,
        'felt_temperature': True        
        },
    animation_frame='day_plus'
)

fig.update_layout(width = 1300, height = 800, template='plotly_dark' ,title='Weather')
fig.show()