# Kayak Project

## Imports

In [1]:
import pandas as pd
import plotly.express as px
import requests
from dotenv import load_dotenv
import os
import logging
import scrapy
from scrapy.crawler import CrawlerProcess
from bs4 import BeautifulSoup
import requests
import time

load_dotenv()
%load_ext dotenv
%dotenv

key = os.getenv('APIKEY')

In [2]:
cities = ["Mont Saint Michel", "St Malo", "Bayeux", "Le Havre", "Rouen", "Paris", "Amiens", "Lille", "Strasbourg", "Chateau du Haut Koenigsbourg", "Colmar", "Eguisheim", "Besancon", "Dijon", "Annecy", "Grenoble", "Lyon", "Gorges du Verdon", "Bormes les Mimosas", "Cassis", "Marseille", "Aix en Provence", "Avignon", "Uzes", "Nimes", "Aigues Mortes", "Saintes Maries de la mer", "Collioure", "Carcassonne", "Ariege", "Toulouse", "Montauban", "Biarritz", "Bayonne", "La Rochelle"]

In [3]:
df_cities = pd.DataFrame(columns=["city"])
df_cities['city'] = cities
df_cities.reset_index(inplace=True)
df_cities.rename(columns={'index': 'id'}, inplace=True)
df_cities

Unnamed: 0,id,city
0,0,Mont Saint Michel
1,1,St Malo
2,2,Bayeux
3,3,Le Havre
4,4,Rouen
5,5,Paris
6,6,Amiens
7,7,Lille
8,8,Strasbourg
9,9,Chateau du Haut Koenigsbourg


## Scraping Booking.com

In [4]:
df_booking = df_cities.copy(deep=True)

class BookingSpider(scrapy.Spider):
    name = "Booking_data"
    cities = df_booking["city"]
    start_urls = ['https://www.booking.com/index.fr.html']

    def parse(self, response):
        for x in cities:
            yield scrapy.FormRequest.from_response(
            response,
            formdata={'ss': x },
            callback=self.after_search
        )

    def after_search(self, response):
        cities = response.url.split("ss=")[-1].split("&")[0]
                
        booking = response.css('.d4924c9e74')
        
        for data in booking:
            
            yield {
                'location': cities,
                'name': data.css('a div.fcab3ed991.a23c043802::text').getall(),
                'url': data.css('h3.a4225678b2 a::attr(href)').getall(),
                # 'score': data.css('div.b5cd09854e.d10a6220b4::text').getall(),
                # 'description': data.css('div.d8eab2cf7f::text').getall(),
            }

        try:
            next_page = response.css('a.paging-next').attrib["href"]
        except KeyError:
            logging.info('No next page. Terminating crawling process.')
        else:
            yield response.follow(next_page, callback=self.after_search)

In [5]:
filename = "cities.json"

if filename in os.listdir():
        os.remove(filename)

process = CrawlerProcess(settings = {
    'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:92.0) Gecko/20100101 Firefox/92.0',
    'LOG_LEVEL': logging.INFO,
    "FEEDS": {
        filename: {"format": "json"},
    },
    "AUTOTHROTTLE_ENABLED": True
})

process.crawl(BookingSpider)
process.start()

2022-04-13 21:39:48 [scrapy.utils.log] INFO: Scrapy 2.6.1 started (bot: scrapybot)
2022-04-13 21:39:48 [scrapy.utils.log] INFO: Versions: lxml 4.8.0.0, libxml2 2.9.4, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 22.2.0, Python 3.9.10 | packaged by conda-forge | (main, Feb  1 2022, 21:27:43) - [Clang 11.1.0 ], pyOpenSSL 22.0.0 (OpenSSL 1.1.1n  15 Mar 2022), cryptography 36.0.1, Platform macOS-12.3.1-arm64-arm-64bit
2022-04-13 21:39:48 [scrapy.crawler] INFO: Overridden settings:
{'AUTOTHROTTLE_ENABLED': True,
 'LOG_LEVEL': 20,
 'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:92.0) '
               'Gecko/20100101 Firefox/92.0'}
2022-04-13 21:39:48 [scrapy.extensions.telnet] INFO: Telnet Password: b3178f7049a58bbe
2022-04-13 21:39:48 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions

In [None]:
df = pd.read_json("cities.json")
df.head()

In [None]:
for i in range (len(df["location"])):
    df["name"][i] = df["name"][i][0:20]
    df["url"][i] = df["url"][i][0:20]

In [None]:
df[df['url'].isna()]

In [None]:
df["lat"] = 0
df["lon"] = 0
df["description"] = "test"
df["score"] = 0.0

navigator = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1)'

for i in range(len(df["url"])):
    lat_list = []
    lon_list = []
    description_list = []
    score_list = []

    hotel_list = df["url"][i]

    for i2 in hotel_list:

        try:
            page = requests.get(i2, headers={'User-Agent': navigator})
            soup = BeautifulSoup(page.text, 'html.parser')
        except:
            page = requests.get(i2, headers={'User-Agent': navigator})
            soup = BeautifulSoup(page.text, 'html.parser')

        lat_list.append(soup.select('p.address.address_clean a')[0].get("data-atlas-latlng").split(",")[0])
        lon_list.append(soup.select('p.address.address_clean a')[0].get("data-atlas-latlng").split(",")[1])
        description_list.append(soup.select('div#property_description_content')[0].get_text())

        try:
            score_list.append(soup.select('div.b5cd09854e.d10a6220b4')[0].get_text())
            
        except:
            # Some hotels dont have a score yet but I still need one for the visualization. I set it to 0.
            score_list.append("0.0")

        time.sleep(1.4)
  
    df["lat"][i] = lat_list
    df["lon"][i] = lon_list
    df["description"][i] = description_list
    df["score"][i] = score_list
    
    print (f"city {df['location'].iloc[i]} done")

In [None]:
df["location"] = df["location"].str.replace("+", " ")
df = df.set_index("location")
df = df.reindex(cities)
df.reset_index(inplace=True)

df.reset_index(inplace=True)
df.rename(columns={'index': 'id'}, inplace=True)
df.head(2)

In [None]:
df2 = df.copy()

df_name = df2["name"].explode()
df_url = df2["url"].explode()
df_lat = df2["lat"].explode()
df_lon = df2["lon"].explode()
df_description = df2["description"].explode()
df_score = df2["score"].explode()

In [None]:
df_final = pd.concat([df_name, df_url, df_lat, df_lon, df_description, df_score], axis=1)
df_final.reset_index().head()

In [None]:
df_final["score"].replace(",", ".", regex=True, inplace=True)

In [None]:
df_final.info()

In [None]:
px.set_mapbox_access_token(open(".mapbox_token").read())
fig = px.scatter_mapbox(
    df_final.sort_values('name'),
    lat='lat',
    lon='lon',
    color='score',
    size='score',
    color_continuous_scale=px.colors.diverging.Picnic,
    size_max=30,
    zoom=5,
    hover_name='name'
)

fig.update_layout(width = 1300, height = 800, template='plotly_dark' ,title='Weather')
fig.show()

## Getting API Data

### GPS Coordinates

In [4]:
def get_gps(df):

    df_new = df.copy(deep=True)
    lat_list = []
    lon_list = []
    for i in cities:
        if i == "Gorges du Verdon":
            i = "La%20Palud-sur-Verdon"
            r = requests.get(f"https://nominatim.openstreetmap.org/search?city={i}&format=json").json()
            lat_list.append(r[0]['lat'])
            lon_list.append(r[0]['lon'])
        elif i == 'Ariege':
            r = requests.get(f"https://nominatim.openstreetmap.org/search?county={i}&format=json").json()
            lat_list.append(r[0]['lat'])
            lon_list.append(r[0]['lon'])
        else:
            name = i.replace(" ", "%20")
            r = requests.get(f"https://nominatim.openstreetmap.org/search?city={name}&format=json").json()
            lat_list.append(r[0]['lat'])
            lon_list.append(r[0]['lon'])
        
    df_new['lat'] = lat_list
    df_new['lon'] = lon_list 
    return df_new

In [5]:
df_gps = get_gps(df_cities)
df_gps

Unnamed: 0,id,city,lat,lon
0,0,Mont Saint Michel,48.6359541,-1.511459954959514
1,1,St Malo,48.649518,-2.0260409
2,2,Bayeux,49.2764624,-0.7024738
3,3,Le Havre,49.4938975,0.1079732
4,4,Rouen,49.4404591,1.0939658
5,5,Paris,48.8588897,2.3200410217200766
6,6,Amiens,49.8941708,2.2956951
7,7,Lille,50.6365654,3.0635282
8,8,Strasbourg,48.584614,7.7507127
9,9,Chateau du Haut Koenigsbourg,48.2495226,7.3454923


### Weather Data

In [6]:
def get_weather(df):
    
    df_new = df.copy(deep=True)
    temps_list = []
    rain_pop = []
    humidity_list = []
    days = list(range(1,8))
    
    for i in df.itertuples():
        lat = i.lat
        lon = i.lon
        r = requests.get(f"https://api.openweathermap.org/data/2.5/onecall?lat={lat}&lon={lon}&units=metric&appid={key}").json()
        weather_7_days = r['daily'][1:]
        temps = [i['feels_like']['day'] for i in weather_7_days]
        rain = [i['pop'] * 100 for i in weather_7_days]
        humidity = [i['humidity'] for i in weather_7_days]
        temps_list.append(temps)
        rain_pop.append(rain)
        humidity_list.append(humidity)
        
    df_new['day_plus'] = [days for _ in range(len(df))]
    df_new['felt_temperatures'] = temps_list
    df_new['rain_chances'] = rain_pop
    df_new['humidity'] = humidity_list
    
    df_new = df_new.set_index(['id']).apply(pd.Series.explode).reset_index()
    df_new[['lat', 'lon', 'day_plus', 'felt_temperatures', 'rain_chances', 'humidity']] = df_new[['lat', 'lon', 'day_plus', 'felt_temperatures', 'rain_chances', 'humidity']].apply(pd.to_numeric)
    return df_new
    

In [7]:
df_full = get_weather(df_gps)
df_full.head(20)

Unnamed: 0,id,city,lat,lon,day_plus,felt_temperatures,rain_chances,humidity
0,0,Mont Saint Michel,48.635954,-1.51146,1,16.87,63.0,74
1,0,Mont Saint Michel,48.635954,-1.51146,2,18.68,0.0,59
2,0,Mont Saint Michel,48.635954,-1.51146,3,16.02,2.0,53
3,0,Mont Saint Michel,48.635954,-1.51146,4,11.03,28.0,67
4,0,Mont Saint Michel,48.635954,-1.51146,5,10.45,28.0,70
5,0,Mont Saint Michel,48.635954,-1.51146,6,11.44,19.0,66
6,0,Mont Saint Michel,48.635954,-1.51146,7,13.5,82.0,70
7,1,St Malo,48.649518,-2.026041,1,15.62,34.0,73
8,1,St Malo,48.649518,-2.026041,2,15.43,5.0,75
9,1,St Malo,48.649518,-2.026041,3,14.72,13.0,64


In [8]:
df_full['rain_chances_inverted'] = 100 - df_full['rain_chances']
df_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 245 entries, 0 to 244
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     245 non-null    int64  
 1   city                   245 non-null    object 
 2   lat                    245 non-null    float64
 3   lon                    245 non-null    float64
 4   day_plus               245 non-null    int64  
 5   felt_temperatures      245 non-null    float64
 6   rain_chances           245 non-null    float64
 7   humidity               245 non-null    int64  
 8   rain_chances_inverted  245 non-null    float64
dtypes: float64(5), int64(3), object(1)
memory usage: 17.4+ KB


In [12]:
df_full['score'] = df_full.apply(lambda x: ((35 - x['felt_temperatures']) * 2) + x['rain_chances'] + (x['humidity'] / 2), axis=1)
df_full.head(15)

Unnamed: 0,id,city,lat,lon,day_plus,felt_temperatures,rain_chances,humidity,rain_chances_inverted,score
0,0,Mont Saint Michel,48.635954,-1.51146,1,16.87,63.0,74,37.0,136.26
1,0,Mont Saint Michel,48.635954,-1.51146,2,18.68,0.0,59,100.0,62.14
2,0,Mont Saint Michel,48.635954,-1.51146,3,16.02,2.0,53,98.0,66.46
3,0,Mont Saint Michel,48.635954,-1.51146,4,11.03,28.0,67,72.0,109.44
4,0,Mont Saint Michel,48.635954,-1.51146,5,10.45,28.0,70,72.0,112.1
5,0,Mont Saint Michel,48.635954,-1.51146,6,11.44,19.0,66,81.0,99.12
6,0,Mont Saint Michel,48.635954,-1.51146,7,13.5,82.0,70,18.0,160.0
7,1,St Malo,48.649518,-2.026041,1,15.62,34.0,73,66.0,109.26
8,1,St Malo,48.649518,-2.026041,2,15.43,5.0,75,95.0,81.64
9,1,St Malo,48.649518,-2.026041,3,14.72,13.0,64,87.0,85.56


In [9]:
px.set_mapbox_access_token(open(".mapbox_token").read())
fig = px.scatter_mapbox(
    df_full.sort_values('day_plus'),
    lat='lat',
    lon='lon',
    color='felt_temperatures',
    size='humidity',
    color_continuous_scale=px.colors.diverging.Picnic,
    size_max=30,
    zoom=5,
    hover_name='city',
    animation_frame='day_plus'
)

fig.update_layout(width = 1300, height = 800, template='plotly_dark' ,title='Weather')
fig.show()