# 0. Imports

In [1]:
from bs4 import BeautifulSoup

import requests

import pandas as pd
import numpy as np

from time import sleep

from selenium import webdriver 
from webdriver_manager.chrome import ChromeDriverManager  
from selenium.webdriver.common.keys import Keys  
from selenium.webdriver.support.ui import Select 
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException 

from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="my-geopy-app")
import random
import re
import datetime
import json
import math

# import suppor functions
import sys 
sys.path.append("..")
from src.data_extraction_support import create_country_airport_code_df, request_flight_itineraries, create_itineraries_dataframe
from src.data_extraction_support import extract_all_accommodations, extract_all_activities

# 1. Conditions for travel itinerary planning

For this project, our clients requested that we organise and plan their holiday, including flight, accommodation and activities suggestions according to the following restrictions:
- Holiday dates must be from Friday 8th to Sunday 17th November. 
- Flights should be direct from Madrid to Paris, on the 8th November, to Berlin on the 13th and back to Madrid on the 17th.
- Accommodation must be under 100 euros per night for a couple
- Total activity budget is X
- Activities should be about X, Y and Z
- Activities suggested should not overlap

# 2. Extracting information

## 2.1 Flights

### 2.1.1 Air scrapper - API

In [2]:
list_of_countries = ["france","spain", "germany", "thailand","bali","philippines","china",
                     "australia","italy","russia","romania","united states","argentina","mexico","brasil",
                     "portugal","austria","belgium","cuba","colombia","morocco","south africa","madagascar","new zealand"]



NOTE: The below two lines are commented in order to avoid their execution when running the notebook, as that execution's result is stored in the 'countries_airports.csv' file.

In [3]:
# lines commented to not execute them when running the notebook
# countries_airports = create_country_airport_code_df(list_of_countries)

# countries_airports.to_csv("../data/airport_codes/countries_airports.csv")


In [4]:
countries_airports = pd.read_csv("../data/airport_codes/countries_airports.csv")

countries_airports

Unnamed: 0.1,Unnamed: 0,country,city,city_entityId,skyId,entityId,airport_name
0,0,france,Paris,27539733,CDG,95565041,Paris Charles de Gaulle
1,1,france,Paris,27539733,ORY,95565040,Paris Orly
2,2,france,Paris,27539733,BVA,95566278,Paris Beauvais
3,3,france,Paris,27539733,LBG,129053609,Paris Le Bourget
4,4,france,Fort De France,27541003,FDF,99539664,Fort De France
...,...,...,...,...,...,...,...
137,137,new zealand,Queenstown,27537522,ZQN,104120283,Queenstown
138,138,new zealand,Wellington,27536222,WLG,128668499,Wellington
139,139,new zealand,Nelson,27545157,NSN,128667093,Nelson
140,140,new zealand,Dunedin,27540825,DUD,128668336,Dunedin


Information needed for extraction from each flight:
- Duration
- Price
- Stops
- Departure
- Arrival
- Company
- Self_transfer
- Fare_policy columns: 'isChangeAllowed', 'isPartiallyChangeable', 'isCancellationAllowed', 'isPartiallyRefundable'
- Score
- Luggage price (optional)
- Origin airport
- Destination airport


In [5]:
origin_city = "madrid"
n_adults = 2

destination_city1 = "paris"
checkin_city1="2024-11-08"
checkout_city1="2024-11-13"

destination_city2= "berlin"
checkin_city2="2024-11-13"
checkout_city2="2024-11-17"

In [6]:
# itineraries_dict_list_city1 = request_flight_itineraries(countries_airports, origin_city,destination_city1, n_adults=2, date=checkin_city1)
# itineraries_dict_list_city2 = request_flight_itineraries(countries_airports, origin_city,destination_city2, n_adults=2, date=checkin_city2)

In [7]:
# flight_itinerary_df1 = create_itineraries_dataframe(itineraries_dict_list_city1)
# flight_itinerary_df2 = create_itineraries_dataframe(itineraries_dict_list_city2)

NOTE: The below two lines are commented in order to avoid their execution when running the notebook, as that execution's result is stored in the 'countries_airports.csv' file.

In [8]:
# flight_itinerary_df1.to_csv("../data/flights/flight_itinieraries1.csv")
# flight_itinerary_df2.to_csv("../data/flights/flight_itinieraries2.csv")

In [9]:
flight_itineraries1 = pd.read_csv("../data/flights/flight_itinieraries1.csv",index_col=0)
flight_itineraries2 = pd.read_csv("../data/flights/flight_itinieraries1.csv",index_col=0)

In [10]:
flight_itineraries1.head()

Unnamed: 0,duration,price,price_currency,stops,departure,arrival,company,self_transfer,fare_isChangeAllowed,fare_isPartiallyChangeable,fare_isCancellationAllowed,fare_isPartiallyRefundable,score,origin_airport,destination_airport
0,130,225,€,0,2024-11-08 13:20:00,2024-11-08 15:30:00,transavia,False,False,False,False,False,0.999,Madrid,Paris Orly
1,130,255,€,0,2024-11-08 09:00:00,2024-11-08 11:10:00,transavia,False,False,False,False,False,0.841351,Madrid,Paris Orly
2,115,278,€,0,2024-11-08 16:30:00,2024-11-08 18:25:00,Air Europa,False,False,False,False,False,0.66002,Madrid,Paris Orly
3,115,365,€,0,2024-11-08 08:00:00,2024-11-08 09:55:00,Air Europa,False,False,False,False,False,0.576296,Madrid,Paris Orly
4,115,322,€,0,2024-11-08 14:25:00,2024-11-08 16:20:00,Air Europa,False,False,False,False,False,0.547622,Madrid,Paris Orly


In [11]:
flight_itineraries2.head()

Unnamed: 0,duration,price,price_currency,stops,departure,arrival,company,self_transfer,fare_isChangeAllowed,fare_isPartiallyChangeable,fare_isCancellationAllowed,fare_isPartiallyRefundable,score,origin_airport,destination_airport
0,130,225,€,0,2024-11-08 13:20:00,2024-11-08 15:30:00,transavia,False,False,False,False,False,0.999,Madrid,Paris Orly
1,130,255,€,0,2024-11-08 09:00:00,2024-11-08 11:10:00,transavia,False,False,False,False,False,0.841351,Madrid,Paris Orly
2,115,278,€,0,2024-11-08 16:30:00,2024-11-08 18:25:00,Air Europa,False,False,False,False,False,0.66002,Madrid,Paris Orly
3,115,365,€,0,2024-11-08 08:00:00,2024-11-08 09:55:00,Air Europa,False,False,False,False,False,0.576296,Madrid,Paris Orly
4,115,322,€,0,2024-11-08 14:25:00,2024-11-08 16:20:00,Air Europa,False,False,False,False,False,0.547622,Madrid,Paris Orly


## 2.2 Accommodations

In [12]:
n_adults=2
n_children=0
n_rooms=1
star_ratings=[3,4,5] 
meal_plan="breakfast"
review_score= [70,80,90]
max_distance_meters=3000 

#### 2.2.1 First-city: Paris

In [13]:
destination_city1 = "paris"
checkin_city1="2024-11-08"
checkout_city1="2024-11-13"
max_price_night1=500

# acommodations_city1 = extract_all_accommodations(destination=destination_city1, checkin=checkin_city1, checkout=checkout_city1, adults= n_adults, children= n_children,
#                            rooms= n_rooms, max_price= max_price_night1, star_ratings= star_ratings, 
#                            meal_plan= meal_plan, review_score= review_score, max_distance_meters= max_distance_meters, verbose=False)

#### 2.2.2 Second city: Berlin

In [14]:
destination_city2= "berlin"
checkin_city2="2024-11-13"
checkout_city2="2024-11-17"
max_price_night2=500

# acommodations_city2 = extract_all_accommodations(destination=destination_city2, checkin=checkin_city2, checkout=checkout_city2, adults= n_adults, children= n_children,
#                            rooms= n_rooms, max_price= max_price_night2, star_ratings= star_ratings, 
#                            meal_plan= meal_plan, review_score= review_score, max_distance_meters= max_distance_meters, verbose=False)

NOTE: The below two lines are commented in order to avoid their execution when running the notebook, as that execution's result is stored in the 'countries_airports.csv' file.

In [15]:
# acommodations_city1.to_csv("../data/accommodations/acommodations_city1.csv")
# acommodations_city2.to_csv("../data/accommodations/acommodations_city2.csv")

In [16]:
acommodations_city1 = pd.read_csv("../data/accommodations/acommodations_city1.csv",index_col=0)
acommodations_city2 = pd.read_csv("../data/accommodations/acommodations_city2.csv",index_col=0)

In [17]:
acommodations_city1.head(5)

Unnamed: 0,name,url,price_currency,total_price_amount,distance_city_center_km,score,n_comments,close_to_metro,sustainability_cert,room_type,double_bed,single_bed,free_cancellation,breakfast_included,pay_at_hotel,location_score,free_taxi
0,Five Boutique Hotel Paris Quartier Latin,https://www.booking.com/hotel/fr/the-five.es.h...,€,1175,2.1,8.3,169,Yes,No,Habitación Doble Superior,Yes,No,Yes,Yes,No,,No
1,Sonder Le Frochot,https://www.booking.com/hotel/fr/opera-frochot...,€,1221,2.9,7.6,787,Yes,No,Habitación con cama grande,Yes,No,Yes,Yes,No,,No
2,Appart'City Collection Paris Gare de Lyon,https://www.booking.com/hotel/fr/appart-city-c...,€,896,2.9,9.0,1237,Yes,No,Estudio Doble,Yes,No,Yes,Yes,Yes,,No
3,Le Patio Bastille,https://www.booking.com/hotel/fr/le-patio-st-a...,€,958,3.0,8.3,3037,Yes,No,Habitación Clásica,Yes,Yes,Yes,Yes,No,,No
4,Austin's Saint Lazare Hotel,https://www.booking.com/hotel/fr/austin-s-sain...,€,959,2.9,8.1,3281,Yes,No,Habitación Doble,Yes,No,Yes,Yes,No,,No


In [18]:
acommodations_city2.head(5)

Unnamed: 0,name,url,price_currency,total_price_amount,distance_city_center_km,score,n_comments,close_to_metro,sustainability_cert,room_type,double_bed,single_bed,free_cancellation,breakfast_included,pay_at_hotel,location_score,free_taxi
0,"InterContinental Berlin, an IHG Hotel",https://www.booking.com/hotel/de/interconti-be...,€,644,2.4,8.8,3828,Yes,No,Habitación Clásica,Yes,Yes,No,No,No,,No
1,"Holiday Inn Express - Berlin - Alexanderplatz,...",https://www.booking.com/hotel/de/holiday-inn-e...,€,687,2.4,8.4,10416,Yes,No,Habitación Estándar,Yes,Yes,Yes,Yes,Yes,,No
2,Hotel Gat Point Charlie,https://www.booking.com/hotel/de/gat-point-cha...,€,676,1.1,8.2,7993,Yes,Yes,Habitación Doble Grande,Yes,Yes,Yes,Yes,Yes,9.4,No
3,"Hotel Berlin, Berlin, a member of Radisson Ind...",https://www.booking.com/hotel/de/bbberlin.es.h...,€,610,2.1,8.0,17744,Yes,Yes,Habitación Estándar,No,Yes,No,No,No,,No
4,Berlin Marriott Hotel,https://www.booking.com/hotel/de/marriott-am-p...,€,1015,0.6,8.6,2826,Yes,Yes,Habitación Deluxe - 1 cama extragrande,Yes,No,Yes,Yes,Yes,9.5,No


## 2.3 Activities

As a first option, the range of activities proposed to our clients will come from the Civitatis catalog. If needed, more catalogs will be built on top of it to make it more compelling.

### 2.3.1 Civitatis - scraping


As for the information to extract from civitatis, the requirements are:
- Total activity budget is X
- Activities should be about X, Y and Z
- Activities suggested should not overlap

Therefore, the fields to be extracted should ideally be, at least:
- Date [X]
- Time [X]
- Address [X]
- Duration [X]
- Price [X]
- Name [X]
- Description [X]
- Category [X]
- Image [X]
- URL [X]
- Score [X]
- Location Score [X]
- Reviews (Optional)
- Language (Optional)

The checkin and checkout for activity dates must account for a minimum of 1 day of travel, therefore they are added 1 day on arrival and substracted 1 day on departure. In this way, it allows for a minimum of rest/journey time.

In [19]:
activities_checkin1 = (pd.to_datetime(checkin_city1) + datetime.timedelta(days=1)).strftime("%Y-%m-%d")
activities_checkout1 = (pd.to_datetime(checkout_city1) - datetime.timedelta(days=1)).strftime("%Y-%m-%d")
activities_checkin2 = (pd.to_datetime(checkin_city2) + datetime.timedelta(days=1)).strftime("%Y-%m-%d")
activities_checkout2 = (pd.to_datetime(checkout_city2) - datetime.timedelta(days=1)).strftime("%Y-%m-%d")

In [20]:
# civitatis_activities_city1 = extract_all_activities(destination_city1,activities_checkin1,activities_checkout1)
# civitatis_activities_city2 = extract_all_activities(destination_city2,activities_checkin2,activities_checkout2)

In [21]:
# civitatis_activities_city1.to_csv("../data/activities/civitatis_activities_city1.csv")
# civitatis_activities_city2.to_csv("../data/activities/civitatis_activities_city2.csv")

In [22]:
civitatis_activities_city1 = pd.read_csv("../data/activities/civitatis_activities_city1.csv",index_col=0)
civitatis_activities_city2 = pd.read_csv("../data/activities/civitatis_activities_city2.csv",index_col=0)

In [23]:
civitatis_activities_city1.head()

Unnamed: 0,activity_name,description,url,image,image2,available_days,available_times,duration,latitude,longitude,address,price,currency,category
0,Free tour por el Barrio Latino,En este free tour por el Barrio Latino descubr...,www.civitatis.com/es/paris/visita-guiada-barri...,"www.civitatis.comdata:image/gif;base64,R0lGODl...",www.civitatis.com/f/francia/paris/visita-guiad...,"['10', '11', '09', '12']","[['10:30', '15:30'], ['10:30', '15:30'], ['10:...",2h 15m -2h 30m,48.853209,2.343522,"9, Place Saint-Michel, Quartier de la Monnaie,...",2.5,EUR,Español
1,Tour privado por París,"Montmartre, Notre Dame o los Campos Elíseos so...",www.civitatis.com/es/paris/tour-privado-paris/,"www.civitatis.comdata:image/gif;base64,R0lGODl...",www.civitatis.com/f/francia/paris/tour-privado...,"['11', '12', '09', '10']","[['9:00', '10:00', '14:00', '16:00'], ['9:00',...",2 -4h,0.0,0.0,,34.0,EUR,Español
2,Entrada a la Sainte-Chapelle y Conciergerie,Con esta entrada combinada a la Conciergerie y...,www.civitatis.com/es/paris/entrada-conciergeri...,"www.civitatis.comdata:image/gif;base64,R0lGODl...",www.civitatis.com/f/francia/paris/entrada-conc...,"['11', '12', '10', '09']","[[], [], [], []]",Entradas,48.855639,2.345388,"Cour du Mai, Quartier Saint-Germain-l'Auxerroi...",5.15,EUR,
3,"Cena en Madame Brasserie, el restaurante de la...","Si buscáis una velada inolvidable, esta cena e...",www.civitatis.com/es/paris/cena-restaurante-to...,"www.civitatis.comdata:image/gif;base64,R0lGODl...",www.civitatis.com/f/francia/paris/cena-restaur...,"['12', '10', '11', '09']","[['21:00'], ['21:00'], ['21:00'], ['21:00']]",2 horas,48.858537,2.293964,"Tour Eiffel, Quai Jacques Chirac, Quartier du ...",11.77,EUR,Gastronomía y enoturismo
4,Excursión a los Castillos del Loira,Si estáis buscando una preciosa excursión desd...,www.civitatis.com/es/paris/excursion-castillos...,"www.civitatis.comdata:image/gif;base64,R0lGODl...",www.civitatis.com/f/francia/paris/excursion-ca...,['12'],[['7:00']],12h 30m,48.831234,2.387336,"15, Avenue des Terroirs de France, Quartier de...",37.0,EUR,Español


In [24]:
civitatis_activities_city2.head()

Unnamed: 0,activity_name,description,url,image,image2,available_days,available_times,duration,latitude,longitude,address,price,currency,category
0,Free tour de los misterios y leyendas de Berlín,Los secretos de la capital alemana son numeros...,www.civitatis.com/es/berlin/free-tour-misterio...,www.civitatis.com/f/alemania/berlin/free-tour-...,,"['14', '16']","[['10:00'], ['10:00']]",3 horas,52.520625,13.407143,"St. Marienkirche, 8, Karl-Liebknecht-Straße, S...",2.2,EUR,Español
1,Entrada al Icebar Berlín,Si queréis saber cómo se vive dentro de un igl...,www.civitatis.com/es/berlin/entrada-icebar-ber...,www.civitatis.com/f/alemania/berlin/entrada-ic...,,"['16', '14', '15']","[[], [], []]",45 minutos,52.521041,13.404021,"2, Spandauer Straße, Spandauer Vorstadt, Mitte...",5.0,EUR,Entradas
2,Visita guiada por la Isla de los Museos,Acompañadnos en esta visita guiada por la Isla...,www.civitatis.com/es/berlin/visita-guiada-isla...,www.civitatis.com/f/alemania/berlin/visita-gui...,,"['14', '16']","[['10:00'], ['10:00']]",3 horas,52.517258,13.394698,"Unter den Linden, Friedrichswerder, Mitte, Ber...",4.0,EUR,Español
3,Tour en bicicleta por Berlín,En este tour en bicicleta por Berlín practicar...,www.civitatis.com/es/berlin/berlin-bicicleta/,"www.civitatis.comdata:image/gif;base64,R0lGODl...",www.civitatis.com/f/alemania/berlin/berlin-bic...,['16'],[['11:00']],3h 30m,52.540055,13.413424,"Kulturbrauerei, 36, Schönhauser Allee, Bremer ...",6.8,EUR,Inglés
4,"Autobús turístico de Berlín, City Sightseeing",El autobús turístico recorre Berlín realizando...,www.civitatis.com/es/berlin/autobus-turistico-...,"www.civitatis.comdata:image/gif;base64,R0lGODl...",www.civitatis.com/f/alemania/berlin/autobus-tu...,"['15', '16', '14']","[['0:00'], ['0:00'], ['0:00']]",1 -2d,0.0,0.0,,9.6,EUR,Español
