# 0. Imports

In [1]:
from bs4 import BeautifulSoup

import requests

import pandas as pd
import numpy as np

from time import sleep

from selenium import webdriver 
from webdriver_manager.chrome import ChromeDriverManager  
from selenium.webdriver.common.keys import Keys  
from selenium.webdriver.support.ui import Select 
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException 

from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="my-geopy-app")
import random
import re
import datetime
import json
import math
from tqdm import tqdm
tqdm.pandas()

# import suppor functions
import sys 
sys.path.append("..")
from src.data_extraction_support import create_country_airport_code_df, request_flight_itineraries, create_itineraries_dataframe
from src.data_extraction_support import extract_all_accommodations, extract_all_activities, create_itineraries_dataframe_aller_retour, request_flight_itineraries_aller_retour

# 1. Conditions for travel itinerary planning

For this project, our clients requested that we organise and plan their holiday, including flight, accommodation and activities suggestions according to the following restrictions:
- Holiday dates must be from Friday 8th to Sunday 17th November. 
- Flights should be direct from Madrid to Paris, on the 8th November, to Berlin on the 13th and back to Madrid on the 17th.
- Accommodation must be under 100 euros per night for a couple
- Total activity budget is X
- Activities should be about X, Y and Z
- Activities suggested should not overlap

# 2. Extracting information

## 2.1 Flights

### 2.1.1 Air scrapper - API

In [2]:
list_of_countries = ["france","spain", "germany", "thailand","bali","philippines","china",
                     "australia","italy","russia","romania","united states","argentina","mexico","brasil",
                     "portugal","austria","belgium","cuba","colombia","morocco","south africa","madagascar","new zealand"]



NOTE: The below two lines are commented in order to avoid their execution when running the notebook, as that execution's result is stored in the 'countries_airports.csv' file.

In [3]:
# lines commented to not execute them when running the notebook
# countries_airports = create_country_airport_code_df(list_of_countries)

# countries_airports.to_csv("../data/airport_codes/countries_airports.csv")


In [4]:
countries_airports = pd.read_csv("../data/airport_codes/countries_airports.csv")

countries_airports

Unnamed: 0.1,Unnamed: 0,country,city,city_entityId,skyId,entityId,airport_name
0,0,france,Paris,27539733,CDG,95565041,Paris Charles de Gaulle
1,1,france,Paris,27539733,ORY,95565040,Paris Orly
2,2,france,Paris,27539733,BVA,95566278,Paris Beauvais
3,3,france,Paris,27539733,LBG,129053609,Paris Le Bourget
4,4,france,Fort De France,27541003,FDF,99539664,Fort De France
...,...,...,...,...,...,...,...
137,137,new zealand,Queenstown,27537522,ZQN,104120283,Queenstown
138,138,new zealand,Wellington,27536222,WLG,128668499,Wellington
139,139,new zealand,Nelson,27545157,NSN,128667093,Nelson
140,140,new zealand,Dunedin,27540825,DUD,128668336,Dunedin


Information needed for extraction from each flight:
- Duration
- Price
- Stops
- Departure
- Arrival
- Company
- Self_transfer
- Fare_policy columns: 'isChangeAllowed', 'isPartiallyChangeable', 'isCancellationAllowed', 'isPartiallyRefundable'
- Score
- Luggage price (optional)
- Origin airport
- Destination airport


In [5]:
origin_city = "madrid"
n_adults = 1

destination_cities = ["paris","berlin","phuket","rome","new york","madeira","havana"]
checkins=["2024-11-04","2024-11-11","2024-11-18"]
checkouts=["2024-11-11","2024-11-18","2024-11-25"]


In [6]:
# flight_itineraries_df = pd.DataFrame()
# for destination_city in tqdm(destination_cities):
#     for e, (checkin, checkout) in enumerate(zip(checkins,checkouts)):
#         itineraries_dict_list = request_flight_itineraries_aller_retour(countries_airports, origin_city,destination_city, n_adults=n_adults, date_departure=checkin,date_return=checkout)
#         flight_itineraries = create_itineraries_dataframe_aller_retour(itineraries_dict_list)
#         flight_itineraries["week"] = e + 1
#         flight_itineraries_df = pd.concat([flight_itineraries_df,flight_itineraries])

In [7]:
# flight_itineraries_df

NOTE: The below two lines are commented in order to avoid their execution when running the notebook, as that execution's result is stored in the 'countries_airports.csv' file.

In [8]:
# flight_itineraries_df.to_csv("../data/flights/flight_itinieraries_weeks_destinations.csv")

In [9]:
flight_itineraries_df = pd.read_csv("../data/flights/flight_itinieraries_weeks_destinations.csv",index_col=0)

In [10]:
flight_itineraries_df

Unnamed: 0,score,price,price_currency,duration_departure,duration_return,stops_departure,stops_return,departure_departure,arrival_departure,departure_return,...,self_transfer,fare_isChangeAllowed,fare_isPartiallyChangeable,fare_isCancellationAllowed,fare_isPartiallyRefundable,origin_airport_departure,destination_airport_departure,origin_airport_return,destination_airport_return,week
0,0.999000,248,€,135,135,0,0,2024-11-04 14:25:00,2024-11-04 16:40:00,2024-11-11 11:35:00,...,False,False,False,False,False,Madrid,Paris Charles de Gaulle,Paris Charles de Gaulle,Madrid,1
1,0.902639,187,€,130,130,0,0,2024-11-04 20:00:00,2024-11-04 22:10:00,2024-11-11 17:15:00,...,False,False,False,False,False,Madrid,Paris Beauvais,Paris Beauvais,Madrid,1
2,0.821090,289,€,130,130,0,0,2024-11-04 16:20:00,2024-11-04 18:30:00,2024-11-11 07:00:00,...,False,False,False,False,False,Madrid,Paris Charles de Gaulle,Paris Charles de Gaulle,Madrid,1
3,0.774970,288,€,130,130,0,0,2024-11-04 17:40:00,2024-11-04 19:50:00,2024-11-11 14:45:00,...,False,False,False,False,False,Madrid,Paris Orly,Paris Orly,Madrid,1
4,0.759652,190,€,130,130,0,0,2024-11-04 20:00:00,2024-11-04 22:10:00,2024-11-11 06:40:00,...,False,False,False,False,False,Madrid,Paris Beauvais,Paris Beauvais,Madrid,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53,0.172074,1424,€,1275,1840,1,2,2024-11-18 23:45:00,2024-11-19 15:00:00,2024-11-25 16:15:00,...,False,False,False,False,False,Madrid,Havana,Havana,Madrid,3
54,0.156223,1602,€,1845,1205,2,2,2024-11-18 20:35:00,2024-11-19 21:20:00,2024-11-25 14:00:00,...,False,False,False,False,False,Madrid,Havana,Havana,Madrid,3
55,0.139322,1734,€,2075,1085,1,1,2024-11-18 10:25:00,2024-11-19 15:00:00,2024-11-25 16:15:00,...,False,False,False,False,False,Madrid,Havana,Havana,Madrid,3
56,0.131635,1949,€,1890,1085,2,1,2024-11-18 13:30:00,2024-11-19 15:00:00,2024-11-25 16:15:00,...,False,False,False,False,False,Madrid,Havana,Havana,Madrid,3


## 2.2 Accommodations

In [11]:
n_rooms=None
star_ratings=[3,4,5] 
meal_plan=None
review_score= [70,80,90]
max_distance_meters=1500 
max_price_night = 200

In [12]:
# destination_cities = ["paris","berlin","phuket","rome","new york","madeira","havana"]
# checkins=["2024-11-04","2024-11-11","2024-11-18"]
# checkouts=["2024-11-11","2024-11-18","2024-11-25"]

# n_adults = [1,2,2,5]
# n_children = [0,0,2,0]

# acommodations_df = pd.DataFrame()
# for destination_city in tqdm(destination_cities):
#     for e, (checkin, checkout) in enumerate(zip(checkins,checkouts)):
#         for i, (adults, children) in enumerate(zip(n_adults,n_children)):
#             acommodations_city = extract_all_accommodations(destination=destination_city, checkin=checkin, checkout=checkout, adults= adults, children= children,
#                                     rooms= n_rooms, max_price= max_price_night, star_ratings= star_ratings, 
#                                     meal_plan= meal_plan, review_score= review_score, max_distance_meters= max_distance_meters, verbose=False)
#             acommodations_city["week"] = e + 1
#             if adults == 1 and children == 0:
#                 acommodations_city["type"] = "solitary"
#             elif adults == 2 and children == 2:
#                 acommodations_city["type"] = "two people"
#             elif adults == 2 and children == 0:
#                 acommodations_city["type"] = "couple"
#             else:
#                 acommodations_city["type"] = "friends"
#             acommodations_df = pd.concat([acommodations_df,acommodations_city])


NOTE: The below two lines are commented in order to avoid their execution when running the notebook, as that execution's result is stored in the 'countries_airports.csv' file.

In [13]:
# acommodations_df

In [14]:
# acommodations_df.to_csv("../data/accommodations/acommodations.csv")

In [15]:
acommodations_df = pd.read_csv("../data/accommodations/acommodations.csv",index_col=0)

In [16]:
acommodations_df.head(5)

Unnamed: 0,name,url,price_currency,total_price_amount,distance_city_center_km,score,n_comments,close_to_metro,sustainability_cert,room_type,double_bed,single_bed,free_cancellation,breakfast_included,pay_at_hotel,location_score,free_taxi,week,type
0,Hotel Meslay Republique,https://www.booking.com/hotel/fr/meslay-republ...,€,1143,1.4,8.1,3153.0,Yes,No,Habitación Doble Estándar - 2 camas,No,Yes,No,No,No,8.2,No,1,solitary
1,Superbe Triplex au Coeur de Montorgueil,https://www.booking.com/hotel/fr/superbe-tripl...,€,1272,1.1,8.5,28.0,Yes,No,Casa con 1 dormitorio,Yes,No,No,No,No,8.4,No,1,solitary
2,Moxy Paris Bastille,https://www.booking.com/hotel/fr/moxy-paris-ba...,€,1311,1.4,8.0,1337.0,Yes,No,MOXY Sleeper con cama grande,Yes,No,Yes,Yes,Yes,8.2,No,1,solitary
3,MBA Splendide Appart Saint-Michel Paris Centre,https://www.booking.com/hotel/fr/splendide-app...,€,776,0.8,7.1,15.0,Yes,No,Apartamento - Planta baja,No,No,No,No,No,,No,1,solitary
4,Paris Bastille,https://www.booking.com/hotel/fr/hotel-paris-b...,€,1304,1.5,8.2,1919.0,Yes,No,Habitación Doble - 1 o 2 camas,Yes,Yes,No,No,No,8.6,No,1,solitary


## 2.3 Activities

As a first option, the range of activities proposed to our clients will come from the Civitatis catalog. If needed, more catalogs will be built on top of it to make it more compelling.

### 2.3.1 Civitatis - scraping


As for the information to extract from civitatis, the requirements are:
- Total activity budget is X
- Activities should be about X, Y and Z
- Activities suggested should not overlap

Therefore, the fields to be extracted should ideally be, at least:
- Date [X]
- Time [X]
- Address [X]
- Duration [X]
- Price [X]
- Name [X]
- Description [X]
- Category [X]
- Image [X]
- URL [X]
- Score [X]
- Location Score [X]
- Reviews (Optional)
- Language (Optional)

The checkin and checkout for activity dates must account for a minimum of 1 day of travel, therefore they are added 1 day on arrival and substracted 1 day on departure. In this way, it allows for a minimum of rest/journey time.

In [17]:
destination_cities = ["roma","madeira"]
checkins=["2024-11-04","2024-11-11","2024-11-18"]
checkouts=["2024-11-11","2024-11-18","2024-11-25"]

activities_df2 = pd.DataFrame()
for destination_city in tqdm(destination_cities):
    for e, (checkin, checkout) in enumerate(zip(checkins,checkouts)):

        activities_checkin = (pd.to_datetime(checkin) + datetime.timedelta(days=1)).strftime("%Y-%m-%d")
        activities_checkout = (pd.to_datetime(checkout) - datetime.timedelta(days=1)).strftime("%Y-%m-%d")

        civitatis_activities = extract_all_activities(destination_city,activities_checkin,activities_checkout)
        civitatis_activities["week"] = e + 1
        civitatis_activities["destination"] = destination_city
        activities_df2 = pd.concat([activities_df2,civitatis_activities])



  0%|          | 0/2 [00:00<?, ?it/s]

100%|██████████| 2/2 [04:47<00:00, 143.79s/it]


In [18]:
activities_df2

Unnamed: 0,activity_name,description,url,image,image2,available_days,available_times,duration,latitude,longitude,address,price,currency,category,category2,week,destination
0,Tour privado por Roma ¡Tú eliges!,Si queréis conocer la Ciudad del Amor de forma...,www.civitatis.com/es/roma/tour-privado-roma/,www.civitatis.com/f/italia/roma/tour-privado-r...,,"[07, 10, 06, 09, 05, 08]","[[9:00, 10:00, 11:00, 12:00, 13:00, 14:00, 15:...",2 horas,41.906658,12.48199,"Piazza di Spagna, Piazza della Trinità dei Mon...",18.2,EUR,2 horas,Español,1,roma
1,Visita guiada por la Basílica de San Pedro,Historia y arte se entremezclan en esta visita...,www.civitatis.com/es/roma/subida-cupula-san-pe...,"www.civitatis.comdata:image/gif;base64,R0lGODl...",www.civitatis.com/f/italia/roma/subida-cupula-...,"[05, 09, 07, 08]","[[8:15], [8:15], [8:15], [8:15]]",1h 30m -3h,41.9071788,12.4564381,"Minimarket, 27, Via Vespasiano, Prati, Municip...",10.5,EUR,1h 30m -3h,Español,1,roma
2,Excursión a Venecia en tren de alta velocidad,En esta excursión a Venecia en tren de alta ve...,www.civitatis.com/es/roma/excursion-venecia/,"www.civitatis.comdata:image/gif;base64,R0lGODl...",www.civitatis.com/f/italia/roma/excursion-vene...,"[07, 09]","[[7:30], [7:30]]",15 horas,41.90198,12.503145,"48, Via Marsala, Castro Pretorio, Municipio Ro...",54.6,EUR,15 horas,Español,1,roma
3,Visita guiada por los Museos Capitolinos y alr...,El Monumento a Víctor Manuel II y la Plaza del...,www.civitatis.com/es/roma/visita-guiada-museos...,"www.civitatis.comdata:image/gif;base64,R0lGODl...",www.civitatis.com/f/italia/roma/visita-guiada-...,"[08, 10, 06]","[[9:30, 16:30], [9:30, 16:30], [9:30, 16:30]]",3 horas,41.893293,12.482938,"Campitelli, Municipio Roma I, Roma, Roma Capit...",14.0,EUR,3 horas,Español,1,roma
4,Tour por las plazas y fuentes de Roma,"La Fontana de Trevi, la Piazza Navona o el Cam...",www.civitatis.com/es/roma/paseo-plazas-fuentes...,"www.civitatis.comdata:image/gif;base64,R0lGODl...",www.civitatis.com/f/italia/roma/paseo-plazas-f...,"[09, 07]","[[15:00], [15:00]]",2h 30m,41.9036625,12.488483,"Fontana del Tritone, Piazza Barberini, Trevi, ...",10.0,EUR,2h 30m,Español,1,roma
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50,Paseo en la carabela de Cristóbal Colón,Sigue los pasos del descubridor del Nuevo Mund...,www.civitatis.com/es/funchal/paseo-carabela-cr...,"www.civitatis.comdata:image/gif;base64,R0lGODl...",www.civitatis.com/f/portugal/funchal/paseo-car...,"[23, 20, 22, 24, 19, 21]","[[10:00, 15:00], [10:00, 15:00], [10:00, 15:00...",3 horas,32.6461526,-16.9100422,"VMT Madeira - Catamaran Trips, Loja nº 9, Mari...",8.0,EUR,3 horas,Paseos en barco,3,madeira
51,Excursión en 4x4 por el norte de Madeira + Pis...,Recored el norte de Madeira en un 4x4 y explor...,www.civitatis.com/es/funchal/excursion-norte-m...,"www.civitatis.comdata:image/gif;base64,R0lGODl...",www.civitatis.com/f/portugal/funchal/excursion...,"[21, 24, 19, 22, 20, 23]","[[8:45], [8:45], [8:45], [8:45], [8:45], [8:45]]",7h 30m,0,0,,13.8,EUR,7h 30m,Español,3,madeira
52,Bautismo de buceo en Garajau,Con este curso de buceo para principiantes des...,www.civitatis.com/es/funchal/bautismo-buceo-fu...,"www.civitatis.comdata:image/gif;base64,R0lGODl...",www.civitatis.com/f/portugal/funchal/bautismo-...,"[22, 23, 20, 21, 19]","[[9:00], [9:00], [9:00], [9:00], [9:00]]",2h 30m,32.63847169205027,-16.85300386186423,"Estrada do Cristo Rei, Garajau, Caniço, Santa ...",13.6,EUR,2h 30m,Español,3,madeira
53,Senderismo por la Levada do Moinho,Antiguos molinos de agua y cascadas nos aguard...,www.civitatis.com/es/funchal/senderismo-levada...,"www.civitatis.comdata:image/gif;base64,R0lGODl...",www.civitatis.com/f/portugal/funchal/senderism...,[21],[[14:00]],4h -5h 30m,0,0,,7.5,EUR,4h -5h 30m,Español,3,madeira


In [23]:
activities_df2.to_csv("../data/activities/civitatis_activities2.csv")

In [31]:
destination_cities = ["paris","madeira"]

for destination_city in tqdm(destination_cities):
    for e, (checkin, checkout) in enumerate(zip(checkins,checkouts)):

        activities_checkin = (pd.to_datetime(checkin) + datetime.timedelta(days=1)).strftime("%Y-%m-%d")
        activities_checkout = (pd.to_datetime(checkout) - datetime.timedelta(days=1)).strftime("%Y-%m-%d")

        civitatis_activities = extract_all_activities(destination_city,activities_checkin,activities_checkout)
        civitatis_activities["week"] = e + 1
        civitatis_activities["destination"] = destination_city
        activities_df = pd.concat([activities_df,civitatis_activities])


100%|██████████| 2/2 [05:51<00:00, 175.71s/it]


In [32]:
activities_df.to_csv("../data/activities/civitatis_activities.csv")

In [34]:
destination_cities = ["berlin","phuket"]

for destination_city in tqdm(destination_cities):
    for e, (checkin, checkout) in enumerate(zip(checkins,checkouts)):

        activities_checkin = (pd.to_datetime(checkin) + datetime.timedelta(days=1)).strftime("%Y-%m-%d")
        activities_checkout = (pd.to_datetime(checkout) - datetime.timedelta(days=1)).strftime("%Y-%m-%d")

        civitatis_activities = extract_all_activities(destination_city,activities_checkin,activities_checkout)
        civitatis_activities["week"] = e + 1
        civitatis_activities["destination"] = destination_city
        activities_df = pd.concat([activities_df,civitatis_activities])


  0%|          | 0/2 [00:00<?, ?it/s]

100%|██████████| 2/2 [03:31<00:00, 105.55s/it]


In [35]:
activities_df.to_csv("../data/activities/civitatis_activities.csv")

In [37]:
destination_cities = ["madeira"]
checkins=["2024-11-04","2024-11-11","2024-11-18"]
checkouts=["2024-11-11","2024-11-18","2024-11-25"]

for destination_city in tqdm(destination_cities):
    for e, (checkin, checkout) in enumerate(zip(checkins,checkouts)):

        activities_checkin = (pd.to_datetime(checkin) + datetime.timedelta(days=1)).strftime("%Y-%m-%d")
        activities_checkout = (pd.to_datetime(checkout) - datetime.timedelta(days=1)).strftime("%Y-%m-%d")

        civitatis_activities = extract_all_activities(destination_city,activities_checkin,activities_checkout)
        civitatis_activities["week"] = e + 1
        civitatis_activities["destination"] = destination_city
        activities_df = pd.concat([activities_df,civitatis_activities])

100%|██████████| 1/1 [02:07<00:00, 127.84s/it]


In [38]:
activities_df.to_csv("../data/activities/civitatis_activities.csv")

In [42]:
activities_df.head()

Unnamed: 0,activity_name,description,url,image,image2,available_days,available_times,duration,latitude,longitude,address,price,currency,category,week,destination
0,Tour privado por Roma ¡Tú eliges!,Si queréis conocer la Ciudad del Amor de forma...,www.civitatis.com/es/roma/tour-privado-roma/,www.civitatis.com/f/italia/roma/tour-privado-r...,,"[05, 08, 09, 06, 10, 07]","[[9:00, 10:00, 11:00, 12:00, 13:00, 14:00, 15:...",2 horas,41.906658,12.48199,"19, Piazza di Spagna, Campo Marzio, Municipio ...",18.2,EUR,Español,1,roma
1,Visita guiada por la Basílica de San Pedro,Historia y arte se entremezclan en esta visita...,www.civitatis.com/es/roma/subida-cupula-san-pe...,"www.civitatis.comdata:image/gif;base64,R0lGODl...",www.civitatis.com/f/italia/roma/subida-cupula-...,"[09, 07, 05, 08]","[[8:15], [8:15], [8:15], [8:15]]",1h 30m -3h,41.9071788,12.4564381,"Minimarket, 27, Via Vespasiano, Prati, Municip...",10.5,EUR,Español,1,roma
2,Excursión a Venecia en tren de alta velocidad,En esta excursión a Venecia en tren de alta ve...,www.civitatis.com/es/roma/excursion-venecia/,"www.civitatis.comdata:image/gif;base64,R0lGODl...",www.civitatis.com/f/italia/roma/excursion-vene...,"[09, 07]","[[7:30], [7:30]]",15 horas,41.90198,12.503145,"48, Via Marsala, Castro Pretorio, Municipio Ro...",54.6,EUR,Español,1,roma
3,Visita guiada por los Museos Capitolinos y alr...,El Monumento a Víctor Manuel II y la Plaza del...,www.civitatis.com/es/roma/visita-guiada-museos...,"www.civitatis.comdata:image/gif;base64,R0lGODl...",www.civitatis.com/f/italia/roma/visita-guiada-...,"[10, 08, 06]","[[9:30, 16:30], [9:30, 16:30], [9:30, 16:30]]",3 horas,41.893293,12.482938,"Piazza del Campidoglio, Campitelli, Municipio ...",14.0,EUR,Español,1,roma
4,Tour por las plazas y fuentes de Roma,"La Fontana de Trevi, la Piazza Navona o el Cam...",www.civitatis.com/es/roma/paseo-plazas-fuentes...,"www.civitatis.comdata:image/gif;base64,R0lGODl...",www.civitatis.com/f/italia/roma/paseo-plazas-f...,"[09, 07]","[[15:00], [15:00]]",2h 30m,41.9036625,12.488483,"Fontana del Tritone, Piazza Barberini, Trevi, ...",10.0,EUR,Español,1,roma
