### 0. Imports

In [1]:
import pandas as pd
import numpy as np

### 1. Introduction to this notebook

# 2. Transformation

## 2.1 Flights

In [2]:
itineraries = pd.read_parquet("../data/flights/itineraries.parquet")
itineraries.info()
itineraries.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 349 entries, 0 to 348
Data columns (total 16 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   date_query                  349 non-null    datetime64[ns]
 1   score                       349 non-null    float64       
 2   duration                    349 non-null    int64         
 3   price                       349 non-null    int64         
 4   price_currency              349 non-null    object        
 5   stops                       349 non-null    int64         
 6   departure                   349 non-null    datetime64[ns]
 7   arrival                     349 non-null    datetime64[ns]
 8   company                     349 non-null    object        
 9   self_transfer               349 non-null    bool          
 10  fare_isChangeAllowed        349 non-null    bool          
 11  fare_isPartiallyChangeable  349 non-null    bool          

Unnamed: 0,date_query,score,duration,price,price_currency,stops,departure,arrival,company,self_transfer,fare_isChangeAllowed,fare_isPartiallyChangeable,fare_isCancellationAllowed,fare_isPartiallyRefundable,origin_airport,destination_airport
0,2024-11-02 19:10:25.310576,0.999,85,100,€,0,2024-11-03 10:30:00,2024-11-03 11:55:00,Iberia,False,False,False,False,False,Barcelona,Madrid
1,2024-11-02 19:10:25.313576,0.694966,85,110,€,0,2024-11-03 11:50:00,2024-11-03 13:15:00,Vueling Airlines,False,False,False,False,False,Barcelona,Madrid
2,2024-11-02 19:10:25.314575,0.564122,90,139,€,0,2024-11-03 20:30:00,2024-11-03 22:00:00,Air Europa,False,False,False,False,False,Barcelona,Madrid
3,2024-11-02 19:10:25.315574,0.432498,90,142,€,0,2024-11-03 11:50:00,2024-11-03 13:20:00,Air Europa,False,False,False,False,False,Barcelona,Madrid
4,2024-11-02 19:10:25.316572,0.425545,85,168,€,0,2024-11-03 13:30:00,2024-11-03 14:55:00,Iberia,False,False,False,False,False,Barcelona,Madrid


Apparently, all data types and values come out correctly from the Air Scraper API. Something that is missing and would have to be provided to the user would be the suggested url to get the flights.

## 2.2 Accommodations

In [3]:
booking_df = pd.read_parquet("../data/accommodations/booking.parquet")
booking_df.info()
booking_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 233 entries, 0 to 232
Data columns (total 24 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   query_date               233 non-null    datetime64[ns]
 1   city                     233 non-null    object        
 2   checkin                  233 non-null    object        
 3   checkout                 233 non-null    object        
 4   n_adults_search          233 non-null    object        
 5   n_children_search        233 non-null    object        
 6   n_rooms_search           233 non-null    object        
 7   name                     233 non-null    object        
 8   url                      233 non-null    object        
 9   price_currency           233 non-null    object        
 10  total_price_amount       233 non-null    object        
 11  distance_city_center_km  233 non-null    object        
 12  score                    223 non-nul

Unnamed: 0,query_date,city,checkin,checkout,n_adults_search,n_children_search,n_rooms_search,name,url,price_currency,...,close_to_metro,sustainability_cert,room_type,double_bed,single_bed,free_cancellation,breakfast_included,pay_at_hotel,location_score,free_taxi
0,2024-11-03 12:39:39.962865,barcelona,2024-11-08,2024-11-10,2,0,1,Pensión 45,https://www.booking.com/hotel/es/pension-45.es...,€,...,Yes,No,Habitación Doble,Yes,No,No,No,No,,No
1,2024-11-03 12:39:39.965792,barcelona,2024-11-08,2024-11-10,2,0,1,Nice and comfortable room for your stay in BCN,https://www.booking.com/hotel/es/nice-and-comf...,€,...,Yes,No,Habitación Doble Económica,Yes,No,No,No,No,10.0,No
2,2024-11-03 12:39:39.967792,barcelona,2024-11-08,2024-11-10,2,0,1,Travelodge Barcelona Poblenou,https://www.booking.com/hotel/es/travelodge-ba...,€,...,Yes,No,Habitación Doble,Yes,No,No,No,No,,No
3,2024-11-03 12:39:39.970791,barcelona,2024-11-08,2024-11-10,2,0,1,Hotel Derby,https://www.booking.com/hotel/es/derby.es.html...,€,...,Yes,No,Habitación (1 o 2 adultos) - 1 o 2 camas,Yes,Yes,No,No,No,,No
4,2024-11-03 12:39:39.975548,barcelona,2024-11-08,2024-11-10,2,0,1,Hostal Benidorm,https://www.booking.com/hotel/es/hostal-benido...,€,...,Yes,No,Habitación Doble,Yes,No,No,No,No,9.5,No


Convert checkin and checkout to datetime

In [4]:
booking_df[["checkin","checkout"]] = booking_df[["checkin","checkout"]].astype("datetime64[ns]")

In [5]:
booking_df["room_type"].value_counts()[:20]

room_type
Habitación Doble con baño compartido                                      23
Cama en habitación compartida mixta de 10 camas                           18
Habitación Doble - 2 camas                                                15
Habitación Doble                                                          12
Habitación Doble - 1 o 2 camas                                             8
Habitación Doble con baño compartido - 2 camas                             7
Cama en habitación compartida mixta de 8 camas                             6
Cama en habitación compartida mixta de 6 camas                             6
Habitación Doble Económica                                                 6
Habitación Doble Estándar con baño compartido                              5
Cama en habitación compartida de 8 camas                                   5
Habitación Doble con baño privado - 2 camas                                4
Habitación Doble Superior                                         

Standardize room types

In [6]:
conditions = [
    booking_df["room_type"].str.startswith("Apartamento"),
    booking_df["room_type"].str.startswith("Cama en habitacion compartida"),
    booking_df["room_type"].str.startswith("Habitacion Doble Superior"),
    booking_df["room_type"].str.startswith("Habitacion Doble")
]

choices = [
    "Apartamento",
    booking_df["room_type"],  # keeps the original value for "Cama en habitacion compartida"
    "Habitacion Doble Superior",
    "Habitacion Doble"
]


booking_df["standardized_room_type"] = np.select(conditions, choices, default=booking_df["room_type"])

Apartamento
Cama en habitacion compartida
Habitacion Doble Superior
Habitacion Doble 

In [7]:
booking_df["shared_bathroom"] = np.where(booking_df["room_type"].str.contains("baño compartido"),"Yes","No")

Add separate info about balcony

In [8]:
booking_df["balcony"] = np.where(booking_df["room_type"].str.contains("balcón"),"Yes","No")

In [9]:
booking_df["sustainability_cert"].value_counts()

sustainability_cert
No     213
Yes     20
Name: count, dtype: int64

In [10]:
booking_df["free_cancellation"].value_counts()

free_cancellation
No     209
Yes     24
Name: count, dtype: int64

In [11]:
booking_df["breakfast_included"].value_counts()

breakfast_included
No     209
Yes     24
Name: count, dtype: int64

## 2.3 Activities

In [12]:
activities_df = pd.read_parquet("../data/activities/90_day_availability.parquet")

In [13]:
activities_df

Unnamed: 0,query_date,city,activity_date_range_start,activity_date_range_end,activity_name,description,url,image,image2,available_days,available_times,duration,latitude,longitude,price,currency,category,spanish
0,2024-11-03 12:47:29.458558,barcelona,2024-11-28,2024-12-04,Tour privado por Barcelona ¡Tú eliges!,Reservando nuestra visita privada tendréis un ...,www.civitatis.com/es/barcelona/tour-privado-ba...,www.civitatis.com/f/espana/barcelona/tour-priv...,,"[30, 29, 02, 03, 01, 28, 04]","[[9:00, 10:00, 11:00, 12:00, 13:00, 14:00, 15:...",2h 30m -5h,0,0,40.00,EUR,Visitas guiadas y free tours,Español
1,2024-11-03 12:47:29.464586,barcelona,2024-11-28,2024-12-04,Excursión a Montserrat + Visita a una bodega,En esta excursión a Montserrat no solo disfrut...,www.civitatis.com/es/barcelona/tour-tapas-vino...,"www.civitatis.comdata:image/gif;base64,R0lGODl...",www.civitatis.com/f/espana/barcelona/tour-tapa...,"[04, 30, 28, 03, 02, 01, 29]","[[8:45], [8:45], [8:45], [8:45], [8:45], [8:45...",7h 30m,41.3940236912484,2.181866082214644,19.98,EUR,Gastronomía y enoturismo,Español
2,2024-11-03 12:47:29.468587,barcelona,2024-11-28,2024-12-04,Paseo en catamarán al atardecer con música en ...,Contempla el skyline de Barcelona mientras dis...,www.civitatis.com/es/barcelona/paseo-catamaran...,"www.civitatis.comdata:image/gif;base64,R0lGODl...",www.civitatis.com/f/espana/barcelona/paseo-cat...,"[30, 29, 01, 04, 28, 02, 03]","[[16:30], [16:30], [16:30], [16:30], [16:30], ...",1h 30m,41.37495867288118,2.17849589524371,7.65,EUR,Paseos en barco,Español
3,2024-11-03 12:47:29.472608,barcelona,2024-11-28,2024-12-04,Free tour por el Parque Güell,En este free tour por el Parque Güell conocere...,www.civitatis.com/es/barcelona/free-tour-parqu...,"www.civitatis.comdata:image/gif;base64,R0lGODl...",www.civitatis.com/f/espana/barcelona/free-tour...,"[03, 29, 28, 02, 04]","[[11:30], [11:30], [11:30], [11:30], [9:30, 11...",1h 30m,41.41508351,2.154768947,2.00,EUR,Visitas guiadas y free tours,Español
4,2024-11-03 12:47:29.476617,barcelona,2024-11-28,2024-12-04,Teleférico de Montjuïc,Con esta entrada al Teleférico de Montjuïc pod...,www.civitatis.com/es/barcelona/billete-telefer...,"www.civitatis.comdata:image/gif;base64,R0lGODl...",www.civitatis.com/f/espana/barcelona/billete-t...,"[04, 29, 28, 30, 01, 03, 02]","[[10:00], [10:00], [10:00], [10:00], [10:00], ...",,41.368762,2.163434,3.20,EUR,Entradas,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
269,2024-11-03 12:47:29.615915,barcelona,2024-12-04,2024-12-10,Tour de los misterios y leyendas de Barcelona,En esta ruta nocturna por Barcelona descubrire...,www.civitatis.com/es/barcelona/tour-misterios-...,"www.civitatis.comdata:image/gif;base64,R0lGODl...",www.civitatis.com/f/espana/barcelona/tour-mist...,,,1h 30m,41.387471,2.172001,4.20,EUR,Visitas guiadas y free tours,Español
270,2024-11-03 12:47:29.616916,barcelona,2024-12-04,2024-12-10,Entrada al Spotify Camp Nou,Si queréis visitar el estadio del Fútbol Club ...,www.civitatis.com/es/barcelona/camp-nou-experi...,"www.civitatis.comdata:image/gif;base64,R0lGODl...",www.civitatis.com/f/espana/barcelona/camp-nou-...,,,1h 30m,41.380053,2.121457,2.91,EUR,Entradas,Español
271,2024-11-03 12:47:29.617916,barcelona,2024-12-04,2024-12-10,Free tour de los misterios y leyendas del Barr...,Acompañadnos en este free tour por Barcelona e...,www.civitatis.com/es/barcelona/free-tour-miste...,"www.civitatis.comdata:image/gif;base64,R0lGODl...",www.civitatis.com/f/espana/barcelona/free-tour...,,,2 horas,41.3838057,2.1757634,2.00,EUR,Visitas guiadas y free tours,Español
272,2024-11-03 12:47:29.618916,barcelona,2024-12-04,2024-12-10,"Excursión a Gerona, Figueras y Museo Dalí",En esta excursión a Gerona y Figueras conocere...,www.civitatis.com/es/barcelona/excursion-museo...,"www.civitatis.comdata:image/gif;base64,R0lGODl...",www.civitatis.com/f/espana/barcelona/excursion...,,,9h - 10h 45m,41.3940811,2.1826233,23.70,EUR,Excursiones de un día,Español


### 2.3.1 Cleaning neccessary

1. Remove duplicates
2. Keep only the good image: The second if it has 2, otherwise the first.
3. Take unique, non null and non zero latitudes and longitudes, to convert to address.
4. Format the available dates and times

Careful as some non-available times might mean they do not apply, i.e. tickets.

##### 2.3.1.1 Remove duplicates

By activity_name and activity_date_range_start

In [14]:
activities_df[activities_df.duplicated(subset=["activity_date_range_start","activity_name"],keep=False)].sort_values(by="activity_name").head()

Unnamed: 0,query_date,city,activity_date_range_start,activity_date_range_end,activity_name,description,url,image,image2,available_days,available_times,duration,latitude,longitude,price,currency,category,spanish
100,2024-11-03 12:47:29.072592,barcelona,2024-11-28,2024-12-04,Barcelona Card Express,Con la tarjeta Barcelona Card Express obtendrá...,www.civitatis.com/es/barcelona/barcelona-card-...,www.civitatis.com/f/espana/barcelona/barcelona...,,"[29, 28, 04, 30, 01, 02, 03]","[[0:00], [0:00], [0:00], [0:00], [0:00], [0:00...",2 días,0.0,0.0,5.4,EUR,Entradas,
109,2024-11-03 12:47:29.169355,barcelona,2024-11-28,2024-12-04,Barcelona Card Express,Con la tarjeta Barcelona Card Express obtendrá...,www.civitatis.com/es/barcelona/barcelona-card-...,www.civitatis.com/f/espana/barcelona/barcelona...,,"[28, 30, 03, 04, 29, 01, 02]","[[0:00], [0:00], [0:00], [0:00], [0:00], [0:00...",2 días,0.0,0.0,5.4,EUR,Entradas,
245,2024-11-03 12:47:29.544854,barcelona,2024-12-04,2024-12-10,Espectáculo flamenco en el Tablao El Duende,Asistiendo a un espectáculo flamenco en el Tab...,www.civitatis.com/es/barcelona/espectaculo-fla...,"www.civitatis.comdata:image/gif;base64,R0lGODl...",www.civitatis.com/f/espana/barcelona/espectacu...,"[06, 10, 08, 09, 04, 05, 07]","[[19:00, 20:15], [19:00, 20:15], [19:00, 20:15...",50 minutos,41.3789078,2.1747321,6.75,EUR,Espectáculos,
117,2024-11-03 12:47:29.273398,barcelona,2024-11-28,2024-12-04,Espectáculo flamenco en el Tablao El Duende,Asistiendo a un espectáculo flamenco en el Tab...,www.civitatis.com/es/barcelona/espectaculo-fla...,"www.civitatis.comdata:image/gif;base64,R0lGODl...",www.civitatis.com/f/espana/barcelona/espectacu...,"[29, 02, 28, 03, 04, 01, 30]","[[19:00, 20:15], [19:00, 20:15], [19:00, 20:15...",50 minutos,41.3789078,2.1747321,6.75,EUR,Espectáculos,
108,2024-11-03 12:47:29.108767,barcelona,2024-11-28,2024-12-04,Espectáculo flamenco en el Tablao El Duende,Asistiendo a un espectáculo flamenco en el Tab...,www.civitatis.com/es/barcelona/espectaculo-fla...,"www.civitatis.comdata:image/gif;base64,R0lGODl...",www.civitatis.com/f/espana/barcelona/espectacu...,"[01, 02, 29, 03, 28, 04, 30]","[[19:00, 20:15], [19:00, 20:15], [19:00, 20:15...",50 minutos,41.3789078,2.1747321,6.75,EUR,Espectáculos,


In [15]:
activities_df.drop_duplicates(subset=["activity_date_range_start","activity_name"], inplace=True)

#### 2.3.1.2 Keep only the good image: The second if it has 2, otherwise the first.

In [16]:
activities_df["image"] = np.where(~activities_df["image2"].isna(), activities_df["image2"], activities_df["image"])
activities_df.drop(columns="image2",inplace=True)

In [17]:
activities_df.head()

Unnamed: 0,query_date,city,activity_date_range_start,activity_date_range_end,activity_name,description,url,image,available_days,available_times,duration,latitude,longitude,price,currency,category,spanish
0,2024-11-03 12:47:29.458558,barcelona,2024-11-28,2024-12-04,Tour privado por Barcelona ¡Tú eliges!,Reservando nuestra visita privada tendréis un ...,www.civitatis.com/es/barcelona/tour-privado-ba...,www.civitatis.com/f/espana/barcelona/tour-priv...,"[30, 29, 02, 03, 01, 28, 04]","[[9:00, 10:00, 11:00, 12:00, 13:00, 14:00, 15:...",2h 30m -5h,0.0,0.0,40.0,EUR,Visitas guiadas y free tours,Español
1,2024-11-03 12:47:29.464586,barcelona,2024-11-28,2024-12-04,Excursión a Montserrat + Visita a una bodega,En esta excursión a Montserrat no solo disfrut...,www.civitatis.com/es/barcelona/tour-tapas-vino...,www.civitatis.com/f/espana/barcelona/tour-tapa...,"[04, 30, 28, 03, 02, 01, 29]","[[8:45], [8:45], [8:45], [8:45], [8:45], [8:45...",7h 30m,41.3940236912484,2.181866082214644,19.98,EUR,Gastronomía y enoturismo,Español
2,2024-11-03 12:47:29.468587,barcelona,2024-11-28,2024-12-04,Paseo en catamarán al atardecer con música en ...,Contempla el skyline de Barcelona mientras dis...,www.civitatis.com/es/barcelona/paseo-catamaran...,www.civitatis.com/f/espana/barcelona/paseo-cat...,"[30, 29, 01, 04, 28, 02, 03]","[[16:30], [16:30], [16:30], [16:30], [16:30], ...",1h 30m,41.37495867288118,2.17849589524371,7.65,EUR,Paseos en barco,Español
3,2024-11-03 12:47:29.472608,barcelona,2024-11-28,2024-12-04,Free tour por el Parque Güell,En este free tour por el Parque Güell conocere...,www.civitatis.com/es/barcelona/free-tour-parqu...,www.civitatis.com/f/espana/barcelona/free-tour...,"[03, 29, 28, 02, 04]","[[11:30], [11:30], [11:30], [11:30], [9:30, 11...",1h 30m,41.41508351,2.154768947,2.0,EUR,Visitas guiadas y free tours,Español
4,2024-11-03 12:47:29.476617,barcelona,2024-11-28,2024-12-04,Teleférico de Montjuïc,Con esta entrada al Teleférico de Montjuïc pod...,www.civitatis.com/es/barcelona/billete-telefer...,www.civitatis.com/f/espana/barcelona/billete-t...,"[04, 29, 28, 30, 01, 03, 02]","[[10:00], [10:00], [10:00], [10:00], [10:00], ...",,41.368762,2.163434,3.2,EUR,Entradas,


#### 2.3.1.3 Take unique, non null and non zero latitudes and longitudes, to convert to address.

In [18]:
from dotenv import load_dotenv
import os 
load_dotenv()

True

In [19]:
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

In [20]:
import aiohttp
import asyncio

async def get_address(lat: float, lon: float, api_key: str) -> str:
    url = f"https://maps.googleapis.com/maps/api/geocode/json?latlng={lat},{lon}&key={api_key}"
    
    async with aiohttp.ClientSession() as session:
        async with session.get(url) as response:
            data = await response.json()
            
            if data.get("status") == "OK":
                address = data["results"][0]["formatted_address"]
                return address
            else:
                return "Address not found"

# Example usage
async def get_adresses_async(lat_lon_df):
    api_key = GOOGLE_API_KEY
    lat_lon_tasks = [get_address(lat, lon, api_key) for lat, lon in lat_lon_df.itertuples(index=False, name=None)]
    addresses = await asyncio.gather(*lat_lon_tasks)
    return addresses


In [21]:
not_nan = activities_df[["latitude", "longitude"]].notna().all(axis=1)
unique_combinations = activities_df.loc[not_nan,["latitude","longitude"]].drop_duplicates()

In [22]:
unique_combinations["address"] = await get_adresses_async(unique_combinations[["latitude", "longitude"]])

In [23]:
unique_combinations["lat_lon"] = list(zip(unique_combinations.latitude, unique_combinations.longitude))
activities_df["lat_lon"] = list(zip(activities_df.latitude, activities_df.longitude))

In [24]:
unique_combinations["address"] = await get_adresses_async(unique_combinations[["latitude", "longitude"]])

In [25]:
lat_lon_to_address_ampping = dict(zip(unique_combinations["lat_lon"], unique_combinations["address"]))

In [26]:
activities_df["address"] = activities_df["lat_lon"].map(lat_lon_to_address_ampping)
activities_df.drop(columns="lat_lon",inplace=True)

In [27]:
activities_df.head(5)

Unnamed: 0,query_date,city,activity_date_range_start,activity_date_range_end,activity_name,description,url,image,available_days,available_times,duration,latitude,longitude,price,currency,category,spanish,address
0,2024-11-03 12:47:29.458558,barcelona,2024-11-28,2024-12-04,Tour privado por Barcelona ¡Tú eliges!,Reservando nuestra visita privada tendréis un ...,www.civitatis.com/es/barcelona/tour-privado-ba...,www.civitatis.com/f/espana/barcelona/tour-priv...,"[30, 29, 02, 03, 01, 28, 04]","[[9:00, 10:00, 11:00, 12:00, 13:00, 14:00, 15:...",2h 30m -5h,0.0,0.0,40.0,EUR,Visitas guiadas y free tours,Español,Address not found
1,2024-11-03 12:47:29.464586,barcelona,2024-11-28,2024-12-04,Excursión a Montserrat + Visita a una bodega,En esta excursión a Montserrat no solo disfrut...,www.civitatis.com/es/barcelona/tour-tapas-vino...,www.civitatis.com/f/espana/barcelona/tour-tapa...,"[04, 30, 28, 03, 02, 01, 29]","[[8:45], [8:45], [8:45], [8:45], [8:45], [8:45...",7h 30m,41.3940236912484,2.181866082214644,19.98,EUR,Gastronomía y enoturismo,Español,"Barcelona (Bus Terminal Nord), Eixample, 08013..."
2,2024-11-03 12:47:29.468587,barcelona,2024-11-28,2024-12-04,Paseo en catamarán al atardecer con música en ...,Contempla el skyline de Barcelona mientras dis...,www.civitatis.com/es/barcelona/paseo-catamaran...,www.civitatis.com/f/espana/barcelona/paseo-cat...,"[30, 29, 01, 04, 28, 02, 03]","[[16:30], [16:30], [16:30], [16:30], [16:30], ...",1h 30m,41.37495867288118,2.17849589524371,7.65,EUR,Paseos en barco,Español,"Moll de les Drassanes, 3P, Ciutat Vella, 08039..."
3,2024-11-03 12:47:29.472608,barcelona,2024-11-28,2024-12-04,Free tour por el Parque Güell,En este free tour por el Parque Güell conocere...,www.civitatis.com/es/barcelona/free-tour-parqu...,www.civitatis.com/f/espana/barcelona/free-tour...,"[03, 29, 28, 02, 04]","[[11:30], [11:30], [11:30], [11:30], [9:30, 11...",1h 30m,41.41508351,2.154768947,2.0,EUR,Visitas guiadas y free tours,Español,"Ctra. del Carmel, 23, Horta-Guinardó, 08024 Ba..."
4,2024-11-03 12:47:29.476617,barcelona,2024-11-28,2024-12-04,Teleférico de Montjuïc,Con esta entrada al Teleférico de Montjuïc pod...,www.civitatis.com/es/barcelona/billete-telefer...,www.civitatis.com/f/espana/barcelona/billete-t...,"[04, 29, 28, 30, 01, 03, 02]","[[10:00], [10:00], [10:00], [10:00], [10:00], ...",,41.368762,2.163434,3.2,EUR,Entradas,,"Avinguda Miramar, 30, Sants-Montjuïc, 08038 Ba..."


#### 2.3.1.4 Format the available dates and times

Available dates and times for each activity will have to go to another table to ensure database normalisation. To do that, it will be necessary to explode the df twice. 

Then, to complete the date of availability, some rules for transformation have to be established:

In [28]:
availability_activities = activities_df[["available_days","available_times","activity_name","activity_date_range_start","activity_date_range_end"]]
availability_activities["paired"] = availability_activities[["available_days","available_times"]].fillna("-").apply(lambda x: list(zip(x["available_days"], x["available_times"])), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  availability_activities["paired"] = availability_activities[["available_days","available_times"]].fillna("-").apply(lambda x: list(zip(x["available_days"], x["available_times"])), axis=1)


In [29]:
availability_activities = availability_activities.explode("paired", ignore_index=True) #.drop(columns=["available_days","available_times"])

In [30]:
availability_activities.head()

Unnamed: 0,available_days,available_times,activity_name,activity_date_range_start,activity_date_range_end,paired
0,"[30, 29, 02, 03, 01, 28, 04]","[[9:00, 10:00, 11:00, 12:00, 13:00, 14:00, 15:...",Tour privado por Barcelona ¡Tú eliges!,2024-11-28,2024-12-04,"(30, [9:00, 10:00, 11:00, 12:00, 13:00, 14:00,..."
1,"[30, 29, 02, 03, 01, 28, 04]","[[9:00, 10:00, 11:00, 12:00, 13:00, 14:00, 15:...",Tour privado por Barcelona ¡Tú eliges!,2024-11-28,2024-12-04,"(29, [9:00, 10:00, 11:00, 12:00, 13:00, 14:00,..."
2,"[30, 29, 02, 03, 01, 28, 04]","[[9:00, 10:00, 11:00, 12:00, 13:00, 14:00, 15:...",Tour privado por Barcelona ¡Tú eliges!,2024-11-28,2024-12-04,"(02, [9:00, 10:00, 11:00, 12:00, 13:00, 14:00,..."
3,"[30, 29, 02, 03, 01, 28, 04]","[[9:00, 10:00, 11:00, 12:00, 13:00, 14:00, 15:...",Tour privado por Barcelona ¡Tú eliges!,2024-11-28,2024-12-04,"(03, [9:00, 10:00, 11:00, 12:00, 13:00, 14:00,..."
4,"[30, 29, 02, 03, 01, 28, 04]","[[9:00, 10:00, 11:00, 12:00, 13:00, 14:00, 15:...",Tour privado por Barcelona ¡Tú eliges!,2024-11-28,2024-12-04,"(01, [9:00, 10:00, 11:00, 12:00, 13:00, 14:00,..."


Now, there is a table with the 

In [31]:
availability_activities[["available_days", "available_times"]] = pd.DataFrame(availability_activities["paired"].tolist(), index=availability_activities.index)
availability_activities = availability_activities.drop(columns="paired")
availability_activities

Unnamed: 0,available_days,available_times,activity_name,activity_date_range_start,activity_date_range_end
0,30,"[9:00, 10:00, 11:00, 12:00, 13:00, 14:00, 15:0...",Tour privado por Barcelona ¡Tú eliges!,2024-11-28,2024-12-04
1,29,"[9:00, 10:00, 11:00, 12:00, 13:00, 14:00, 15:0...",Tour privado por Barcelona ¡Tú eliges!,2024-11-28,2024-12-04
2,02,"[9:00, 10:00, 11:00, 12:00, 13:00, 14:00, 15:0...",Tour privado por Barcelona ¡Tú eliges!,2024-11-28,2024-12-04
3,03,"[9:00, 10:00, 11:00, 12:00, 13:00, 14:00, 15:0...",Tour privado por Barcelona ¡Tú eliges!,2024-11-28,2024-12-04
4,01,"[9:00, 10:00, 11:00, 12:00, 13:00, 14:00, 15:0...",Tour privado por Barcelona ¡Tú eliges!,2024-11-28,2024-12-04
...,...,...,...,...,...
1177,-,-,Tour de los misterios y leyendas de Barcelona,2024-12-04,2024-12-10
1178,-,-,Entrada al Spotify Camp Nou,2024-12-04,2024-12-10
1179,-,-,Free tour de los misterios y leyendas del Barr...,2024-12-04,2024-12-10
1180,-,-,"Excursión a Gerona, Figueras y Museo Dalí",2024-12-04,2024-12-10


In [32]:
availability_activities = availability_activities.explode("available_times")
availability_activities

Unnamed: 0,available_days,available_times,activity_name,activity_date_range_start,activity_date_range_end
0,30,9:00,Tour privado por Barcelona ¡Tú eliges!,2024-11-28,2024-12-04
0,30,10:00,Tour privado por Barcelona ¡Tú eliges!,2024-11-28,2024-12-04
0,30,11:00,Tour privado por Barcelona ¡Tú eliges!,2024-11-28,2024-12-04
0,30,12:00,Tour privado por Barcelona ¡Tú eliges!,2024-11-28,2024-12-04
0,30,13:00,Tour privado por Barcelona ¡Tú eliges!,2024-11-28,2024-12-04
...,...,...,...,...,...
1177,-,-,Tour de los misterios y leyendas de Barcelona,2024-12-04,2024-12-10
1178,-,-,Entrada al Spotify Camp Nou,2024-12-04,2024-12-10
1179,-,-,Free tour de los misterios y leyendas del Barr...,2024-12-04,2024-12-10
1180,-,-,"Excursión a Gerona, Figueras y Museo Dalí",2024-12-04,2024-12-10


In [33]:
import datetime

In [34]:
availability_activities["activity_date_range_start"] = pd.to_datetime(availability_activities["activity_date_range_start"])
availability_activities["activity_date_range_end"] = pd.to_datetime(availability_activities["activity_date_range_end"])

availability_activities[["available_days","available_times"]] = availability_activities[["available_days","available_times"]].replace("-",np.nan)

availability_activities["available_times"] = availability_activities["available_times"].apply(lambda x: datetime.datetime.strptime(x, "%H:%M").time() if pd.notna(x) else pd.NaT)


def calculate_adjusted_date(row):
    # return Not a Time if nan
    if pd.isna(row["available_days"]):
        return pd.NaT
    
    # convert from string
    day = int(row["available_days"]) 

    # if available date is bigger than end range, belongs to first month. Else second.
    if day > row["activity_date_range_end"].day:
        return row["activity_date_range_start"].replace(day=day)
    else:
        return row["activity_date_range_end"].replace(day=day)


availability_activities["available_date"] = availability_activities.apply(calculate_adjusted_date, axis=1)

availability_activities.drop(columns="available_days",inplace=True)



In [35]:
availability_activities.dtypes

available_times                      object
activity_name                        object
activity_date_range_start    datetime64[ns]
activity_date_range_end      datetime64[ns]
available_date               datetime64[ns]
dtype: object

# 3. Save for data load

## 3.1 Itineraries

In [36]:
import os

In [37]:
os.makedirs("../data/flights/transformed/",exist_ok=True)

In [38]:
itineraries.to_parquet("../data/flights/transformed/itineraries.parquet")

## 3.2 Accommodations

### 3.2.1 Accommodations - booking

In [39]:
os.makedirs("../data/accommodations/transformed/",exist_ok=True)

In [40]:
booking_df.to_parquet("../data/accommodations/transformed/booking.parquet")

## 3.3 Activities

Activities has 2 tables, the activities per se and their available times and dates.

In [41]:
os.makedirs("../data/activities/transformed/",exist_ok=True)

In [42]:
activities_df.to_parquet("../data/activities/transformed/activities.parquet")
availability_activities.to_parquet("../data/activities/transformed/availabilities.parquet")